jhansi1 commited on
Commit
db1baea
1 Parent(s): 009fbcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -6
app.py CHANGED
@@ -3,16 +3,30 @@ import streamlit as st
3
  from transformers import pipeline
4
  from datasets import load_dataset
5
  from huggingface_hub import hf_hub_download
6
- from datasets import load_dataset
 
 
 
 
 
7
 
 
 
8
 
 
 
 
9
 
10
  # Initialize text-generation pipeline with the model
11
  model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
12
  pipe = pipeline("text-generation", model=model_name)
13
 
14
- # Load the dataset from the cloned local direc/tory
15
- ds = load_dataset("./canadian-legal-data", split="train",verify=False)
 
 
 
 
16
 
17
  # Gradio Interface setup
18
  def respond(
@@ -51,9 +65,10 @@ def streamlit_interface():
51
  st.title("Canadian Legal Text Generator")
52
  st.write("Enter a prompt related to Canadian legal data and generate text using Llama-3.1.")
53
 
54
- # Show dataset sample
55
  st.subheader("Sample Data from Canadian Legal Dataset:")
56
- st.write(ds[:5]) # Display the first 5 rows of the dataset
 
57
 
58
  # Prompt input
59
  prompt = st.text_area("Enter your prompt:", placeholder="Type something...")
@@ -86,4 +101,4 @@ if __name__ == "__main__":
86
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
87
  ],
88
  )
89
- demo.launch()
 
3
  from transformers import pipeline
4
  from datasets import load_dataset
5
  from huggingface_hub import hf_hub_download
6
+ import subprocess
7
+ import os
8
+
9
+ # Clone the dataset repository if not already cloned
10
+ repo_url = "https://huggingface.co/datasets/BEE-spoke-data/survivorslib-law-books"
11
+ repo_dir = "./survivorslib-law-books"
12
 
13
+ if not os.path.exists(repo_dir):
14
+ subprocess.run(["git", "clone", repo_url], check=True)
15
 
16
+ # Load the dataset from the cloned repository
17
+ dataset_path = os.path.join(repo_dir, "train.parquet")
18
+ ds = load_dataset("parquet", data_files=dataset_path)
19
 
20
  # Initialize text-generation pipeline with the model
21
  model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
22
  pipe = pipeline("text-generation", model=model_name)
23
 
24
+ # Preprocess dataset (assuming it has a 'text' or 'content' column for feeding to the model)
25
+ # If the dataset is different, update the field names accordingly
26
+ def preprocess_data(dataset):
27
+ # Here, we assume that the dataset has a 'content' column with legal text
28
+ # Adjust the column name as needed (for example, it might be 'text' or 'paragraph')
29
+ return dataset['content'][:5] # Displaying only the first 5 entries for brevity
30
 
31
  # Gradio Interface setup
32
  def respond(
 
65
  st.title("Canadian Legal Text Generator")
66
  st.write("Enter a prompt related to Canadian legal data and generate text using Llama-3.1.")
67
 
68
+ # Show dataset sample (first 5 entries)
69
  st.subheader("Sample Data from Canadian Legal Dataset:")
70
+ sample_data = preprocess_data(ds['train']) # Assuming 'train' split
71
+ st.write(sample_data) # Display the first 5 rows of the dataset
72
 
73
  # Prompt input
74
  prompt = st.text_area("Enter your prompt:", placeholder="Type something...")
 
101
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
102
  ],
103
  )
104
+ demo.launch()