Spaces:
Running
on
Zero
Running
on
Zero
zR
commited on
Commit
•
49cfbf7
1
Parent(s):
5254142
zero GPU
Browse files- README.md +1 -1
- app.py +115 -40
- requirement.txt +3 -5
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
|
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
suggested_hardware: a100-large
|
9 |
app_port: 7860
|
10 |
app_file: app.py
|
|
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.42.0
|
8 |
suggested_hardware: a100-large
|
9 |
app_port: 7860
|
10 |
app_file: app.py
|
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import subprocess
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from transformers import (
|
@@ -7,14 +6,18 @@ from transformers import (
|
|
7 |
)
|
8 |
import docx
|
9 |
import PyPDF2
|
|
|
|
|
10 |
|
11 |
def convert_to_txt(file):
|
12 |
doc_type = file.split(".")[-1].strip()
|
13 |
if doc_type in ["txt", "md", "py"]:
|
14 |
-
data = [file.read().decode(
|
15 |
elif doc_type in ["pdf"]:
|
16 |
pdf_reader = PyPDF2.PdfReader(file)
|
17 |
-
data = [
|
|
|
|
|
18 |
elif doc_type in ["docx"]:
|
19 |
doc = docx.Document(file)
|
20 |
data = [p.text for p in doc.paragraphs]
|
@@ -23,9 +26,12 @@ def convert_to_txt(file):
|
|
23 |
text = "\n\n".join(data)
|
24 |
return text
|
25 |
|
|
|
26 |
model_name = "THUDM/LongCite-glm4-9b"
|
27 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
28 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
29 |
|
30 |
html_styles = """<style>
|
31 |
.reference {
|
@@ -48,19 +54,21 @@ html_styles = """<style>
|
|
48 |
}
|
49 |
</style>\n"""
|
50 |
|
|
|
51 |
def process_text(text):
|
52 |
-
special_char={
|
53 |
-
|
54 |
-
'
|
55 |
-
'"':
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
}
|
60 |
for x, y in special_char.items():
|
61 |
text = text.replace(x, y)
|
62 |
return text
|
63 |
|
|
|
64 |
def convert_to_html(statements, clicked=-1):
|
65 |
html = html_styles + '<br><span class="label">Answer:</span><br>\n'
|
66 |
all_cite_html = []
|
@@ -68,7 +76,7 @@ def convert_to_html(statements, clicked=-1):
|
|
68 |
cite_num2idx = {}
|
69 |
idx = 0
|
70 |
for i, js in enumerate(statements):
|
71 |
-
statement, citations = process_text(js[
|
72 |
if clicked == i:
|
73 |
html += f"""<span class="statement">{statement}</span>"""
|
74 |
else:
|
@@ -79,19 +87,47 @@ def convert_to_html(statements, clicked=-1):
|
|
79 |
for c in citations:
|
80 |
idx += 1
|
81 |
idxs.append(str(idx))
|
82 |
-
cite =
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
all_cite_html.extend(cite_html)
|
85 |
-
cite_num =
|
86 |
cite_num2idx[cite_num] = i
|
87 |
-
cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(
|
|
|
|
|
88 |
html += cite_num_html
|
89 |
-
html +=
|
90 |
if clicked == i:
|
91 |
-
clicked_cite_html =
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return html, all_cite_html, clicked_cite_html, cite_num2idx
|
94 |
|
|
|
95 |
def render_context(file):
|
96 |
if hasattr(file, "name"):
|
97 |
context = convert_to_txt(file.name)
|
@@ -99,24 +135,35 @@ def render_context(file):
|
|
99 |
else:
|
100 |
raise gr.Error(f"ERROR: no uploaded document")
|
101 |
|
|
|
|
|
102 |
def run_llm(context, query):
|
103 |
if not context:
|
104 |
raise gr.Error("Error: no uploaded document")
|
105 |
if not query:
|
106 |
raise gr.Error("Error: no query")
|
107 |
-
result = model.query_longcite(
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
cite_nums = list(cite_num2idx_dict.keys())
|
111 |
return {
|
112 |
statements: gr.JSON(all_statements),
|
113 |
answer: gr.HTML(answer_html, visible=True),
|
114 |
all_citations: gr.HTML(all_cite_html, visible=True),
|
115 |
cite_num2idx: gr.JSON(cite_num2idx_dict),
|
116 |
-
citation_choices: gr.Radio(cite_nums, visible=len(cite_nums)>0),
|
117 |
clicked_citations: gr.HTML(visible=False),
|
118 |
}
|
119 |
-
|
|
|
120 |
def chose_citation(statements, cite_num2idx, clicked_cite_num):
|
121 |
clicked = cite_num2idx[clicked_cite_num]
|
122 |
answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
|
@@ -125,6 +172,7 @@ def chose_citation(statements, cite_num2idx, clicked_cite_num):
|
|
125 |
clicked_citations: gr.HTML(clicked_cite_html, visible=True),
|
126 |
}
|
127 |
|
|
|
128 |
with gr.Blocks() as demo:
|
129 |
gr.Markdown(
|
130 |
"""
|
@@ -142,31 +190,58 @@ with gr.Blocks() as demo:
|
|
142 |
</div>
|
143 |
"""
|
144 |
)
|
145 |
-
|
146 |
with gr.Row():
|
147 |
with gr.Column(scale=4):
|
148 |
-
file = gr.File(
|
149 |
-
|
|
|
|
|
150 |
submit_btn = gr.Button("Submit")
|
151 |
|
152 |
-
with gr.Column(scale=4):
|
153 |
-
context = gr.Textbox(
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
file.upload(render_context, [file], [context])
|
156 |
-
|
157 |
with gr.Row():
|
158 |
with gr.Column(scale=4):
|
159 |
statements = gr.JSON(label="statements", visible=False)
|
160 |
answer = gr.HTML(label="Answer", visible=True)
|
161 |
cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
|
162 |
-
citation_choices = gr.Radio(
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
166 |
all_citations = gr.HTML(label="All citations", visible=False)
|
167 |
-
|
168 |
-
submit_btn.click(
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
demo.queue()
|
172 |
-
demo.launch()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import (
|
|
|
6 |
)
|
7 |
import docx
|
8 |
import PyPDF2
|
9 |
+
import spaces
|
10 |
+
|
11 |
|
12 |
def convert_to_txt(file):
|
13 |
doc_type = file.split(".")[-1].strip()
|
14 |
if doc_type in ["txt", "md", "py"]:
|
15 |
+
data = [file.read().decode("utf-8")]
|
16 |
elif doc_type in ["pdf"]:
|
17 |
pdf_reader = PyPDF2.PdfReader(file)
|
18 |
+
data = [
|
19 |
+
pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))
|
20 |
+
]
|
21 |
elif doc_type in ["docx"]:
|
22 |
doc = docx.Document(file)
|
23 |
data = [p.text for p in doc.paragraphs]
|
|
|
26 |
text = "\n\n".join(data)
|
27 |
return text
|
28 |
|
29 |
+
|
30 |
model_name = "THUDM/LongCite-glm4-9b"
|
31 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
32 |
+
model = AutoModelForCausalLM.from_pretrained(
|
33 |
+
model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
|
34 |
+
)
|
35 |
|
36 |
html_styles = """<style>
|
37 |
.reference {
|
|
|
54 |
}
|
55 |
</style>\n"""
|
56 |
|
57 |
+
|
58 |
def process_text(text):
|
59 |
+
special_char = {
|
60 |
+
"&": "&",
|
61 |
+
"'": "'",
|
62 |
+
'"': """,
|
63 |
+
"<": "<",
|
64 |
+
">": ">",
|
65 |
+
"\n": "<br>",
|
66 |
}
|
67 |
for x, y in special_char.items():
|
68 |
text = text.replace(x, y)
|
69 |
return text
|
70 |
|
71 |
+
|
72 |
def convert_to_html(statements, clicked=-1):
|
73 |
html = html_styles + '<br><span class="label">Answer:</span><br>\n'
|
74 |
all_cite_html = []
|
|
|
76 |
cite_num2idx = {}
|
77 |
idx = 0
|
78 |
for i, js in enumerate(statements):
|
79 |
+
statement, citations = process_text(js["statement"]), js["citation"]
|
80 |
if clicked == i:
|
81 |
html += f"""<span class="statement">{statement}</span>"""
|
82 |
else:
|
|
|
87 |
for c in citations:
|
88 |
idx += 1
|
89 |
idxs.append(str(idx))
|
90 |
+
cite = (
|
91 |
+
"[Sentence: {}-{}\t|\tChar: {}-{}]<br>\n<span {}>{}</span>".format(
|
92 |
+
c["start_sentence_idx"],
|
93 |
+
c["end_sentence_idx"],
|
94 |
+
c["start_char_idx"],
|
95 |
+
c["end_char_idx"],
|
96 |
+
'class="highlight"' if clicked == i else "",
|
97 |
+
process_text(c["cite"].strip()),
|
98 |
+
)
|
99 |
+
)
|
100 |
+
cite_html.append(
|
101 |
+
f"""<span><span class="Bold">Snippet [{idx}]:</span><br>{cite}</span>"""
|
102 |
+
)
|
103 |
all_cite_html.extend(cite_html)
|
104 |
+
cite_num = "[{}]".format(",".join(idxs))
|
105 |
cite_num2idx[cite_num] = i
|
106 |
+
cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(
|
107 |
+
i, cite_num
|
108 |
+
)
|
109 |
html += cite_num_html
|
110 |
+
html += "\n"
|
111 |
if clicked == i:
|
112 |
+
clicked_cite_html = (
|
113 |
+
html_styles
|
114 |
+
+ """<br><span class="label">Citations of current statement:</span><br><div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format(
|
115 |
+
"<br><br>\n".join(cite_html)
|
116 |
+
)
|
117 |
+
)
|
118 |
+
all_cite_html = (
|
119 |
+
html_styles
|
120 |
+
+ """<br><span class="label">All citations:</span><br>\n<div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format(
|
121 |
+
"<br><br>\n".join(all_cite_html).replace(
|
122 |
+
'<span class="highlight">', "<span>"
|
123 |
+
)
|
124 |
+
if len(all_cite_html)
|
125 |
+
else "No citation in the answer"
|
126 |
+
)
|
127 |
+
)
|
128 |
return html, all_cite_html, clicked_cite_html, cite_num2idx
|
129 |
|
130 |
+
|
131 |
def render_context(file):
|
132 |
if hasattr(file, "name"):
|
133 |
context = convert_to_txt(file.name)
|
|
|
135 |
else:
|
136 |
raise gr.Error(f"ERROR: no uploaded document")
|
137 |
|
138 |
+
|
139 |
+
@spaces.GPU()
|
140 |
def run_llm(context, query):
|
141 |
if not context:
|
142 |
raise gr.Error("Error: no uploaded document")
|
143 |
if not query:
|
144 |
raise gr.Error("Error: no query")
|
145 |
+
result = model.query_longcite(
|
146 |
+
context,
|
147 |
+
query,
|
148 |
+
tokenizer=tokenizer,
|
149 |
+
max_input_length=128000,
|
150 |
+
max_new_tokens=1024,
|
151 |
+
)
|
152 |
+
all_statements = result["all_statements"]
|
153 |
+
answer_html, all_cite_html, clicked_cite_html, cite_num2idx_dict = convert_to_html(
|
154 |
+
all_statements
|
155 |
+
)
|
156 |
cite_nums = list(cite_num2idx_dict.keys())
|
157 |
return {
|
158 |
statements: gr.JSON(all_statements),
|
159 |
answer: gr.HTML(answer_html, visible=True),
|
160 |
all_citations: gr.HTML(all_cite_html, visible=True),
|
161 |
cite_num2idx: gr.JSON(cite_num2idx_dict),
|
162 |
+
citation_choices: gr.Radio(cite_nums, visible=len(cite_nums) > 0),
|
163 |
clicked_citations: gr.HTML(visible=False),
|
164 |
}
|
165 |
+
|
166 |
+
|
167 |
def chose_citation(statements, cite_num2idx, clicked_cite_num):
|
168 |
clicked = cite_num2idx[clicked_cite_num]
|
169 |
answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
|
|
|
172 |
clicked_citations: gr.HTML(clicked_cite_html, visible=True),
|
173 |
}
|
174 |
|
175 |
+
|
176 |
with gr.Blocks() as demo:
|
177 |
gr.Markdown(
|
178 |
"""
|
|
|
190 |
</div>
|
191 |
"""
|
192 |
)
|
193 |
+
|
194 |
with gr.Row():
|
195 |
with gr.Column(scale=4):
|
196 |
+
file = gr.File(
|
197 |
+
label="Upload a document (supported type: pdf, docx, txt, md, py)"
|
198 |
+
)
|
199 |
+
query = gr.Textbox(label="Question")
|
200 |
submit_btn = gr.Button("Submit")
|
201 |
|
202 |
+
with gr.Column(scale=4):
|
203 |
+
context = gr.Textbox(
|
204 |
+
label="Document content",
|
205 |
+
autoscroll=False,
|
206 |
+
placeholder="No uploaded document.",
|
207 |
+
max_lines=10,
|
208 |
+
visible=False,
|
209 |
+
)
|
210 |
+
|
211 |
file.upload(render_context, [file], [context])
|
212 |
+
|
213 |
with gr.Row():
|
214 |
with gr.Column(scale=4):
|
215 |
statements = gr.JSON(label="statements", visible=False)
|
216 |
answer = gr.HTML(label="Answer", visible=True)
|
217 |
cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
|
218 |
+
citation_choices = gr.Radio(
|
219 |
+
label="Chose citations for details", visible=False, interactive=True
|
220 |
+
)
|
221 |
+
|
222 |
+
with gr.Column(scale=4):
|
223 |
+
clicked_citations = gr.HTML(
|
224 |
+
label="Citations of the chosen statement", visible=False
|
225 |
+
)
|
226 |
all_citations = gr.HTML(label="All citations", visible=False)
|
227 |
+
|
228 |
+
submit_btn.click(
|
229 |
+
run_llm,
|
230 |
+
[context, query],
|
231 |
+
[
|
232 |
+
statements,
|
233 |
+
answer,
|
234 |
+
all_citations,
|
235 |
+
cite_num2idx,
|
236 |
+
citation_choices,
|
237 |
+
clicked_citations,
|
238 |
+
],
|
239 |
+
)
|
240 |
+
citation_choices.change(
|
241 |
+
chose_citation,
|
242 |
+
[statements, cite_num2idx, citation_choices],
|
243 |
+
[answer, clicked_citations],
|
244 |
+
)
|
245 |
+
|
246 |
demo.queue()
|
247 |
+
demo.launch()
|
requirement.txt
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
-
gradio==4.
|
2 |
-
torch==2.
|
3 |
-
transformers==4.
|
4 |
spaces==0.29.2
|
5 |
accelerate==0.33.0
|
6 |
sentencepiece==0.2.0
|
7 |
-
huggingface-hub==0.24.5
|
8 |
-
sentencepiece==0.2.0
|
9 |
jinja2==3.1.4
|
10 |
sentence_transformers==3.0.1
|
11 |
tiktoken==0.7.0
|
|
|
1 |
+
gradio==4.42.0
|
2 |
+
torch==2.2.0
|
3 |
+
transformers==4.44.2
|
4 |
spaces==0.29.2
|
5 |
accelerate==0.33.0
|
6 |
sentencepiece==0.2.0
|
|
|
|
|
7 |
jinja2==3.1.4
|
8 |
sentence_transformers==3.0.1
|
9 |
tiktoken==0.7.0
|