zR commited on
Commit
49cfbf7
1 Parent(s): 5254142
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +115 -40
  3. requirement.txt +3 -5
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.41.0
8
  suggested_hardware: a100-large
9
  app_port: 7860
10
  app_file: app.py
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.42.0
8
  suggested_hardware: a100-large
9
  app_port: 7860
10
  app_file: app.py
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import subprocess
2
  import gradio as gr
3
  import torch
4
  from transformers import (
@@ -7,14 +6,18 @@ from transformers import (
7
  )
8
  import docx
9
  import PyPDF2
 
 
10
 
11
  def convert_to_txt(file):
12
  doc_type = file.split(".")[-1].strip()
13
  if doc_type in ["txt", "md", "py"]:
14
- data = [file.read().decode('utf-8')]
15
  elif doc_type in ["pdf"]:
16
  pdf_reader = PyPDF2.PdfReader(file)
17
- data = [pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))]
 
 
18
  elif doc_type in ["docx"]:
19
  doc = docx.Document(file)
20
  data = [p.text for p in doc.paragraphs]
@@ -23,9 +26,12 @@ def convert_to_txt(file):
23
  text = "\n\n".join(data)
24
  return text
25
 
 
26
  model_name = "THUDM/LongCite-glm4-9b"
27
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map='auto')
 
 
29
 
30
  html_styles = """<style>
31
  .reference {
@@ -48,19 +54,21 @@ html_styles = """<style>
48
  }
49
  </style>\n"""
50
 
 
51
  def process_text(text):
52
- special_char={
53
- '&': '&amp;',
54
- '\'': '&apos;',
55
- '"': '&quot;',
56
- '<': '&lt;',
57
- '>': '&gt;',
58
- '\n': '<br>',
59
  }
60
  for x, y in special_char.items():
61
  text = text.replace(x, y)
62
  return text
63
 
 
64
  def convert_to_html(statements, clicked=-1):
65
  html = html_styles + '<br><span class="label">Answer:</span><br>\n'
66
  all_cite_html = []
@@ -68,7 +76,7 @@ def convert_to_html(statements, clicked=-1):
68
  cite_num2idx = {}
69
  idx = 0
70
  for i, js in enumerate(statements):
71
- statement, citations = process_text(js['statement']), js['citation']
72
  if clicked == i:
73
  html += f"""<span class="statement">{statement}</span>"""
74
  else:
@@ -79,19 +87,47 @@ def convert_to_html(statements, clicked=-1):
79
  for c in citations:
80
  idx += 1
81
  idxs.append(str(idx))
82
- cite = '[Sentence: {}-{}\t|\tChar: {}-{}]<br>\n<span {}>{}</span>'.format(c['start_sentence_idx'], c['end_sentence_idx'], c['start_char_idx'], c['end_char_idx'], 'class="highlight"' if clicked==i else "", process_text(c['cite'].strip()))
83
- cite_html.append(f"""<span><span class="Bold">Snippet [{idx}]:</span><br>{cite}</span>""")
 
 
 
 
 
 
 
 
 
 
 
84
  all_cite_html.extend(cite_html)
85
- cite_num = '[{}]'.format(','.join(idxs))
86
  cite_num2idx[cite_num] = i
87
- cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(i, cite_num)
 
 
88
  html += cite_num_html
89
- html += '\n'
90
  if clicked == i:
91
- clicked_cite_html = html_styles + """<br><span class="label">Citations of current statement:</span><br><div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(cite_html))
92
- all_cite_html = html_styles + """<br><span class="label">All citations:</span><br>\n<div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(all_cite_html).replace('<span class="highlight">', '<span>') if len(all_cite_html) else "No citation in the answer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return html, all_cite_html, clicked_cite_html, cite_num2idx
94
 
 
95
  def render_context(file):
96
  if hasattr(file, "name"):
97
  context = convert_to_txt(file.name)
@@ -99,24 +135,35 @@ def render_context(file):
99
  else:
100
  raise gr.Error(f"ERROR: no uploaded document")
101
 
 
 
102
  def run_llm(context, query):
103
  if not context:
104
  raise gr.Error("Error: no uploaded document")
105
  if not query:
106
  raise gr.Error("Error: no query")
107
- result = model.query_longcite(context, query, tokenizer=tokenizer, max_input_length=128000, max_new_tokens=1024)
108
- all_statements = result['all_statements']
109
- answer_html, all_cite_html, clicked_cite_html, cite_num2idx_dict = convert_to_html(all_statements)
 
 
 
 
 
 
 
 
110
  cite_nums = list(cite_num2idx_dict.keys())
111
  return {
112
  statements: gr.JSON(all_statements),
113
  answer: gr.HTML(answer_html, visible=True),
114
  all_citations: gr.HTML(all_cite_html, visible=True),
115
  cite_num2idx: gr.JSON(cite_num2idx_dict),
116
- citation_choices: gr.Radio(cite_nums, visible=len(cite_nums)>0),
117
  clicked_citations: gr.HTML(visible=False),
118
  }
119
-
 
120
  def chose_citation(statements, cite_num2idx, clicked_cite_num):
121
  clicked = cite_num2idx[clicked_cite_num]
122
  answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
@@ -125,6 +172,7 @@ def chose_citation(statements, cite_num2idx, clicked_cite_num):
125
  clicked_citations: gr.HTML(clicked_cite_html, visible=True),
126
  }
127
 
 
128
  with gr.Blocks() as demo:
129
  gr.Markdown(
130
  """
@@ -142,31 +190,58 @@ with gr.Blocks() as demo:
142
  </div>
143
  """
144
  )
145
-
146
  with gr.Row():
147
  with gr.Column(scale=4):
148
- file = gr.File(label="Upload a document (supported type: pdf, docx, txt, md, py)")
149
- query = gr.Textbox(label='Question')
 
 
150
  submit_btn = gr.Button("Submit")
151
 
152
- with gr.Column(scale=4):
153
- context = gr.Textbox(label="Document content", autoscroll=False, placeholder="No uploaded document.", max_lines=10, visible=False)
154
-
 
 
 
 
 
 
155
  file.upload(render_context, [file], [context])
156
-
157
  with gr.Row():
158
  with gr.Column(scale=4):
159
  statements = gr.JSON(label="statements", visible=False)
160
  answer = gr.HTML(label="Answer", visible=True)
161
  cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
162
- citation_choices = gr.Radio(label="Chose citations for details", visible=False, interactive=True)
163
-
164
- with gr.Column(scale=4):
165
- clicked_citations = gr.HTML(label="Citations of the chosen statement", visible=False)
 
 
 
 
166
  all_citations = gr.HTML(label="All citations", visible=False)
167
-
168
- submit_btn.click(run_llm, [context, query], [statements, answer, all_citations, cite_num2idx, citation_choices, clicked_citations])
169
- citation_choices.change(chose_citation, [statements, cite_num2idx, citation_choices], [answer, clicked_citations])
170
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  demo.queue()
172
- demo.launch()
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import (
 
6
  )
7
  import docx
8
  import PyPDF2
9
+ import spaces
10
+
11
 
12
  def convert_to_txt(file):
13
  doc_type = file.split(".")[-1].strip()
14
  if doc_type in ["txt", "md", "py"]:
15
+ data = [file.read().decode("utf-8")]
16
  elif doc_type in ["pdf"]:
17
  pdf_reader = PyPDF2.PdfReader(file)
18
+ data = [
19
+ pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))
20
+ ]
21
  elif doc_type in ["docx"]:
22
  doc = docx.Document(file)
23
  data = [p.text for p in doc.paragraphs]
 
26
  text = "\n\n".join(data)
27
  return text
28
 
29
+
30
  model_name = "THUDM/LongCite-glm4-9b"
31
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
34
+ )
35
 
36
  html_styles = """<style>
37
  .reference {
 
54
  }
55
  </style>\n"""
56
 
57
+
58
  def process_text(text):
59
+ special_char = {
60
+ "&": "&amp;",
61
+ "'": "&apos;",
62
+ '"': "&quot;",
63
+ "<": "&lt;",
64
+ ">": "&gt;",
65
+ "\n": "<br>",
66
  }
67
  for x, y in special_char.items():
68
  text = text.replace(x, y)
69
  return text
70
 
71
+
72
  def convert_to_html(statements, clicked=-1):
73
  html = html_styles + '<br><span class="label">Answer:</span><br>\n'
74
  all_cite_html = []
 
76
  cite_num2idx = {}
77
  idx = 0
78
  for i, js in enumerate(statements):
79
+ statement, citations = process_text(js["statement"]), js["citation"]
80
  if clicked == i:
81
  html += f"""<span class="statement">{statement}</span>"""
82
  else:
 
87
  for c in citations:
88
  idx += 1
89
  idxs.append(str(idx))
90
+ cite = (
91
+ "[Sentence: {}-{}\t|\tChar: {}-{}]<br>\n<span {}>{}</span>".format(
92
+ c["start_sentence_idx"],
93
+ c["end_sentence_idx"],
94
+ c["start_char_idx"],
95
+ c["end_char_idx"],
96
+ 'class="highlight"' if clicked == i else "",
97
+ process_text(c["cite"].strip()),
98
+ )
99
+ )
100
+ cite_html.append(
101
+ f"""<span><span class="Bold">Snippet [{idx}]:</span><br>{cite}</span>"""
102
+ )
103
  all_cite_html.extend(cite_html)
104
+ cite_num = "[{}]".format(",".join(idxs))
105
  cite_num2idx[cite_num] = i
106
+ cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(
107
+ i, cite_num
108
+ )
109
  html += cite_num_html
110
+ html += "\n"
111
  if clicked == i:
112
+ clicked_cite_html = (
113
+ html_styles
114
+ + """<br><span class="label">Citations of current statement:</span><br><div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format(
115
+ "<br><br>\n".join(cite_html)
116
+ )
117
+ )
118
+ all_cite_html = (
119
+ html_styles
120
+ + """<br><span class="label">All citations:</span><br>\n<div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format(
121
+ "<br><br>\n".join(all_cite_html).replace(
122
+ '<span class="highlight">', "<span>"
123
+ )
124
+ if len(all_cite_html)
125
+ else "No citation in the answer"
126
+ )
127
+ )
128
  return html, all_cite_html, clicked_cite_html, cite_num2idx
129
 
130
+
131
  def render_context(file):
132
  if hasattr(file, "name"):
133
  context = convert_to_txt(file.name)
 
135
  else:
136
  raise gr.Error(f"ERROR: no uploaded document")
137
 
138
+
139
+ @spaces.GPU()
140
  def run_llm(context, query):
141
  if not context:
142
  raise gr.Error("Error: no uploaded document")
143
  if not query:
144
  raise gr.Error("Error: no query")
145
+ result = model.query_longcite(
146
+ context,
147
+ query,
148
+ tokenizer=tokenizer,
149
+ max_input_length=128000,
150
+ max_new_tokens=1024,
151
+ )
152
+ all_statements = result["all_statements"]
153
+ answer_html, all_cite_html, clicked_cite_html, cite_num2idx_dict = convert_to_html(
154
+ all_statements
155
+ )
156
  cite_nums = list(cite_num2idx_dict.keys())
157
  return {
158
  statements: gr.JSON(all_statements),
159
  answer: gr.HTML(answer_html, visible=True),
160
  all_citations: gr.HTML(all_cite_html, visible=True),
161
  cite_num2idx: gr.JSON(cite_num2idx_dict),
162
+ citation_choices: gr.Radio(cite_nums, visible=len(cite_nums) > 0),
163
  clicked_citations: gr.HTML(visible=False),
164
  }
165
+
166
+
167
  def chose_citation(statements, cite_num2idx, clicked_cite_num):
168
  clicked = cite_num2idx[clicked_cite_num]
169
  answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
 
172
  clicked_citations: gr.HTML(clicked_cite_html, visible=True),
173
  }
174
 
175
+
176
  with gr.Blocks() as demo:
177
  gr.Markdown(
178
  """
 
190
  </div>
191
  """
192
  )
193
+
194
  with gr.Row():
195
  with gr.Column(scale=4):
196
+ file = gr.File(
197
+ label="Upload a document (supported type: pdf, docx, txt, md, py)"
198
+ )
199
+ query = gr.Textbox(label="Question")
200
  submit_btn = gr.Button("Submit")
201
 
202
+ with gr.Column(scale=4):
203
+ context = gr.Textbox(
204
+ label="Document content",
205
+ autoscroll=False,
206
+ placeholder="No uploaded document.",
207
+ max_lines=10,
208
+ visible=False,
209
+ )
210
+
211
  file.upload(render_context, [file], [context])
212
+
213
  with gr.Row():
214
  with gr.Column(scale=4):
215
  statements = gr.JSON(label="statements", visible=False)
216
  answer = gr.HTML(label="Answer", visible=True)
217
  cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
218
+ citation_choices = gr.Radio(
219
+ label="Chose citations for details", visible=False, interactive=True
220
+ )
221
+
222
+ with gr.Column(scale=4):
223
+ clicked_citations = gr.HTML(
224
+ label="Citations of the chosen statement", visible=False
225
+ )
226
  all_citations = gr.HTML(label="All citations", visible=False)
227
+
228
+ submit_btn.click(
229
+ run_llm,
230
+ [context, query],
231
+ [
232
+ statements,
233
+ answer,
234
+ all_citations,
235
+ cite_num2idx,
236
+ citation_choices,
237
+ clicked_citations,
238
+ ],
239
+ )
240
+ citation_choices.change(
241
+ chose_citation,
242
+ [statements, cite_num2idx, citation_choices],
243
+ [answer, clicked_citations],
244
+ )
245
+
246
  demo.queue()
247
+ demo.launch()
requirement.txt CHANGED
@@ -1,11 +1,9 @@
1
- gradio==4.41.0
2
- torch==2.3.1
3
- transformers==4.43.0
4
  spaces==0.29.2
5
  accelerate==0.33.0
6
  sentencepiece==0.2.0
7
- huggingface-hub==0.24.5
8
- sentencepiece==0.2.0
9
  jinja2==3.1.4
10
  sentence_transformers==3.0.1
11
  tiktoken==0.7.0
 
1
+ gradio==4.42.0
2
+ torch==2.2.0
3
+ transformers==4.44.2
4
  spaces==0.29.2
5
  accelerate==0.33.0
6
  sentencepiece==0.2.0
 
 
7
  jinja2==3.1.4
8
  sentence_transformers==3.0.1
9
  tiktoken==0.7.0