liujch1998 commited on
Commit
8c4a00c
1 Parent(s): 3649303

Sync changes

Browse files
Files changed (2) hide show
  1. app.py +127 -2
  2. constants.py +1 -0
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import datetime
3
  import json
 
4
  import requests
5
  from constants import *
6
 
@@ -149,6 +150,83 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
149
  docs.append([])
150
  return tuple([latency, tokenization_info, message] + metadatas + docs)
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with gr.Blocks() as demo:
153
  with gr.Column():
154
  gr.HTML(
@@ -183,7 +261,7 @@ with gr.Blocks() as demo:
183
  <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
184
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
185
  <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
186
- <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
187
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
188
  </ul>
189
  ''')
@@ -311,7 +389,7 @@ with gr.Blocks() as demo:
311
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
312
  infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
313
 
314
- with gr.Tab('6. Search documents'):
315
  with gr.Column():
316
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
317
  with gr.Accordion(label='Click to view instructions', open=False):
@@ -362,6 +440,53 @@ with gr.Blocks() as demo:
362
  search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
363
  search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  with gr.Row():
366
  gr.Markdown('''
367
  If you find this tool useful, please kindly cite our paper:
 
1
  import gradio as gr
2
  import datetime
3
  import json
4
+ import random
5
  import requests
6
  from constants import *
7
 
 
150
  docs.append([])
151
  return tuple([latency, tokenization_info, message] + metadatas + docs)
152
 
153
+ find_result = None
154
+
155
+ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
156
+ global find_result
157
+ if ' AND ' in query or ' OR ' in query: # CNF query
158
+ find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
159
+ find_result['type'] = 'cnf'
160
+ else: # simple query
161
+ find_result = process('find', index_desc, query=query)
162
+ find_result['type'] = 'simple'
163
+ latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
164
+ tokenization_info = format_tokenization_info(find_result)
165
+ if 'error' in find_result:
166
+ message = find_result['error']
167
+ idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
168
+ metadata = ''
169
+ doc = []
170
+ return latency, tokenization_info, message, idx, metadata, doc
171
+
172
+ if ' AND ' in query or ' OR ' in query: # CNF query
173
+ ptrs_by_shard = find_result['ptrs_by_shard']
174
+ cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
175
+ if find_result["approx"]:
176
+ message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
177
+ else:
178
+ message = f'{find_result["cnt"]} occurrences found'
179
+ else: # simple query
180
+ message = f'{find_result["cnt"]} occurrences found'
181
+ cnt_retrievable = find_result['cnt']
182
+ if cnt_retrievable == 0:
183
+ idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
184
+ metadata = ''
185
+ doc = []
186
+ return latency, tokenization_info, message, idx, metadata, doc
187
+ idx = random.randint(0, cnt_retrievable-1)
188
+ metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
189
+ idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
190
+ return latency, tokenization_info, message, idx, metadata, doc
191
+
192
+ def clear_search_docs_new():
193
+ global find_result
194
+ find_result = None
195
+ idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
196
+ return idx
197
+
198
+ def get_another_doc(index_desc, idx, max_disp_len):
199
+ global find_result
200
+ if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
201
+ metadata = ''
202
+ doc = []
203
+ return metadata, doc
204
+ if find_result['type'] == 'cnf':
205
+ ptrs_by_shard = find_result['ptrs_by_shard']
206
+ cnt_by_shard = [len(ptrs) for ptrs in ptrs_by_shard]
207
+ s = 0
208
+ while idx >= cnt_by_shard[s]:
209
+ idx -= cnt_by_shard[s]
210
+ s += 1
211
+ ptr = ptrs_by_shard[s][idx]
212
+ result = process('get_doc_by_ptr', index_desc, s=s, ptr=ptr, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
213
+ else: # simple query
214
+ segment_by_shard = find_result['segment_by_shard']
215
+ cnt_by_shard = [end - start for (start, end) in segment_by_shard]
216
+ s = 0
217
+ while idx >= cnt_by_shard[s]:
218
+ idx -= cnt_by_shard[s]
219
+ s += 1
220
+ rank = segment_by_shard[s][0] + idx
221
+ result = process('get_doc_by_rank', index_desc, s=s, rank=rank, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
222
+ if 'error' in result:
223
+ metadata = result['error']
224
+ doc = []
225
+ return metadata, doc
226
+ metadata = format_doc_metadata(result)
227
+ doc = result['spans']
228
+ return metadata, doc
229
+
230
  with gr.Blocks() as demo:
231
  with gr.Column():
232
  gr.HTML(
 
261
  <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
262
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
263
  <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
264
+ <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
265
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
266
  </ul>
267
  ''')
 
389
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
390
  infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
391
 
392
+ with gr.Tab('6. Search documents', visible=False):
393
  with gr.Column():
394
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
395
  with gr.Accordion(label='Click to view instructions', open=False):
 
440
  search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
441
  search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
442
 
443
+ with gr.Tab('6. Search documents'):
444
+ with gr.Column():
445
+ gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
446
+ with gr.Accordion(label='Click to view instructions', open=False):
447
+ gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
448
+ <br>
449
+ <p style="font-size: 16px;">Example queries:</p>
450
+ <ul style="font-size: 16px;">
451
+ <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
452
+ <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
453
+ <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
454
+ </ul>
455
+ <br>
456
+ <p style="font-size: 16px;">Notes on CNF queries:</p>
457
+ <ul style="font-size: 16px;">
458
+ <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
459
+ <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
460
+ <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
461
+ <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
462
+ <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
463
+ </ul>
464
+ <br>
465
+ <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
466
+ ''')
467
+ with gr.Row():
468
+ with gr.Column(scale=1):
469
+ search_docs_new_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
470
+ search_docs_new_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
471
+ with gr.Accordion(label='Advanced options', open=False):
472
+ with gr.Row():
473
+ search_docs_new_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
474
+ search_docs_new_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
475
+ with gr.Row():
476
+ search_docs_new_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
477
+ search_docs_new_submit = gr.Button(value='Submit', variant='primary', visible=True)
478
+ search_docs_new_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
479
+ search_docs_new_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
480
+ with gr.Column(scale=2):
481
+ search_docs_new_message = gr.Label(label='Message', num_top_classes=0)
482
+ search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
483
+ search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
484
+ search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
485
+ search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
486
+ search_docs_new_clear.click(clear_search_docs_new, inputs=[], outputs=[search_docs_new_idx], api_name=False)
487
+ search_docs_new_submit.click(search_docs_new, inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len, search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens], outputs=[search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output], api_name=False)
488
+ search_docs_new_idx.input(get_another_doc, inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len], outputs=[search_docs_new_metadata, search_docs_new_output], api_name=False)
489
+
490
  with gr.Row():
491
  gr.Markdown('''
492
  If you find this tool useful, please kindly cite our paper:
constants.py CHANGED
@@ -2,6 +2,7 @@ import os
2
 
3
  # options
4
  INDEX_BY_DESC = {
 
5
  'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
6
  'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
7
  'Pile-train (380B tokens)': 'v4_piletrain_llama',
 
2
 
3
  # options
4
  INDEX_BY_DESC = {
5
+ 'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
6
  'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
7
  'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
8
  'Pile-train (380B tokens)': 'v4_piletrain_llama',