Spaces:

liujch1998
/

infini-gram

Running

App Files Files Community

liujch1998 commited on Jun 28

Commit

8c4a00c

•

1 Parent(s): 3649303

Sync changes

Browse files

Files changed (2) hide show

app.py +127 -2
constants.py +1 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import datetime
 import json
 import requests
 from constants import *
@@ -149,6 +150,83 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
         docs.append([])
     return tuple([latency, tokenization_info, message] + metadatas + docs)
 with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML(
@@ -183,7 +261,7 @@ with gr.Blocks() as demo:
                                             <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
                                             <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                             <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
-                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                     ''')
@@ -311,7 +389,7 @@ with gr.Blocks() as demo:
                     infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
                     infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
-                with gr.Tab('6. Search documents'):
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
@@ -362,6 +440,53 @@ with gr.Blocks() as demo:
                     search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
                     search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
         with gr.Row():
             gr.Markdown('''
 If you find this tool useful, please kindly cite our paper:

 import gradio as gr
 import datetime
 import json
+import random
 import requests
 from constants import *
         docs.append([])
     return tuple([latency, tokenization_info, message] + metadatas + docs)
+find_result = None
+def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
+    global find_result
+    if ' AND ' in query or ' OR ' in query: # CNF query
+        find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
+        find_result['type'] = 'cnf'
+    else: # simple query
+        find_result = process('find', index_desc, query=query)
+        find_result['type'] = 'simple'
+    latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
+    tokenization_info = format_tokenization_info(find_result)
+    if 'error' in find_result:
+        message = find_result['error']
+        idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
+        metadata = ''
+        doc = []
+        return latency, tokenization_info, message, idx, metadata, doc
+    if ' AND ' in query or ' OR ' in query: # CNF query
+        ptrs_by_shard = find_result['ptrs_by_shard']
+        cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
+        if find_result["approx"]:
+            message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
+        else:
+            message = f'{find_result["cnt"]} occurrences found'
+    else: # simple query
+        message = f'{find_result["cnt"]} occurrences found'
+        cnt_retrievable = find_result['cnt']
+    if cnt_retrievable == 0:
+        idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
+        metadata = ''
+        doc = []
+        return latency, tokenization_info, message, idx, metadata, doc
+    idx = random.randint(0, cnt_retrievable-1)
+    metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
+    idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
+    return latency, tokenization_info, message, idx, metadata, doc
+def clear_search_docs_new():
+    global find_result
+    find_result = None
+    idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
+    return idx
+def get_another_doc(index_desc, idx, max_disp_len):
+    global find_result
+    if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
+        metadata = ''
+        doc = []
+        return metadata, doc
+    if find_result['type'] == 'cnf':
+        ptrs_by_shard = find_result['ptrs_by_shard']
+        cnt_by_shard = [len(ptrs) for ptrs in ptrs_by_shard]
+        s = 0
+        while idx >= cnt_by_shard[s]:
+            idx -= cnt_by_shard[s]
+            s += 1
+        ptr = ptrs_by_shard[s][idx]
+        result = process('get_doc_by_ptr', index_desc, s=s, ptr=ptr, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
+    else: # simple query
+        segment_by_shard = find_result['segment_by_shard']
+        cnt_by_shard = [end - start for (start, end) in segment_by_shard]
+        s = 0
+        while idx >= cnt_by_shard[s]:
+            idx -= cnt_by_shard[s]
+            s += 1
+        rank = segment_by_shard[s][0] + idx
+        result = process('get_doc_by_rank', index_desc, s=s, rank=rank, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
+    if 'error' in result:
+        metadata = result['error']
+        doc = []
+        return metadata, doc
+    metadata = format_doc_metadata(result)
+    doc = result['spans']
+    return metadata, doc
 with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML(
                                             <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
                                             <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                             <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
+                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                     ''')
                     infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
                     infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
+                with gr.Tab('6. Search documents', visible=False):
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
                     search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
                     search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
+                with gr.Tab('6. Search documents'):
+                    with gr.Column():
+                        gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
+                                            <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
+                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
+                                        </ul>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
+                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
+                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
+                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
+                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
+                                        </ul>
+                                        <br>
+                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
+                                    ''')
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                search_docs_new_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
+                                search_docs_new_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
+                                with gr.Accordion(label='Advanced options', open=False):
+                                    with gr.Row():
+                                        search_docs_new_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
+                                        search_docs_new_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
+                                with gr.Row():
+                                    search_docs_new_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
+                                    search_docs_new_submit = gr.Button(value='Submit', variant='primary', visible=True)
+                                search_docs_new_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                search_docs_new_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
+                            with gr.Column(scale=2):
+                                search_docs_new_message = gr.Label(label='Message', num_top_classes=0)
+                                search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
+                                search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
+                                search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
+                    search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
+                    search_docs_new_clear.click(clear_search_docs_new, inputs=[], outputs=[search_docs_new_idx], api_name=False)
+                    search_docs_new_submit.click(search_docs_new, inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len, search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens], outputs=[search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output], api_name=False)
+                    search_docs_new_idx.input(get_another_doc, inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len], outputs=[search_docs_new_metadata, search_docs_new_output], api_name=False)
         with gr.Row():
             gr.Markdown('''
 If you find this tool useful, please kindly cite our paper:

constants.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 # options
 INDEX_BY_DESC = {
     'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
     'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
     'Pile-train (380B tokens)': 'v4_piletrain_llama',

 # options
 INDEX_BY_DESC = {
+    'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
     'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
     'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
     'Pile-train (380B tokens)': 'v4_piletrain_llama',