Spaces:
Running
Running
File size: 17,924 Bytes
d8d9fba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
import gradio as gr
import json
import os
import requests
CORPUS_BY_DESC = {
'RedPajama (LLaMA tokenizer)': 'rpj_v3_c4_llama2',
'Pile-val (GPT-2 tokenizer)': 'pile_v3_val',
}
CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
QUERY_TYPE_BY_DESC = {
'1. Count an n-gram': 'count',
'2. Compute the probability of the last token in an n-gram': 'compute_prob',
'3. Compute the next-token distribution of an (n-1)-gram': 'get_next_token_distribution_approx',
'4. Compute the β-gram probability of the last token': 'compute_infgram_prob',
'5. Compute the β-gram next-token distribution': 'get_infgram_next_token_distribution_approx',
'6. Searching for document containing n-gram(s)': 'get_a_random_document_from_cnf_query_fast_approx',
# '7. Analyze an (AI-generated) document using β-gram': 'analyze_document',
}
QUERY_DESC_BY_TYPE = {v: k for k, v in QUERY_TYPE_BY_DESC.items()}
QUERY_DESCS = list(QUERY_TYPE_BY_DESC.keys())
MAX_QUERY_CHARS = 1000
MAX_INPUT_DOC_TOKENS = 1000
MAX_OUTPUT_DOC_TOKENS = 5000 # must be an even number!
MAX_CNT_FOR_NTD = 1000
MAX_CLAUSE_FREQ = 10000
MAX_CLAUSE_FREQ_FAST = 1000000
MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = 50000
MAX_DIFF_TOKENS = 100
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
MAX_CLAUSES_IN_CNF = 4
MAX_TERMS_IN_DISJ_CLAUSE = 4
API_IPADDR = os.environ.get('API_IPADDR', None)
default_concurrency_limit = os.environ.get('default_concurrency_limit', 10)
max_size = os.environ.get('max_size', 100)
max_threads = os.environ.get('max_threads', 40)
debug = os.environ.get('debug', False)
def process(corpus_desc, query_desc, query):
corpus = CORPUS_BY_DESC[corpus_desc]
query_type = QUERY_TYPE_BY_DESC[query_desc]
print(json.dumps({'corpus': corpus, 'query_type': query_type, 'query': query}))
data = {
'corpus': corpus,
'query_type': query_type,
'query': query,
}
if API_IPADDR is None:
raise ValueError(f'API_IPADDR envvar is not set!')
response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
if response.status_code == 200:
result = response.json()
else:
raise ValueError(f'Invalid response: {response.status_code}')
# print(result)
return result
with gr.Blocks() as demo:
with gr.Column():
gr.HTML(
'''<h1 text-align="center">Infini-gram: An Engine for n-gram / β-gram Language Models with Trillion-Token Corpora</h1>
<p style='font-size: 16px;'>This is an engine that processes n-gram / β-gram queries on a text corpus. Please first select the corpus and the type of query, then enter your query and submit.</p>
'''
)
with gr.Row():
with gr.Column(scale=1):
corpus_desc = gr.Radio(choices=CORPUS_DESCS, label='Corpus', value=CORPUS_DESCS[0])
with gr.Column(scale=4):
query_desc = gr.Radio(
choices=QUERY_DESCS, label='Query Type', value=QUERY_DESCS[0],
)
with gr.Row(visible=True) as row_1:
with gr.Column():
gr.HTML('<h2>1. Count an n-gram</h2>')
gr.HTML('<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.</p>')
gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is Cnt(natural language processing))</p>')
with gr.Row():
with gr.Column(scale=1):
count_input = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
with gr.Row():
count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
count_submit = gr.Button(value='Submit', variant='primary', visible=True)
count_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
with gr.Column(scale=1):
count_output = gr.Label(label='Count', num_top_classes=0)
with gr.Row(visible=False) as row_2:
with gr.Column():
gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
gr.HTML('<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>')
gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>')
gr.HTML('<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</p>')
with gr.Row():
with gr.Column(scale=1):
ngram_input = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
with gr.Row():
ngram_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
ngram_submit = gr.Button(value='Submit', variant='primary', visible=True)
ngram_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
with gr.Column(scale=1):
ngram_output = gr.Label(label='Probability', num_top_classes=0)
with gr.Row(visible=False) as row_3:
with gr.Column():
gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_CNT_FOR_NTD} times in the corpus, the result will be approximate.</p>')
with gr.Row():
with gr.Column(scale=1):
a_ntd_input = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
with gr.Row():
a_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
a_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
a_ntd_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
with gr.Column(scale=1):
a_ntd_output = gr.Label(label='Distribution', num_top_classes=10)
with gr.Row(visible=False) as row_4:
with gr.Column():
gr.HTML('<h2>4. Compute the β-gram probability of the last token</h2>')
gr.HTML('<p style="font-size: 16px;">This computes the β-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>')
gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)</p>')
gr.HTML('<p style="font-size: 16px;">Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.</p>')
with gr.Row():
with gr.Column(scale=1):
infgram_input = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
with gr.Row():
infgram_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
infgram_submit = gr.Button(value='Submit', variant='primary', visible=True)
infgram_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
infgram_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
with gr.Column(scale=1):
infgram_output = gr.Label(label='Probability', num_top_classes=0)
with gr.Row(visible=False) as row_5:
with gr.Column():
gr.HTML('<h2>5. Compute the β-gram next-token distribution</h2>')
gr.HTML('<p style="font-size: 16px;">This is similar to Query 3, but with β-gram instead of n-gram.</p>')
gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
with gr.Row():
with gr.Column(scale=1):
a_infntd_input = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
with gr.Row():
a_infntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
a_infntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
a_infntd_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
a_infntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
with gr.Column(scale=1):
a_infntd_output = gr.Label(label='Distribution', num_top_classes=10)
with gr.Row(visible=False) as row_6:
with gr.Column():
gr.HTML(f'''<h2>6. Searching for document containing n-gram(s)</h2>
<p style="font-size: 16px;">This displays a random document in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
<p style="font-size: 16px;">Example queries:</p>
<ul style="font-size: 16px;">
<li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
<li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
<li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
</ul>
<p style="font-size: 16px;">If you want another random document, simply hit the Submit button again :)</p>
<p style="font-size: 16px;">A few notes:</p>
<ul style="font-size: 16px;">
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
<li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
<li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
<li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will search within a random subset of all documents containing that clause.</li>
<li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
</ul>
''')
with gr.Row():
with gr.Column(scale=1):
a_ard_cnf_input = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
with gr.Row():
a_ard_cnf_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
a_ard_cnf_submit = gr.Button(value='Submit', variant='primary', visible=True)
a_ard_cnf_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
with gr.Column(scale=1):
a_ard_cnf_output_message = gr.Label(label='Message', num_top_classes=0)
a_ard_cnf_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
with gr.Row(visible=False) as row_7:
with gr.Column():
gr.HTML('<h2>7. Analyze an (AI-generated) document using β-gram</h2>')
gr.HTML('<p style="font-size: 16px;">This analyzes the document you entered using the β-gram. Each token is highlighted where (1) the color represents its β-gram probability (red is 0.0, blue is 1.0), and (2) the alpha represents the effective n (higher alpha means higher n).</p>')
gr.HTML('<p style="font-size: 16px;">If you hover over a token, the tokens preceding it are each highlighted where (1) the color represents the n-gram probability of your selected token, with the n-gram starting from that highlighted token (red is 0.0, blue is 1.0), and (2) the alpha represents the count of the (n-1)-gram starting from that highlighted token (and up to but excluding your selected token) (higher alpha means higher count).</p>')
with gr.Row():
with gr.Column(scale=1):
doc_analysis_input = gr.Textbox(placeholder='Enter a document here', label='Query', interactive=True, lines=10)
with gr.Row():
doc_analysis_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
doc_analysis_submit = gr.Button(value='Submit', variant='primary', visible=True)
with gr.Column(scale=1):
doc_analysis_output = gr.HTML(value='', label='Analysis')
count_clear.add([count_input, count_output, count_output_tokens])
ngram_clear.add([ngram_input, ngram_output, ngram_output_tokens])
a_ntd_clear.add([a_ntd_input, a_ntd_output, a_ntd_output_tokens])
infgram_clear.add([infgram_input, infgram_output, infgram_output_tokens])
a_infntd_clear.add([a_infntd_input, a_infntd_output, a_infntd_output_tokens, a_infntd_longest_suffix])
a_ard_cnf_clear.add([a_ard_cnf_input, a_ard_cnf_output, a_ard_cnf_output_tokens, a_ard_cnf_output_message])
doc_analysis_clear.add([doc_analysis_input, doc_analysis_output])
count_submit.click(process, inputs=[corpus_desc, query_desc, count_input], outputs=[count_output, count_output_tokens])
ngram_submit.click(process, inputs=[corpus_desc, query_desc, ngram_input], outputs=[ngram_output, ngram_output_tokens])
a_ntd_submit.click(process, inputs=[corpus_desc, query_desc, a_ntd_input], outputs=[a_ntd_output, a_ntd_output_tokens])
infgram_submit.click(process, inputs=[corpus_desc, query_desc, infgram_input], outputs=[infgram_output, infgram_output_tokens, infgram_longest_suffix])
a_infntd_submit.click(process, inputs=[corpus_desc, query_desc, a_infntd_input], outputs=[a_infntd_output, a_infntd_output_tokens, a_infntd_longest_suffix])
a_ard_cnf_submit.click(process, inputs=[corpus_desc, query_desc, a_ard_cnf_input], outputs=[a_ard_cnf_output, a_ard_cnf_output_tokens, a_ard_cnf_output_message])
doc_analysis_submit.click(process, inputs=[corpus_desc, query_desc, doc_analysis_input], outputs=[doc_analysis_output])
def update_query_desc(selection):
return {
row_1: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['count'])),
row_2: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['compute_prob'])),
row_3: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_next_token_distribution_approx'])),
row_4: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['compute_infgram_prob'])),
row_5: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_infgram_next_token_distribution_approx'])),
row_6: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_a_random_document_from_cnf_query_fast_approx'])),
# row_7: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['analyze_document'])),
}
query_desc.change(fn=update_query_desc, inputs=query_desc, outputs=[
row_1,
row_2,
row_3,
row_4,
row_5,
row_6,
# row_7,
])
demo.queue(
default_concurrency_limit=default_concurrency_limit,
max_size=max_size,
).launch(
max_threads=max_threads,
debug=debug,
)
|