Spaces:
Paused
Paused
flagging refactor
Browse files
app.py
CHANGED
@@ -19,31 +19,39 @@ roots_datasets = {
|
|
19 |
def get_docid_html(docid):
|
20 |
data_org, dataset, docid = docid.split("/")
|
21 |
metadata = roots_datasets[dataset]
|
|
|
|
|
22 |
if metadata.private:
|
23 |
docid_html = """
|
24 |
<a title="This dataset is private. See the introductory text for more information"
|
25 |
-
style="color
|
26 |
-
onmouseover="style='color
|
27 |
-
onmouseout="style='color
|
28 |
href="https://huggingface.co/datasets/bigscience-data/{dataset}"
|
29 |
target="_blank">
|
30 |
π{dataset}
|
31 |
</a>
|
32 |
-
<span style="color
|
33 |
-
dataset=dataset,
|
|
|
|
|
|
|
34 |
)
|
35 |
else:
|
36 |
docid_html = """
|
37 |
<a title="This dataset is licensed {metadata}"
|
38 |
-
style="color
|
39 |
-
onmouseover="style='color
|
40 |
-
onmouseout="style='color
|
41 |
href="https://huggingface.co/datasets/bigscience-data/{dataset}"
|
42 |
target="_blank">
|
43 |
{dataset}
|
44 |
</a>
|
45 |
-
<span style="color
|
46 |
-
metadata=metadata.tags[0].split(":")[-1],
|
|
|
|
|
|
|
47 |
)
|
48 |
return docid_html
|
49 |
|
@@ -63,30 +71,6 @@ def process_pii(text):
|
|
63 |
return text
|
64 |
|
65 |
|
66 |
-
def flag(query, language, num_results, issue_description):
|
67 |
-
try:
|
68 |
-
post_data = {
|
69 |
-
"query": query,
|
70 |
-
"k": num_results,
|
71 |
-
"flag": True,
|
72 |
-
"description": issue_description,
|
73 |
-
}
|
74 |
-
if language != "detect_language":
|
75 |
-
post_data["lang"] = language
|
76 |
-
|
77 |
-
output = requests.post(
|
78 |
-
os.environ.get("address"),
|
79 |
-
headers={"Content-type": "application/json"},
|
80 |
-
data=json.dumps(post_data),
|
81 |
-
timeout=120,
|
82 |
-
)
|
83 |
-
|
84 |
-
results = json.loads(output.text)
|
85 |
-
except:
|
86 |
-
print("Error flagging")
|
87 |
-
return ""
|
88 |
-
|
89 |
-
|
90 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
91 |
text, url, docid = result
|
92 |
if datasets_filter is not None:
|
@@ -133,8 +117,9 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
133 |
language = "FIXME"
|
134 |
result_html = """{}
|
135 |
<span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</span>
|
136 |
-
<a href="https://forms.gle/AdBLLwRApqcLkHYA8" target="_blank"
|
137 |
-
|
|
|
138 |
<!-- <span style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</span><br> -->
|
139 |
<span style='font-family: Arial;'>{}</span><br>
|
140 |
<br>
|
@@ -273,9 +258,7 @@ information and instructions on how to access the full corpus check [this form](
|
|
273 |
|
274 |
|
275 |
if __name__ == "__main__":
|
276 |
-
demo = gr.Blocks(
|
277 |
-
css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
|
278 |
-
)
|
279 |
|
280 |
with demo:
|
281 |
processed_results_state = gr.State([])
|
@@ -283,7 +266,6 @@ if __name__ == "__main__":
|
|
283 |
num_results_state = gr.State(0)
|
284 |
exact_search_state = gr.State(False)
|
285 |
lang_state = gr.State("")
|
286 |
-
max_page_size_state = gr.State(100)
|
287 |
received_results_state = gr.State(0)
|
288 |
|
289 |
with gr.Row():
|
@@ -319,7 +301,13 @@ if __name__ == "__main__":
|
|
319 |
value="en",
|
320 |
label="Language",
|
321 |
)
|
322 |
-
k = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
with gr.Row():
|
324 |
submit_btn = gr.Button("Submit")
|
325 |
with gr.Row(visible=False) as datasets_filter:
|
@@ -336,19 +324,17 @@ if __name__ == "__main__":
|
|
336 |
with gr.Row(visible=False) as pagination:
|
337 |
next_page_btn = gr.Button("Next Page")
|
338 |
|
339 |
-
def run_query(query, lang, k, dropdown_input,
|
340 |
query = query.strip()
|
341 |
exact_search = False
|
342 |
if query.startswith('"') and query.endswith('"') and len(query) >= 2:
|
343 |
exact_search = True
|
344 |
query = query[1:-1]
|
345 |
-
k = max_page_size
|
346 |
else:
|
347 |
query = " ".join(query.split())
|
348 |
if query == "" or query is None:
|
349 |
return None
|
350 |
|
351 |
-
print("submitting", query, lang, k)
|
352 |
payload = request_payload(query, lang, exact_search, k, received_results)
|
353 |
err = extract_error_from_payload(payload)
|
354 |
if err is not None:
|
@@ -377,7 +363,8 @@ if __name__ == "__main__":
|
|
377 |
ds,
|
378 |
)
|
379 |
|
380 |
-
def submit(query, lang, k, dropdown_input
|
|
|
381 |
(
|
382 |
processed_results,
|
383 |
highlight_terms,
|
@@ -385,8 +372,8 @@ if __name__ == "__main__":
|
|
385 |
exact_search,
|
386 |
result_page_html,
|
387 |
datasets,
|
388 |
-
) = run_query(query, lang, k, dropdown_input,
|
389 |
-
has_more_results = exact_search and (num_results >
|
390 |
return [
|
391 |
processed_results,
|
392 |
highlight_terms,
|
@@ -404,7 +391,6 @@ if __name__ == "__main__":
|
|
404 |
lang,
|
405 |
k,
|
406 |
dropdown_input,
|
407 |
-
max_page_size,
|
408 |
received_results,
|
409 |
processed_results,
|
410 |
):
|
@@ -415,11 +401,9 @@ if __name__ == "__main__":
|
|
415 |
exact_search,
|
416 |
result_page_html,
|
417 |
datasets,
|
418 |
-
) = run_query(
|
419 |
-
query, lang, k, dropdown_input, max_page_size, received_results
|
420 |
-
)
|
421 |
num_processed_results = len(next(iter(processed_results.values())))
|
422 |
-
has_more_results = exact_search and (num_results >
|
423 |
print("num_processed_results", num_processed_results)
|
424 |
print("has_more_results", has_more_results)
|
425 |
print("received_results", received_results)
|
@@ -430,9 +414,7 @@ if __name__ == "__main__":
|
|
430 |
exact_search,
|
431 |
gr.update(visible=True),
|
432 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
433 |
-
gr.update(
|
434 |
-
visible=num_processed_results >= max_page_size and has_more_results
|
435 |
-
),
|
436 |
received_results + num_processed_results,
|
437 |
result_page_html,
|
438 |
]
|
@@ -457,7 +439,7 @@ if __name__ == "__main__":
|
|
457 |
|
458 |
query.submit(
|
459 |
fn=submit,
|
460 |
-
inputs=[query, lang, k, available_datasets
|
461 |
outputs=[
|
462 |
processed_results_state,
|
463 |
highlight_terms_state,
|
@@ -472,7 +454,7 @@ if __name__ == "__main__":
|
|
472 |
)
|
473 |
submit_btn.click(
|
474 |
submit,
|
475 |
-
inputs=[query, lang, k, available_datasets
|
476 |
outputs=[
|
477 |
processed_results_state,
|
478 |
highlight_terms_state,
|
@@ -493,7 +475,6 @@ if __name__ == "__main__":
|
|
493 |
lang,
|
494 |
k,
|
495 |
available_datasets,
|
496 |
-
max_page_size_state,
|
497 |
received_results_state,
|
498 |
processed_results_state,
|
499 |
],
|
|
|
19 |
def get_docid_html(docid):
|
20 |
data_org, dataset, docid = docid.split("/")
|
21 |
metadata = roots_datasets[dataset]
|
22 |
+
locked_color = "LightGray"
|
23 |
+
open_color = "#7978FF"
|
24 |
if metadata.private:
|
25 |
docid_html = """
|
26 |
<a title="This dataset is private. See the introductory text for more information"
|
27 |
+
style="color:{locked_color}; font-weight: bold; text-decoration:none"
|
28 |
+
onmouseover="style='color:{locked_color}; font-weight: bold; text-decoration:underline'"
|
29 |
+
onmouseout="style='color:{locked_color}; font-weight: bold; text-decoration:none'"
|
30 |
href="https://huggingface.co/datasets/bigscience-data/{dataset}"
|
31 |
target="_blank">
|
32 |
π{dataset}
|
33 |
</a>
|
34 |
+
<span style="color:{open_color}; ">/{docid}</span>""".format(
|
35 |
+
dataset=dataset,
|
36 |
+
docid=docid,
|
37 |
+
locked_color=locked_color,
|
38 |
+
open_color=open_color,
|
39 |
)
|
40 |
else:
|
41 |
docid_html = """
|
42 |
<a title="This dataset is licensed {metadata}"
|
43 |
+
style="color:{open_color}; font-weight: bold; text-decoration:none"
|
44 |
+
onmouseover="style='color:{open_color}; font-weight: bold; text-decoration:underline'"
|
45 |
+
onmouseout="style='color:{open_color}; font-weight: bold; text-decoration:none'"
|
46 |
href="https://huggingface.co/datasets/bigscience-data/{dataset}"
|
47 |
target="_blank">
|
48 |
{dataset}
|
49 |
</a>
|
50 |
+
<span style="color:{open_color}; ">/{docid}</span>""".format(
|
51 |
+
metadata=metadata.tags[0].split(":")[-1],
|
52 |
+
dataset=dataset,
|
53 |
+
docid=docid,
|
54 |
+
open_color=open_color,
|
55 |
)
|
56 |
return docid_html
|
57 |
|
|
|
71 |
return text
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
75 |
text, url, docid = result
|
76 |
if datasets_filter is not None:
|
|
|
117 |
language = "FIXME"
|
118 |
result_html = """{}
|
119 |
<span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</span>
|
120 |
+
<a href="https://forms.gle/AdBLLwRApqcLkHYA8" target="_blank">
|
121 |
+
<button style="color:#ffcdf8; ">π΄ββ οΈ Flag result π΄ββ οΈ</button>
|
122 |
+
</a><br>
|
123 |
<!-- <span style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</span><br> -->
|
124 |
<span style='font-family: Arial;'>{}</span><br>
|
125 |
<br>
|
|
|
258 |
|
259 |
|
260 |
if __name__ == "__main__":
|
261 |
+
demo = gr.Blocks(css=".underline-on-hover:hover { text-decoration: underline; }")
|
|
|
|
|
262 |
|
263 |
with demo:
|
264 |
processed_results_state = gr.State([])
|
|
|
266 |
num_results_state = gr.State(0)
|
267 |
exact_search_state = gr.State(False)
|
268 |
lang_state = gr.State("")
|
|
|
269 |
received_results_state = gr.State(0)
|
270 |
|
271 |
with gr.Row():
|
|
|
301 |
value="en",
|
302 |
label="Language",
|
303 |
)
|
304 |
+
k = gr.Slider(
|
305 |
+
1,
|
306 |
+
100,
|
307 |
+
value=10,
|
308 |
+
step=1,
|
309 |
+
label="Max Results in fuzzy search or Max Results per page in exact search",
|
310 |
+
)
|
311 |
with gr.Row():
|
312 |
submit_btn = gr.Button("Submit")
|
313 |
with gr.Row(visible=False) as datasets_filter:
|
|
|
324 |
with gr.Row(visible=False) as pagination:
|
325 |
next_page_btn = gr.Button("Next Page")
|
326 |
|
327 |
+
def run_query(query, lang, k, dropdown_input, received_results):
|
328 |
query = query.strip()
|
329 |
exact_search = False
|
330 |
if query.startswith('"') and query.endswith('"') and len(query) >= 2:
|
331 |
exact_search = True
|
332 |
query = query[1:-1]
|
|
|
333 |
else:
|
334 |
query = " ".join(query.split())
|
335 |
if query == "" or query is None:
|
336 |
return None
|
337 |
|
|
|
338 |
payload = request_payload(query, lang, exact_search, k, received_results)
|
339 |
err = extract_error_from_payload(payload)
|
340 |
if err is not None:
|
|
|
363 |
ds,
|
364 |
)
|
365 |
|
366 |
+
def submit(query, lang, k, dropdown_input):
|
367 |
+
print("submitting", query, lang, k)
|
368 |
(
|
369 |
processed_results,
|
370 |
highlight_terms,
|
|
|
372 |
exact_search,
|
373 |
result_page_html,
|
374 |
datasets,
|
375 |
+
) = run_query(query, lang, k, dropdown_input, 0)
|
376 |
+
has_more_results = exact_search and (num_results > k)
|
377 |
return [
|
378 |
processed_results,
|
379 |
highlight_terms,
|
|
|
391 |
lang,
|
392 |
k,
|
393 |
dropdown_input,
|
|
|
394 |
received_results,
|
395 |
processed_results,
|
396 |
):
|
|
|
401 |
exact_search,
|
402 |
result_page_html,
|
403 |
datasets,
|
404 |
+
) = run_query(query, lang, k, dropdown_input, received_results)
|
|
|
|
|
405 |
num_processed_results = len(next(iter(processed_results.values())))
|
406 |
+
has_more_results = exact_search and (num_results > k)
|
407 |
print("num_processed_results", num_processed_results)
|
408 |
print("has_more_results", has_more_results)
|
409 |
print("received_results", received_results)
|
|
|
414 |
exact_search,
|
415 |
gr.update(visible=True),
|
416 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
417 |
+
gr.update(visible=num_processed_results >= k and has_more_results),
|
|
|
|
|
418 |
received_results + num_processed_results,
|
419 |
result_page_html,
|
420 |
]
|
|
|
439 |
|
440 |
query.submit(
|
441 |
fn=submit,
|
442 |
+
inputs=[query, lang, k, available_datasets],
|
443 |
outputs=[
|
444 |
processed_results_state,
|
445 |
highlight_terms_state,
|
|
|
454 |
)
|
455 |
submit_btn.click(
|
456 |
submit,
|
457 |
+
inputs=[query, lang, k, available_datasets],
|
458 |
outputs=[
|
459 |
processed_results_state,
|
460 |
highlight_terms_state,
|
|
|
475 |
lang,
|
476 |
k,
|
477 |
available_datasets,
|
|
|
478 |
received_results_state,
|
479 |
processed_results_state,
|
480 |
],
|