Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
import json | |
data_sources = [ | |
"Freelaw", | |
"Wikipedia", | |
"PhilPapers", | |
"Arxiv", | |
"S2ORC", | |
"S2ORC Abstract", | |
"Pubmeds", | |
"USPTO", | |
"Hackernews", | |
"Ubuntu IRC", | |
"StackExchange", | |
"DM Maths", | |
"PG19", | |
"Europarl", | |
] | |
def get_data(data_source: str = "Freelaw", data_ext_doc_id: int = 3, htmx=None): | |
data_ext_doc_id = max(0, min(int(data_ext_doc_id), 9)) | |
if data_source == "Freelaw": | |
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/freelaw_extract.json") | |
) | |
elif data_source == "Wikipedia": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/wiki.json") | |
) | |
elif data_source == "StackExchange": | |
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/stackexchange_extract.json") | |
) | |
elif data_source == "PhilPapers": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/philpapers_raw.json") | |
) | |
elif data_source == "Arxiv": | |
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/arxiv_extract.json") | |
) | |
elif data_source == "S2ORC": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/s2orc_raw.json") | |
) | |
elif data_source == "S2ORC Abstract": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/s2orc_abstract_raw.json") | |
) | |
else: | |
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)] | |
raw_json = raw_sample_doc[data_ext_doc_id] | |
extracted_json = extracted_sample_doc[data_ext_doc_id] | |
drop_down = Select( | |
*[Option(ds, value=ds, selected=(ds == data_source)) for ds in data_sources], | |
name="data_source", | |
hx_get="/curated", | |
hx_target="#colcontent", | |
hx_trigger="change", | |
hx_swap="innerHTML", | |
) | |
slider = Input( | |
type="range", | |
name="data_ext_doc_id", | |
min="0", | |
max="9", | |
value=str(data_ext_doc_id), | |
hx_get="/curated", | |
hx_target="#colcontent", | |
hx_trigger="change", | |
hx_swap="innerHTML", | |
oninput='document.getElementById("doc_id_value").innerText = "Selected document index: " + this.value', | |
) | |
form = Form( | |
Div( | |
Label("Data source:", drop_down), | |
style="margin-bottom: 20px;", | |
), | |
Div( | |
Label("Data sample:", slider, f"{data_ext_doc_id}"), | |
style="margin-bottom: 20px;", | |
), | |
) | |
col1 = Div( | |
H3("Raw format"), | |
Pre( | |
json.dumps(raw_json, indent=4), | |
style="white-space: pre-wrap; word-break: break-all;", | |
), | |
style="width: 48%; float: left; overflow-x: auto;", | |
) | |
col2 = Div( | |
H3("Extracted format"), | |
Pre( | |
json.dumps(extracted_json, indent=4), | |
style="white-space: pre-wrap; word-break: break-all;", | |
), | |
style="width: 48%; float: right; overflow-x: auto;", | |
) | |
data_display = Div(col1, col2, style="overflow: auto; clear: both; height: 600px;") | |
return (Div(form, data_display, style="margin-top: 10px;", id="colcontent"),) | |