from fasthtml.common import *
import json
data_sources = [
"Freelaw",
"Wikipedia",
"PhilPapers",
"Arxiv",
"S2ORC",
"S2ORC Abstract",
"Pubmed",
"USPTO",
"Hackernews",
"Ubuntu IRC",
"StackExchange",
"DM Maths",
"PG19",
"Europarl",
]
def get_data(data_source: str = "Freelaw", doc_id: int = 3):
doc_id = max(0, min(int(doc_id), 9))
if data_source == "Freelaw":
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/freelaw_extract.json")
)
elif data_source == "Wikipedia":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/wiki.json")
)
elif data_source == "StackExchange":
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/stackexchange_extract.json")
)
elif data_source == "PhilPapers":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/philpapers_raw.json")
)
elif data_source == "Arxiv":
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/arxiv_extract.json")
)
elif data_source == "S2ORC":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/s2orc_raw.json")
)
elif data_source == "S2ORC Abstract":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/s2orc_abstract_raw.json")
)
elif data_source == "Pubmed":
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/pubmed_extract.json")
)
elif data_source == "DM Maths":
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/dm_maths_extract.json")
)
elif data_source == "PG19":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/pg19_raw.json")
)
elif data_source == "Europarl":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/europarl_raw.json")
)
else:
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
raw_json = raw_sample_doc[doc_id]
extracted_json = extracted_sample_doc[doc_id]
drop_down = Select(
*[Option(ds, value=ds, selected=(ds == data_source)) for ds in data_sources],
name="data_source",
hx_get="/curated",
hx_target="#colcontent",
hx_trigger="change",
hx_swap="innerHTML",
)
slider = Input(
type="range",
name="doc_id",
min="0",
max="9",
value=str(doc_id),
hx_get="/curated",
hx_target="#colcontent",
hx_trigger="change",
hx_swap="innerHTML",
hx_include="[name='data_source']",
)
form = Form(
Div(
Label("Data source: ", drop_down),
),
Div(
Label("Data sample: ", slider, f"{doc_id}", cls="plotly_slider"),
),
cls="plotly_input_container",
)
col1 = Div(
H3("Raw format"),
Pre(
json.dumps(raw_json, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: left; overflow-x: auto;",
)
col2 = Div(
H3("Extracted format"),
Pre(
json.dumps(extracted_json, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: right; overflow-x: auto;",
)
data_display = Div(
col1,
col2,
style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
)
return Div(form, data_display, style="margin-top: 10px;", id="colcontent")