from fasthtml.common import * import json data_sources = [ "Freelaw", "Wikipedia", "PhilPapers", "Arxiv", "S2ORC", "S2ORC Abstract", "Pubmed", "USPTO", "Hackernews", "Ubuntu IRC", "StackExchange", "DM Maths", "PG19", "Europarl", ] def get_data(data_source: str = "Freelaw", doc_id: int = 3): doc_id = max(0, min(int(doc_id), 9)) if data_source == "Freelaw": raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json")) extracted_sample_doc = json.load( open("data/curated_samples/freelaw_extract.json") ) elif data_source == "Wikipedia": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/wiki.json") ) elif data_source == "StackExchange": raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json")) extracted_sample_doc = json.load( open("data/curated_samples/stackexchange_extract.json") ) elif data_source == "PhilPapers": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/philpapers_raw.json") ) elif data_source == "Arxiv": raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json")) extracted_sample_doc = json.load( open("data/curated_samples/arxiv_extract.json") ) elif data_source == "S2ORC": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/s2orc_raw.json") ) elif data_source == "S2ORC Abstract": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/s2orc_abstract_raw.json") ) elif data_source == "Pubmed": raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json")) extracted_sample_doc = json.load( open("data/curated_samples/pubmed_extract.json") ) elif data_source == "DM Maths": raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json")) extracted_sample_doc = json.load( open("data/curated_samples/dm_maths_extract.json") ) elif data_source == "PG19": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/pg19_raw.json") ) elif data_source == "Europarl": raw_sample_doc = extracted_sample_doc = json.load( open("data/curated_samples/europarl_raw.json") ) else: raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)] raw_json = raw_sample_doc[doc_id] extracted_json = extracted_sample_doc[doc_id] drop_down = Select( *[Option(ds, value=ds, selected=(ds == data_source)) for ds in data_sources], name="data_source", hx_get="/curated", hx_target="#colcontent", hx_trigger="change", hx_swap="innerHTML", ) slider = Input( type="range", name="doc_id", min="0", max="9", value=str(doc_id), hx_get="/curated", hx_target="#colcontent", hx_trigger="change", hx_swap="innerHTML", hx_include="[name='data_source']", ) form = Form( Div( Label("Data source: ", drop_down), style="margin-bottom: 20px;", ), Div( Label("Data sample: ", slider, f"{doc_id}"), style="margin-bottom: 20px;", ), ) col1 = Div( H3("Raw format"), Pre( json.dumps(raw_json, indent=4), style="white-space: pre-wrap; word-break: break-all;", ), style="width: 48%; float: left; overflow-x: auto;", ) col2 = Div( H3("Extracted format"), Pre( json.dumps(extracted_json, indent=4), style="white-space: pre-wrap; word-break: break-all;", ), style="width: 48%; float: right; overflow-x: auto;", ) data_display = Div( col1, col2, style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;", ) return Div(form, data_display, style="margin-top: 10px;", id="colcontent")