Spaces:
Sleeping
Sleeping
File size: 4,240 Bytes
6770b66 b6c56e9 6770b66 7e8dbcd 6770b66 b6c56e9 6770b66 7e8dbcd 6770b66 7e8dbcd 6770b66 7e8dbcd 6770b66 7e8dbcd 6770b66 7e8dbcd 6770b66 85da60b 6770b66 85da60b 6770b66 7e8dbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from fasthtml.common import *
import json
data_sources = [
"Freelaw",
"Wikipedia",
"PhilPapers",
"Arxiv",
"S2ORC",
"S2ORC Abstract",
"Pubmed",
"USPTO",
"Hackernews",
"Ubuntu IRC",
"StackExchange",
"DM Maths",
"PG19",
"Europarl",
]
def get_data(data_source: str = "Freelaw", doc_id: int = 3):
doc_id = max(0, min(int(doc_id), 9))
if data_source == "Freelaw":
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/freelaw_extract.json")
)
elif data_source == "Wikipedia":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/wiki.json")
)
elif data_source == "StackExchange":
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/stackexchange_extract.json")
)
elif data_source == "PhilPapers":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/philpapers_raw.json")
)
elif data_source == "Arxiv":
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/arxiv_extract.json")
)
elif data_source == "S2ORC":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/s2orc_raw.json")
)
elif data_source == "S2ORC Abstract":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/s2orc_abstract_raw.json")
)
elif data_source == "Pubmed":
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/pubmed_extract.json")
)
elif data_source == "DM Maths":
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
extracted_sample_doc = json.load(
open("data/curated_samples/dm_maths_extract.json")
)
elif data_source == "PG19":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/pg19_raw.json")
)
elif data_source == "Europarl":
raw_sample_doc = extracted_sample_doc = json.load(
open("data/curated_samples/europarl_raw.json")
)
else:
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
raw_json = raw_sample_doc[doc_id]
extracted_json = extracted_sample_doc[doc_id]
drop_down = Select(
*[Option(ds, value=ds, selected=(ds == data_source)) for ds in data_sources],
name="data_source",
hx_get="/curated",
hx_target="#colcontent",
hx_trigger="change",
hx_swap="innerHTML",
)
slider = Input(
type="range",
name="doc_id",
min="0",
max="9",
value=str(doc_id),
hx_get="/curated",
hx_target="#colcontent",
hx_trigger="change",
hx_swap="innerHTML",
hx_include="[name='data_source']",
)
form = Form(
Div(
Label("Data source: ", drop_down),
),
Div(
Label("Data sample: ", slider, f"{doc_id}", cls="plotly_slider"),
),
cls="plotly_input_container",
)
col1 = Div(
H3("Raw format"),
Pre(
json.dumps(raw_json, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: left; overflow-x: auto;",
)
col2 = Div(
H3("Extracted format"),
Pre(
json.dumps(extracted_json, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: right; overflow-x: auto;",
)
data_display = Div(
col1,
col2,
style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
)
return Div(form, data_display, style="margin-top: 10px;", id="colcontent")
|