omwdataset

Running

App Files Files Community

victormiller commited on 14 days ago

Commit

8a16e84

•

1 Parent(s): 0bc171c

Update curated.py

Browse files

Files changed (1) hide show

curated.py +201 -24

curated.py CHANGED Viewed

@@ -571,6 +571,183 @@ phil_examples = Div(
     ),
 )
 filtering_process = Div(
     Section(
           H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -605,10 +782,10 @@ filtering_process = Div(
             Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
         ),
         table_div_arx,
-       # Details(
-       #     Summary("ArXiv Filtering Examples"),
-       #     arx_examples,
-       # ),
         ),
     ),
     Section(
@@ -647,10 +824,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
-       # Details(
-      #      Summary("FreeLaw Filtering Examples -- need to update"),
-      #      freelaw_examples,
-      #  ),
         ),
     ),
     Section(
@@ -683,10 +860,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
-      #  Details(
-      #      Summary("PubMed Filtering Examples"),
-      #      pubmed_examples,
-      #  ),
         ),
     ),
     Section(
@@ -715,10 +892,10 @@ filtering_process = Div(
         H4("Filtering"),
         P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
         table_div_up,
-      #  Details(
-      #      Summary("EuroParl Filtering Examples"),
-      #      eu_examples,
-      #  ),
     ),
     ),
     Section(
@@ -860,10 +1037,10 @@ filtering_process = Div(
             Li("None"),
         ),
         table_div_dmm,
-       # Details(
-       #     Summary("DM Math Filtering Examples"),
-       #     dmm_examples,
-       # ),
        ),
     ),
     Section(
@@ -881,10 +1058,10 @@ filtering_process = Div(
             Li("Unigram Log Probability"),
         ),
         table_div_pg19,
-        #Details(
-        #    Summary("PG-19 Filtering Examples"),
-        #    pg19_examples,
-        #),
         ),
     ),
 )

     ),
 )
+arx_examples = Div(
+    Div(
+        get_arx_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "S2ORC":
+        raw_sample_doc = extracted_sample_doc = json.load(
+            open("data/curated_samples/s2orc_raw.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="S2ORC",
+        data_sources="S2ORC",
+        target=target,
+    )
+s2o_examples = Div(
+    Div(
+        get_S2ORC_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "S2ORC":
+        raw_sample_doc = extracted_sample_doc = json.load(
+            open("data/curated_samples/s2orc_abstract_raw.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="S2ORC Abstract",
+        data_sources="S2ORC Abstract",
+        target=target,
+    )
+s2oa_examples = Div(
+    Div(
+        get_S2ORCA_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "Pubmed":
+        raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
+        extracted_sample_doc = json.load(
+            open("data/curated_samples/pubmed_extract.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="Pubmed",
+        data_sources="Pubmed",
+        target=target,
+    )
+pubmed_examples = Div(
+    Div(
+        get_pubmed_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "DM Math":
+        raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
+        extracted_sample_doc = json.load(
+            open("data/curated_samples/dm_maths_extract.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="DM Math",
+        data_sources="DM Math",
+        target=target,
+    )
+dmm_examples = Div(
+    Div(
+        get_dmm_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "PG19":
+        raw_sample_doc = extracted_sample_doc = json.load(
+            open("data/curated_samples/pg19_raw.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="PG19",
+        data_sources="PG19",
+        target=target,
+    )
+pg19_examples = Div(
+    Div(
+        get_pg19_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "Europarl":
+        raw_sample_doc = extracted_sample_doc = json.load(
+            open("data/curated_samples/europarl_raw.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="Europarl",
+        data_sources="Europarl",
+        target=target,
+    )
+eu_examples = Div(
+    Div(
+        get_eu_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
 filtering_process = Div(
     Section(
           H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
             Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
         ),
         table_div_arx,
+        Details(
+            Summary("ArXiv Filtering Examples"),
+            arx_examples,
+        ),
         ),
     ),
     Section(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
+        Details(
+            Summary("FreeLaw Filtering Examples -- need to update"),
+            freelaw_examples,
+        ),
         ),
     ),
     Section(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
+        Details(
+            Summary("PubMed Filtering Examples"),
+            pubmed_examples,
+        ),
         ),
     ),
     Section(
         H4("Filtering"),
         P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
         table_div_up,
+        Details(
+            Summary("EuroParl Filtering Examples"),
+            eu_examples,
+        ),
     ),
     ),
     Section(
             Li("None"),
         ),
         table_div_dmm,
+        Details(
+            Summary("DM Math Filtering Examples"),
+            dmm_examples,
+        ),
        ),
     ),
     Section(
             Li("Unigram Log Probability"),
         ),
         table_div_pg19,
+        Details(
+            Summary("PG-19 Filtering Examples"),
+            pg19_examples,
+        ),
         ),
     ),
 )