TxT360

Running

App Files Files Community

victormiller commited on 14 days ago

Commit

0bc171c

•

1 Parent(s): beddb3a

Update curated.py

Browse files

Files changed (1) hide show

curated.py +65 -8

curated.py CHANGED Viewed

@@ -514,6 +514,63 @@ freelaw_examples = Div(
     ),
 )
 filtering_process = Div(
     Section(
           H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -643,10 +700,10 @@ filtering_process = Div(
             Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
         ),
         table_div_phil,
-      #  Details(
-      #      Summary("Phil Papers Filtering Examples"),
-       #     phil_examples,
-       # ),
         ),
     ),
     Section(
@@ -751,10 +808,10 @@ filtering_process = Div(
             Li("Minimum Word Count Filter: 10"),
         ),
         table_div_se,
-       # Details(
-       #     Summary("StackExchange Filtering Examples"),
-       #     se_examples,
-       # ),
         ),
     ),
     Section(

     ),
 )
+def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "StackExchange":
+        raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
+        extracted_sample_doc = json.load(
+            open("data/curated_samples/stackexchange_extract.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="StackExchange",
+        data_sources="StackExchange",
+        target=target,
+    )
+se_examples = Div(
+    Div(
+        get_se_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
+def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
+    doc_id = max(0, min(int(doc_id), 9))
+    if data_source == "PhilPapers":
+        raw_sample_doc = extracted_sample_doc = json.load(
+            open("data/curated_samples/philpapers_raw.json")
+        )
+    else:
+        raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
+    raw_json = raw_sample_doc[doc_id]
+    extracted_json = extracted_sample_doc[doc_id]
+    return view_data(
+        raw_json,
+        extracted_json,
+        doc_id=doc_id,
+        data_source="PhilPapers",
+        data_sources="PhilPapers",
+        target=target,
+    )
+phil_examples = Div(
+    Div(
+        get_phil_data(target=gen_random_id()),
+        style="border: 1px solid #ccc; padding: 20px;",
+    ),
+)
 filtering_process = Div(
     Section(
           H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
             Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
         ),
         table_div_phil,
+        Details(
+            Summary("Phil Papers Filtering Examples"),
+            phil_examples,
+        ),
         ),
     ),
     Section(
             Li("Minimum Word Count Filter: 10"),
         ),
         table_div_se,
+        Details(
+            Summary("StackExchange Filtering Examples"),
+            se_examples,
+        ),
         ),
     ),
     Section(