TxT360

Running

App Files Files Community

victormiller commited on 14 days ago

Commit

5094bb7

•

1 Parent(s): 103b5cf

Update curated.py

Browse files

Files changed (1) hide show

curated.py +25 -1

curated.py CHANGED Viewed

@@ -458,6 +458,7 @@ filtering_process = Div(
         ),
     ),
     Section(
         H3("ArXiv"),
         H4("Download and Extraction"),
         P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",  D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
@@ -474,8 +475,10 @@ filtering_process = Div(
             Li("Local dedup was done with all papers combined."),
         ),
         table_div_arx,
     ),
     Section(
         H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
         H4("Download and Extraction"),
         Ol(
@@ -509,8 +512,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
     ),
     Section(
         H3("PubMed - need to update with abstract vs central"),
         H4("Download and Extraction"),
         Ol(
@@ -538,8 +543,10 @@ filtering_process = Div(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
     ),
     Section(
         H3("Phil Papers"),
         H4("Download and Extraction"),
         P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
@@ -552,8 +559,10 @@ filtering_process = Div(
             Li("Local dedup was done with all papers combined."),
         ),
         table_div_phil,
     ),
     Section(
         H3("Europarl"),
         H4("Download and Extraction"),
         P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
@@ -565,7 +574,9 @@ filtering_process = Div(
         ),
         table_div_up,
     ),
     Section(
         H3("HackerNews"),
         H4("Download and Extraction"),
         P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000.  The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
@@ -581,8 +592,10 @@ filtering_process = Div(
             Li("Local dedup was done within hackernews itself"),
         ),
         table_div_hn,
     ),
     Section(
         H3("USPTO"),
         H4("Download and Extraction"),
         P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
@@ -597,8 +610,10 @@ filtering_process = Div(
             Li("Local dedup was done within USPTO itself"),
         ),
         table_div_uspto,
     ),
     Section(
         H3("FreeLaw"),
         H4("Download and Extraction"),
         #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
@@ -623,8 +638,10 @@ filtering_process = Div(
             Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
         ),
         table_div_freelaw,
     ),
     Section(
         H3("StackExchange"),
         H4("Download and Extraction"),
         P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
@@ -648,8 +665,10 @@ filtering_process = Div(
             Li("Local dedup was done within stackexchange itself"),
         ),
         table_div_se,
     ),
     Section(
         H3("Ubuntu IRC"),
         H4("Download and Extraction"),
         P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
@@ -675,9 +694,11 @@ filtering_process = Div(
             Li("Local dedup was done within Ubuntu IRC itself"),
         ),
         table_div_uirc,
     ),
     Section(
-        H3("DM Maths"),
         H4("Download and Extraction"),
         P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
         D_code("""
@@ -692,8 +713,10 @@ filtering_process = Div(
             Li("None"),
         ),
         table_div_dmm,
     ),
     Section(
         H3("PG19"),
         H4("Download and Extraction"),
         Ol(
@@ -710,6 +733,7 @@ filtering_process = Div(
             Li("Local dedup was done within PG19 itself"),
         ),
         table_div_pg19,
     ),
 )

         ),
     ),
     Section(
+        Div(
         H3("ArXiv"),
         H4("Download and Extraction"),
         P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",  D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
             Li("Local dedup was done with all papers combined."),
         ),
         table_div_arx,
+        ),
     ),
     Section(
+        Div(
         H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
         H4("Download and Extraction"),
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
         table_div_s2o,
+        ),
     ),
     Section(
+        Div(
         H3("PubMed - need to update with abstract vs central"),
         H4("Download and Extraction"),
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
         table_div_med,
+        ),
     ),
     Section(
+        Div(
         H3("Phil Papers"),
         H4("Download and Extraction"),
         P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
             Li("Local dedup was done with all papers combined."),
         ),
         table_div_phil,
+        ),
     ),
     Section(
+        Div(
         H3("Europarl"),
         H4("Download and Extraction"),
         P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
         ),
         table_div_up,
     ),
+    ),
     Section(
+        Div(
         H3("HackerNews"),
         H4("Download and Extraction"),
         P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000.  The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
             Li("Local dedup was done within hackernews itself"),
         ),
         table_div_hn,
+        ),
     ),
     Section(
+        Div(
         H3("USPTO"),
         H4("Download and Extraction"),
         P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
             Li("Local dedup was done within USPTO itself"),
         ),
         table_div_uspto,
+        ),
     ),
     Section(
+        Div(
         H3("FreeLaw"),
         H4("Download and Extraction"),
         #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
             Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
         ),
         table_div_freelaw,
+        ),
     ),
     Section(
+        Div(
         H3("StackExchange"),
         H4("Download and Extraction"),
         P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
             Li("Local dedup was done within stackexchange itself"),
         ),
         table_div_se,
+        ),
     ),
     Section(
+        Div(
         H3("Ubuntu IRC"),
         H4("Download and Extraction"),
         P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
             Li("Local dedup was done within Ubuntu IRC itself"),
         ),
         table_div_uirc,
+        ),
     ),
     Section(
+       Div(
+        H3("DM Math"),
         H4("Download and Extraction"),
         P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
         D_code("""
             Li("None"),
         ),
         table_div_dmm,
+       ),
     ),
     Section(
+        Div(
         H3("PG19"),
         H4("Download and Extraction"),
         Ol(
             Li("Local dedup was done within PG19 itself"),
         ),
         table_div_pg19,
+        ),
     ),
 )