victormiller commited on
Commit
5094bb7
1 Parent(s): 103b5cf

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +25 -1
curated.py CHANGED
@@ -458,6 +458,7 @@ filtering_process = Div(
458
  ),
459
  ),
460
  Section(
 
461
  H3("ArXiv"),
462
  H4("Download and Extraction"),
463
  P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
@@ -474,8 +475,10 @@ filtering_process = Div(
474
  Li("Local dedup was done with all papers combined."),
475
  ),
476
  table_div_arx,
 
477
  ),
478
  Section(
 
479
  H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
480
  H4("Download and Extraction"),
481
  Ol(
@@ -509,8 +512,10 @@ filtering_process = Div(
509
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
510
  ),
511
  table_div_s2o,
 
512
  ),
513
  Section(
 
514
  H3("PubMed - need to update with abstract vs central"),
515
  H4("Download and Extraction"),
516
  Ol(
@@ -538,8 +543,10 @@ filtering_process = Div(
538
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
539
  ),
540
  table_div_med,
 
541
  ),
542
  Section(
 
543
  H3("Phil Papers"),
544
  H4("Download and Extraction"),
545
  P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
@@ -552,8 +559,10 @@ filtering_process = Div(
552
  Li("Local dedup was done with all papers combined."),
553
  ),
554
  table_div_phil,
 
555
  ),
556
  Section(
 
557
  H3("Europarl"),
558
  H4("Download and Extraction"),
559
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
@@ -565,7 +574,9 @@ filtering_process = Div(
565
  ),
566
  table_div_up,
567
  ),
 
568
  Section(
 
569
  H3("HackerNews"),
570
  H4("Download and Extraction"),
571
  P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
@@ -581,8 +592,10 @@ filtering_process = Div(
581
  Li("Local dedup was done within hackernews itself"),
582
  ),
583
  table_div_hn,
 
584
  ),
585
  Section(
 
586
  H3("USPTO"),
587
  H4("Download and Extraction"),
588
  P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
@@ -597,8 +610,10 @@ filtering_process = Div(
597
  Li("Local dedup was done within USPTO itself"),
598
  ),
599
  table_div_uspto,
 
600
  ),
601
  Section(
 
602
  H3("FreeLaw"),
603
  H4("Download and Extraction"),
604
  #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
@@ -623,8 +638,10 @@ filtering_process = Div(
623
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
624
  ),
625
  table_div_freelaw,
 
626
  ),
627
  Section(
 
628
  H3("StackExchange"),
629
  H4("Download and Extraction"),
630
  P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
@@ -648,8 +665,10 @@ filtering_process = Div(
648
  Li("Local dedup was done within stackexchange itself"),
649
  ),
650
  table_div_se,
 
651
  ),
652
  Section(
 
653
  H3("Ubuntu IRC"),
654
  H4("Download and Extraction"),
655
  P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
@@ -675,9 +694,11 @@ filtering_process = Div(
675
  Li("Local dedup was done within Ubuntu IRC itself"),
676
  ),
677
  table_div_uirc,
 
678
  ),
679
  Section(
680
- H3("DM Maths"),
 
681
  H4("Download and Extraction"),
682
  P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
683
  D_code("""
@@ -692,8 +713,10 @@ filtering_process = Div(
692
  Li("None"),
693
  ),
694
  table_div_dmm,
 
695
  ),
696
  Section(
 
697
  H3("PG19"),
698
  H4("Download and Extraction"),
699
  Ol(
@@ -710,6 +733,7 @@ filtering_process = Div(
710
  Li("Local dedup was done within PG19 itself"),
711
  ),
712
  table_div_pg19,
 
713
  ),
714
  )
715
 
 
458
  ),
459
  ),
460
  Section(
461
+ Div(
462
  H3("ArXiv"),
463
  H4("Download and Extraction"),
464
  P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
 
475
  Li("Local dedup was done with all papers combined."),
476
  ),
477
  table_div_arx,
478
+ ),
479
  ),
480
  Section(
481
+ Div(
482
  H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
483
  H4("Download and Extraction"),
484
  Ol(
 
512
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
513
  ),
514
  table_div_s2o,
515
+ ),
516
  ),
517
  Section(
518
+ Div(
519
  H3("PubMed - need to update with abstract vs central"),
520
  H4("Download and Extraction"),
521
  Ol(
 
543
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
544
  ),
545
  table_div_med,
546
+ ),
547
  ),
548
  Section(
549
+ Div(
550
  H3("Phil Papers"),
551
  H4("Download and Extraction"),
552
  P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
 
559
  Li("Local dedup was done with all papers combined."),
560
  ),
561
  table_div_phil,
562
+ ),
563
  ),
564
  Section(
565
+ Div(
566
  H3("Europarl"),
567
  H4("Download and Extraction"),
568
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
 
574
  ),
575
  table_div_up,
576
  ),
577
+ ),
578
  Section(
579
+ Div(
580
  H3("HackerNews"),
581
  H4("Download and Extraction"),
582
  P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
 
592
  Li("Local dedup was done within hackernews itself"),
593
  ),
594
  table_div_hn,
595
+ ),
596
  ),
597
  Section(
598
+ Div(
599
  H3("USPTO"),
600
  H4("Download and Extraction"),
601
  P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
 
610
  Li("Local dedup was done within USPTO itself"),
611
  ),
612
  table_div_uspto,
613
+ ),
614
  ),
615
  Section(
616
+ Div(
617
  H3("FreeLaw"),
618
  H4("Download and Extraction"),
619
  #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
 
638
  Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
639
  ),
640
  table_div_freelaw,
641
+ ),
642
  ),
643
  Section(
644
+ Div(
645
  H3("StackExchange"),
646
  H4("Download and Extraction"),
647
  P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
 
665
  Li("Local dedup was done within stackexchange itself"),
666
  ),
667
  table_div_se,
668
+ ),
669
  ),
670
  Section(
671
+ Div(
672
  H3("Ubuntu IRC"),
673
  H4("Download and Extraction"),
674
  P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
 
694
  Li("Local dedup was done within Ubuntu IRC itself"),
695
  ),
696
  table_div_uirc,
697
+ ),
698
  ),
699
  Section(
700
+ Div(
701
+ H3("DM Math"),
702
  H4("Download and Extraction"),
703
  P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
704
  D_code("""
 
713
  Li("None"),
714
  ),
715
  table_div_dmm,
716
+ ),
717
  ),
718
  Section(
719
+ Div(
720
  H3("PG19"),
721
  H4("Download and Extraction"),
722
  Ol(
 
733
  Li("Local dedup was done within PG19 itself"),
734
  ),
735
  table_div_pg19,
736
+ ),
737
  ),
738
  )
739