victormiller commited on
Commit
913dc7b
1 Parent(s): 8acb3f0

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +63 -31
curated.py CHANGED
@@ -680,17 +680,17 @@ filtering_process = Div(
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
  H4("Filtering"),
682
  Ul(
683
- Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
684
- Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
685
- Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",)),
686
- Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python"))),
687
- Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
688
- Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),
689
- Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters")),
690
- Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
691
- Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues.")),
692
- Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python"))),
693
- Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability.")),
694
  ),
695
  table_div_phil,
696
  Details(
@@ -714,7 +714,39 @@ filtering_process = Div(
714
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
715
  P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
716
  H4("Filtering"),
717
- P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  table_div_up,
719
  Details(
720
  Summary("EuroParl Filtering Examples"),
@@ -736,12 +768,12 @@ filtering_process = Div(
736
  H3("HackerNews"),
737
  P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
738
  P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
739
- P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest threads included stories from the 3rd level onwards. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
740
  H4("Filtering"),
741
- Ol(
742
- Li("Language Filter: English"),
743
- Li("Minimum Word Count Filter: 10"),
744
- Li("Unigram Log Probability"),
745
  ),
746
  table_div_hn,
747
  ),
@@ -750,12 +782,12 @@ filtering_process = Div(
750
  Div(
751
  H3("USPTO"),
752
  P("Patent documents from the United States Patent and Trademark Office."),
753
- P(B("Download and Extraction: "), "Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
754
  H4("Filtering"),
755
  Ol(
756
- Li("Language Filter: English"),
757
- Li("Minimum Word Count Filter: 50"),
758
- Li("Unigram Log Probability"),
759
  ),
760
  table_div_uspto,
761
  ),
@@ -778,9 +810,9 @@ filtering_process = Div(
778
  P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
779
  H4("Filtering"),
780
  Ol(
781
- Li("Language Filter: English"),
782
- Li("Minimum Word Count Filter: 50"),
783
- Li("Unigram Log Probability"),
784
  ),
785
  H4("Local Deduplication Process"),
786
  Ol(
@@ -821,7 +853,7 @@ filtering_process = Div(
821
  """),
822
  H4("Filtering"),
823
  Ol(
824
- Li("Minimum Word Count Filter: 10"),
825
  ),
826
  table_div_se,
827
  Details(
@@ -859,9 +891,9 @@ filtering_process = Div(
859
  """, block="block", language="python" ),
860
  H4("Filtering"),
861
  Ol(
862
- Li("Language Filter: English"),
863
- Li("Minimum Word Count Filter: 10"),
864
- Li("Unigram Log Probability"),
865
  ),
866
  table_div_uirc,
867
  ),
@@ -905,9 +937,9 @@ filtering_process = Div(
905
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
906
  H4("Filtering"),
907
  Ol(
908
- Li("Language Filter: ???"),
909
- Li("Minimum Word Count Filter: 20"),
910
- Li("Unigram Log Probability"),
911
  ),
912
  table_div_pg19,
913
  Details(
 
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
  H4("Filtering"),
682
  Ul(
683
+ Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python")), style = "margin-bottom: 2px"),
684
+ Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: 2px"),
685
+ Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",), style = "margin-bottom: 2px"),
686
+ Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python")), style = "margin-bottom: 2px"),
687
+ Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0"), style = "margin-bottom: 2px"),
688
+ Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts."), style = "margin-bottom: 2px"),
689
+ Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters"), style = "margin-bottom: 2px"),
690
+ Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: 2px"),
691
+ Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues."), style = "margin-bottom: 2px"),
692
+ Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python")), style = "margin-bottom: 2px"),
693
+ Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability."), style = "margin-bottom: 2px"),
694
  ),
695
  table_div_phil,
696
  Details(
 
714
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
715
  P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
716
  H4("Filtering"),
717
+ P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
718
+ D_code("""
719
+ Raw single line in data: <P> Hi I am speaker
720
+ After tag removal: P Hi I am speaker
721
+ We remove everything that starts with ["P", "BRK", "CHAPTER", "/P"]
722
+ and only keep tagnae == SPEAKER
723
+ because line starting with <SPEAKER> TEXT TEXT ....... has the relevant text
724
+ """, style="block", language = "python"),
725
+ D_code("""
726
+ def process_tag(original_tag):
727
+ tag = original_tag.strip(">").strip("<")
728
+
729
+ # Skip empty tags
730
+ if not tag:
731
+ return None
732
+
733
+ tagname = tag.split()[0]
734
+
735
+ # Skip paragraph, break, and chapter tags
736
+ if tagname in ["P", "BRK", "CHAPTER", "/P"]:
737
+ return None
738
+
739
+ # For speaker tags, return the name
740
+ if tagname == "SPEAKER":
741
+ soup = bs4.BeautifulSoup(original_tag, "html.parser")
742
+ name = soup.speaker["name"]
743
+ return name
744
+
745
+ # Raise a error here if there is a tag we don't know
746
+ raise ValueError(f"Unknown tag {tag}")
747
+
748
+
749
+ """, style="block", language = "python"),
750
  table_div_up,
751
  Details(
752
  Summary("EuroParl Filtering Examples"),
 
768
  H3("HackerNews"),
769
  P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
770
  P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
771
+ P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
772
  H4("Filtering"),
773
+ Ul(
774
+ Li("Language Filter: English", style = "margin-bottom: 2px"),
775
+ Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
776
+ Li("Unigram Log Probability Threshold: -20", style = "margin-bottom: 2px"),
777
  ),
778
  table_div_hn,
779
  ),
 
782
  Div(
783
  H3("USPTO"),
784
  P("Patent documents from the United States Patent and Trademark Office."),
785
+ P(B("Download and Extraction: "), "Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year:", I("Pre_2002"), ", ", I("2002_to_2004"), " and", I("post_2004"),". We used the exact code used in The Pile (citation needed)."),
786
  H4("Filtering"),
787
  Ol(
788
+ Li("Language Filter: English", style = "margin-bottom: 2px"),
789
+ Li("Minimum Word Count Filter: 50", style = "margin-bottom: 2px"),
790
+ Li("Unigram Log Probability", style = "margin-bottom: 2px"),
791
  ),
792
  table_div_uspto,
793
  ),
 
810
  P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
811
  H4("Filtering"),
812
  Ol(
813
+ Li("Language Filter: English", style = "margin-bottom: 2px"),
814
+ Li("Minimum Word Count Filter: 50", style = "margin-bottom: 2px"),
815
+ Li("Unigram Log Probability", style = "margin-bottom: 2px"),
816
  ),
817
  H4("Local Deduplication Process"),
818
  Ol(
 
853
  """),
854
  H4("Filtering"),
855
  Ol(
856
+ Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
857
  ),
858
  table_div_se,
859
  Details(
 
891
  """, block="block", language="python" ),
892
  H4("Filtering"),
893
  Ol(
894
+ Li("Language Filter: English", style = "margin-bottom: 2px"),
895
+ Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
896
+ Li("Unigram Log Probability", style = "margin-bottom: 2px"),
897
  ),
898
  table_div_uirc,
899
  ),
 
937
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
938
  H4("Filtering"),
939
  Ol(
940
+ Li("Language Filter: ???", style = "margin-bottom: 2px"),
941
+ Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
942
+ Li("Unigram Log Probability", style = "margin-bottom: 2px"),
943
  ),
944
  table_div_pg19,
945
  Details(