victormiller commited on
Commit
7e7a96b
1 Parent(s): 26832b9

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +34 -19
web.py CHANGED
@@ -731,23 +731,24 @@ def web_data():
731
  ),
732
  ),
733
  H4("3.3 Statistics-based Heuristics"),
734
- P("""
735
- We summarize other statistics-based rules originating from Gopher [2] in this section, which include:
736
- - Word count in the document,
737
- - Mean word length,
738
- - Number of sentences,
739
- - Symbol-to-word ratio,
740
- - Fraction of alphabetic words,
741
- - Number of stop words.
742
-
743
- Specifically, we remove any document which meets any of the following criteria:
744
- - Contains fewer than 50 words or more than 100,000 words
745
- - Has a mean word length outside the range of 3 to 10 characters
746
- - Contains fewer than 3 sentences
747
- - Has a symbol-to-word ratio greater than 0.1
748
- - Contains less than 80% alphabetic words
749
- - Contains fewer than two of the following stop words: "the," "be," "to," "of," "and," "that," "have," "with"
750
- """),
 
751
  H5("Word Count"),
752
  P("""
753
  Implementations from Dolma
@@ -809,7 +810,14 @@ def web_data():
809
  P("""
810
  The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
811
  """),
812
- Img(),
 
 
 
 
 
 
 
813
  H5("Our Implementations"),
814
  Details(
815
  Summary("Sample documents that are filtered out by statistics-based heuristics"),
@@ -830,7 +838,14 @@ def web_data():
830
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
831
  ),
832
  H3("4. Deduplication"),
833
- P("..."), # Add detailed content and images as needed
 
 
 
 
 
 
 
834
  H3("5. PII Removal"),
835
  P("..."), # Add detailed content and images as needed
836
  H2("Reference"),
 
731
  ),
732
  ),
733
  H4("3.3 Statistics-based Heuristics"),
734
+ P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
735
+ Ul(
736
+ Li("the word count in the document", style = "margin-bottom: 5px"),
737
+ Li("the mean word length", style = "margin-bottom: 5px"),
738
+ Li("the number of sentences", style = "margin-bottom: 5px"),
739
+ Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
740
+ Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
741
+ Li("and the number of stop words", style = "margin-bottom: 5px"),
742
+ ),
743
+ P("Specifically, we remove any document which satisfies any of the following criteria:"),
744
+ Ul(
745
+ Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
746
+ Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
747
+ Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
748
+ Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
749
+ Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
750
+ Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
751
+ ),
752
  H5("Word Count"),
753
  P("""
754
  Implementations from Dolma
 
810
  P("""
811
  The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
812
  """),
813
+ D_code("""
814
+ STOP_WORDS = ('the', 'be', 'to', 'of', 'and', 'that', 'have', 'with')
815
+ ...
816
+ stop_words_pattern = re.compile("|".join(re.escape(symbol) for symbol in STOP_WORDS))
817
+ ...
818
+ attrs.num_of_stop_words = sum(1 for word in words if stop_words_pattern.search(word))
819
+
820
+ """, block="block", language="python"),
821
  H5("Our Implementations"),
822
  Details(
823
  Summary("Sample documents that are filtered out by statistics-based heuristics"),
 
838
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
839
  ),
840
  H3("4. Deduplication"),
841
+ P("""
842
+ After careful filtering, although data quality has improved, a large fraction of the content is repeated across documents. This may be due to the crawler indirectly hitting the same page multiple times, to boilerplate content being repeated (e.g., licences), or even to plagiarism. These duplicates can strongly impact models, favoring memorization instead of generalization.
843
+ """), # Add detailed content and images as needed
844
+ P("We perform two-level deduplication: local exact deduplication and global fuzzy deduplication")
845
+ P(B("Local Exact Deduplication"))
846
+ P("To reduce the expensive cost of global deduplication, we apply a local exact deduplication before it. Specifically, each dump is split into 70 splits. A bloom filter is applied within each split.")
847
+ P(B("Global Fuzzy Deduplication"))
848
+ P("NEED TO UPDATE")
849
  H3("5. PII Removal"),
850
  P("..."), # Add detailed content and images as needed
851
  H2("Reference"),