victormiller
commited on
Commit
•
7e7a96b
1
Parent(s):
26832b9
Update web.py
Browse files
web.py
CHANGED
@@ -731,23 +731,24 @@ def web_data():
|
|
731 |
),
|
732 |
),
|
733 |
H4("3.3 Statistics-based Heuristics"),
|
734 |
-
P(""
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
Specifically, we remove any document which
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
|
|
751 |
H5("Word Count"),
|
752 |
P("""
|
753 |
Implementations from Dolma
|
@@ -809,7 +810,14 @@ def web_data():
|
|
809 |
P("""
|
810 |
The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
|
811 |
"""),
|
812 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
H5("Our Implementations"),
|
814 |
Details(
|
815 |
Summary("Sample documents that are filtered out by statistics-based heuristics"),
|
@@ -830,7 +838,14 @@ def web_data():
|
|
830 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
831 |
),
|
832 |
H3("4. Deduplication"),
|
833 |
-
P("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
834 |
H3("5. PII Removal"),
|
835 |
P("..."), # Add detailed content and images as needed
|
836 |
H2("Reference"),
|
|
|
731 |
),
|
732 |
),
|
733 |
H4("3.3 Statistics-based Heuristics"),
|
734 |
+
P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
|
735 |
+
Ul(
|
736 |
+
Li("the word count in the document", style = "margin-bottom: 5px"),
|
737 |
+
Li("the mean word length", style = "margin-bottom: 5px"),
|
738 |
+
Li("the number of sentences", style = "margin-bottom: 5px"),
|
739 |
+
Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
|
740 |
+
Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
|
741 |
+
Li("and the number of stop words", style = "margin-bottom: 5px"),
|
742 |
+
),
|
743 |
+
P("Specifically, we remove any document which satisfies any of the following criteria:"),
|
744 |
+
Ul(
|
745 |
+
Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
|
746 |
+
Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
|
747 |
+
Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
|
748 |
+
Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
|
749 |
+
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
750 |
+
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
751 |
+
),
|
752 |
H5("Word Count"),
|
753 |
P("""
|
754 |
Implementations from Dolma
|
|
|
810 |
P("""
|
811 |
The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
|
812 |
"""),
|
813 |
+
D_code("""
|
814 |
+
STOP_WORDS = ('the', 'be', 'to', 'of', 'and', 'that', 'have', 'with')
|
815 |
+
...
|
816 |
+
stop_words_pattern = re.compile("|".join(re.escape(symbol) for symbol in STOP_WORDS))
|
817 |
+
...
|
818 |
+
attrs.num_of_stop_words = sum(1 for word in words if stop_words_pattern.search(word))
|
819 |
+
|
820 |
+
""", block="block", language="python"),
|
821 |
H5("Our Implementations"),
|
822 |
Details(
|
823 |
Summary("Sample documents that are filtered out by statistics-based heuristics"),
|
|
|
838 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
839 |
),
|
840 |
H3("4. Deduplication"),
|
841 |
+
P("""
|
842 |
+
After careful filtering, although data quality has improved, a large fraction of the content is repeated across documents. This may be due to the crawler indirectly hitting the same page multiple times, to boilerplate content being repeated (e.g., licences), or even to plagiarism. These duplicates can strongly impact models, favoring memorization instead of generalization.
|
843 |
+
"""), # Add detailed content and images as needed
|
844 |
+
P("We perform two-level deduplication: local exact deduplication and global fuzzy deduplication")
|
845 |
+
P(B("Local Exact Deduplication"))
|
846 |
+
P("To reduce the expensive cost of global deduplication, we apply a local exact deduplication before it. Specifically, each dump is split into 70 splits. A bloom filter is applied within each split.")
|
847 |
+
P(B("Global Fuzzy Deduplication"))
|
848 |
+
P("NEED TO UPDATE")
|
849 |
H3("5. PII Removal"),
|
850 |
P("..."), # Add detailed content and images as needed
|
851 |
H2("Reference"),
|