victormiller commited on
Commit
41b9932
1 Parent(s): 27b61df

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +155 -0
web.py CHANGED
@@ -9,6 +9,152 @@ from data.url_blocklist import urls_high_matches, urls_false_positives
9
  from data.non_web_urls import non_web_urls
10
  from fasthtml.components import D_code
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def DVS(
14
  left,
@@ -216,6 +362,12 @@ def web_data():
216
  ),
217
  style="margin-top: 20px;",
218
  ),
 
 
 
 
 
 
219
  H3("1. Document Preparation"),
220
 
221
  H4("1.1 Text Extraction"),
@@ -226,6 +378,9 @@ def web_data():
226
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
227
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
228
  """),
 
 
 
229
  DV2("data/sample_wet.json", "data/sample_warc.json", 3),
230
 
231
  H4("1.2 Language Identification"),
 
9
  from data.non_web_urls import non_web_urls
10
  from fasthtml.components import D_code
11
 
12
+ data_filtering_table_data = pd.DataFrame(
13
+ {
14
+ "Dataset": [
15
+ "TxT360",
16
+ "FineWeb",
17
+ "RefinedWeb",
18
+ "RedPajamaV2",
19
+ "C4",
20
+ "Dolma",
21
+ "RedPajamaV1",
22
+ "The Pile",
23
+ ],
24
+ "Data Reading": [
25
+ "warc",
26
+ "warc",
27
+ "warc",
28
+ "wet",
29
+ "wet",
30
+ "warc",
31
+ "wet",
32
+ "warc",
33
+ ],
34
+ "Text Extraction": [
35
+ "trafilatura",
36
+ "trafilatura",
37
+ "trafilatura",
38
+ "n/a",
39
+ "n/a",
40
+ "?",
41
+ "n/a",
42
+ "jusText",
43
+ ],
44
+ "URL Filtering": [
45
+ "Yes",
46
+ "Yes",
47
+ "Yes",
48
+ "Yes",
49
+ "No",
50
+ "No",
51
+ "No",
52
+ "No",
53
+ ],
54
+ "Language Identification": [
55
+ "fastText",
56
+ "fastText",
57
+ "fastText",
58
+ "fastText",
59
+ "langdetect",
60
+ "fastText",
61
+ "fastText",
62
+ "pycld2",
63
+ ],
64
+ "Line Removal": [
65
+ "Yes",
66
+ "Yes",
67
+ "Yes",
68
+ "Yes",
69
+ "Yes",
70
+ "Yes",
71
+ "No",
72
+ "No",
73
+ ],
74
+ "QF: ML-based": [
75
+ "No",
76
+ "No",
77
+ "No",
78
+ "Yes",
79
+ "No",
80
+ "No",
81
+ "Yes",
82
+ "Yes",
83
+ ],
84
+ "QF: Repition-based": [
85
+ "Yes",
86
+ "Yes",
87
+ "Yes",
88
+ "Yes",
89
+ "No",
90
+ "Yes",
91
+ "No",
92
+ "No",
93
+ ],
94
+ "QF: Correction-based": [
95
+ "Yes",
96
+ "Yes",
97
+ "Yes",
98
+ "No",
99
+ "No",
100
+ "No",
101
+ "No",
102
+ "No",
103
+ ],
104
+ "QF: Gopher Rules": [
105
+ "Yes",
106
+ "Yes",
107
+ "Yes",
108
+ "Yes",
109
+ "No",
110
+ "Yes",
111
+ "No",
112
+ "No",
113
+ ],
114
+ "QF: C4 Rules": [
115
+ "Yes",
116
+ "Yes",
117
+ "Yes",
118
+ "Yes",
119
+ "Yes",
120
+ "Yes",
121
+ "No",
122
+ "No",
123
+ ],
124
+ "PI Filtering": [
125
+ "Yes",
126
+ "Yes",
127
+ "No",
128
+ "No",
129
+ "No",
130
+ "Yes",
131
+ "No",
132
+ "No",
133
+ ],
134
+ "Exact Deduplication": [
135
+ "Bloom Filter",
136
+ "n/a",
137
+ "ExactSubStr",
138
+ "Bloom Filter",
139
+ "n/a",
140
+ "Bloom Filter",
141
+ "n/a",
142
+ "n/a",
143
+ ],
144
+ "Fuzzy Deduplication": [
145
+ "Global",
146
+ "Local",
147
+ "Local",
148
+ "Local",
149
+ "Local",
150
+ "Local",
151
+ "Local",
152
+ "Global",
153
+ ],
154
+ }
155
+ )
156
+ table_html_filter_data = data_filtering_table_data.to_html(index=False, border=0)
157
+ table_div_filter_data = Div(NotStr(table_html_filter_data), style="margin: 40px;")
158
 
159
  def DVS(
160
  left,
 
362
  ),
363
  style="margin-top: 20px;",
364
  ),
365
+ H2("Web Data Processing Overview"),
366
+ P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
367
+ table_div_filter_data,
368
+ P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
369
+ Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
370
+ P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
371
  H3("1. Document Preparation"),
372
 
373
  H4("1.1 Text Extraction"),
 
378
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
379
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
380
  """),
381
+ P("We directly read WARC files instead of WET files and extracted text using Trafilatura. Similar to RefinedWeb, we avoid using Machine Learning (ML)-based metrics for filtering documents to prevent bias introduced by ML models. Importantly, we apply global deduplication across the entire dataset, whereas previous works only use local deduplication. Note that although The Pile also employed global deduplication on its web data (Pile-CC), this accounted for just 0.6\% of 74 snapshots."),
382
+
383
+
384
  DV2("data/sample_wet.json", "data/sample_warc.json", 3),
385
 
386
  H4("1.2 Language Identification"),