victormiller commited on
Commit
715785a
1 Parent(s): 0e26631

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +54 -33
web.py CHANGED
@@ -72,6 +72,54 @@ data_filtering_table_data = pd.DataFrame(
72
  "No",
73
  "No",
74
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  "QF: ML-based": [
76
  "No",
77
  "No",
@@ -122,40 +170,11 @@ data_filtering_table_data = pd.DataFrame(
122
  "No",
123
  "No",
124
  ],
125
- "PI Filtering": [
126
- "Yes",
127
- "Yes",
128
- "No",
129
- "No",
130
- "No",
131
- "Yes",
132
- "No",
133
- "No",
134
- ],
135
- "Exact Deduplication": [
136
- "Bloom Filter",
137
- "n/a",
138
- "ExactSubStr",
139
- "Bloom Filter",
140
- "n/a",
141
- "Bloom Filter",
142
- "n/a",
143
- "n/a",
144
- ],
145
- "Fuzzy Deduplication": [
146
- "Global",
147
- "Local",
148
- "Local",
149
- "Local",
150
- "Local",
151
- "Local",
152
- "Local",
153
- "Global",
154
- ],
155
- }
156
  )
157
- table_html_filter_data = data_filtering_table_data.to_html(index=False, border=0)
158
- table_div_filter_data = Div(NotStr(table_html_filter_data), style="margin: 40px;")
 
159
 
160
  def DVS(
161
  left,
@@ -366,6 +385,8 @@ def web_data():
366
  H2("Web Data Processing Summary"),
367
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
368
  table_div_filter_data,
 
 
369
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
370
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
371
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
 
72
  "No",
73
  "No",
74
  ],
75
+ "PII Filtering": [
76
+ "Yes",
77
+ "Yes",
78
+ "No",
79
+ "No",
80
+ "No",
81
+ "Yes",
82
+ "No",
83
+ "No",
84
+ ],
85
+ "Exact Deduplication": [
86
+ "Bloom Filter",
87
+ "n/a",
88
+ "ExactSubStr",
89
+ "Bloom Filter",
90
+ "n/a",
91
+ "Bloom Filter",
92
+ "n/a",
93
+ "n/a",
94
+ ],
95
+ "Fuzzy Deduplication": [
96
+ "Global",
97
+ "Local",
98
+ "Local",
99
+ "Local",
100
+ "Local",
101
+ "Local",
102
+ "Local",
103
+ "Global",
104
+ ],
105
+ }
106
+ )
107
+ table_html_filter_data = data_filtering_table_data.to_html(index=False, border=0)
108
+ table_div_filter_data = Div(NotStr(table_html_filter_data), style="margin: 40px;")
109
+
110
+
111
+ qf_filtering_table_data = pd.DataFrame(
112
+ {
113
+ "Dataset": [
114
+ "TxT360",
115
+ "FineWeb",
116
+ "RefinedWeb",
117
+ "RedPajamaV2",
118
+ "C4",
119
+ "Dolma",
120
+ "RedPajamaV1",
121
+ "The Pile",
122
+ ],
123
  "QF: ML-based": [
124
  "No",
125
  "No",
 
170
  "No",
171
  "No",
172
  ],
173
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  )
175
+ table_html_qf_filter_data = qf_filtering_table_data.to_html(index=False, border=0)
176
+ table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
177
+
178
 
179
  def DVS(
180
  left,
 
385
  H2("Web Data Processing Summary"),
386
  P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
387
  table_div_filter_data,
388
+ P("ADD EXPLAINER TEXT ABOUT THE QUALITY FILTERS"),
389
+ table_div_qf_filter_data,
390
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
391
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
392
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),