victormiller commited on
Commit
5e5aef1
1 Parent(s): e8dab56

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +363 -13
curated.py CHANGED
@@ -74,7 +74,7 @@ wikipedia_filter = pd.DataFrame(
74
  "Percent Removed After Unigram Probability Filter": [
75
  "0.00%",
76
  ],
77
- "Lines Remaining After Local Dedup": [
78
  "",
79
  ],
80
  "Total Percentage Remaining": [
@@ -86,6 +86,356 @@ wikipedia_filter = pd.DataFrame(
86
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
87
  table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  filtering_process = Div(
91
  Section(
@@ -139,7 +489,7 @@ filtering_process = Div(
139
  Ol(
140
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
141
  ),
142
-
143
  ),
144
  Section(
145
  H3("S2ORC"),
@@ -174,7 +524,7 @@ filtering_process = Div(
174
  Ol(
175
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
176
  ),
177
-
178
  ),
179
  Section(
180
  H3("PubMed"),
@@ -203,7 +553,7 @@ filtering_process = Div(
203
  Ol(
204
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
205
  ),
206
-
207
  ),
208
  Section(
209
  H3("Phil Papers"),
@@ -226,7 +576,7 @@ filtering_process = Div(
226
  Ol(
227
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
228
  ),
229
-
230
  ),
231
  Section(
232
  H3("Europarl"),
@@ -248,7 +598,7 @@ filtering_process = Div(
248
  Ol(
249
  Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
250
  ),
251
-
252
  ),
253
  Section(
254
  H3("HackerNews"),
@@ -273,7 +623,7 @@ filtering_process = Div(
273
  Ol(
274
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
275
  ),
276
-
277
  ),
278
  Section(
279
  H3("USPTO"),
@@ -297,7 +647,7 @@ filtering_process = Div(
297
  Ol(
298
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
299
  ),
300
-
301
  ),
302
  Section(
303
  H3("FreeLaw"),
@@ -325,7 +675,7 @@ filtering_process = Div(
325
  Ol(
326
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
327
  ),
328
-
329
  ),
330
  Section(
331
  H3("StackExchange"),
@@ -358,7 +708,7 @@ filtering_process = Div(
358
  Ol(
359
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
360
  ),
361
-
362
  ),
363
  Section(
364
  H3("Ubuntu IRC"),
@@ -382,7 +732,7 @@ filtering_process = Div(
382
  Ol(
383
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
384
  ),
385
-
386
  ),
387
  Section(
388
  H3("DM Maths"),
@@ -403,7 +753,7 @@ filtering_process = Div(
403
  Ol(
404
  Li("None"),
405
  ),
406
-
407
  ),
408
  Section(
409
  H3("PG19"),
@@ -425,7 +775,7 @@ filtering_process = Div(
425
  Ol(
426
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
427
  ),
428
-
429
  ),
430
  )
431
 
 
74
  "Percent Removed After Unigram Probability Filter": [
75
  "0.00%",
76
  ],
77
+ "Percent Removed After Local Dedup": [
78
  "",
79
  ],
80
  "Total Percentage Remaining": [
 
86
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
87
  table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
88
 
89
+ freelaw_filter = pd.DataFrame(
90
+ {
91
+ "Dataset": [
92
+ "Wikipedia",
93
+ ],
94
+ "Lines Downloaded": [
95
+ "61614907",
96
+ ],
97
+ "Percent Removed After Language Filter": [
98
+ "0.00%",
99
+ ],
100
+ "Percent Removed After Min Word Count Filter": [
101
+ "1.86%",
102
+ ],
103
+ "Percent Removed After Unigram Probability Filter": [
104
+ "0.00%",
105
+ ],
106
+ "Percent Removed After Local Dedup": [
107
+ "",
108
+ ],
109
+ "Total Percentage Remaining": [
110
+ "98.14%",
111
+ ],
112
+ }
113
+ )
114
+
115
+ table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
116
+ table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin: 40px;")
117
+
118
+ dmm_filter = pd.DataFrame(
119
+ {
120
+ "Dataset": [
121
+ "Wikipedia",
122
+ ],
123
+ "Lines Downloaded": [
124
+ "61614907",
125
+ ],
126
+ "Percent Removed After Language Filter": [
127
+ "0.00%",
128
+ ],
129
+ "Percent Removed After Min Word Count Filter": [
130
+ "1.86%",
131
+ ],
132
+ "Percent Removed After Unigram Probability Filter": [
133
+ "0.00%",
134
+ ],
135
+ "Percent Removed After Local Dedup": [
136
+ "",
137
+ ],
138
+ "Total Percentage Remaining": [
139
+ "98.14%",
140
+ ],
141
+ }
142
+ )
143
+
144
+ table_html_dmm = dmm_filter.to_html(index=False, border=0)
145
+ table_div_dmm = Div(NotStr(table_html_dmm), style="margin: 40px;")
146
+
147
+
148
+ uspto_filter = pd.DataFrame(
149
+ {
150
+ "Dataset": [
151
+ "Wikipedia",
152
+ ],
153
+ "Lines Downloaded": [
154
+ "61614907",
155
+ ],
156
+ "Percent Removed After Language Filter": [
157
+ "0.00%",
158
+ ],
159
+ "Percent Removed After Min Word Count Filter": [
160
+ "1.86%",
161
+ ],
162
+ "Percent Removed After Unigram Probability Filter": [
163
+ "0.00%",
164
+ ],
165
+ "Percent Removed After Local Dedup": [
166
+ "",
167
+ ],
168
+ "Total Percentage Remaining": [
169
+ "98.14%",
170
+ ],
171
+ }
172
+ )
173
+
174
+ table_html_uspto = uspto_filter.to_html(index=False, border=0)
175
+ table_div_uspto = Div(NotStr(table_html_uspto), style="margin: 40px;")
176
+
177
+ pg19_filter = pd.DataFrame(
178
+ {
179
+ "Dataset": [
180
+ "Wikipedia",
181
+ ],
182
+ "Lines Downloaded": [
183
+ "61614907",
184
+ ],
185
+ "Percent Removed After Language Filter": [
186
+ "0.00%",
187
+ ],
188
+ "Percent Removed After Min Word Count Filter": [
189
+ "1.86%",
190
+ ],
191
+ "Percent Removed After Unigram Probability Filter": [
192
+ "0.00%",
193
+ ],
194
+ "Percent Removed After Local Dedup": [
195
+ "",
196
+ ],
197
+ "Total Percentage Remaining": [
198
+ "98.14%",
199
+ ],
200
+ }
201
+ )
202
+
203
+ table_html_pg19 = pg19_filter.to_html(index=False, border=0)
204
+ table_div_pg19 = Div(NotStr(table_html_pg19), style="margin: 40px;")
205
+
206
+
207
+ hn_filter = pd.DataFrame(
208
+ {
209
+ "Dataset": [
210
+ "Wikipedia",
211
+ ],
212
+ "Lines Downloaded": [
213
+ "61614907",
214
+ ],
215
+ "Percent Removed After Language Filter": [
216
+ "0.00%",
217
+ ],
218
+ "Percent Removed After Min Word Count Filter": [
219
+ "1.86%",
220
+ ],
221
+ "Percent Removed After Unigram Probability Filter": [
222
+ "0.00%",
223
+ ],
224
+ "Percent Removed After Local Dedup": [
225
+ "",
226
+ ],
227
+ "Total Percentage Remaining": [
228
+ "98.14%",
229
+ ],
230
+ }
231
+ )
232
+
233
+ table_html_hn = hn_filter.to_html(index=False, border=0)
234
+ table_div_hn = Div(NotStr(table_html_hn), style="margin: 40px;")
235
+
236
+
237
+ uirc_filter = pd.DataFrame(
238
+ {
239
+ "Dataset": [
240
+ "Wikipedia",
241
+ ],
242
+ "Lines Downloaded": [
243
+ "61614907",
244
+ ],
245
+ "Percent Removed After Language Filter": [
246
+ "0.00%",
247
+ ],
248
+ "Percent Removed After Min Word Count Filter": [
249
+ "1.86%",
250
+ ],
251
+ "Percent Removed After Unigram Probability Filter": [
252
+ "0.00%",
253
+ ],
254
+ "Percent Removed After Local Dedup": [
255
+ "",
256
+ ],
257
+ "Total Percentage Remaining": [
258
+ "98.14%",
259
+ ],
260
+ }
261
+ )
262
+
263
+ table_html_uirc = uirc_filter.to_html(index=False, border=0)
264
+ table_div_uirc = Div(NotStr(table_html_uirc), style="margin: 40px;")
265
+
266
+ up_filter = pd.DataFrame(
267
+ {
268
+ "Dataset": [
269
+ "Wikipedia",
270
+ ],
271
+ "Lines Downloaded": [
272
+ "61614907",
273
+ ],
274
+ "Percent Removed After Language Filter": [
275
+ "0.00%",
276
+ ],
277
+ "Percent Removed After Min Word Count Filter": [
278
+ "1.86%",
279
+ ],
280
+ "Percent Removed After Unigram Probability Filter": [
281
+ "0.00%",
282
+ ],
283
+ "Percent Removed After Local Dedup": [
284
+ "",
285
+ ],
286
+ "Total Percentage Remaining": [
287
+ "98.14%",
288
+ ],
289
+ }
290
+ )
291
+
292
+ table_html_up = up_filter.to_html(index=False, border=0)
293
+ table_div_up = Div(NotStr(table_html_up), style="margin: 40px;")
294
+
295
+ se_filter = pd.DataFrame(
296
+ {
297
+ "Dataset": [
298
+ "Wikipedia",
299
+ ],
300
+ "Lines Downloaded": [
301
+ "61614907",
302
+ ],
303
+ "Percent Removed After Language Filter": [
304
+ "0.00%",
305
+ ],
306
+ "Percent Removed After Min Word Count Filter": [
307
+ "1.86%",
308
+ ],
309
+ "Percent Removed After Unigram Probability Filter": [
310
+ "0.00%",
311
+ ],
312
+ "Percent Removed After Local Dedup": [
313
+ "",
314
+ ],
315
+ "Total Percentage Remaining": [
316
+ "98.14%",
317
+ ],
318
+ }
319
+ )
320
+
321
+ table_html_se = se_filter.to_html(index=False, border=0)
322
+ table_div_se = Div(NotStr(table_html_se), style="margin: 40px;")
323
+
324
+ arx_filter = pd.DataFrame(
325
+ {
326
+ "Dataset": [
327
+ "Wikipedia",
328
+ ],
329
+ "Lines Downloaded": [
330
+ "61614907",
331
+ ],
332
+ "Percent Removed After Language Filter": [
333
+ "0.00%",
334
+ ],
335
+ "Percent Removed After Min Word Count Filter": [
336
+ "1.86%",
337
+ ],
338
+ "Percent Removed After Unigram Probability Filter": [
339
+ "0.00%",
340
+ ],
341
+ "Percent Removed After Local Dedup": [
342
+ "",
343
+ ],
344
+ "Total Percentage Remaining": [
345
+ "98.14%",
346
+ ],
347
+ }
348
+ )
349
+
350
+ table_html_arx = arx_filter.to_html(index=False, border=0)
351
+ table_div_arx = Div(NotStr(table_html_arx), style="margin: 40px;")
352
+
353
+ s2o_filter = pd.DataFrame(
354
+ {
355
+ "Dataset": [
356
+ "Wikipedia",
357
+ ],
358
+ "Lines Downloaded": [
359
+ "61614907",
360
+ ],
361
+ "Percent Removed After Language Filter": [
362
+ "0.00%",
363
+ ],
364
+ "Percent Removed After Min Word Count Filter": [
365
+ "1.86%",
366
+ ],
367
+ "Percent Removed After Unigram Probability Filter": [
368
+ "0.00%",
369
+ ],
370
+ "Percent Removed After Local Dedup": [
371
+ "",
372
+ ],
373
+ "Total Percentage Remaining": [
374
+ "98.14%",
375
+ ],
376
+ }
377
+ )
378
+
379
+ table_html_s2o = s2o_filter.to_html(index=False, border=0)
380
+ table_div_s2o = Div(NotStr(table_html_s2o), style="margin: 40px;")
381
+
382
+ med_filter = pd.DataFrame(
383
+ {
384
+ "Dataset": [
385
+ "Wikipedia",
386
+ ],
387
+ "Lines Downloaded": [
388
+ "61614907",
389
+ ],
390
+ "Percent Removed After Language Filter": [
391
+ "0.00%",
392
+ ],
393
+ "Percent Removed After Min Word Count Filter": [
394
+ "1.86%",
395
+ ],
396
+ "Percent Removed After Unigram Probability Filter": [
397
+ "0.00%",
398
+ ],
399
+ "Percent Removed After Local Dedup": [
400
+ "",
401
+ ],
402
+ "Total Percentage Remaining": [
403
+ "98.14%",
404
+ ],
405
+ }
406
+ )
407
+
408
+ table_html_med = med_filter.to_html(index=False, border=0)
409
+ table_div_med = Div(NotStr(table_html_med), style="margin: 40px;")
410
+
411
+ phil_filter = pd.DataFrame(
412
+ {
413
+ "Dataset": [
414
+ "Wikipedia",
415
+ ],
416
+ "Lines Downloaded": [
417
+ "61614907",
418
+ ],
419
+ "Percent Removed After Language Filter": [
420
+ "0.00%",
421
+ ],
422
+ "Percent Removed After Min Word Count Filter": [
423
+ "1.86%",
424
+ ],
425
+ "Percent Removed After Unigram Probability Filter": [
426
+ "0.00%",
427
+ ],
428
+ "Percent Removed After Local Dedup": [
429
+ "",
430
+ ],
431
+ "Total Percentage Remaining": [
432
+ "98.14%",
433
+ ],
434
+ }
435
+ )
436
+
437
+ table_html_phil = phil_filter.to_html(index=False, border=0)
438
+ table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
439
 
440
  filtering_process = Div(
441
  Section(
 
489
  Ol(
490
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
491
  ),
492
+ table_div_arx,
493
  ),
494
  Section(
495
  H3("S2ORC"),
 
524
  Ol(
525
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
526
  ),
527
+ table_div_s2o,
528
  ),
529
  Section(
530
  H3("PubMed"),
 
553
  Ol(
554
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
555
  ),
556
+ table_div_med,
557
  ),
558
  Section(
559
  H3("Phil Papers"),
 
576
  Ol(
577
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
578
  ),
579
+ table_div_phil,
580
  ),
581
  Section(
582
  H3("Europarl"),
 
598
  Ol(
599
  Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
600
  ),
601
+ table_div_up,
602
  ),
603
  Section(
604
  H3("HackerNews"),
 
623
  Ol(
624
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
625
  ),
626
+ table_div_hn,
627
  ),
628
  Section(
629
  H3("USPTO"),
 
647
  Ol(
648
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
649
  ),
650
+ table_div_uspto,
651
  ),
652
  Section(
653
  H3("FreeLaw"),
 
675
  Ol(
676
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
677
  ),
678
+ table_div_freelaw,
679
  ),
680
  Section(
681
  H3("StackExchange"),
 
708
  Ol(
709
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
710
  ),
711
+ table_div_se,
712
  ),
713
  Section(
714
  H3("Ubuntu IRC"),
 
732
  Ol(
733
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
734
  ),
735
+ table_div_uirc,
736
  ),
737
  Section(
738
  H3("DM Maths"),
 
753
  Ol(
754
  Li("None"),
755
  ),
756
+ table_div_dmm,
757
  ),
758
  Section(
759
  H3("PG19"),
 
775
  Ol(
776
  Li("After local dedup, remaining data was deduped again with all the datasets combined"),
777
  ),
778
+ table_div_pg19,
779
  ),
780
  )
781