victormiller
commited on
Commit
•
48b277d
1
Parent(s):
5e5aef1
Update curated.py
Browse files- curated.py +52 -52
curated.py
CHANGED
@@ -89,19 +89,19 @@ table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
|
|
89 |
freelaw_filter = pd.DataFrame(
|
90 |
{
|
91 |
"Dataset": [
|
92 |
-
"
|
93 |
],
|
94 |
"Lines Downloaded": [
|
95 |
-
"
|
96 |
],
|
97 |
"Percent Removed After Language Filter": [
|
98 |
-
"
|
99 |
],
|
100 |
"Percent Removed After Min Word Count Filter": [
|
101 |
-
"
|
102 |
],
|
103 |
"Percent Removed After Unigram Probability Filter": [
|
104 |
-
"0.
|
105 |
],
|
106 |
"Percent Removed After Local Dedup": [
|
107 |
"",
|
@@ -118,16 +118,16 @@ table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin: 40px;")
|
|
118 |
dmm_filter = pd.DataFrame(
|
119 |
{
|
120 |
"Dataset": [
|
121 |
-
"
|
122 |
],
|
123 |
"Lines Downloaded": [
|
124 |
-
"
|
125 |
],
|
126 |
"Percent Removed After Language Filter": [
|
127 |
"0.00%",
|
128 |
],
|
129 |
"Percent Removed After Min Word Count Filter": [
|
130 |
-
"
|
131 |
],
|
132 |
"Percent Removed After Unigram Probability Filter": [
|
133 |
"0.00%",
|
@@ -148,19 +148,19 @@ table_div_dmm = Div(NotStr(table_html_dmm), style="margin: 40px;")
|
|
148 |
uspto_filter = pd.DataFrame(
|
149 |
{
|
150 |
"Dataset": [
|
151 |
-
"
|
152 |
],
|
153 |
"Lines Downloaded": [
|
154 |
-
"
|
155 |
],
|
156 |
"Percent Removed After Language Filter": [
|
157 |
-
"0.
|
158 |
],
|
159 |
"Percent Removed After Min Word Count Filter": [
|
160 |
-
"1.
|
161 |
],
|
162 |
"Percent Removed After Unigram Probability Filter": [
|
163 |
-
"0.
|
164 |
],
|
165 |
"Percent Removed After Local Dedup": [
|
166 |
"",
|
@@ -177,19 +177,19 @@ table_div_uspto = Div(NotStr(table_html_uspto), style="margin: 40px;")
|
|
177 |
pg19_filter = pd.DataFrame(
|
178 |
{
|
179 |
"Dataset": [
|
180 |
-
"
|
181 |
],
|
182 |
"Lines Downloaded": [
|
183 |
-
"
|
184 |
],
|
185 |
"Percent Removed After Language Filter": [
|
186 |
-
"0.
|
187 |
],
|
188 |
"Percent Removed After Min Word Count Filter": [
|
189 |
-
"
|
190 |
],
|
191 |
"Percent Removed After Unigram Probability Filter": [
|
192 |
-
"0.
|
193 |
],
|
194 |
"Percent Removed After Local Dedup": [
|
195 |
"",
|
@@ -207,19 +207,19 @@ table_div_pg19 = Div(NotStr(table_html_pg19), style="margin: 40px;")
|
|
207 |
hn_filter = pd.DataFrame(
|
208 |
{
|
209 |
"Dataset": [
|
210 |
-
"
|
211 |
],
|
212 |
"Lines Downloaded": [
|
213 |
-
"
|
214 |
],
|
215 |
"Percent Removed After Language Filter": [
|
216 |
-
"
|
217 |
],
|
218 |
"Percent Removed After Min Word Count Filter": [
|
219 |
-
"
|
220 |
],
|
221 |
"Percent Removed After Unigram Probability Filter": [
|
222 |
-
"0.
|
223 |
],
|
224 |
"Percent Removed After Local Dedup": [
|
225 |
"",
|
@@ -237,19 +237,19 @@ table_div_hn = Div(NotStr(table_html_hn), style="margin: 40px;")
|
|
237 |
uirc_filter = pd.DataFrame(
|
238 |
{
|
239 |
"Dataset": [
|
240 |
-
"
|
241 |
],
|
242 |
"Lines Downloaded": [
|
243 |
-
"
|
244 |
],
|
245 |
"Percent Removed After Language Filter": [
|
246 |
-
"
|
247 |
],
|
248 |
"Percent Removed After Min Word Count Filter": [
|
249 |
-
"
|
250 |
],
|
251 |
"Percent Removed After Unigram Probability Filter": [
|
252 |
-
"
|
253 |
],
|
254 |
"Percent Removed After Local Dedup": [
|
255 |
"",
|
@@ -266,16 +266,16 @@ table_div_uirc = Div(NotStr(table_html_uirc), style="margin: 40px;")
|
|
266 |
up_filter = pd.DataFrame(
|
267 |
{
|
268 |
"Dataset": [
|
269 |
-
"
|
270 |
],
|
271 |
"Lines Downloaded": [
|
272 |
-
"
|
273 |
],
|
274 |
"Percent Removed After Language Filter": [
|
275 |
"0.00%",
|
276 |
],
|
277 |
"Percent Removed After Min Word Count Filter": [
|
278 |
-
"
|
279 |
],
|
280 |
"Percent Removed After Unigram Probability Filter": [
|
281 |
"0.00%",
|
@@ -295,16 +295,16 @@ table_div_up = Div(NotStr(table_html_up), style="margin: 40px;")
|
|
295 |
se_filter = pd.DataFrame(
|
296 |
{
|
297 |
"Dataset": [
|
298 |
-
"
|
299 |
],
|
300 |
"Lines Downloaded": [
|
301 |
-
"
|
302 |
],
|
303 |
"Percent Removed After Language Filter": [
|
304 |
"0.00%",
|
305 |
],
|
306 |
"Percent Removed After Min Word Count Filter": [
|
307 |
-
"
|
308 |
],
|
309 |
"Percent Removed After Unigram Probability Filter": [
|
310 |
"0.00%",
|
@@ -324,19 +324,19 @@ table_div_se = Div(NotStr(table_html_se), style="margin: 40px;")
|
|
324 |
arx_filter = pd.DataFrame(
|
325 |
{
|
326 |
"Dataset": [
|
327 |
-
"
|
328 |
],
|
329 |
"Lines Downloaded": [
|
330 |
-
"
|
331 |
],
|
332 |
"Percent Removed After Language Filter": [
|
333 |
-
"
|
334 |
],
|
335 |
"Percent Removed After Min Word Count Filter": [
|
336 |
-
"
|
337 |
],
|
338 |
"Percent Removed After Unigram Probability Filter": [
|
339 |
-
"0.
|
340 |
],
|
341 |
"Percent Removed After Local Dedup": [
|
342 |
"",
|
@@ -353,16 +353,16 @@ table_div_arx = Div(NotStr(table_html_arx), style="margin: 40px;")
|
|
353 |
s2o_filter = pd.DataFrame(
|
354 |
{
|
355 |
"Dataset": [
|
356 |
-
"
|
357 |
],
|
358 |
"Lines Downloaded": [
|
359 |
-
"
|
360 |
],
|
361 |
"Percent Removed After Language Filter": [
|
362 |
"0.00%",
|
363 |
],
|
364 |
"Percent Removed After Min Word Count Filter": [
|
365 |
-
"
|
366 |
],
|
367 |
"Percent Removed After Unigram Probability Filter": [
|
368 |
"0.00%",
|
@@ -382,19 +382,19 @@ table_div_s2o = Div(NotStr(table_html_s2o), style="margin: 40px;")
|
|
382 |
med_filter = pd.DataFrame(
|
383 |
{
|
384 |
"Dataset": [
|
385 |
-
"
|
386 |
],
|
387 |
"Lines Downloaded": [
|
388 |
-
"
|
389 |
],
|
390 |
"Percent Removed After Language Filter": [
|
391 |
-
"
|
392 |
],
|
393 |
"Percent Removed After Min Word Count Filter": [
|
394 |
-
"1.
|
395 |
],
|
396 |
"Percent Removed After Unigram Probability Filter": [
|
397 |
-
"0.
|
398 |
],
|
399 |
"Percent Removed After Local Dedup": [
|
400 |
"",
|
@@ -411,19 +411,19 @@ table_div_med = Div(NotStr(table_html_med), style="margin: 40px;")
|
|
411 |
phil_filter = pd.DataFrame(
|
412 |
{
|
413 |
"Dataset": [
|
414 |
-
"
|
415 |
],
|
416 |
"Lines Downloaded": [
|
417 |
-
"
|
418 |
],
|
419 |
"Percent Removed After Language Filter": [
|
420 |
-
"
|
421 |
],
|
422 |
"Percent Removed After Min Word Count Filter": [
|
423 |
-
"
|
424 |
],
|
425 |
"Percent Removed After Unigram Probability Filter": [
|
426 |
-
"0.
|
427 |
],
|
428 |
"Percent Removed After Local Dedup": [
|
429 |
"",
|
|
|
89 |
freelaw_filter = pd.DataFrame(
|
90 |
{
|
91 |
"Dataset": [
|
92 |
+
"FreeLaw",
|
93 |
],
|
94 |
"Lines Downloaded": [
|
95 |
+
"75971288",
|
96 |
],
|
97 |
"Percent Removed After Language Filter": [
|
98 |
+
"3.00%",
|
99 |
],
|
100 |
"Percent Removed After Min Word Count Filter": [
|
101 |
+
"7.49%",
|
102 |
],
|
103 |
"Percent Removed After Unigram Probability Filter": [
|
104 |
+
"0.07%",
|
105 |
],
|
106 |
"Percent Removed After Local Dedup": [
|
107 |
"",
|
|
|
118 |
dmm_filter = pd.DataFrame(
|
119 |
{
|
120 |
"Dataset": [
|
121 |
+
"DM Math",
|
122 |
],
|
123 |
"Lines Downloaded": [
|
124 |
+
"112559888",
|
125 |
],
|
126 |
"Percent Removed After Language Filter": [
|
127 |
"0.00%",
|
128 |
],
|
129 |
"Percent Removed After Min Word Count Filter": [
|
130 |
+
"0.00%",
|
131 |
],
|
132 |
"Percent Removed After Unigram Probability Filter": [
|
133 |
"0.00%",
|
|
|
148 |
uspto_filter = pd.DataFrame(
|
149 |
{
|
150 |
"Dataset": [
|
151 |
+
"USPTO",
|
152 |
],
|
153 |
"Lines Downloaded": [
|
154 |
+
"6880276",
|
155 |
],
|
156 |
"Percent Removed After Language Filter": [
|
157 |
+
"0.02%",
|
158 |
],
|
159 |
"Percent Removed After Min Word Count Filter": [
|
160 |
+
"1.88%",
|
161 |
],
|
162 |
"Percent Removed After Unigram Probability Filter": [
|
163 |
+
"0.01%",
|
164 |
],
|
165 |
"Percent Removed After Local Dedup": [
|
166 |
"",
|
|
|
177 |
pg19_filter = pd.DataFrame(
|
178 |
{
|
179 |
"Dataset": [
|
180 |
+
"PG-19",
|
181 |
],
|
182 |
"Lines Downloaded": [
|
183 |
+
"28752",
|
184 |
],
|
185 |
"Percent Removed After Language Filter": [
|
186 |
+
"0.24%",
|
187 |
],
|
188 |
"Percent Removed After Min Word Count Filter": [
|
189 |
+
"0.00%",
|
190 |
],
|
191 |
"Percent Removed After Unigram Probability Filter": [
|
192 |
+
"0.17%",
|
193 |
],
|
194 |
"Percent Removed After Local Dedup": [
|
195 |
"",
|
|
|
207 |
hn_filter = pd.DataFrame(
|
208 |
{
|
209 |
"Dataset": [
|
210 |
+
"HackerNews",
|
211 |
],
|
212 |
"Lines Downloaded": [
|
213 |
+
"2064931",
|
214 |
],
|
215 |
"Percent Removed After Language Filter": [
|
216 |
+
"2.62%%",
|
217 |
],
|
218 |
"Percent Removed After Min Word Count Filter": [
|
219 |
+
"0.02%",
|
220 |
],
|
221 |
"Percent Removed After Unigram Probability Filter": [
|
222 |
+
"0.34%",
|
223 |
],
|
224 |
"Percent Removed After Local Dedup": [
|
225 |
"",
|
|
|
237 |
uirc_filter = pd.DataFrame(
|
238 |
{
|
239 |
"Dataset": [
|
240 |
+
"Ubunutu IRC",
|
241 |
],
|
242 |
"Lines Downloaded": [
|
243 |
+
"37966",
|
244 |
],
|
245 |
"Percent Removed After Language Filter": [
|
246 |
+
"38.10%",
|
247 |
],
|
248 |
"Percent Removed After Min Word Count Filter": [
|
249 |
+
"0.14%",
|
250 |
],
|
251 |
"Percent Removed After Unigram Probability Filter": [
|
252 |
+
"1.12%",
|
253 |
],
|
254 |
"Percent Removed After Local Dedup": [
|
255 |
"",
|
|
|
266 |
up_filter = pd.DataFrame(
|
267 |
{
|
268 |
"Dataset": [
|
269 |
+
"EuroParl",
|
270 |
],
|
271 |
"Lines Downloaded": [
|
272 |
+
"69814",
|
273 |
],
|
274 |
"Percent Removed After Language Filter": [
|
275 |
"0.00%",
|
276 |
],
|
277 |
"Percent Removed After Min Word Count Filter": [
|
278 |
+
"0.00%",
|
279 |
],
|
280 |
"Percent Removed After Unigram Probability Filter": [
|
281 |
"0.00%",
|
|
|
295 |
se_filter = pd.DataFrame(
|
296 |
{
|
297 |
"Dataset": [
|
298 |
+
"StackExchange",
|
299 |
],
|
300 |
"Lines Downloaded": [
|
301 |
+
"23246548",
|
302 |
],
|
303 |
"Percent Removed After Language Filter": [
|
304 |
"0.00%",
|
305 |
],
|
306 |
"Percent Removed After Min Word Count Filter": [
|
307 |
+
"0.00%",
|
308 |
],
|
309 |
"Percent Removed After Unigram Probability Filter": [
|
310 |
"0.00%",
|
|
|
324 |
arx_filter = pd.DataFrame(
|
325 |
{
|
326 |
"Dataset": [
|
327 |
+
"ArXiv",
|
328 |
],
|
329 |
"Lines Downloaded": [
|
330 |
+
"1911867",
|
331 |
],
|
332 |
"Percent Removed After Language Filter": [
|
333 |
+
"2.22%",
|
334 |
],
|
335 |
"Percent Removed After Min Word Count Filter": [
|
336 |
+
"5.65%",
|
337 |
],
|
338 |
"Percent Removed After Unigram Probability Filter": [
|
339 |
+
"0.07%",
|
340 |
],
|
341 |
"Percent Removed After Local Dedup": [
|
342 |
"",
|
|
|
353 |
s2o_filter = pd.DataFrame(
|
354 |
{
|
355 |
"Dataset": [
|
356 |
+
"S2ORC",
|
357 |
],
|
358 |
"Lines Downloaded": [
|
359 |
+
"12963563",
|
360 |
],
|
361 |
"Percent Removed After Language Filter": [
|
362 |
"0.00%",
|
363 |
],
|
364 |
"Percent Removed After Min Word Count Filter": [
|
365 |
+
"0.00%",
|
366 |
],
|
367 |
"Percent Removed After Unigram Probability Filter": [
|
368 |
"0.00%",
|
|
|
382 |
med_filter = pd.DataFrame(
|
383 |
{
|
384 |
"Dataset": [
|
385 |
+
"PubMed - Central",
|
386 |
],
|
387 |
"Lines Downloaded": [
|
388 |
+
"5230932",
|
389 |
],
|
390 |
"Percent Removed After Language Filter": [
|
391 |
+
"7.66%",
|
392 |
],
|
393 |
"Percent Removed After Min Word Count Filter": [
|
394 |
+
"1.29%",
|
395 |
],
|
396 |
"Percent Removed After Unigram Probability Filter": [
|
397 |
+
"0.02%",
|
398 |
],
|
399 |
"Percent Removed After Local Dedup": [
|
400 |
"",
|
|
|
411 |
phil_filter = pd.DataFrame(
|
412 |
{
|
413 |
"Dataset": [
|
414 |
+
"Phil Papers",
|
415 |
],
|
416 |
"Lines Downloaded": [
|
417 |
+
"49389",
|
418 |
],
|
419 |
"Percent Removed After Language Filter": [
|
420 |
+
"20.68%",
|
421 |
],
|
422 |
"Percent Removed After Min Word Count Filter": [
|
423 |
+
"0.00%",
|
424 |
],
|
425 |
"Percent Removed After Unigram Probability Filter": [
|
426 |
+
"0.12%",
|
427 |
],
|
428 |
"Percent Removed After Local Dedup": [
|
429 |
"",
|