victormiller commited on
Commit
1c66024
1 Parent(s): 6afc890

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +3 -147
main.py CHANGED
@@ -65,7 +65,7 @@ def main():
65
  ),
66
  Li(
67
  A(
68
- "Background",
69
  href="/intro#section2",
70
  hx_get="/intro#section2",
71
  hx_target="#inner-text",
@@ -73,7 +73,7 @@ def main():
73
  ),
74
  Li(
75
  A(
76
- "Main Content",
77
  href="/intro#section3",
78
  hx_get="/intro#section3",
79
  hx_target="#inner-text",
@@ -81,7 +81,7 @@ def main():
81
  ),
82
  Li(
83
  A(
84
- "Conclusion",
85
  href="/intro#section4",
86
  hx_get="/intro#section4",
87
  hx_target="#inner-text",
@@ -210,143 +210,6 @@ previous_content = P("""The performance of a large language model (LLM)
210
  and the process followed to create its 📚
211
  FineWeb-Edu subset.""")
212
 
213
- dataset_comparison = pd.DataFrame(
214
- {
215
- "Dataset": [
216
- "TxT360",
217
- "FineWeb",
218
- "RefinedWeb",
219
- "RedPajama-v2",
220
- "C4",
221
- "Dolma",
222
- "RedPajama-v1",
223
- "The Pile",
224
- ],
225
- "CommonCrawl": [
226
- "99 Snapshots",
227
- "96 Snapshots",
228
- "90 Snapshots",
229
- "84 Snapshots",
230
- "1 Snapshots",
231
- "24 Snapshots",
232
- "5 Snapshots",
233
- "0.6% of 74 Snapshots",
234
- ],
235
- "Papers": [
236
- "5 Sources",
237
- "-",
238
- "-",
239
- "-",
240
- "-",
241
- "1 Source",
242
- "1 Source",
243
- "4 Sources",
244
- ],
245
- "Wikipedia": [
246
- "Improves data quality by removing irrelevant documents",
247
- "Filters out low-quality or incomplete documents",
248
- "Provides additional information for analysis",
249
- "Enables language-specific analysis and insights",
250
- "Helps understand the complexity and content of documents",
251
- "Identifies important terms and topics in the dataset",
252
- "Quantifies the importance of individual words",
253
- "RedPajama-v1",
254
- ],
255
- "FreeLaw": [
256
- "May exclude documents in less common languages",
257
- "May remove documents with valuable information",
258
- "May introduce bias in the analysis",
259
- "May not accurately represent the language distribution",
260
- "May not capture the complexity of document structure",
261
- "May be sensitive to noise and outliers",
262
- "May not capture the semantic meaning of words",
263
- "RedPajama-v1",
264
- ],
265
- "DM Math": [
266
- "May exclude documents in less common languages",
267
- "May remove documents with valuable information",
268
- "May introduce bias in the analysis",
269
- "May not accurately represent the language distribution",
270
- "May not capture the complexity of document structure",
271
- "May be sensitive to noise and outliers",
272
- "May not capture the semantic meaning of words",
273
- "RedPajama-v1",
274
- ],
275
- "USPTO": [
276
- "May exclude documents in less common languages",
277
- "May remove documents with valuable information",
278
- "May introduce bias in the analysis",
279
- "May not accurately represent the language distribution",
280
- "May not capture the complexity of document structure",
281
- "May be sensitive to noise and outliers",
282
- "May not capture the semantic meaning of words",
283
- "RedPajama-v1",
284
- ],
285
- "PG-19": [
286
- "May exclude documents in less common languages",
287
- "May remove documents with valuable information",
288
- "May introduce bias in the analysis",
289
- "May not accurately represent the language distribution",
290
- "May not capture the complexity of document structure",
291
- "May be sensitive to noise and outliers",
292
- "May not capture the semantic meaning of words",
293
- "RedPajama-v1",
294
- ],
295
- "HackerNews": [
296
- "May exclude documents in less common languages",
297
- "May remove documents with valuable information",
298
- "May introduce bias in the analysis",
299
- "May not accurately represent the language distribution",
300
- "May not capture the complexity of document structure",
301
- "May be sensitive to noise and outliers",
302
- "May not capture the semantic meaning of words",
303
- "RedPajama-v1",
304
- ],
305
- "Ubuntu IRC": [
306
- "May exclude documents in less common languages",
307
- "May remove documents with valuable information",
308
- "May introduce bias in the analysis",
309
- "May not accurately represent the language distribution",
310
- "May not capture the complexity of document structure",
311
- "May be sensitive to noise and outliers",
312
- "May not capture the semantic meaning of words",
313
- "RedPajama-v1",
314
- ],
315
- "EuroParl": [
316
- "May exclude documents in less common languages",
317
- "May remove documents with valuable information",
318
- "May introduce bias in the analysis",
319
- "May not accurately represent the language distribution",
320
- "May not capture the complexity of document structure",
321
- "May be sensitive to noise and outliers",
322
- "May not capture the semantic meaning of words",
323
- "RedPajama-v1",
324
- ],
325
- "StackExchange": [
326
- "May exclude documents in less common languages",
327
- "May remove documents with valuable information",
328
- "May introduce bias in the analysis",
329
- "May not accurately represent the language distribution",
330
- "May not capture the complexity of document structure",
331
- "May be sensitive to noise and outliers",
332
- "May not capture the semantic meaning of words",
333
- "RedPajama-v1",
334
- ],
335
- "Code": [
336
- "May exclude documents in less common languages",
337
- "May remove documents with valuable information",
338
- "May introduce bias in the analysis",
339
- "May not accurately represent the language distribution",
340
- "May not capture the complexity of document structure",
341
- "May be sensitive to noise and outliers",
342
- "May not capture the semantic meaning of words",
343
- "RedPajama-v1",
344
- ],
345
- }
346
- )
347
-
348
- table_html = dataset_comparison.to_html(index=False, border=0)
349
- table_div = Div(NotStr(table_html), style="margin: 40px;")
350
 
351
 
352
 
@@ -386,13 +249,6 @@ def intro():
386
  P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
387
  id="section4",
388
  ),
389
- Section(
390
- H2("Combining the Best of Web and Curated Sources"),
391
- H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
392
- P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
393
- table_div,
394
- id="section5",
395
- ),
396
  id="inner-text",
397
  )
398
 
 
65
  ),
66
  Li(
67
  A(
68
+ "Global Deduplication",
69
  href="/intro#section2",
70
  hx_get="/intro#section2",
71
  hx_target="#inner-text",
 
73
  ),
74
  Li(
75
  A(
76
+ "Controllable Upweighting",
77
  href="/intro#section3",
78
  hx_get="/intro#section3",
79
  hx_target="#inner-text",
 
81
  ),
82
  Li(
83
  A(
84
+ "Full Documentation",
85
  href="/intro#section4",
86
  hx_get="/intro#section4",
87
  hx_target="#inner-text",
 
210
  and the process followed to create its 📚
211
  FineWeb-Edu subset.""")
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
 
215
 
 
249
  P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
250
  id="section4",
251
  ),
 
 
 
 
 
 
 
252
  id="inner-text",
253
  )
254