Files changed (6) hide show
  1. common.py +1 -0
  2. curated.py +112 -77
  3. data/topic_charts.json +0 -0
  4. main.py +10 -4
  5. results.py +105 -64
  6. web.py +2 -0
common.py CHANGED
@@ -298,6 +298,7 @@ global_div = Div(
298
  style="margin-bottom: 5px",
299
  ),
300
  Li("Normalization Form C Discussion", style="margin-bottom: 5px"),
 
301
  ),
302
  id="section41",
303
  ),
 
298
  style="margin-bottom: 5px",
299
  ),
300
  Li("Normalization Form C Discussion", style="margin-bottom: 5px"),
301
+ Li(B("Estimated Reading Time: 10 minutes"), style="margin-bottom: 5px"),
302
  ),
303
  id="section41",
304
  ),
curated.py CHANGED
@@ -24,6 +24,7 @@ overview = (
24
  "Individual Filtering Discussion for Each Source",
25
  style="margin-bottom: 5px",
26
  ),
 
27
  ),
28
  ),
29
  )
@@ -33,7 +34,9 @@ curated_sources_intro = Div(
33
  P(
34
  "Curated sources comprise high-quality datasets that contain domain-specificity.",
35
  B(
36
- " TxT360 was strongly influenced by The Pile", D_cite(bibtex_key="thepile"), " regarding both inclusion of the dataset and filtering techniques."
 
 
37
  ),
38
  " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
39
  ),
@@ -129,16 +132,16 @@ wikipedia_filter = pd.DataFrame(
129
  "0.00%",
130
  ],
131
  "Percent Removed After Local Dedup": [
132
- "",
133
  ],
134
  "Total Percentage Remaining": [
135
- "",
136
  ],
137
  }
138
  )
139
 
140
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
141
- table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin-left: auto; width: 80%; align: center;")
142
 
143
  freelaw_filter = pd.DataFrame(
144
  {
@@ -167,7 +170,7 @@ freelaw_filter = pd.DataFrame(
167
  )
168
 
169
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
170
- table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin-left: auto; width: 80%; align: center;")
171
 
172
  dmm_filter = pd.DataFrame(
173
  {
@@ -187,16 +190,16 @@ dmm_filter = pd.DataFrame(
187
  "0.00%",
188
  ],
189
  "Percent Removed After Local Dedup": [
190
- "",
191
  ],
192
  "Total Percentage Remaining": [
193
- "%",
194
  ],
195
  }
196
  )
197
 
198
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
199
- table_div_dmm = Div(NotStr(table_html_dmm), style="margin-left: auto; width: 80%; align: center;")
200
 
201
 
202
  uspto_filter = pd.DataFrame(
@@ -217,16 +220,16 @@ uspto_filter = pd.DataFrame(
217
  "0.01%",
218
  ],
219
  "Percent Removed After Local Dedup": [
220
- "",
221
  ],
222
  "Total Percentage Remaining": [
223
- "%",
224
  ],
225
  }
226
  )
227
 
228
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
229
- table_div_uspto = Div(NotStr(table_html_uspto), style="margin-left: auto; width: 80%; align: center;")
230
 
231
  pg19_filter = pd.DataFrame(
232
  {
@@ -246,16 +249,16 @@ pg19_filter = pd.DataFrame(
246
  "0.17%",
247
  ],
248
  "Percent Removed After Local Dedup": [
249
- "",
250
  ],
251
  "Total Percentage Remaining": [
252
- "%",
253
  ],
254
  }
255
  )
256
 
257
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
258
- table_div_pg19 = Div(NotStr(table_html_pg19), style="margin-left: auto; width: 80%; align: center;")
259
 
260
 
261
  hn_filter = pd.DataFrame(
@@ -267,7 +270,7 @@ hn_filter = pd.DataFrame(
267
  "2064931",
268
  ],
269
  "Percent Removed After Language Filter": [
270
- "2.62%%",
271
  ],
272
  "Percent Removed After Min Word Count Filter": [
273
  "0.02%",
@@ -276,16 +279,16 @@ hn_filter = pd.DataFrame(
276
  "0.34%",
277
  ],
278
  "Percent Removed After Local Dedup": [
279
- "",
280
  ],
281
  "Total Percentage Remaining": [
282
- "%",
283
  ],
284
  }
285
  )
286
 
287
  table_html_hn = hn_filter.to_html(index=False, border=0)
288
- table_div_hn = Div(NotStr(table_html_hn), style="margin-left: auto; width: 80%; align: center;")
289
 
290
 
291
  uirc_filter = pd.DataFrame(
@@ -306,16 +309,16 @@ uirc_filter = pd.DataFrame(
306
  "1.12%",
307
  ],
308
  "Percent Removed After Local Dedup": [
309
- "",
310
  ],
311
  "Total Percentage Remaining": [
312
- "%",
313
  ],
314
  }
315
  )
316
 
317
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
318
- table_div_uirc = Div(NotStr(table_html_uirc), style="margin-left: auto; width: 80%; align: center;")
319
 
320
  up_filter = pd.DataFrame(
321
  {
@@ -335,16 +338,16 @@ up_filter = pd.DataFrame(
335
  "0.00%",
336
  ],
337
  "Percent Removed After Local Dedup": [
338
- "",
339
  ],
340
  "Total Percentage Remaining": [
341
- "%",
342
  ],
343
  }
344
  )
345
 
346
  table_html_up = up_filter.to_html(index=False, border=0)
347
- table_div_up = Div(NotStr(table_html_up), style="margin-left: auto; width: 80%; align: center;")
348
 
349
  se_filter = pd.DataFrame(
350
  {
@@ -364,16 +367,16 @@ se_filter = pd.DataFrame(
364
  "0.00%",
365
  ],
366
  "Percent Removed After Local Dedup": [
367
- "",
368
  ],
369
  "Total Percentage Remaining": [
370
- "%",
371
  ],
372
  }
373
  )
374
 
375
  table_html_se = se_filter.to_html(index=False, border=0)
376
- table_div_se = Div(NotStr(table_html_se), style="margin-left: auto; width: 80%; align: center;")
377
 
378
  arx_filter = pd.DataFrame(
379
  {
@@ -393,16 +396,16 @@ arx_filter = pd.DataFrame(
393
  "0.07%",
394
  ],
395
  "Percent Removed After Local Dedup": [
396
- "",
397
  ],
398
  "Total Percentage Remaining": [
399
- "%",
400
  ],
401
  }
402
  )
403
 
404
  table_html_arx = arx_filter.to_html(index=False, border=0)
405
- table_div_arx = Div(NotStr(table_html_arx), style="margin-left: auto; width: 80%; align: center;")
406
 
407
  s2o_filter = pd.DataFrame(
408
  {
@@ -422,16 +425,16 @@ s2o_filter = pd.DataFrame(
422
  "0.00%",
423
  ],
424
  "Percent Removed After Local Dedup": [
425
- "",
426
  ],
427
  "Total Percentage Remaining": [
428
- "%",
429
  ],
430
  }
431
  )
432
 
433
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
434
- table_div_s2o = Div(NotStr(table_html_s2o), style="margin-left: auto; width: 80%; align: center;")
435
 
436
  med_filter = pd.DataFrame(
437
  {
@@ -451,16 +454,16 @@ med_filter = pd.DataFrame(
451
  "0.02%",
452
  ],
453
  "Percent Removed After Local Dedup": [
454
- "",
455
  ],
456
  "Total Percentage Remaining": [
457
- "%",
458
  ],
459
  }
460
  )
461
 
462
  table_html_med = med_filter.to_html(index=False, border=0)
463
- table_div_med = Div(NotStr(table_html_med), style="margin-left: auto; width: 80%; align: center;")
464
 
465
  phil_filter = pd.DataFrame(
466
  {
@@ -480,16 +483,16 @@ phil_filter = pd.DataFrame(
480
  "0.12%",
481
  ],
482
  "Percent Removed After Local Dedup": [
483
- "",
484
  ],
485
  "Total Percentage Remaining": [
486
- "%",
487
  ],
488
  }
489
  )
490
 
491
  table_html_phil = phil_filter.to_html(index=False, border=0)
492
- table_div_phil = Div(NotStr(table_html_phil), style="margin-left: auto; width: 80%; align: center;")
493
  ## end individual tables showing filterin
494
 
495
 
@@ -681,24 +684,51 @@ filtering_process = Div(
681
  P(
682
  B("Download and Extraction: "),
683
  "All the data was downloaded in original latex format from ArXiv official S3 repo: ",
684
- A("s3://arxic/src", href="s3://arxic/src"),
685
- ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",
 
 
686
  D_code(
687
- "pandoc -s {tex} -o out/{out_name}.md --wrap=none",
688
- language="python",
689
  ),
690
- ". All markdowns were combined to create jsonl files.",
691
  ),
692
  P(B("Unique Data Preparation Challenges: ")),
 
 
 
693
  Ul(
694
  Li(
695
- "Due to large amounts of meaningful data being contained in table formats, special consideration was taken to extract the data and proper metadata.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  style="margin-bottom: -3px",
697
  ),
698
  ),
699
  P(
700
  B(" Filters Applied: "),
701
- "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset", D_cite(bibtex_key="peS2o"),
 
702
  ),
703
  Ul(
704
  Li(
@@ -773,19 +803,19 @@ filtering_process = Div(
773
  ),
774
  ),
775
  table_div_s2o,
776
- Details(
777
- Summary("S2ORC Filtering Examples -- need to update"),
778
- Div(
779
- P("examples are missing"),
780
- style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
781
- ),
782
- style="""
783
- background-color: #FFFAEA; /* Light yellow background */
784
- padding: 15px;
785
- border-radius: 12px;
786
- margin-bottom: 15px
787
- """,
788
- ),
789
  ),
790
  ),
791
  Section(
@@ -825,19 +855,19 @@ filtering_process = Div(
825
  style="margin-bottom: -3px",
826
  ),
827
  ),
828
- Details(
829
- Summary("S2ORC Abstract Filtering Examples "),
830
- Div(
831
- P("examples are missing"),
832
- style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
833
- ),
834
- style="""
835
- background-color: #FFFAEA; /* Light yellow background */
836
- padding: 15px;
837
- border-radius: 12px;
838
- margin-bottom: 15px
839
- """,
840
- ),
841
  )
842
  ),
843
  Section(
@@ -851,13 +881,16 @@ filtering_process = Div(
851
  href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
852
  ),
853
  ". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
854
- D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
855
- ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
 
 
 
856
  ),
857
  P(B("Unique Data Preparation Challenges: ")),
858
  Ul(
859
  Li(
860
- "Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
861
  style="margin-bottom: -3px",
862
  ),
863
  ),
@@ -1584,7 +1617,8 @@ def curated():
1584
  table_html = data_preparation_steps.to_html(index=False, border=0)
1585
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1586
 
1587
- text = P("""This initial stage serves as the foundation for the entire
 
1588
  process. Here, we focus on acquiring and extracting the raw data, which can
1589
  come from various sources such as crawling websites, using HTTP/FTP dumps,
1590
  or working with archive dumps. For instance, to download and prepare a
@@ -1594,7 +1628,8 @@ def curated():
1594
  preparation process: It is worth noting that some pipelines might require
1595
  invoking additional functions or scripts to handle specific data sources or
1596
  formats. These helper scripts can be located within specific directories
1597
- or modules dedicated to the dataset.""")
 
1598
 
1599
  return Div(
1600
  Section(
 
24
  "Individual Filtering Discussion for Each Source",
25
  style="margin-bottom: 5px",
26
  ),
27
+ Li(B("Estimated Reading Time: 25 minutes"),style="margin-bottom: 5px", ),
28
  ),
29
  ),
30
  )
 
34
  P(
35
  "Curated sources comprise high-quality datasets that contain domain-specificity.",
36
  B(
37
+ " TxT360 was strongly influenced by The Pile",
38
+ D_cite(bibtex_key="thepile"),
39
+ " regarding both inclusion of the dataset and filtering techniques.",
40
  ),
41
  " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
42
  ),
 
132
  "0.00%",
133
  ],
134
  "Percent Removed After Local Dedup": [
135
+ "0.31%",
136
  ],
137
  "Total Percentage Remaining": [
138
+ "97.84%",
139
  ],
140
  }
141
  )
142
 
143
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
144
+ table_div_wikipedia = Div(NotStr(table_html_wikipedia))
145
 
146
  freelaw_filter = pd.DataFrame(
147
  {
 
170
  )
171
 
172
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
173
+ table_div_freelaw = Div(NotStr(table_html_freelaw))
174
 
175
  dmm_filter = pd.DataFrame(
176
  {
 
190
  "0.00%",
191
  ],
192
  "Percent Removed After Local Dedup": [
193
+ "0.00%",
194
  ],
195
  "Total Percentage Remaining": [
196
+ "100.00%",
197
  ],
198
  }
199
  )
200
 
201
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
202
+ table_div_dmm = Div(NotStr(table_html_dmm))
203
 
204
 
205
  uspto_filter = pd.DataFrame(
 
220
  "0.01%",
221
  ],
222
  "Percent Removed After Local Dedup": [
223
+ "22.94%",
224
  ],
225
  "Total Percentage Remaining": [
226
+ "75.15%",
227
  ],
228
  }
229
  )
230
 
231
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
232
+ table_div_uspto = Div(NotStr(table_html_uspto))
233
 
234
  pg19_filter = pd.DataFrame(
235
  {
 
249
  "0.17%",
250
  ],
251
  "Percent Removed After Local Dedup": [
252
+ "0.80%",
253
  ],
254
  "Total Percentage Remaining": [
255
+ "98.78%",
256
  ],
257
  }
258
  )
259
 
260
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
261
+ table_div_pg19 = Div(NotStr(table_html_pg19))
262
 
263
 
264
  hn_filter = pd.DataFrame(
 
270
  "2064931",
271
  ],
272
  "Percent Removed After Language Filter": [
273
+ "2.62%",
274
  ],
275
  "Percent Removed After Min Word Count Filter": [
276
  "0.02%",
 
279
  "0.34%",
280
  ],
281
  "Percent Removed After Local Dedup": [
282
+ "61.84%",
283
  ],
284
  "Total Percentage Remaining": [
285
+ "35.18%",
286
  ],
287
  }
288
  )
289
 
290
  table_html_hn = hn_filter.to_html(index=False, border=0)
291
+ table_div_hn = Div(NotStr(table_html_hn))
292
 
293
 
294
  uirc_filter = pd.DataFrame(
 
309
  "1.12%",
310
  ],
311
  "Percent Removed After Local Dedup": [
312
+ "0.66%",
313
  ],
314
  "Total Percentage Remaining": [
315
+ "59.98%",
316
  ],
317
  }
318
  )
319
 
320
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
321
+ table_div_uirc = Div(NotStr(table_html_uirc))
322
 
323
  up_filter = pd.DataFrame(
324
  {
 
338
  "0.00%",
339
  ],
340
  "Percent Removed After Local Dedup": [
341
+ "1.00%",
342
  ],
343
  "Total Percentage Remaining": [
344
+ "99.00%",
345
  ],
346
  }
347
  )
348
 
349
  table_html_up = up_filter.to_html(index=False, border=0)
350
+ table_div_up = Div(NotStr(table_html_up))
351
 
352
  se_filter = pd.DataFrame(
353
  {
 
367
  "0.00%",
368
  ],
369
  "Percent Removed After Local Dedup": [
370
+ "0.00%",
371
  ],
372
  "Total Percentage Remaining": [
373
+ "100.00%",
374
  ],
375
  }
376
  )
377
 
378
  table_html_se = se_filter.to_html(index=False, border=0)
379
+ table_div_se = Div(NotStr(table_html_se))
380
 
381
  arx_filter = pd.DataFrame(
382
  {
 
396
  "0.07%",
397
  ],
398
  "Percent Removed After Local Dedup": [
399
+ "0.00%",
400
  ],
401
  "Total Percentage Remaining": [
402
+ "92.06%",
403
  ],
404
  }
405
  )
406
 
407
  table_html_arx = arx_filter.to_html(index=False, border=0)
408
+ table_div_arx = Div(NotStr(table_html_arx))
409
 
410
  s2o_filter = pd.DataFrame(
411
  {
 
425
  "0.00%",
426
  ],
427
  "Percent Removed After Local Dedup": [
428
+ "0.00%",
429
  ],
430
  "Total Percentage Remaining": [
431
+ "100.00%",
432
  ],
433
  }
434
  )
435
 
436
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
437
+ table_div_s2o = Div(NotStr(table_html_s2o))
438
 
439
  med_filter = pd.DataFrame(
440
  {
 
454
  "0.02%",
455
  ],
456
  "Percent Removed After Local Dedup": [
457
+ "0.00%",
458
  ],
459
  "Total Percentage Remaining": [
460
+ "91.03%",
461
  ],
462
  }
463
  )
464
 
465
  table_html_med = med_filter.to_html(index=False, border=0)
466
+ table_div_med = Div(NotStr(table_html_med))
467
 
468
  phil_filter = pd.DataFrame(
469
  {
 
483
  "0.12%",
484
  ],
485
  "Percent Removed After Local Dedup": [
486
+ "0.00%",
487
  ],
488
  "Total Percentage Remaining": [
489
+ "79.20%",
490
  ],
491
  }
492
  )
493
 
494
  table_html_phil = phil_filter.to_html(index=False, border=0)
495
+ table_div_phil = Div(NotStr(table_html_phil))
496
  ## end individual tables showing filterin
497
 
498
 
 
684
  P(
685
  B("Download and Extraction: "),
686
  "All the data was downloaded in original latex format from ArXiv official S3 repo: ",
687
+ A("s3://arxiv/src", href="s3://arxiv/src"),
688
+ ". We aim to encode the downloaded data in UTF-8 format, and when necessary, utilize the chardet library to infer the appropriate encoding. After that, we use ",
689
+ A("Pandoc", href="https://pandoc.org/"),
690
+ " to extract information from the latex files into markdown format. The command we use is",
691
  D_code(
692
+ "pandoc <raw_tex_path> -s -o <output_markdown_path> -f latex+raw_tex -t markdown_mmd [--lua-filter <lua_filter_path>]",
693
+ language="bash",
694
  ),
695
+ ". Finally, all markdowns were combined to create jsonl files.",
696
  ),
697
  P(B("Unique Data Preparation Challenges: ")),
698
+ P(
699
+ "When converting LaTeX files into Markdown using Pandoc, it is crucial to account for different data formats to minimize information loss while also filtering out noisy content in LaTeX. Below, we outline our considerations and methods for handling various data types during this conversion process:"
700
+ ),
701
  Ul(
702
  Li(
703
+ B("Tables: "),
704
+ "The process for handling tables follows three main approaches. First, tables compatible with Pandoc’s built-in formats are directly converted into standard Markdown tables. Notably, LaTeX’s '\\multicolumn' and '\\multirow' commands can be successfully translated into valid Markdown tables. Second, tables unsupported by Pandoc’s native functionality, such as deluxetable or other complex LaTeX types, are preserved in their original LaTeX format to maintain the integrity of complex structures. Third, only a few remaining tables have been converted to HTML web tables.",
705
+ style="margin-bottom: -3px",
706
+ ),
707
+ Li(
708
+ B("Mathematical Expressions: "),
709
+ "Inline mathematical expressions are rendered in Markdown. More complex equations remain unchanged, e.g., presented as '\\begin{aligned}' blocks, to ensure accuracy and readability.",
710
+ style="margin-bottom: -3px",
711
+ ),
712
+ Li(
713
+ B("Figures: "),
714
+ "All figures are removed during the conversion process. Placeholder figures might not contribute to the paper’s data quality and, as such, have been omitted to streamline the output.",
715
+ style="margin-bottom: -3px",
716
+ ),
717
+ Li(
718
+ B("Section Headers: "),
719
+ "Section headers are converted into markdown format, using leading '#' symbols to represent the heading levels.",
720
+ style="margin-bottom: -3px",
721
+ ),
722
+ Li(
723
+ B("References: "),
724
+ "References are removed. Although they may be informative, references often introduce formatting inconsistencies or add little value compared to the core content of the paper.",
725
  style="margin-bottom: -3px",
726
  ),
727
  ),
728
  P(
729
  B(" Filters Applied: "),
730
+ "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset",
731
+ D_cite(bibtex_key="peS2o"),
732
  ),
733
  Ul(
734
  Li(
 
803
  ),
804
  ),
805
  table_div_s2o,
806
+ # Details(
807
+ # Summary("S2ORC Filtering Examples -- need to update"),
808
+ # Div(
809
+ # P("examples are missing"),
810
+ # style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
811
+ # ),
812
+ # style="""
813
+ # background-color: #FFFAEA; /* Light yellow background */
814
+ # padding: 15px;
815
+ # border-radius: 12px;
816
+ # margin-bottom: 15px
817
+ # """,
818
+ # ),
819
  ),
820
  ),
821
  Section(
 
855
  style="margin-bottom: -3px",
856
  ),
857
  ),
858
+ #Details(
859
+ # Summary("S2ORC Abstract Filtering Examples "),
860
+ # Div(
861
+ # P("examples are missing"),
862
+ # style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
863
+ # ),
864
+ # style="""
865
+ # background-color: #FFFAEA; /* Light yellow background */
866
+ # padding: 15px;
867
+ # border-radius: 12px;
868
+ # margin-bottom: 15px
869
+ # """,
870
+ # ),
871
  )
872
  ),
873
  Section(
 
881
  href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
882
  ),
883
  ". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
884
+ D_code(
885
+ "pandoc <raw_xml_path> -s -o <output_markdown_path> -f jats -t markdown_mmd [--lua-filter <lua_filter_path>]",
886
+ language="bash",
887
+ ),
888
+ ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
889
  ),
890
  P(B("Unique Data Preparation Challenges: ")),
891
  Ul(
892
  Li(
893
+ "We tried similar attempts on PMC as we did on ArXiv. The resulted markdown might have slight difference due to the different structure of the XML files.",
894
  style="margin-bottom: -3px",
895
  ),
896
  ),
 
1617
  table_html = data_preparation_steps.to_html(index=False, border=0)
1618
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1619
 
1620
+ text = P(
1621
+ """This initial stage serves as the foundation for the entire
1622
  process. Here, we focus on acquiring and extracting the raw data, which can
1623
  come from various sources such as crawling websites, using HTTP/FTP dumps,
1624
  or working with archive dumps. For instance, to download and prepare a
 
1628
  preparation process: It is worth noting that some pipelines might require
1629
  invoking additional functions or scripts to handle specific data sources or
1630
  formats. These helper scripts can be located within specific directories
1631
+ or modules dedicated to the dataset."""
1632
+ )
1633
 
1634
  return Div(
1635
  Section(
data/topic_charts.json ADDED
The diff for this file is too large to render. See raw diff
 
main.py CHANGED
@@ -175,7 +175,7 @@ def main():
175
  Div(
176
  A(
177
  "TxT360",
178
- href="#section1",
179
  )
180
  ),
181
  Div(
@@ -352,6 +352,12 @@ def main():
352
  href="#section53",
353
  )
354
  ),
 
 
 
 
 
 
355
  ),
356
  ),
357
  role="navigation",
@@ -359,8 +365,8 @@ def main():
359
  ),
360
  ),
361
  intro(),
362
- curated.curated(),
363
  web.web_data(),
 
364
  common.common_steps(),
365
  results.results(),
366
  ),
@@ -757,7 +763,7 @@ dataset_sources = pd.DataFrame(
757
  "StackExchange",
758
  ],
759
  "Raw Data Size": [
760
- "11 TB",
761
  "712 GB",
762
  "210 GB",
763
  "23 GB",
@@ -770,7 +776,7 @@ dataset_sources = pd.DataFrame(
770
  "45 GB",
771
  ],
772
  "Token Count": [
773
- "5.71T",
774
  "154.96B",
775
  "4.75B",
776
  "7.34B",
 
175
  Div(
176
  A(
177
  "TxT360",
178
+ href="#section11",
179
  )
180
  ),
181
  Div(
 
352
  href="#section53",
353
  )
354
  ),
355
+ Li(
356
+ A(
357
+ "Topic Analysis",
358
+ href="#section55",
359
+ )
360
+ )
361
  ),
362
  ),
363
  role="navigation",
 
365
  ),
366
  ),
367
  intro(),
 
368
  web.web_data(),
369
+ curated.curated(),
370
  common.common_steps(),
371
  results.results(),
372
  ),
 
763
  "StackExchange",
764
  ],
765
  "Raw Data Size": [
766
+ "9.2 TB",
767
  "712 GB",
768
  "210 GB",
769
  "23 GB",
 
776
  "45 GB",
777
  ],
778
  "Token Count": [
779
+ "4.83T",
780
  "154.96B",
781
  "4.75B",
782
  "7.34B",
results.py CHANGED
@@ -157,81 +157,69 @@ fig_loss.update_layout(
157
  lm_loss_graph = fig_loss
158
 
159
 
160
- #Perplexity Across Different Buckets (global)
161
- # The data you provided
162
- DATA = [
163
- ["2014", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.410227605477868, 16.11176217183986, 15.632757662414805, 15.446116676532212, 16.716943171826703, 18.156821563322765]]],
164
- ["2015", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.446573602753478, 16.14852530113782, 15.627408549576069, 15.0055028132117, 15.565430373421485, 17.314701050452452]]],
165
- ["2016", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.307221780905284, 16.297702171159543, 15.948641884223639, 14.799690714225637, 14.935989931859659, 16.09585768919658]]],
166
- ["2017", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.338525603992114, 15.960924352297502, 15.912187993988933, 14.822102470001267, 14.778913482337416, 15.428145290012955]]],
167
- ["2018", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.08551151136689, 16.187802102106698, 14.935072408852303, 14.832038213200583, 14.508674264491997, 14.800605964649103]]],
168
- ["2019", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.818363305107052, 16.474269837858706, 14.944741674400241, 14.568394784374943, 14.690158822673334, 15.990949424635108]]],
169
- ["2020", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806]]],
170
- ["2021", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305]]],
171
- ["2022", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273]]],
172
- ["2023", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]]]
173
- ]
174
-
175
- # Extract ranges (buckets) and years
176
- ranges = DATA[0][1][0]
177
- years = [year_data[0] for year_data in DATA]
178
- all_values = [year_data[1][1] for year_data in DATA]
179
 
180
- # Create the figure
181
- fig = go.Figure()
182
 
183
- # Add a trace for each year
184
- for i, year in enumerate(years):
185
- values = all_values[i]
186
- fig.add_trace(go.Scatter(x=ranges, y=values, mode='lines+markers', name=year))
187
 
188
- # Update layout
189
- fig.update_layout(
190
- title="Perplexity Versus Buckets for Different Years",
191
- xaxis_title="Buckets",
192
- yaxis_title="Perplexity",
193
- legend_title="Years",
194
- hovermode="x unified"
 
 
 
 
195
  )
196
 
197
- Perplexity_Across_Different_Buckets_global_graph = fig
198
 
199
  ##graph 2
200
 
201
  # Data
202
- years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
203
- buckets = ["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"]
204
  data = {
205
- "2014": [17.410227605477868, 16.11176217183986, 15.632757662414805, 15.446116676532212, 16.716943171826703, 18.156821563322765],
206
- "2015": [17.446573602753478, 16.14852530113782, 15.627408549576069, 15.0055028132117, 15.565430373421485, 17.314701050452452],
207
- "2016": [17.307221780905284, 16.297702171159543, 15.948641884223639, 14.799690714225637, 14.935989931859659, 16.09585768919658],
208
- "2017": [17.338525603992114, 15.960924352297502, 15.912187993988933, 14.822102470001267, 14.778913482337416, 15.428145290012955],
209
- "2018": [17.08551151136689, 16.187802102106698, 14.935072408852303, 14.832038213200583, 14.508674264491997, 14.800605964649103],
210
- "2019": [16.818363305107052, 16.474269837858706, 14.944741674400241, 14.568394784374943, 14.690158822673334, 15.990949424635108],
211
- "2020": [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806],
212
- "2021": [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305],
213
- "2022": [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273],
214
- "2023": [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]
215
  }
216
-
217
  # Create a line plot for each bucket
218
- fig = go.Figure()
 
219
 
220
- for i, bucket in enumerate(buckets):
221
- bucket_values = [data[year][i] for year in years]
222
- fig.add_trace(go.Scatter(x=years, y=bucket_values, mode='lines+markers', name=bucket))
 
 
 
223
 
224
  # Update layout
225
- fig.update_layout(
226
- title="Average Perplexity Over Years by Bucket",
227
  xaxis_title="Year",
228
  yaxis_title="Average Perplexity",
229
- legend_title="Buckets",
230
- hovermode="x unified"
231
  )
232
 
233
- # Show plot
234
- graph2 = fig
 
235
 
236
  #graph 3 tbd
237
 
@@ -277,8 +265,11 @@ data = {
277
  "y": [15.4293630385597, 14.827776633211421, 14.600432832118155, 14.43330043760322, 14.30586483681026, 14.203397641081045, 14.140361413924607, 14.101673126860582, 14.05840021694595, 14.033693337279875, 13.990003714671388, 14.001106927608756, 13.982387676044238, 13.960424890216352, 13.941694305252629, 13.928958405693843, 13.922871327026984, 13.90749356497257, 13.883187320364065, 13.870538613453949, 13.853682922141118, 13.839326154723096, 13.841949693311191, 13.851284862386178, 13.853606323578846, 13.851840301257587, 13.86166226046842, 13.872513892742713, 13.867295530090015, 13.870385212514083, 13.868578068850889, 13.848685425568009, 13.838736750620761, 13.825182992628129, 13.795340100698754, 13.809629502148145, 13.767911089744738, 13.75599639262174, 13.75925151191706, 13.751975433642748, 13.731931931502134, 13.714316407794309, 13.730848265421725, 13.681606796845, 13.670846152397202, 13.685040324991581, 13.687292733648798, 13.645627677729081, 13.635021438002346, 13.670103747374988, 13.62246956240464, 13.658574692934657, 13.65219324836813, 13.663836335892329, 13.666948307267594, 13.642643510414398, 13.672961601819406, 13.663525877548398, 13.667436573958156, 13.687522639036205, 13.647544546546508, 13.670704172336292, 13.643000490240736, 13.667107110344569, 13.660218070172265, 13.675550822990397, 13.59092599672469, 13.619850375757148, 13.651551988902462, 13.684288098100867, 13.599046453546292, 13.580905963181452, 13.620956017533885, 13.648408527460056, 13.60249514150298, 13.568078301644128, 13.6145797181229, 13.647719674647586, 13.531493703586886, 13.609260600121146, 13.687447710013837, 13.602215210547463, 13.563220813507392, 13.57176728376717, 13.664222431811334, 13.586259696651297, 13.571127927324502, 13.692682818016978, 13.599437831977406, 13.646024625160337, 13.645231022004468, 13.582760709524955, 13.597408008025921, 13.640791164279545, 13.58016693386471, 13.664139165839629, 13.553901320176095, 13.616471379535914, 13.574418885444663, 13.62049913139043, 13.595296717779055, 13.618520397945241, 13.619083853308746, 13.549404239296155, 13.515187660214737, 13.728226318739061, 13.618609127026419, 13.638021829042664, 13.684199453984574, 13.707676555543845, 13.541617144639595, 13.703834138276244, 13.519323579997998, 13.6031555746482, 13.56414632339735, 13.711610914557605, 13.529701508251849, 13.631734977501925, 13.657120297408555, 13.58489249685537, 13.64403630157245, 13.608235747585796, 13.512110498832085, 13.846959297419959, 13.488917791046676, 13.696834245583917, 13.61507766454769, 13.565377802829648, 13.790197786574588, 13.564346823139237, 13.5342157764538, 13.65148446769962, 13.570646849759997, 13.538878864721305, 13.608980817839505, 13.664477237928127, 13.532411719398455, 13.737437273256564, 13.519816331488395, 13.764368990604515, 13.653588472445573, 13.5302540795639, 13.57561188810073, 13.647608383912683, 13.728643890026184, 13.710097008284352, 13.75589344505474, 13.682502650080487, 13.684339202206958, 13.86749342599449, 13.645786331176726, 13.660817039716257, 13.638011389586667, 13.732387353692099, 13.687686391049773, 13.94728981311995, 13.614547412507926, 13.460552695752389, 13.857074456574574, 13.660326212827487, 13.718393121459215, 13.76973846040823, 13.778762864123724, 13.656959991670522, 14.214462072816556, 13.622739420457773, 13.645227120727897, 13.687497326712435, 13.649272554141808, 13.503550000252996, 14.052494437028535, 13.580311058963396, 14.14017314736784, 13.64182000911326, 13.886936632170794, 13.942674178813384, 13.801754161915504, 13.931144818346747, 13.584651322907064, 13.960391830857033, 14.178873989248789, 13.645923431288137, 13.97423140921055, 13.669569353257327, 13.915331700422486, 13.827145087821776, 13.736836827537942, 13.74766543270326, 13.770318536193258, 13.721033560006754, 13.710191655241298, 13.72744974500201, 13.845900599098753, 13.678446729208007, 13.902643304888189, 13.885717101405227, 13.800092557910519, 13.421930828385424, 14.22006344809192, 13.879986040380091, 14.134020605148754, 13.828712030750555, 13.713190050100788, 13.670401789346878, 13.948423234333882, 13.84481181587886, 13.818908981670926, 13.74582730003707, 13.94097096205736, 14.180988587482524, 13.792218775776533, 13.684855627228844, 13.881739174231223, 13.998746647008945, 14.576445279868192, 13.977974630190932, 13.537548783758675, 13.793279757360363, 13.712674433009878, 13.758696209000801, 13.912382937440464, 14.340144810498328, 14.03279240397904, 13.889996587251982, 13.862065986515601, 13.712302898403514, 13.797613183843772, 13.847295441275936, 14.422108283887903, 13.900169262027397, 14.247783256416513, 14.02056653491548, 13.99526374998638, 13.69358071050138, 13.828640744267325, 14.01321596237364, 13.824657271537166, 13.683602248925892, 13.933987455146445, 13.993921243916123, 13.720966315955001, 13.890770809341586, 13.426961283787957, 13.789886238259456, 13.904131224083592, 13.697354217113691, 13.710497864728065, 14.683006531539148, 14.315759264484964, 14.034699751802846, 13.890948305458895, 14.038907287423818, 13.298757873288494, 14.037824844116592, 14.162250808384407, 13.947058195824253, 13.806653590867487, 13.980386450979841, 13.667273172120268, 14.201737731565034, 14.447849496372461, 14.018697147254176, 14.002315381014313, 14.15231239942244, 13.73719042552595, 14.033594655653038, 14.03087108296539, 13.782578520604822, 13.660908086958573, 14.205910336717215, 13.930322566343703, 14.300591211697906, 13.808749910964323, 13.8076208806989, 14.169819824542373, 14.067537576689853, 14.306124453553517, 13.56827905088618, 13.592634469105525, 13.789396713937975, 13.853464456609156, 13.855768885291056, 13.710464058237172, 13.641314331273772, 14.295528532894258, 14.04977108393012, 14.153384726345907, 13.385524853965638, 13.484650040422745, 13.831061312642468, 14.15298734516191, 14.20289531849528, 14.116529526532855, 13.767652893233096, 13.715924776449405, 14.649027392365024, 13.603310028324861, 14.10228343978809, 13.704762155010586, 13.731715455443299],
278
  },
279
  }
 
 
 
280
  for year, year_data in data.items():
281
- fig.add_trace(go.Scatter(
282
  x=year_data['x'],
283
  y=year_data['y'],
284
  mode='lines+markers',
@@ -287,7 +278,7 @@ for year, year_data in data.items():
287
  ))
288
 
289
  # Update layout
290
- fig.update_layout(
291
  title="Perplexity vs. Number of Duplicate Documents Over Years",
292
  xaxis_title="Number of Duplicate Documents",
293
  yaxis_title="Average Perplexity",
@@ -295,7 +286,7 @@ fig.update_layout(
295
  hovermode="x unified"
296
  )
297
 
298
- graph3 = fig
299
 
300
  ##graph 4
301
 
@@ -839,6 +830,8 @@ intro_div = Div(
839
  Ul(
840
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
841
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
 
 
842
  ),
843
  )
844
 
@@ -910,16 +903,15 @@ perp1_div = Div(
910
  Section(
911
  H3("Perplexity vs Years"),
912
  P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
913
- Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
914
- plotly2fasthtml(graph2),
915
- P("NEED TO UPDATE - THIS GRAPH SHOULD MATCH THE IMAGE ABOVE AND YEAR SHOULD NOT BE a LINE OPTION"),
916
  ),
917
  Section(
918
  H3("Perplexity vs Document Duplication"),
919
  P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
920
- Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
921
  plotly2fasthtml(graph3),
922
- P("NEED TO UPDATE - THIS GRAPH SHOULD MATCH THE IMAGE ABOVE AND BUCKET SHOULD NOT BE a LINE OPTION"),
923
  ),
924
  Section(
925
  H3("Perplexity vs Dump Duplication"),
@@ -974,6 +966,51 @@ llama_div = Div(
974
  ),
975
  )
976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
 
978
  def results():
979
  return Div(
@@ -995,6 +1032,10 @@ def results():
995
  ),
996
  Section(
997
  llama_div,
 
 
 
 
998
  ),
999
  id="inner-text"
1000
  )
 
157
  lm_loss_graph = fig_loss
158
 
159
 
160
+ data = {
161
+ "1-1": [17.410227605477868, 17.446573602753478,17.307221780905284,17.338525603992114,17.08551151136689,16.818363305107052,16.98821894111693, 17.125795647512877,16.573462144306383, 15.4293630385597],
162
+ "2-5": [16.11176217183986,16.14852530113782,16.297702171159543,15.960924352297502,16.187802102106698,16.474269837858706,15.936494557783181, 15.780419457145868,15.283018703313582, 14.608379914730168],
163
+ "6-10": [15.632757662414805,15.627408549576069,15.948641884223639,15.912187993988933,14.935072408852303,14.944741674400241,14.79960386342691,14.631430892394002, 14.378277745163881,14.118271697056592],
164
+ "11-100": [15.446116676532212,15.0055028132117,14.799690714225637,14.822102470001267,14.832038213200583,14.568394784374943,14.435682562274105,14.276477514399625, 14.0611924390084,13.880215644749589],
165
+ "101-1000": [16.716943171826703,15.565430373421485,14.935989931859659,14.778913482337416,14.508674264491997,14.690158822673334,14.58651834886038,14.337146941773641,13.9886330091318,13.767106666731275 ],
166
+ "1001-30000000": [18.156821563322765,17.314701050452452,16.09585768919658,15.428145290012955, 14.800605964649103, 15.990949424635108,15.869365567783806,15.872474774329305, 15.769421394877273, 15.05749135510839],
167
+ }
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # Years for the x-axis
170
+ years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
171
 
172
+ # Create a plotly figure
173
+ fig11 = go.Figure()
 
 
174
 
175
+ # Add a trace for each bucket
176
+ for bucket, values in data.items():
177
+ fig11.add_trace(go.Scatter(x=years, y=values, mode='lines', name=bucket))
178
+
179
+ # Update layout for better presentation
180
+ fig11.update_layout(
181
+ title='Perplexity Versus Buckets for Different Years',
182
+ xaxis_title='Year',
183
+ yaxis_title='Perplexity',
184
+ xaxis_tickangle=-45,
185
+ legend_title="Buckets",
186
  )
187
 
188
+ Perplexity_Across_Different_Buckets_global_graph = fig11
189
 
190
  ##graph 2
191
 
192
  # Data
 
 
193
  data = {
194
+ "1-1": [17.410227605477868, 17.446573602753478,17.307221780905284,17.338525603992114,17.08551151136689,16.818363305107052,16.98821894111693, 17.125795647512877,16.573462144306383, 15.4293630385597],
195
+ "2-5": [16.11176217183986,16.14852530113782,16.297702171159543,15.960924352297502,16.187802102106698,16.474269837858706,15.936494557783181, 15.780419457145868,15.283018703313582, 14.608379914730168],
196
+ "6-10": [15.632757662414805,15.627408549576069,15.948641884223639,15.912187993988933,14.935072408852303,14.944741674400241,14.79960386342691,14.631430892394002, 14.378277745163881,14.118271697056592],
197
+ "11-100": [15.446116676532212,15.0055028132117,14.799690714225637,14.822102470001267,14.832038213200583,14.568394784374943,14.435682562274105,14.276477514399625, 14.0611924390084,13.880215644749589],
198
+ "101-1000": [16.716943171826703,15.565430373421485,14.935989931859659,14.778913482337416,14.508674264491997,14.690158822673334,14.58651834886038,14.337146941773641,13.9886330091318,13.767106666731275 ],
199
+ "1001-30000000": [18.156821563322765,17.314701050452452,16.09585768919658,15.428145290012955, 14.800605964649103, 15.990949424635108,15.869365567783806,15.872474774329305, 15.769421394877273, 15.05749135510839],
 
 
 
 
200
  }
 
201
  # Create a line plot for each bucket
202
+ # Years
203
+ years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
204
 
205
+ # Create the figure
206
+ fig22 = go.Figure()
207
+
208
+ # Add lines for each bucket
209
+ for bucket, perplexities in data.items():
210
+ fig22.add_trace(go.Scatter(x=years, y=perplexities, mode='lines+markers', name=bucket))
211
 
212
  # Update layout
213
+ fig22.update_layout(
214
+ title="Perplexity Across Different Years (Global)",
215
  xaxis_title="Year",
216
  yaxis_title="Average Perplexity",
217
+ legend_title="Bucket (duplicate count range)"
 
218
  )
219
 
220
+ # Show the figure
221
+ graph2222 = fig22
222
+
223
 
224
  #graph 3 tbd
225
 
 
265
  "y": [15.4293630385597, 14.827776633211421, 14.600432832118155, 14.43330043760322, 14.30586483681026, 14.203397641081045, 14.140361413924607, 14.101673126860582, 14.05840021694595, 14.033693337279875, 13.990003714671388, 14.001106927608756, 13.982387676044238, 13.960424890216352, 13.941694305252629, 13.928958405693843, 13.922871327026984, 13.90749356497257, 13.883187320364065, 13.870538613453949, 13.853682922141118, 13.839326154723096, 13.841949693311191, 13.851284862386178, 13.853606323578846, 13.851840301257587, 13.86166226046842, 13.872513892742713, 13.867295530090015, 13.870385212514083, 13.868578068850889, 13.848685425568009, 13.838736750620761, 13.825182992628129, 13.795340100698754, 13.809629502148145, 13.767911089744738, 13.75599639262174, 13.75925151191706, 13.751975433642748, 13.731931931502134, 13.714316407794309, 13.730848265421725, 13.681606796845, 13.670846152397202, 13.685040324991581, 13.687292733648798, 13.645627677729081, 13.635021438002346, 13.670103747374988, 13.62246956240464, 13.658574692934657, 13.65219324836813, 13.663836335892329, 13.666948307267594, 13.642643510414398, 13.672961601819406, 13.663525877548398, 13.667436573958156, 13.687522639036205, 13.647544546546508, 13.670704172336292, 13.643000490240736, 13.667107110344569, 13.660218070172265, 13.675550822990397, 13.59092599672469, 13.619850375757148, 13.651551988902462, 13.684288098100867, 13.599046453546292, 13.580905963181452, 13.620956017533885, 13.648408527460056, 13.60249514150298, 13.568078301644128, 13.6145797181229, 13.647719674647586, 13.531493703586886, 13.609260600121146, 13.687447710013837, 13.602215210547463, 13.563220813507392, 13.57176728376717, 13.664222431811334, 13.586259696651297, 13.571127927324502, 13.692682818016978, 13.599437831977406, 13.646024625160337, 13.645231022004468, 13.582760709524955, 13.597408008025921, 13.640791164279545, 13.58016693386471, 13.664139165839629, 13.553901320176095, 13.616471379535914, 13.574418885444663, 13.62049913139043, 13.595296717779055, 13.618520397945241, 13.619083853308746, 13.549404239296155, 13.515187660214737, 13.728226318739061, 13.618609127026419, 13.638021829042664, 13.684199453984574, 13.707676555543845, 13.541617144639595, 13.703834138276244, 13.519323579997998, 13.6031555746482, 13.56414632339735, 13.711610914557605, 13.529701508251849, 13.631734977501925, 13.657120297408555, 13.58489249685537, 13.64403630157245, 13.608235747585796, 13.512110498832085, 13.846959297419959, 13.488917791046676, 13.696834245583917, 13.61507766454769, 13.565377802829648, 13.790197786574588, 13.564346823139237, 13.5342157764538, 13.65148446769962, 13.570646849759997, 13.538878864721305, 13.608980817839505, 13.664477237928127, 13.532411719398455, 13.737437273256564, 13.519816331488395, 13.764368990604515, 13.653588472445573, 13.5302540795639, 13.57561188810073, 13.647608383912683, 13.728643890026184, 13.710097008284352, 13.75589344505474, 13.682502650080487, 13.684339202206958, 13.86749342599449, 13.645786331176726, 13.660817039716257, 13.638011389586667, 13.732387353692099, 13.687686391049773, 13.94728981311995, 13.614547412507926, 13.460552695752389, 13.857074456574574, 13.660326212827487, 13.718393121459215, 13.76973846040823, 13.778762864123724, 13.656959991670522, 14.214462072816556, 13.622739420457773, 13.645227120727897, 13.687497326712435, 13.649272554141808, 13.503550000252996, 14.052494437028535, 13.580311058963396, 14.14017314736784, 13.64182000911326, 13.886936632170794, 13.942674178813384, 13.801754161915504, 13.931144818346747, 13.584651322907064, 13.960391830857033, 14.178873989248789, 13.645923431288137, 13.97423140921055, 13.669569353257327, 13.915331700422486, 13.827145087821776, 13.736836827537942, 13.74766543270326, 13.770318536193258, 13.721033560006754, 13.710191655241298, 13.72744974500201, 13.845900599098753, 13.678446729208007, 13.902643304888189, 13.885717101405227, 13.800092557910519, 13.421930828385424, 14.22006344809192, 13.879986040380091, 14.134020605148754, 13.828712030750555, 13.713190050100788, 13.670401789346878, 13.948423234333882, 13.84481181587886, 13.818908981670926, 13.74582730003707, 13.94097096205736, 14.180988587482524, 13.792218775776533, 13.684855627228844, 13.881739174231223, 13.998746647008945, 14.576445279868192, 13.977974630190932, 13.537548783758675, 13.793279757360363, 13.712674433009878, 13.758696209000801, 13.912382937440464, 14.340144810498328, 14.03279240397904, 13.889996587251982, 13.862065986515601, 13.712302898403514, 13.797613183843772, 13.847295441275936, 14.422108283887903, 13.900169262027397, 14.247783256416513, 14.02056653491548, 13.99526374998638, 13.69358071050138, 13.828640744267325, 14.01321596237364, 13.824657271537166, 13.683602248925892, 13.933987455146445, 13.993921243916123, 13.720966315955001, 13.890770809341586, 13.426961283787957, 13.789886238259456, 13.904131224083592, 13.697354217113691, 13.710497864728065, 14.683006531539148, 14.315759264484964, 14.034699751802846, 13.890948305458895, 14.038907287423818, 13.298757873288494, 14.037824844116592, 14.162250808384407, 13.947058195824253, 13.806653590867487, 13.980386450979841, 13.667273172120268, 14.201737731565034, 14.447849496372461, 14.018697147254176, 14.002315381014313, 14.15231239942244, 13.73719042552595, 14.033594655653038, 14.03087108296539, 13.782578520604822, 13.660908086958573, 14.205910336717215, 13.930322566343703, 14.300591211697906, 13.808749910964323, 13.8076208806989, 14.169819824542373, 14.067537576689853, 14.306124453553517, 13.56827905088618, 13.592634469105525, 13.789396713937975, 13.853464456609156, 13.855768885291056, 13.710464058237172, 13.641314331273772, 14.295528532894258, 14.04977108393012, 14.153384726345907, 13.385524853965638, 13.484650040422745, 13.831061312642468, 14.15298734516191, 14.20289531849528, 14.116529526532855, 13.767652893233096, 13.715924776449405, 14.649027392365024, 13.603310028324861, 14.10228343978809, 13.704762155010586, 13.731715455443299],
266
  },
267
  }
268
+
269
+ fig33 = go.Figure()
270
+
271
  for year, year_data in data.items():
272
+ fig33.add_trace(go.Scatter(
273
  x=year_data['x'],
274
  y=year_data['y'],
275
  mode='lines+markers',
 
278
  ))
279
 
280
  # Update layout
281
+ fig33.update_layout(
282
  title="Perplexity vs. Number of Duplicate Documents Over Years",
283
  xaxis_title="Number of Duplicate Documents",
284
  yaxis_title="Average Perplexity",
 
286
  hovermode="x unified"
287
  )
288
 
289
+ graph3 = fig33
290
 
291
  ##graph 4
292
 
 
830
  Ul(
831
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
832
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
833
+ Li("Topic Analysis on Data Cluster Groups", style = "margin-bottom: 5px"),
834
+ Li(B("Estimated Reading Time: 15 minutes"), style = "margin-bottom: 5px"),
835
  ),
836
  )
837
 
 
903
  Section(
904
  H3("Perplexity vs Years"),
905
  P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
906
+ #Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
907
+ plotly2fasthtml(graph2222),
908
+
909
  ),
910
  Section(
911
  H3("Perplexity vs Document Duplication"),
912
  P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
913
+ #Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
914
  plotly2fasthtml(graph3),
 
915
  ),
916
  Section(
917
  H3("Perplexity vs Dump Duplication"),
 
966
  ),
967
  )
968
 
969
+ with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
970
+ topic_charts = json.load(f)
971
+ topic_graphs = []
972
+
973
+ for title, data in topic_charts:
974
+ if data["type"] == "barh":
975
+ topic_graphs.append(go.Figure(go.Bar(
976
+ x=data["kwargs"]["width"],
977
+ y=data["kwargs"]['y'],
978
+ orientation='h',
979
+ marker_color=[
980
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
981
+ for rgb in data["kwargs"]["color"]
982
+ ]
983
+ )))
984
+ elif data["type"] == "pie":
985
+ topic_graphs.append(go.Figure(go.Pie(
986
+ values=data["kwargs"]['x'],
987
+ labels=data["kwargs"]["labels"],
988
+ marker_colors=[
989
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
990
+ for rgb in data["kwargs"]["colors"]
991
+ ]
992
+ )))
993
+
994
+ cluster_div = Div(
995
+ Section(
996
+ H2("Topic Analysis"),
997
+ P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. Data from different topic groups should manifest different characteristics of distribution, which can give us some insight into the composition of dataset."),
998
+ H3("Methodology"),
999
+ P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". We collected and aggregated a series of metrics which include quality signals and other useful metadata. For each topic group, we calculated average scores and generated the corresponding bar charts over different metrics for comparison and analysis."),
1000
+ H3("Cluster Groups"),
1001
+ P("We grouped data into the following 17 clusters"),
1002
+ Ul(*(
1003
+ Li(topic_name, style = "margin-bottom: 5px")
1004
+ for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
1005
+ )),
1006
+ H3("Results Analysis"),
1007
+ *(
1008
+ Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
1009
+ for i, (title, data) in enumerate(topic_charts)
1010
+ )
1011
+ )
1012
+ )
1013
+
1014
 
1015
  def results():
1016
  return Div(
 
1032
  ),
1033
  Section(
1034
  llama_div,
1035
+ ),
1036
+ Section(
1037
+ cluster_div,
1038
+ id="section55"
1039
  ),
1040
  id="inner-text"
1041
  )
web.py CHANGED
@@ -376,6 +376,7 @@ def web_data():
376
  return Div(
377
  Section(
378
  Div(
 
379
  H2("Common Crawl Snapshot Processing"),
380
  H3("What This Section Contains"),
381
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
@@ -387,6 +388,7 @@ def web_data():
387
  Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
388
  "DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
389
  "and/or RedPajama-V-2", D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
 
390
  ),
391
  P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
392
  ),
 
376
  return Div(
377
  Section(
378
  Div(
379
+ H1("Web Data Processing"),
380
  H2("Common Crawl Snapshot Processing"),
381
  H3("What This Section Contains"),
382
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
 
388
  Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
389
  "DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
390
  "and/or RedPajama-V-2", D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
391
+ Li(B("Estimated Reading Time: 31 minutes"), style = "margin-bottom: 5px"),
392
  ),
393
  P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
394
  ),