victormiller commited on
Commit
a12d6cd
1 Parent(s): 8580754

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +7 -19
main.py CHANGED
@@ -136,11 +136,11 @@ def main():
136
  ),
137
  ),
138
  ),
139
- Div(
140
- A("Overview", href="#inner-text"),
141
- hx_get="/overview",
142
- hx_target="#inner-text",
143
- ),
144
  Div(
145
  A("Global Processing Steps", href="#inner-text"),
146
  hx_get="/common",
@@ -477,19 +477,7 @@ overview_div = Div(
477
  ),
478
  H2("Motivation Behind Txt360"),
479
  H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
480
- P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
481
- Ul(
482
- Li("RefinedWeb - cite", style = "margin-bottom: 5px"),
483
- Li("RedPajama-Data-V2- cite", style = "margin-bottom: 5px"),
484
- Li("DCLM- cite", style = "margin-bottom: 5px"),
485
- Li("FineWeb- cite", style = "margin-bottom: 5px"),
486
- ),
487
- P("Conversely, there are similar datasets that focus on filtering curated sources to for high-quality, domain specific knowledge. Commonly used curated datasets include:"),
488
- Ul(
489
- Li("The Pile - cite", style = "margin-bottom: 5px"),
490
- Li("RedPajama-Data-V1- cite", style = "margin-bottom: 5px"),
491
- Li("Dolma- cite", style = "margin-bottom: 5px"),
492
- ),
493
  P("In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Both datasets play critical for effective LLM pre-training."),
494
  H4("The Gap TxT360 Fills"),
495
  P("Despite advancements in filtering and source material for both data types, each type of dataset has its limitations. RefinedWeb is known for its high quality content but and only about 10% of the entire dataset has been disclosed and the processing scripts have not been released. For datasets that have combined curated sources with web data, the web component is relatively small (NEED TO UPDATE - citation needed)."),
@@ -552,7 +540,7 @@ def intro():
552
  )
553
 
554
 
555
- rt("/overview")(overview.overview)
556
  rt("/curated")(curated.curated)
557
  rt("/curated/{target}")(curated.update)
558
 
 
136
  ),
137
  ),
138
  ),
139
+ # Div(
140
+ # A("Overview", href="#inner-text"),
141
+ # hx_get="/overview",
142
+ # hx_target="#inner-text",
143
+ # ),
144
  Div(
145
  A("Global Processing Steps", href="#inner-text"),
146
  hx_get="/common",
 
477
  ),
478
  H2("Motivation Behind Txt360"),
479
  H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
480
+ P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets."),
 
 
 
 
 
 
 
 
 
 
 
 
481
  P("In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Both datasets play critical for effective LLM pre-training."),
482
  H4("The Gap TxT360 Fills"),
483
  P("Despite advancements in filtering and source material for both data types, each type of dataset has its limitations. RefinedWeb is known for its high quality content but and only about 10% of the entire dataset has been disclosed and the processing scripts have not been released. For datasets that have combined curated sources with web data, the web component is relatively small (NEED TO UPDATE - citation needed)."),
 
540
  )
541
 
542
 
543
+ #rt("/overview")(overview.overview)
544
  rt("/curated")(curated.curated)
545
  rt("/curated/{target}")(curated.update)
546