Spaces:
Running
Running
victormiller
commited on
Commit
•
a12d6cd
1
Parent(s):
8580754
Update main.py
Browse files
main.py
CHANGED
@@ -136,11 +136,11 @@ def main():
|
|
136 |
),
|
137 |
),
|
138 |
),
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
Div(
|
145 |
A("Global Processing Steps", href="#inner-text"),
|
146 |
hx_get="/common",
|
@@ -477,19 +477,7 @@ overview_div = Div(
|
|
477 |
),
|
478 |
H2("Motivation Behind Txt360"),
|
479 |
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
480 |
-
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets.
|
481 |
-
Ul(
|
482 |
-
Li("RefinedWeb - cite", style = "margin-bottom: 5px"),
|
483 |
-
Li("RedPajama-Data-V2- cite", style = "margin-bottom: 5px"),
|
484 |
-
Li("DCLM- cite", style = "margin-bottom: 5px"),
|
485 |
-
Li("FineWeb- cite", style = "margin-bottom: 5px"),
|
486 |
-
),
|
487 |
-
P("Conversely, there are similar datasets that focus on filtering curated sources to for high-quality, domain specific knowledge. Commonly used curated datasets include:"),
|
488 |
-
Ul(
|
489 |
-
Li("The Pile - cite", style = "margin-bottom: 5px"),
|
490 |
-
Li("RedPajama-Data-V1- cite", style = "margin-bottom: 5px"),
|
491 |
-
Li("Dolma- cite", style = "margin-bottom: 5px"),
|
492 |
-
),
|
493 |
P("In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Both datasets play critical for effective LLM pre-training."),
|
494 |
H4("The Gap TxT360 Fills"),
|
495 |
P("Despite advancements in filtering and source material for both data types, each type of dataset has its limitations. RefinedWeb is known for its high quality content but and only about 10% of the entire dataset has been disclosed and the processing scripts have not been released. For datasets that have combined curated sources with web data, the web component is relatively small (NEED TO UPDATE - citation needed)."),
|
@@ -552,7 +540,7 @@ def intro():
|
|
552 |
)
|
553 |
|
554 |
|
555 |
-
rt("/overview")(overview.overview)
|
556 |
rt("/curated")(curated.curated)
|
557 |
rt("/curated/{target}")(curated.update)
|
558 |
|
|
|
136 |
),
|
137 |
),
|
138 |
),
|
139 |
+
# Div(
|
140 |
+
# A("Overview", href="#inner-text"),
|
141 |
+
# hx_get="/overview",
|
142 |
+
# hx_target="#inner-text",
|
143 |
+
# ),
|
144 |
Div(
|
145 |
A("Global Processing Steps", href="#inner-text"),
|
146 |
hx_get="/common",
|
|
|
477 |
),
|
478 |
H2("Motivation Behind Txt360"),
|
479 |
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
480 |
+
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
P("In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Both datasets play critical for effective LLM pre-training."),
|
482 |
H4("The Gap TxT360 Fills"),
|
483 |
P("Despite advancements in filtering and source material for both data types, each type of dataset has its limitations. RefinedWeb is known for its high quality content but and only about 10% of the entire dataset has been disclosed and the processing scripts have not been released. For datasets that have combined curated sources with web data, the web component is relatively small (NEED TO UPDATE - citation needed)."),
|
|
|
540 |
)
|
541 |
|
542 |
|
543 |
+
#rt("/overview")(overview.overview)
|
544 |
rt("/curated")(curated.curated)
|
545 |
rt("/curated/{target}")(curated.update)
|
546 |
|