victormiller
commited on
Commit
•
1f950b4
1
Parent(s):
3d1994e
Update main.py
Browse files
main.py
CHANGED
@@ -456,7 +456,7 @@ def intro():
|
|
456 |
Section(
|
457 |
H2("About TxT360"),
|
458 |
P(
|
459 |
-
"We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models."
|
460 |
),
|
461 |
P(
|
462 |
"Building on top of the prior studies on pre-training data, TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
|
|
|
456 |
Section(
|
457 |
H2("About TxT360"),
|
458 |
P(
|
459 |
+
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
|
460 |
),
|
461 |
P(
|
462 |
"Building on top of the prior studies on pre-training data, TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
|