victormiller commited on
Commit
1f950b4
1 Parent(s): 3d1994e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +1 -1
main.py CHANGED
@@ -456,7 +456,7 @@ def intro():
456
  Section(
457
  H2("About TxT360"),
458
  P(
459
- "We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models."
460
  ),
461
  P(
462
  "Building on top of the prior studies on pre-training data, TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
 
456
  Section(
457
  H2("About TxT360"),
458
  P(
459
+ B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
460
  ),
461
  P(
462
  "Building on top of the prior studies on pre-training data, TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."