fh-new-vm1

Sleeping

victormiller commited on Sep 24

Commit

27361f1

•

1 Parent(s): d84fec1

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -120,14 +120,14 @@ def main():
 intro_text = P(
 """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
-intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
-1. Curates commonly used pretraining datasets, including all CommonCrawl
-2. Employs carefully selected filters designed for each data source
-3. Provides only unique data elements via globally deduplicated across all datasets
-4. Retains all deduplication metadata for custom upweighting
-5. Is Production ready! Download here [link to HF repo]
-""")
 @app.get("/intro")
@@ -137,6 +137,11 @@ def intro():
             H2("Introduction"),
             intro_text,
             intro_list,
             id="section1",
         ),
         Section(

 intro_text = P(
 """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
+intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
+intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
+intro_2 = P("2. Employs carefully selected filters designed for each data source")
+intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
+intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
+intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
 @app.get("/intro")
             H2("Introduction"),
             intro_text,
             intro_list,
+            intro_1,
+            intro_2,
+            intro_3,
+            intro_4,
+            intro_5,
             id="section1",
         ),
         Section(