omwdataset

Running

victormiller commited on 13 days ago

Commit

83c90e0

•

1 Parent(s): ad02e17

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -183,43 +183,6 @@ def main():
     )
-intro_text = P(
-    "Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
-    A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
-    ", ",
-    A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
-    ", ",
-    A("K2-65B", href="https://huggingface.co/LLM360/K2"),
-    " have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
-)
-intro_list = P(
-    "We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
-)
-intro_list1 = Ol(
-    Li(
-        "Curates commonly used pretraining datasets, including all CommonCrawl",
-        style="margin-bottom: 5px",
-    ),
-    Li(
-        "Employs carefully selected filters designed for each data source",
-        style="margin-bottom: 5px",
-    ),
-    Li(
-        "Provides only unique data elements via globally deduplicated across all datasets",
-        style="margin-bottom: 5px",
-    ),
-    Li(
-        "Retains all deduplication metadata for custom upweighting",
-        style="margin-bottom: 5px",
-    ),
-    Li(
-        "Is Production ready! Download here [link to HF repo]",
-        style="margin-bottom: 5px",
-    ),
-)
 dataset_comparison1 = pd.DataFrame(
     {

     )
 dataset_comparison1 = pd.DataFrame(
     {