TxT360

Sleeping

App Files Files Community

victormiller commited on Sep 25

Commit

fb20585

•

1 Parent(s): 5d3f993

Update main.py

Browse files

Files changed (1) hide show

main.py +42 -20

main.py CHANGED Viewed

@@ -117,13 +117,20 @@ def main():
         ),
     )
-@app.get("/intro")
-def intro():
-    return Div(
-        Section(
-            H2("Introduction"),
-            P("""We are excited to introduce TxT360, a
                 large-scale, comprehensive, and fully transparent
                 dataset designed for Large Language Model (LLM)
                 pre-training. TxT360 is engineered to strike a
@@ -161,12 +168,9 @@ def intro():
                 represents a significant step forward in the
                 availability and transparency of large-scale
                 training data for language models, setting a new
-                standard for dataset quality and openness."""),
-            id="section1",
-        ),
-        Section(
-            H2("Background"),
-            P(
                 """ The quality and size of a pre-training dataset
                     play a crucial role in the performance of large
                     language models (LLMs). The community has
@@ -197,11 +201,8 @@ def intro():
                     rigorous standards required for state-of-the-art
                     LLM pre-training. """
             ),
-            id="section2",
-        ),
-        Section(
-            H2("Main Content"),
-            P("""The performance of a large language model (LLM)
                     depends heavily on the quality and size of its
                     pretraining dataset. However, the pretraining
                     datasets for state-of-the-art open LLMs like Llama
@@ -246,13 +247,34 @@ def intro():
                     (listing and explaining all of our design choices),
                     and the process followed to create its 📚
                     FineWeb-Edu subset."""),
             id="section3",
         ),
         Section(
             H2("Conclusion"),
-            P("""This is the conclusion section where we
-                summarize the key points discussed in the blog post
-                and provide final thoughts."""),
             id="section4",
         ),
         id="inner-text",

         ),
     )
+intro_text = P(
+"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
+intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
+intro_list1 = Ol(
+                Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
+                Li("Employs carefully selected filters designed for each data source"),
+                Li("Provides only unique data elements via globally deduplicated across all datasets"),
+                Li("Retains all deduplication metadata for custom upweighting"),
+                Li("Is Production ready! Download here [link to HF repo]")
+)
+previous_intro = P("""We are excited to introduce TxT360, a
                 large-scale, comprehensive, and fully transparent
                 dataset designed for Large Language Model (LLM)
                 pre-training. TxT360 is engineered to strike a
                 represents a significant step forward in the
                 availability and transparency of large-scale
                 training data for language models, setting a new
+                standard for dataset quality and openness.""")
+previous_background = P(
                 """ The quality and size of a pre-training dataset
                     play a crucial role in the performance of large
                     language models (LLMs). The community has
                     rigorous standards required for state-of-the-art
                     LLM pre-training. """
             ),
+previous_content = P("""The performance of a large language model (LLM)
                     depends heavily on the quality and size of its
                     pretraining dataset. However, the pretraining
                     datasets for state-of-the-art open LLMs like Llama
                     (listing and explaining all of our design choices),
                     and the process followed to create its 📚
                     FineWeb-Edu subset."""),
+previous_conclusion = P("""This is the conclusion section where we
+                summarize the key points discussed in the blog post
+                and provide final thoughts."""),
+@app.get("/intro")
+def intro():
+    return Div(
+        Section(
+            H2("About TxT360"),
+            intro_text,
+            intro_list,
+            intro_list1,
+            id="section1",
+        ),
+        Section(
+            H2("Background"),
+            id="section2",
+        ),
+        Section(
+            H2("Main Content"),
             id="section3",
         ),
         Section(
             H2("Conclusion"),
             id="section4",
         ),
         id="inner-text",