Spaces:
Sleeping
Sleeping
victormiller
commited on
Commit
•
27361f1
1
Parent(s):
d84fec1
Update main.py
Browse files
main.py
CHANGED
@@ -120,14 +120,14 @@ def main():
|
|
120 |
intro_text = P(
|
121 |
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
122 |
|
123 |
-
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
|
124 |
|
125 |
-
1. Curates commonly used pretraining datasets, including all CommonCrawl
|
126 |
-
2. Employs carefully selected filters designed for each data source
|
127 |
-
3. Provides only unique data elements via globally deduplicated across all datasets
|
128 |
-
4. Retains all deduplication metadata for custom upweighting
|
129 |
-
5. Is Production ready! Download here [link to HF repo]
|
130 |
-
|
131 |
|
132 |
|
133 |
@app.get("/intro")
|
@@ -137,6 +137,11 @@ def intro():
|
|
137 |
H2("Introduction"),
|
138 |
intro_text,
|
139 |
intro_list,
|
|
|
|
|
|
|
|
|
|
|
140 |
id="section1",
|
141 |
),
|
142 |
Section(
|
|
|
120 |
intro_text = P(
|
121 |
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
122 |
|
123 |
+
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
124 |
|
125 |
+
intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
|
126 |
+
intro_2 = P("2. Employs carefully selected filters designed for each data source")
|
127 |
+
intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
|
128 |
+
intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
|
129 |
+
intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
|
130 |
+
|
131 |
|
132 |
|
133 |
@app.get("/intro")
|
|
|
137 |
H2("Introduction"),
|
138 |
intro_text,
|
139 |
intro_list,
|
140 |
+
intro_1,
|
141 |
+
intro_2,
|
142 |
+
intro_3,
|
143 |
+
intro_4,
|
144 |
+
intro_5,
|
145 |
id="section1",
|
146 |
),
|
147 |
Section(
|