victormiller commited on
Commit
27361f1
1 Parent(s): d84fec1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +12 -7
main.py CHANGED
@@ -120,14 +120,14 @@ def main():
120
  intro_text = P(
121
  """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
122
 
123
- intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
124
 
125
- 1. Curates commonly used pretraining datasets, including all CommonCrawl
126
- 2. Employs carefully selected filters designed for each data source
127
- 3. Provides only unique data elements via globally deduplicated across all datasets
128
- 4. Retains all deduplication metadata for custom upweighting
129
- 5. Is Production ready! Download here [link to HF repo]
130
- """)
131
 
132
 
133
  @app.get("/intro")
@@ -137,6 +137,11 @@ def intro():
137
  H2("Introduction"),
138
  intro_text,
139
  intro_list,
 
 
 
 
 
140
  id="section1",
141
  ),
142
  Section(
 
120
  intro_text = P(
121
  """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
122
 
123
+ intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
124
 
125
+ intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl")
126
+ intro_2 = P("2. Employs carefully selected filters designed for each data source")
127
+ intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets")
128
+ intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
129
+ intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
130
+
131
 
132
 
133
  @app.get("/intro")
 
137
  H2("Introduction"),
138
  intro_text,
139
  intro_list,
140
+ intro_1,
141
+ intro_2,
142
+ intro_3,
143
+ intro_4,
144
+ intro_5,
145
  id="section1",
146
  ),
147
  Section(