victormiller commited on
Commit
83c90e0
1 Parent(s): ad02e17

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +0 -37
main.py CHANGED
@@ -183,43 +183,6 @@ def main():
183
  )
184
 
185
 
186
- intro_text = P(
187
- "Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
188
- A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
189
- ", ",
190
- A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
191
- ", ",
192
- A("K2-65B", href="https://huggingface.co/LLM360/K2"),
193
- " have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
194
- )
195
-
196
- intro_list = P(
197
- "We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
198
- )
199
-
200
- intro_list1 = Ol(
201
- Li(
202
- "Curates commonly used pretraining datasets, including all CommonCrawl",
203
- style="margin-bottom: 5px",
204
- ),
205
- Li(
206
- "Employs carefully selected filters designed for each data source",
207
- style="margin-bottom: 5px",
208
- ),
209
- Li(
210
- "Provides only unique data elements via globally deduplicated across all datasets",
211
- style="margin-bottom: 5px",
212
- ),
213
- Li(
214
- "Retains all deduplication metadata for custom upweighting",
215
- style="margin-bottom: 5px",
216
- ),
217
- Li(
218
- "Is Production ready! Download here [link to HF repo]",
219
- style="margin-bottom: 5px",
220
- ),
221
- )
222
-
223
 
224
  dataset_comparison1 = pd.DataFrame(
225
  {
 
183
  )
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  dataset_comparison1 = pd.DataFrame(
188
  {