victormiller commited on
Commit
a552bff
1 Parent(s): 506b4ce

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +8 -3
main.py CHANGED
@@ -123,10 +123,15 @@ def main():
123
  ),
124
  )
125
 
126
- intro_text = P(
127
- """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
 
 
 
 
 
128
 
129
- intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
130
 
131
  intro_list1 = Ol(
132
  Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
 
123
  ),
124
  )
125
 
126
+ intro_text = P("Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
127
+ A("Amber-7B", href = "https://huggingface.co/LLM360/Amber"),
128
+ ", ",
129
+ A("Crystal-7B", href = "https://huggingface.co/LLM360/CrystalCoder"),
130
+ ", ",
131
+ A("K2-65B", href = "https://huggingface.co/LLM360/K2"),
132
+ "have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",)
133
 
134
+ intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
135
 
136
  intro_list1 = Ol(
137
  Li("Curates commonly used pretraining datasets, including all CommonCrawl"),