victormiller commited on
Commit
adcd5e6
1 Parent(s): 8061116

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +13 -13
main.py CHANGED
@@ -178,7 +178,7 @@ def main():
178
  new_dataset_comparison1 = pd.DataFrame(
179
  {
180
  "Data Source": [
181
- "CommonCrawl",
182
  "Papers",
183
  "Wikipedia",
184
  "FreeLaw",
@@ -193,7 +193,7 @@ new_dataset_comparison1 = pd.DataFrame(
193
 
194
  ],
195
  "TxT360": [
196
- "99 Snapshots",
197
  "5 Sources",
198
  "310+ Languages",
199
  "Included",
@@ -207,7 +207,7 @@ new_dataset_comparison1 = pd.DataFrame(
207
  "**",
208
  ],
209
  "FineWeb": [
210
- "96 Snapshots",
211
  "-",
212
  "-",
213
  "-",
@@ -221,7 +221,7 @@ new_dataset_comparison1 = pd.DataFrame(
221
  "-",
222
  ],
223
  "RefinedWeb": [
224
- "90 Snapshots",
225
  "-",
226
  "-",
227
  "-",
@@ -234,8 +234,8 @@ new_dataset_comparison1 = pd.DataFrame(
234
  "-",
235
  "-",
236
  ],
237
- "PedPajama-V-2": [
238
- "84 Snapshots",
239
  "-",
240
  "-",
241
  "-",
@@ -249,7 +249,7 @@ new_dataset_comparison1 = pd.DataFrame(
249
  "-",
250
  ],
251
  "C4": [
252
- "1 Snapshots",
253
  "-",
254
  "-",
255
  "-",
@@ -263,7 +263,7 @@ new_dataset_comparison1 = pd.DataFrame(
263
  "-",
264
  ],
265
  "Dolma": [
266
- "24 Snapshots",
267
  "1 Source",
268
  "checkmark",
269
  "-",
@@ -276,8 +276,8 @@ new_dataset_comparison1 = pd.DataFrame(
276
  "-",
277
  "Included",
278
  ],
279
- "RedPajama-V-1": [
280
- "5 Snapshots",
281
  "1 Source",
282
  "checkmark",
283
  "",
@@ -291,7 +291,7 @@ new_dataset_comparison1 = pd.DataFrame(
291
  "Included",
292
  ],
293
  "The Pile": [
294
- "0.6% of 74 Snapshots",
295
  "4 Sources",
296
  "English Only",
297
  "Included",
@@ -636,8 +636,8 @@ def intro():
636
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
637
  ),
638
  new_table_div_1,
639
- table_div_1,
640
- table_div_2,
641
  P(
642
  "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
643
  ),
 
178
  new_dataset_comparison1 = pd.DataFrame(
179
  {
180
  "Data Source": [
181
+ "CommonCrawl Snapshots",
182
  "Papers",
183
  "Wikipedia",
184
  "FreeLaw",
 
193
 
194
  ],
195
  "TxT360": [
196
+ "99",
197
  "5 Sources",
198
  "310+ Languages",
199
  "Included",
 
207
  "**",
208
  ],
209
  "FineWeb": [
210
+ "96",
211
  "-",
212
  "-",
213
  "-",
 
221
  "-",
222
  ],
223
  "RefinedWeb": [
224
+ "90",
225
  "-",
226
  "-",
227
  "-",
 
234
  "-",
235
  "-",
236
  ],
237
+ "PedPajamaV2": [
238
+ "84",
239
  "-",
240
  "-",
241
  "-",
 
249
  "-",
250
  ],
251
  "C4": [
252
+ "1",
253
  "-",
254
  "-",
255
  "-",
 
263
  "-",
264
  ],
265
  "Dolma": [
266
+ "24",
267
  "1 Source",
268
  "checkmark",
269
  "-",
 
276
  "-",
277
  "Included",
278
  ],
279
+ "RedPajamaV1": [
280
+ "5",
281
  "1 Source",
282
  "checkmark",
283
  "",
 
291
  "Included",
292
  ],
293
  "The Pile": [
294
+ "0.6% of 74",
295
  "4 Sources",
296
  "English Only",
297
  "Included",
 
636
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
637
  ),
638
  new_table_div_1,
639
+ #table_div_1,
640
+ #table_div_2,
641
  P(
642
  "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
643
  ),