omkarenator commited on
Commit
f5cf42d
1 Parent(s): 4334509

remove redundant refs

Browse files
Files changed (1) hide show
  1. web.py +0 -44
web.py CHANGED
@@ -1380,48 +1380,4 @@ def web_data():
1380
  P("NEED TO UPDATE"),
1381
  H3("5. PII Removal"),
1382
  P("..."), # Add detailed content and images as needed
1383
- H2("Reference"),
1384
- Ul(
1385
- Li(
1386
- P(
1387
- "The {P}ile: An 800{GB} dataset of diverse text for language modeling Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and others. 2020."
1388
- )
1389
- ),
1390
- Li(
1391
- P("""Scaling Language Models: Methods, Analysis & Insights from Training Gopher [link]
1392
- Jack W. Rae and Sebastian Borgeaud and Trevor Cai and Katie Millican and Jordan Hoffmann and H. Francis
1393
- Song and John Aslanides and Sarah Henderson and Roman Ring and Susannah Young and Eliza Rutherford and Tom
1394
- Hennigan and Jacob Menick and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters
1395
- and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani
1396
- and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy
1397
- and Dirk Groeneveld and Jesse Dodge and Kyle Lo. 2021.""")
1398
- ),
1399
- Li(
1400
- P("""The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data, and Web Data Only
1401
- Guilherme Penedo and Quentin Malartic and Daniel Hesslow and Ruxandra Cojocaru and Alessandro Cappelli and
1402
- Hamza Alobeidli and Baptiste Pannier and Ebtesam Almazrouei and Julien Launay. 2023.""")
1403
- ),
1404
- Li(
1405
- P("""🍷 FineWeb: decanting the web for the finest text data at scale [link]
1406
- Guilherme Penedo, Hynek Kydlíček, Loubna Ben Allal, Anton Lozhkov, Colin Raffel, Leandro Werra and Thomas Wolf. 2024.""")
1407
- ),
1408
- Li(
1409
- P("""Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
1410
- Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and
1411
- Wei Li and Peter J. Liu. 2023.""")
1412
- ),
1413
- Li(
1414
- P("""Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research
1415
- Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and
1416
- Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Harsh Jha and
1417
- Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and
1418
- Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen
1419
- and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and
1420
- Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo. 2024.""")
1421
- ),
1422
- Li(
1423
- P("""RedPajama-Data-v2: an Open Dataset with 30 Trillion Tokens for Training Large Language Models [link]
1424
- Together Computer. 2023.""")
1425
- ),
1426
- ),
1427
  )
 
1380
  P("NEED TO UPDATE"),
1381
  H3("5. PII Removal"),
1382
  P("..."), # Add detailed content and images as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383
  )