omkarenator
commited on
Commit
•
f5cf42d
1
Parent(s):
4334509
remove redundant refs
Browse files
web.py
CHANGED
@@ -1380,48 +1380,4 @@ def web_data():
|
|
1380 |
P("NEED TO UPDATE"),
|
1381 |
H3("5. PII Removal"),
|
1382 |
P("..."), # Add detailed content and images as needed
|
1383 |
-
H2("Reference"),
|
1384 |
-
Ul(
|
1385 |
-
Li(
|
1386 |
-
P(
|
1387 |
-
"The {P}ile: An 800{GB} dataset of diverse text for language modeling Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and others. 2020."
|
1388 |
-
)
|
1389 |
-
),
|
1390 |
-
Li(
|
1391 |
-
P("""Scaling Language Models: Methods, Analysis & Insights from Training Gopher [link]
|
1392 |
-
Jack W. Rae and Sebastian Borgeaud and Trevor Cai and Katie Millican and Jordan Hoffmann and H. Francis
|
1393 |
-
Song and John Aslanides and Sarah Henderson and Roman Ring and Susannah Young and Eliza Rutherford and Tom
|
1394 |
-
Hennigan and Jacob Menick and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters
|
1395 |
-
and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani
|
1396 |
-
and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy
|
1397 |
-
and Dirk Groeneveld and Jesse Dodge and Kyle Lo. 2021.""")
|
1398 |
-
),
|
1399 |
-
Li(
|
1400 |
-
P("""The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data, and Web Data Only
|
1401 |
-
Guilherme Penedo and Quentin Malartic and Daniel Hesslow and Ruxandra Cojocaru and Alessandro Cappelli and
|
1402 |
-
Hamza Alobeidli and Baptiste Pannier and Ebtesam Almazrouei and Julien Launay. 2023.""")
|
1403 |
-
),
|
1404 |
-
Li(
|
1405 |
-
P("""🍷 FineWeb: decanting the web for the finest text data at scale [link]
|
1406 |
-
Guilherme Penedo, Hynek Kydlíček, Loubna Ben Allal, Anton Lozhkov, Colin Raffel, Leandro Werra and Thomas Wolf. 2024.""")
|
1407 |
-
),
|
1408 |
-
Li(
|
1409 |
-
P("""Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
|
1410 |
-
Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and
|
1411 |
-
Wei Li and Peter J. Liu. 2023.""")
|
1412 |
-
),
|
1413 |
-
Li(
|
1414 |
-
P("""Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research
|
1415 |
-
Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and
|
1416 |
-
Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Harsh Jha and
|
1417 |
-
Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and
|
1418 |
-
Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen
|
1419 |
-
and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and
|
1420 |
-
Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo. 2024.""")
|
1421 |
-
),
|
1422 |
-
Li(
|
1423 |
-
P("""RedPajama-Data-v2: an Open Dataset with 30 Trillion Tokens for Training Large Language Models [link]
|
1424 |
-
Together Computer. 2023.""")
|
1425 |
-
),
|
1426 |
-
),
|
1427 |
)
|
|
|
1380 |
P("NEED TO UPDATE"),
|
1381 |
H3("5. PII Removal"),
|
1382 |
P("..."), # Add detailed content and images as needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1383 |
)
|