File size: 10,271 Bytes
e137e27 005657d e137e27 8262fca e137e27 005657d e137e27 005657d e137e27 591cd18 e137e27 005657d e137e27 5d3f993 e137e27 591cd18 e137e27 5d3f993 e137e27 578c629 e137e27 578c629 227158f e137e27 6ff14ce e137e27 6ff14ce e137e27 005657d e137e27 005657d fb20585 005657d fb20585 005657d fb20585 9b18c90 005657d fb20585 9b18c90 005657d e137e27 37e0b82 005657d 37e0b82 005657d e137e27 005657d 578c629 e137e27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import (
D_title,
D_article,
D_front_matter,
D_contents,
D_byline,
D_bibliography,
D_appendix,
D_cite,
)
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print
import overview
import curated
import web
import common
import results
from pybtex.database import parse_file
app, rt = fast_app(
debug=True,
pico=False,
hdrs=(
Meta(charset="UTF-8"),
Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
Script(src="https://distill.pub/template.v2.js"),
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
Link(rel="stylesheet", href="style.css"),
MarkdownJS(),
),
)
front_matter = """
<d-front-matter>
<script id='distill-front-matter' type="text/json">{
"title": "",
"description": "",
"published": "",
"affiliation": {},
"authors": [
{
"author":"",
"authorURL":""
}
],
"katex": {
"delimiters": [
{"left": "$$", "right": "$$", "display": false}
]
}
}
</script>
</d-front-matter>
"""
def read_bibs():
bib_data = parse_file("bibliography.bib")
cits = []
for key in bib_data.entries.keys():
cits.append(D_cite(bibtex_key=key))
return cits
@app.get("/bibliography.bib")
def get():
return FileResponse("bibliography.bib")
@app.get("/")
def main():
return Div(
D_title(
H1(
"TxT360: the most comprehensive, highest quality, and production ready pretraining dataset",
cls="l-body",
style="text-align: center;",
),
Div(
Img(src="images/llm360_logo.png"),
id="title-plot",
cls="main-plot-container l-page",
),
),
Div(D_byline(), NotStr(front_matter), style="display: none;"),
D_article(
D_contents(
Nav(
H3("Table of Contents"),
Div(
A("TxT360", href="#_self"),
hx_get="/intro",
hx_target="#inner-text",
),
Div(
Ul(
Li(
A(
"About TxT360",
href="/intro#section1",
hx_get="/intro#section1",
hx_target="#inner-text",
)
),
Li(
A(
"Globally Deduplicated",
href="/intro#section2",
hx_get="/intro#section2",
hx_target="#inner-text",
)
),
Li(
A(
"Controllable Upweighting",
href="/intro#section3",
hx_get="/intro#section3",
hx_target="#inner-text",
)
),
Li(
A(
"Fully Documented",
href="/intro#section4",
hx_get="/intro#section4",
hx_target="#inner-text",
)
),
),
),
Div(
A("Overview", href="#inner-text"),
hx_get="/overview",
hx_target="#inner-text",
),
Div(
A("Global Processing Steps", href="#inner-text"),
hx_get="/common",
hx_target="#inner-text",
),
Div(
A("Web Data Processing", href="#inner-text"),
hx_get="/webdata",
hx_target="#inner-text",
),
Div(
A("Curated Sources Processing", href="#inner-text"),
hx_get="/curated",
hx_target="#inner-text",
),
Div(
A("TxT360 Results", href="#inner-text"),
hx_get="/results",
hx_target="#inner-text",
),
role="navigation",
cls="l-text figcaption",
),
),
intro(),
),
D_appendix(D_bibliography(src="bibliography.bib")),
Div(*read_bibs(), style="display: none;"),
)
intro_text = P(
"Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
", ",
A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
", ",
A("K2-65B", href="https://huggingface.co/LLM360/K2"),
" have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
)
intro_list = P(
"We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
)
intro_list1 = Ol(
Li(
"Curates commonly used pretraining datasets, including all CommonCrawl",
style="margin-bottom: 5px",
),
Li(
"Employs carefully selected filters designed for each data source",
style="margin-bottom: 5px",
),
Li(
"Provides only unique data elements via globally deduplicated across all datasets",
style="margin-bottom: 5px",
),
Li(
"Retains all deduplication metadata for custom upweighting",
style="margin-bottom: 5px",
),
Li(
"Is Production ready! Download here [link to HF repo]",
style="margin-bottom: 5px",
),
)
@app.get("/intro")
def intro():
return Div(
Section(
H2("About TxT360"),
intro_text,
intro_list,
intro_list1,
id="section1",
),
Section(
H3("Global Deduplication"),
P(
"TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available."
),
id="section2",
),
Section(
H3("Controllable Upweighting for Flexible Data Sample Weight Control"),
P(
"In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb",
D_cite(bibtex_key="fineweb"),
"and RefinedWeb",
D_cite(bibtex_key="refinedweb"),
").",
),
P(
"Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. "
),
id="section3",
),
Section(
H3("Full and Openly Documented Production Ready Pretraining Corpus"),
P(
"We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "
),
P("Our code is open sourced here[link to github]."),
P(
"The dataset is ready for immediate download directly from Hugging Face [link]."
),
P(
"In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"
),
id="section4",
),
id="inner-text",
)
rt("/overview")(overview.overview)
rt("/curated")(curated.curated)
rt("/curated/{target}")(curated.update)
rt("/webdata")(web.web_data)
rt("/webdata/{target}")(web.update)
rt("/common")(common.common_steps)
rt("/results")(results.results)
serve()
|