File size: 10,271 Bytes
e137e27
 
005657d
 
 
 
 
 
 
 
 
 
e137e27
 
 
 
 
8262fca
e137e27
 
 
 
005657d
e137e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
005657d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e137e27
 
 
 
 
591cd18
e137e27
 
 
 
 
 
 
 
 
005657d
e137e27
 
 
 
 
 
 
 
 
 
 
 
 
5d3f993
e137e27
 
 
 
 
 
 
591cd18
e137e27
 
 
 
 
 
 
5d3f993
e137e27
 
 
 
 
 
 
578c629
e137e27
 
 
 
 
 
 
578c629
 
 
 
 
227158f
 
 
 
 
e137e27
6ff14ce
e137e27
 
 
 
6ff14ce
e137e27
 
 
 
 
 
 
 
 
 
 
 
 
 
005657d
 
e137e27
 
 
005657d
 
 
 
 
 
 
 
fb20585
 
005657d
 
 
fb20585
005657d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb20585
 
 
 
 
 
 
 
 
 
 
 
 
9b18c90
005657d
 
 
fb20585
 
 
9b18c90
005657d
 
 
 
 
 
 
 
 
 
e137e27
 
 
37e0b82
005657d
 
 
37e0b82
005657d
 
 
 
 
 
e137e27
 
 
 
 
005657d
578c629
e137e27
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import (
    D_title,
    D_article,
    D_front_matter,
    D_contents,
    D_byline,
    D_bibliography,
    D_appendix,
    D_cite,
)
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print
import overview
import curated
import web
import common
import results
from pybtex.database import parse_file


app, rt = fast_app(
    debug=True,
    pico=False,
    hdrs=(
        Meta(charset="UTF-8"),
        Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
        Script(src="https://distill.pub/template.v2.js"),
        Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
        Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
        Link(rel="stylesheet", href="style.css"),
        MarkdownJS(),
    ),
)


front_matter = """
<d-front-matter>
<script id='distill-front-matter' type="text/json">{
    "title": "",
    "description": "",
    "published": "",
    "affiliation": {},
    "authors": [
      {
        "author":"",
        "authorURL":""
      }
    ],
    "katex": {
      "delimiters": [
        {"left": "$$", "right": "$$", "display": false}
      ]
    }
  }
</script>
</d-front-matter>
"""


def read_bibs():
    bib_data = parse_file("bibliography.bib")
    cits = []
    for key in bib_data.entries.keys():
        cits.append(D_cite(bibtex_key=key))
    return cits


@app.get("/bibliography.bib")
def get():
    return FileResponse("bibliography.bib")


@app.get("/")
def main():
    return Div(
        D_title(
            H1(
                "TxT360: the most comprehensive, highest quality, and production ready pretraining dataset",
                cls="l-body",
                style="text-align: center;",
            ),
            Div(
                Img(src="images/llm360_logo.png"),
                id="title-plot",
                cls="main-plot-container l-page",
            ),
        ),
        Div(D_byline(), NotStr(front_matter), style="display: none;"),
        D_article(
            D_contents(
                Nav(
                    H3("Table of Contents"),
                    Div(
                        A("TxT360", href="#_self"),
                        hx_get="/intro",
                        hx_target="#inner-text",
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "About TxT360",
                                    href="/intro#section1",
                                    hx_get="/intro#section1",
                                    hx_target="#inner-text",
                                )
                            ),
                            Li(
                                A(
                                    "Globally Deduplicated",
                                    href="/intro#section2",
                                    hx_get="/intro#section2",
                                    hx_target="#inner-text",
                                )
                            ),
                            Li(
                                A(
                                    "Controllable Upweighting",
                                    href="/intro#section3",
                                    hx_get="/intro#section3",
                                    hx_target="#inner-text",
                                )
                            ),
                            Li(
                                A(
                                    "Fully Documented",
                                    href="/intro#section4",
                                    hx_get="/intro#section4",
                                    hx_target="#inner-text",
                                )
                            ),
                        ),
                    ),
                    Div(
                        A("Overview", href="#inner-text"),
                        hx_get="/overview",
                        hx_target="#inner-text",
                    ),
                    Div(
                        A("Global Processing Steps", href="#inner-text"),
                        hx_get="/common",
                        hx_target="#inner-text",
                    ),
                    Div(
                        A("Web Data Processing", href="#inner-text"),
                        hx_get="/webdata",
                        hx_target="#inner-text",
                    ),
                    Div(
                        A("Curated Sources Processing", href="#inner-text"),
                        hx_get="/curated",
                        hx_target="#inner-text",
                    ),
                    Div(
                        A("TxT360 Results", href="#inner-text"),
                        hx_get="/results",
                        hx_target="#inner-text",
                    ),
                    role="navigation",
                    cls="l-text figcaption",
                ),
            ),
            intro(),
        ),
        D_appendix(D_bibliography(src="bibliography.bib")),
        Div(*read_bibs(), style="display: none;"),
    )


intro_text = P(
    "Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
    A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
    ", ",
    A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
    ", ",
    A("K2-65B", href="https://huggingface.co/LLM360/K2"),
    " have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
)

intro_list = P(
    "We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
)

intro_list1 = Ol(
    Li(
        "Curates commonly used pretraining datasets, including all CommonCrawl",
        style="margin-bottom: 5px",
    ),
    Li(
        "Employs carefully selected filters designed for each data source",
        style="margin-bottom: 5px",
    ),
    Li(
        "Provides only unique data elements via globally deduplicated across all datasets",
        style="margin-bottom: 5px",
    ),
    Li(
        "Retains all deduplication metadata for custom upweighting",
        style="margin-bottom: 5px",
    ),
    Li(
        "Is Production ready! Download here [link to HF repo]",
        style="margin-bottom: 5px",
    ),
)


@app.get("/intro")
def intro():
    return Div(
        Section(
            H2("About TxT360"),
            intro_text,
            intro_list,
            intro_list1,
            id="section1",
        ),
        Section(
            H3("Global Deduplication"),
            P(
                "TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and  a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available."
            ),
            id="section2",
        ),
        Section(
            H3("Controllable Upweighting for Flexible Data Sample Weight Control"),
            P(
                "In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb",
                D_cite(bibtex_key="fineweb"),
                "and RefinedWeb",
                D_cite(bibtex_key="refinedweb"),
                ").",
            ),
            P(
                "Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. "
            ),
            id="section3",
        ),
        Section(
            H3("Full and Openly Documented Production Ready Pretraining Corpus"),
            P(
                "We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII.  Our reasoning is thoroughly explained, ensuring transparency and replicability. "
            ),
            P("Our code is open sourced here[link to github]."),
            P(
                "The dataset is ready for immediate download directly from Hugging Face [link]."
            ),
            P(
                "In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"
            ),
            id="section4",
        ),
        id="inner-text",
    )


rt("/overview")(overview.overview)
rt("/curated")(curated.curated)
rt("/curated/{target}")(curated.update)

rt("/webdata")(web.web_data)
rt("/webdata/{target}")(web.update)

rt("/common")(common.common_steps)

rt("/results")(results.results)

serve()