File size: 20,030 Bytes
a1a7dfb
67f6eb3
 
2ee1e66
a08395c
 
 
 
6770b66
a1a7dfb
45cd785
a08395c
a1a7dfb
 
3c38447
 
a1a7dfb
 
 
 
67f6eb3
3c38447
a08395c
 
a1a7dfb
 
67f6eb3
 
 
e0a8eb9
67f6eb3
3c38447
 
 
 
 
 
67f6eb3
 
 
 
 
3c38447
5197624
3c38447
 
67f6eb3
3c38447
 
5197624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c38447
a1a7dfb
3c38447
 
a08395c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c38447
a1a7dfb
3c38447
 
a1a7dfb
5197624
a1a7dfb
3c38447
45cd785
a1a7dfb
 
 
 
 
3c38447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a08395c
3c38447
5197624
3c38447
 
a08395c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9db0440
a08395c
 
 
 
6770b66
 
 
7e8dbcd
6770b66
 
7e8dbcd
 
 
6770b66
2ee1e66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888beee
6770b66
888beee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6770b66
 
888beee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9db0440
6770b66
b6c56e9
6770b66
888beee
 
6770b66
 
9db0440
a08395c
 
 
 
 
 
 
 
 
 
 
 
a1a7dfb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
from fasthtml.components import HR
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print


app, rt = fast_app(debug=True)


@app.get("/")
def main():
    return Html(
        Head(
            Meta(charset="UTF-8"),
            Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
            Script(src="https://distill.pub/template.v2.js"),
            Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
            Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
            Link(rel="stylesheet", href="style.css"),
        ),
        Body(
            D_title(
                H1(
                    "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
                    cls="l-body",
                    style="text-align: center;",
                ),
                Div(
                    Img(src="images/llm360_logo.png"),
                    id="title-plot",
                    cls="main-plot-container l-page",
                ),
            ),
            D_article(
                D_contents(
                    Nav(
                        H3("Table of Contents"),
                        Div(
                            A("TxT360", href="#_self"),
                            hx_get="/intro",
                            hx_target="#inner-text",
                        ),
                        Div(
                            Ul(
                                Li(
                                    A(
                                        "Introduction",
                                        href="/intro#section1",
                                        hx_get="/intro#section1",
                                        hx_target="#inner-text",
                                    )
                                ),
                                Li(
                                    A(
                                        "Background",
                                        href="/intro#section2",
                                        hx_get="/intro#section2",
                                        hx_target="#inner-text",
                                    )
                                ),
                                Li(
                                    A(
                                        "Main Content",
                                        href="/intro#section3",
                                        hx_get="/intro#section3",
                                        hx_target="#inner-text",
                                    )
                                ),
                                Li(
                                    A(
                                        "Conclusion",
                                        href="/intro#section4",
                                        hx_get="/intro#section4",
                                        hx_target="#inner-text",
                                    )
                                ),
                            ),
                        ),
                        Div(
                            A("Web Data", href="#inner-text"),
                            hx_get="/webdata",
                            hx_target="#inner-text",
                        ),
                        Div(
                            A("Curated Sources", href="#inner-text"),
                            hx_get="/curated",
                            hx_target="#inner-text",
                        ),
                        Div(
                            A("Common Steps", href="#inner-text"),
                            hx_get="/common",
                            hx_target="#inner-text",
                        ),
                        Div(
                            A("TxT360 Results", href="#inner-text"),
                            hx_get="/results",
                            hx_target="#inner-text",
                        ),
                        role="navigation",
                        cls="l-text figcaption",
                    ),
                    prerendered="true",
                ),
                intro(),
            ),
        ),
        lang="en",
    )


@app.get("/intro")
def intro():
    return Div(
        Section(
            H2("Introduction"),
            P("""We are excited to introduce TxT360, a
                large-scale, comprehensive, and fully transparent
                dataset designed for Large Language Model (LLM)
                pre-training. TxT360 is engineered to strike a
                balance between the quantity and quality of
                pre-training data, pushing the limit on both
                fronts. This comprehensive dataset encompasses both
                expansive web-based data and highly curated data
                sources, making it one of the most robust LLM
                pre-training corpora available today.  Our web data
                component includes 99 snapshots from Common Crawl,
                amassing 5.7 trillion tokens and occupying 11 TB of
                disk space in jsonl.gz format. On the curated side,
                TxT360 integrates one of the most extensive
                collections of high-quality sources across multiple
                domains, ensuring diverse and rich content referred
                to as curated sources, 14 sources across 10
                domains.  To maintain the highest quality, we
                meticulously pre-processed the web data to filter
                out low-quality content and conducted thorough
                reviews of the curated sources. This process not
                only unified their formats but also identified and
                rectified any anomalies. Not only do we 100%
                open-source our processing scripts, but we also
                release the details of our data reviews, revealing
                the decision-making processes behind data selection
                and quality assurance.  This level of transparency
                allows researchers and practitioners to fully
                understand the datasetโ€™s composition and make
                informed decisions when using TxT360 for training.
                Additionally, TxT360 includes detailed
                documentation and analysis of the data, covering
                distribution statistics, domain coverage, and
                processing pipeline, which helps users navigate and
                utilize the dataset effectively.  Overall, TxT360
                represents a significant step forward in the
                availability and transparency of large-scale
                training data for language models, setting a new
                standard for dataset quality and openness."""),
            id="section1",
        ),
        Section(
            H2("Background"),
            P(
                """ The quality and size of a pre-training dataset
                    play a crucial role in the performance of large
                    language models (LLMs). The community has
                    introduced a variety of datasets for this purpose,
                    including purely web-based datasets like RefinedWeb
                    [1], RedPajama-Data-V2 [2], DCLM [3], and
                    FineWeb [4], as well as comprehensive datasets
                    derived from multiple highly-curated data sources
                    such as The Pile [5], RedPajama-Data-V1 [6], and
                    Dolma [7] . It is commonly known that web-based
                    datasets provide a vast quantity of data, while
                    highly-curated multi-source datasets consistently
                    deliver high quality and diversity, both critical
                    for effective LLM pre-training.  However, despite
                    the advancements in both types of data, each type
                    of dataset has its limitations. For instance, the
                    processing scripts for the web dataset, RefinedWeb,
                    known for its high quality, are not public, and
                    only about 10% of the entire dataset has been
                    disclosed. Conversely, the web component of
                    existing highly-curated multi-source datasets is
                    relatively small compared to purely web-based
                    datasets, limiting their coverage and diversity
                    compared to the scale of information from the
                    internet.  By integrating the extensive reach of
                    web data with the exceptional quality of curated
                    sources, TxT360 is crafted to meet and surpass the
                    rigorous standards required for state-of-the-art
                    LLM pre-training. """
            ),
            id="section2",
        ),
        Section(
            H2("Main Content"),
            P("""The performance of a large language model (LLM)
                    depends heavily on the quality and size of its
                    pretraining dataset. However, the pretraining
                    datasets for state-of-the-art open LLMs like Llama
                    3 and Mixtral are not publicly available and very
                    little is known about how they were created.
                    Reading time: 45 min. For the best reading
                    experience, we recommend not using a mobile phone.
                    Recently, we released ๐Ÿท FineWeb, a new,
                    large-scale (15-trillion tokens, 44TB disk space)
                    dataset for LLM pretraining. FineWeb is derived
                    from 96 CommonCrawl snapshots and produces
                    better-performing LLMs than other open pretraining
                    datasets. To bring more clarity in machine learning
                    and advance the open understanding of how to train
                    good quality large language models, we carefully
                    documented and ablated all of the design choices
                    used in FineWeb, including in-depth investigations
                    of deduplication and filtering strategies. The
                    present long form report is a deep dive in how to
                    create a large and high-quality web-scale dataset
                    for LLM pretraining. The dataset itself, ๐Ÿท
                    FineWeb, is available here.  We are extremely
                    thankful to the whole distill.pub team (Christopher
                    Olah, Shan Carter, Ludwig Schubert in particular)
                    for creating the template on which we based this
                    blog post. Thanks also for inspiring us with
                    exquisitely crafted articles and blog posts.  In
                    this report we also introduce ๐Ÿ“š FineWeb-Edu, a
                    subset of FineWeb constructed using scalable
                    automated high-quality annotations for educational
                    value, and which outperforms all openly accessible
                    web-datasets on a number of educational benchmarks
                    such as MMLU, ARC, and OpenBookQA. ๐Ÿ“š FineWeb-Edu
                    is available in two sizes/filtering-level: 1.3
                    trillion (very high educational content) and 5.4
                    trillion (high educational content) tokens (all
                    tokens are measured with GPT2 tokenizer). You can
                    download it here.  Both datasets are released under
                    the permissive ODC-By 1.0 license TLDR: This blog
                    covers a discussion on processing and evaluating
                    data quality at scale, the ๐Ÿท FineWeb recipe
                    (listing and explaining all of our design choices),
                    and the process followed to create its ๐Ÿ“š
                    FineWeb-Edu subset."""),
            id="section3",
        ),
        Section(
            H2("Conclusion"),
            P("""This is the conclusion section where we
                summarize the key points discussed in the blog post
                and provide final thoughts."""),
            id="section4",
        ),
        id="inner-text",
    )


@app.get("/webdata")
def web_data():
    return Div(Section(H2(P("Web Data")), id="inner-text"))


def get_chart_28168342():
    fig = go.Figure()
    filter_names = [
        "Download",
        "Language",
        "Min word count",
        "Title Abstract",
        "Majority language",
        "Paragraph count",
        "Frequency",
        "Unigram log probability",
        "Local dedup",
    ]

    data_sources = [
        ("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]),
        ("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]),
        ("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
        ("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
    ]

    for name, x_values in data_sources:
        fig.add_trace(
            go.Funnel(
                name=name,
                orientation="h",
                y=filter_names,
                x=x_values,
                textinfo="value+percent total",
                textposition="inside",
            )
        )

    fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
    return fig


@app.get("/curated")
def curated(request):
    from curated import get_data

    # Partial Updates
    params = request.query_params
    if data_source := params.get("data_source"):
        return get_data(data_source, params.get("doc_id", 3))
    if doc_id := params.get("doc_id"):
        return get_data(params.get("data_source"), doc_id)

    hr = HR()
    data_preparation_steps = pd.DataFrame(
        {
            "Method": [
                "HTTP/FTP dumps",
                "Web crawling",
                "Archive snapshot",
                "Generated",
                "Curated",
            ],
            "Description": [
                "Acquiring data from HTTP/FTP dumps",
                "Crawling websites to extract data",
                "Working with archive dumps",
                "Generating synthetic data",
                "High quality curated data",
            ],
            "Source": [
                "Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds",
                "USPTO | Hackernews | Ubuntu IRC",
                "StackExchange",
                "DM Maths",
                "PG19 | Europarl",
            ],
        }
    )

    table_html = data_preparation_steps.to_html(index=False, border=0)
    table_div = Div(NotStr(table_html), style="margin: 40px;")

    text = P("""This initial stage serves as the foundation for the entire
    process. Here, we focus on acquiring and extracting the raw data, which can
    come from various sources such as crawling websites, using HTTP/FTP dumps,
    or working with archive dumps.  For instance, to download and prepare a
    dataset, we can specific downloaders based on the data source. Each dataset
    might have its own downloader script which can be updated in real time to
    handle changes in the data source.  Here is a general outline of the data
    preparation process: It's worth noting that some pipelines might require
    invoking additional functions or scripts to handle specific data sources or
    formats.  These helper scripts can be located within specific directories
    or modules dedicated to the dataset.""")

    data_preparation_div = Div(
        H3("Data Preparation"),
        text,
        table_div,
        Div(get_data(), style="border: 1px solid #ccc; padding: 20px;"),
    )

    text = P("""Data preprocessing is a crucial step in the data science
    pipeline. It involves cleaning and transforming raw data into a format that
    is suitable for analysis. This process includes handling missing values,
    normalizing data, encoding categorical variables, and more.""")

    preprocessing_steps = pd.DataFrame(
        {
            "Step": [
                "Language Filter",
                "Min Word Count",
                "Title Abstract",
                "Majority Language",
                "Paragraph Count",
                "Frequency",
                "Unigram Log Probability",
            ],
            "Description": [
                "Filtering data based on language",
                "Setting a minimum word count threshold",
                "Extracting information from the title and abstract",
                "Identifying the majority language in the dataset",
                "Counting the number of paragraphs in each document",
                "Calculating the frequency of each word in the dataset",
                "Calculating the log probability of each unigram",
            ],
            "Need": [
                "To remove documents in unwanted languages",
                "To filter out documents with very few words",
                "To extract relevant information for analysis",
                "To understand the distribution of languages in the dataset",
                "To analyze the structure and length of documents",
                "To identify important words in the dataset",
                "To measure the significance of individual words",
            ],
            "Pros": [
                "Improves data quality by removing irrelevant documents",
                "Filters out low-quality or incomplete documents",
                "Provides additional information for analysis",
                "Enables language-specific analysis and insights",
                "Helps understand the complexity and content of documents",
                "Identifies important terms and topics in the dataset",
                "Quantifies the importance of individual words",
            ],
            "Cons": [
                "May exclude documents in less common languages",
                "May remove documents with valuable information",
                "May introduce bias in the analysis",
                "May not accurately represent the language distribution",
                "May not capture the complexity of document structure",
                "May be sensitive to noise and outliers",
                "May not capture the semantic meaning of words",
            ],
        }
    )

    table_html = preprocessing_steps.to_html(index=False, border=0)
    table_div = Div(NotStr(table_html), style="margin: 40px;")
    data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)

    return Div(
        Section(
            H2("Curated Sources"),
            plotly2fasthtml(get_chart_28168342()),
            data_preparation_div,
            data_preprocessing_div,
            id="inner-text",
        )
    )


@app.get("/common")
def common_steps():
    return Div(Section(H2(P("Common Steps")), id="inner-text"))


@app.get("/results")
def results():
    return Div(Section(H2(P("Results")), id="inner-text"))


serve()