Spaces:

LLM360
/

TxT360

Running

File size: 30,371 Bytes

e137e27
 
005657d
 
 
 
 
 
 
 
 
 
e137e27
 
 
 
 
8262fca
e137e27
 
 
 
005657d
87a6313
e137e27
a4dc57a
 
e137e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e384d00
 
 
 
005657d
e384d00
 
 
 
 
 
 
 
ddc7526
e384d00
e74bc72
e384d00
 
 
 
 
 
 
e74bc72
 
 
 
 
 
e384d00
 
ddc7526
e384d00
 
 
 
4cc0103
ddc7526
 
e384d00
 
e74bc72
 
 
 
 
 
e384d00
 
ddc7526
 
e384d00
 
 
 
ddc7526
e384d00
 
 
 
 
ddc7526
 
e384d00
 
 
 
ddc7526
 
e384d00
 
 
 
ddc7526
 
e384d00
 
 
 
ddc7526
 
e384d00
 
 
 
ddc7526
 
e384d00
 
005657d
e384d00
 
005657d
 
35a3f42
 
 
 
 
 
 
 
 
005657d
 
 
 
 
 
 
 
 
 
 
 
 
e137e27
 
e384d00
 
e137e27
 
 
9a127b5
e137e27
 
 
 
 
 
 
 
 
e384d00
 
 
 
 
 
 
 
e137e27
 
 
 
 
43e1d29
 
45ddd25
43e1d29
e137e27
 
 
 
 
5d3f993
09bef6a
e137e27
 
 
 
a4dc57a
09bef6a
e137e27
 
 
 
b2b380b
09bef6a
e137e27
 
 
 
 
fac35b0
4cc0103
09bef6a
fac35b0
e137e27
db08107
 
 
 
 
09bef6a
db08107
 
 
 
 
09bef6a
db08107
 
 
 
 
09bef6a
db08107
 
 
 
 
09bef6a
db08107
 
 
 
 
09bef6a
db08107
 
 
 
e137e27
fac35b0
4cc0103
09bef6a
fac35b0
e137e27
48d8ec3
 
 
 
 
09bef6a
48d8ec3
 
 
 
 
09bef6a
48d8ec3
 
 
 
 
09bef6a
48d8ec3
 
 
 
e137e27
fac35b0
9a127b5
09bef6a
ec2b3ce
 
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
a4dc57a
09bef6a
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
33e67c2
09bef6a
fd8de54
 
 
 
 
09bef6a
fd8de54
 
 
 
ec2b3ce
 
d673af7
09bef6a
fac35b0
e137e27
ea708b9
 
 
 
 
09bef6a
ea708b9
 
 
 
a4dc57a
09bef6a
ea708b9
 
 
 
 
09bef6a
ea708b9
 
a711d2f
 
 
 
 
 
ea708b9
 
e137e27
 
 
 
 
09bef6a
45ddd25
09bef6a
 
e137e27
35a3f42
 
 
 
 
 
005657d
e137e27
 
 
8061116
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
 
8061116
 
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
 
adcd5e6
8061116
4e6ee79
8061116
 
 
 
 
 
 
 
 
 
adcd5e6
 
8061116
4e6ee79
8061116
 
 
e384d00
8061116
 
 
 
 
 
 
adcd5e6
8061116
 
 
 
 
 
 
 
 
 
 
 
 
e384d00
8061116
 
3f67a06
 
e384d00
861154a
3f67a06
 
e384d00
 
 
8061116
5614f01
 
 
 
 
 
dbbb9f4
58a867d
 
8061116
 
 
 
dbbb9f4
5614f01
 
82df62a
5614f01
8061116
 
8580754
fac35b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8580754
12ce41f
fac35b0
 
 
 
 
 
 
2ecaabf
fac35b0
2ecaabf
fac35b0
 
 
 
 
 
b6d74c9
 
 
140edc3
8580754
 
 
fac35b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12ce41f
fac35b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbbb9f4
 
 
 
 
fac35b0
 
b6d74c9
 
a1001c2
140edc3
dbbb9f4
 
8580754
 
fac35b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89cfcec
fac35b0
 
 
 
 
 
 
 
 
 
 
 
89cfcec
fac35b0
 
 
 
 
 
 
 
 
 
 
7ab95df
fac35b0
 
 
 
 
 
 
 
 
 
 
 
 
 
12ce41f
e3b3325
 
 
 
 
 
 
 
 
 
f754e2b
 
d6d69e3
f754e2b
 
 
 
e3b3325
 
0e10a03
a1001c2
d6d69e3
e384d00
d6d69e3
2d4ad39
 
 
e384d00
8580754
 
 
 
 
 
 
4cc0103
 
 
 
 
 
fac35b0
a4dc57a
fac35b0
9a127b5
e384d00
 
 
 
ac7d8cf
4cc0103
e384d00
 
 
4cc0103
fac35b0
9cbd894
9345dee
9cbd894
 
09bef6a
e137e27
5025d3d
ac7d8cf
4cc0103
 
5025d3d
8061116
e384d00
 
3d1994e
4cc0103
3d1994e
e384d00
4cc0103
 
 
e384d00
 
 
09bef6a
5025d3d
 
4cc0103
5025d3d
4cc0103
5025d3d
 
3d1994e
5025d3d
 
4cc0103
 
5025d3d
117a05e
5025d3d
4cc0103
5025d3d
e384d00
 
3d1994e
e384d00
09bef6a
5025d3d
e137e27
 
 
005657d
87a6313
 
e137e27

from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import (
    D_title,
    D_article,
    D_front_matter,
    D_contents,
    D_byline,
    D_bibliography,
    D_appendix,
    D_cite,
)
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print
import overview
import curated
import web
import common
import results
from pybtex.database import parse_file
import data_viewer

from eval_result_figures import all_eval_res_figs


app, rt = fast_app(
    debug=True,
    pico=False,
    hdrs=(
        Meta(charset="UTF-8"),
        Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
        Script(src="https://distill.pub/template.v2.js"),
        Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
        Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
        Link(rel="stylesheet", href="style.css"),
        MarkdownJS(),
    ),
)


front_matter = {
    "title": "TxT360",
    "description": "A globally deduplicated dataset for LLM pretraining",
    "published": "October 7, 2024",
    "authors": [
        {
            "author": "Liping Tang",
            "authorURL": "https://huggingface.co/Liping",
            "affiliation": "MBZUAI",
            "affiliationURL": "LLM360.ai",
        },
        {
            "author": "Nikhil Ranjan",
            "authorURL": "https://huggingface.co/nikhilranjan",
            "affiliation": "MBZUAI",
            "affiliationURL": "LLM360.ai",
        },
        {
            "author": "Omkar Pangarkar",
            "authorURL": "https://huggingface.co/omkarenator",
            "affiliation": "Petuum, Inc.",
            "affiliationURL": "",
        },
        {
            "author": "Xuezhi Liang",
            "authorURL": "",
            "affiliation": "MBZUAI",
            "affiliationURL": "",
        },
        {
            "author": "Zhen Wang",
            "authorURL": "",
            "affiliation": "MBZUAI",
            "affiliationURL": "",
        },
        {
            "author": "Li An",
            "authorURL": "https://huggingface.co/an1118",
            "affiliation": "UCSD",
            "affiliationURL": "",
        },
        {
            "author": "Bhaskar Rao",
            "authorURL": "",
            "affiliation": "MBZUAI",
            "affiliationURL": "",
        },
        {
            "author": "Zhoujun Cheng",
            "authorURL": "https://huggingface.co/zhoujun",
            "affiliation": "UCSD",
            "affiliationURL": "",
        },
        {
            "author": "Suqi Sun",
            "authorURL": "https://huggingface.co/mylibrar",
            "affiliation": "Petuum, Inc.",
            "affiliationURL": "",
        },
        {
            "author": "Cun Mu",
            "authorURL": "https://huggingface.co/CarisMu",
            "affiliation": "MBZUAI",
            "affiliationURL": "",
        },
        {
            "author": "Victor Miller",
            "authorURL": "https://huggingface.co/vamiller12",
            "affiliation": "Petuum, Inc.",
            "affiliationURL": "",
        },
        {
            "author": "Yue Peng",
            "authorURL": "https://huggingface.co/Dreamever",
            "affiliation": "MBZUAI",
            "affiliationURL": "",
        },
        {
            "author": "Eric P. Xing",
            "authorURL": "",
            "affiliation": "MBZUAI",
            "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
        },
        {
            "author": "Zhengzhong Liu",
            "authorURL": "https://huggingface.co/hunterhector",
            "affiliation": "Petuum, Inc. / MBZUAI ",
            "affiliationURL": "",
        },
    ],
    "katex": {"delimiters": [{"left": "$$", "right": "$$", "display": "false"}]},
}


citation_long = """
@misc{txt360data2024,
  title        = {TxT360: a globally deduplicated dataset for LLM pretraining},
  author       = {Liping Tang, Nikhil Ranjan, Omkar Pangarkar, Zhen Wang, An Li, Zhoujun Cheng, Suqi Sun, Cun Mu, Victor Miller, Yue Peng, Eric P. Xing, Zhengzhong Liu},
  year         = 2024
}
"""


def read_bibs():
    bib_data = parse_file("bibliography.bib")
    cits = []
    for key in bib_data.entries.keys():
        cits.append(D_cite(bibtex_key=key))
    return cits


@app.get("/bibliography.bib")
def get():
    return FileResponse("bibliography.bib")


@app.get("/")
def main():
    from fasthtml.xtend import Script

    return Div(
        D_title(
            H1(
                "TxT360: A Top-Quality LLM Pre-training Dataset Requires the Perfect Blend",
                cls="l-body",
                style="text-align: center;",
            ),
            Div(
                Img(src="images/llm360_logo.png"),
                id="title-plot",
                cls="main-plot-container l-page",
            ),
        ),
        D_byline(),
        D_front_matter(
            Script(
                json.dumps(front_matter),
                id="distill-front-matter",
                type="text/json",
            )
        ),
        D_article(
            D_contents(
                Nav(
                    H3("Table of Contents"),
                    Div(
                        A(
                            "TxT360",
                            href="#section11",
                        )
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "About TxT360",
                                    href="#section11",
                                )
                            ),
                            Li(
                                A(
                                    "Why TxT360",
                                    href="#section12",
                                )
                            ),
                            Li(
                                A(
                                    "Generalizable Approach to Data Processing",
                                    href="#section13",
                                )
                            ),
                        ),
                    ),
                    Div(
                        A(
                            "Common Crawl Data",
                            href="#section21",
                        )
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "Common Crawl Snapshot Processing",
                                    href="#section21",
                                )
                            ),
                            Li(
                                A(
                                    "Common Crawl Data Processing Summary",
                                    href="#section22",
                                )
                            ),
                            Li(
                                A(
                                    "Document Preparation",
                                    href="#section23",
                                )
                            ),
                            Li(
                                A(
                                    "Line-Level Removal",
                                    href="#section24",
                                )
                            ),
                            Li(
                                A(
                                    "Document-Level Filtering",
                                    href="#section25",
                                )
                            ),
                        ),
                    ),
                    Div(
                        A(
                            "Curated Sources",
                            href="#section31",
                        )
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "Curated Sources in TxT360",
                                    href="#section31",
                                )
                            ),
                            Li(
                                A(
                                    "Filtering Steps and Definitions",
                                    href="#section32",
                                )
                            ),
                            Li(
                                A(
                                    "Filtering Discussion on All Curated Sources",
                                    href="#section33",
                                )
                            ),
                        ),
                    ),
                    Div(
                        A(
                            "Shared Processing Steps",
                            href="#section41",
                        )
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "Overview",
                                    href="#section41",
                                )
                            ),
                            Li(
                                A(
                                    "Why Global Deduplication",
                                    href="#section42",
                                )
                            ),
                            Li(
                                A(
                                    "MinHash Generation",
                                    href="#section43",
                                )
                            ),
                            Li(
                                A(
                                    "Matching Pairs Generation",
                                    href="#section44",
                                )
                            ),
                            Li(
                                A(
                                    "Finding Duplicate Pairs",
                                    href="#section45",
                                )
                            ),
                            Li(
                                A(
                                    "Finding Connected Components using MapReduce",
                                    href="#section46",
                                )
                            ),
                            Li(
                                A(
                                    "Personally Identifiable Information Removal",
                                    href="#section47",
                                )
                            ),
                            Li(
                                A(
                                    "Normalization Form C",
                                    href="#section48",
                                )
                            ),
                        ),
                    ),
                    Div(
                        A(
                            "TxT360 Studies",
                            href="#section51",
                        ),
                    ),
                    Div(
                        Ul(
                            Li(
                                A(
                                    "Overview",
                                    href="#section51",
                                )
                            ),
                            Li(
                                A(
                                    "A Simple Data Mix Creates a Good Learning Curve",
                                    href="#section52",
                                )
                            ),
                            Li(
                                A(
                                    "Perplexity Analysis",
                                    href="#section53",
                                )
                            ),
                            Li(
                                A(
                                    "Topic Analysis",
                                    href="#section55",
                                )
                            )
                        ),
                    ),
                    role="navigation",
                    cls="l-text figcaption",
                ),
            ),
            intro(),
            web.web_data(),
            curated.curated(),
            common.common_steps(),
            results.results(),
        ),
        D_appendix(
            D_bibliography(src="bibliography.bib"),
            H3("Citation"),
            P("For attribution in academic contexts, please cite this work as"),
            Pre(citation_long, cls="citation long"),
        ),
        Div(*read_bibs(), style="display: none;"),
    )


new_dataset_comparison1 = pd.DataFrame(
    {
        "Data Source": [
            "CommonCrawl Snapshots",
            "Papers",
            "Wikipedia",
            "FreeLaw",
            "DM Math",
            "USPTO",
            "PG-19",
            "HackerNews",
            "Ubuntu IRC",
            "EuroParl",
            "StackExchange",
            "Code",
        ],
        "TxT360": [
            "99",
            "5 Sources",
            "310+ Languages",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "**",
        ],
        "FineWeb": [
            "96",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
        ],
        "RefinedWeb": [
            "90",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
        ],
        "PedPajamaV2": [
            "84",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
        ],
        "C4": [
            "1",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
        ],
        "Dolma": [
            "24",
            "1 Source",
            "Included",
            "-",
            "-",
            "-",
            "Included",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "RedPajamaV1": [
            "5",
            "1 Source",
            "Included",
            "",
            " ",
            "",
            "Included",
            "-",
            "-",
            "-",
            "Included",
            "Included",
        ],
        "The Pile": [
            "0.6% of 74",
            "4 Sources",
            "English Only",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
            "Included",
        ],
    }
)

styled_table = (
    new_dataset_comparison1.style.applymap(
        lambda _: "background-color: #E1EEDB",  # Green background for col 1
        subset=pd.IndexSlice[:, "TxT360"],
    )
    .applymap(
        lambda _: "background-color: white",  # White background for all other columns
        subset=pd.IndexSlice[
            :, new_dataset_comparison1.columns.difference(["TxT360"])
        ],  # Apply to all columns except "TxT360"
    )
    .set_properties(
        **{
            "text-align": "left",  # Left the text in each cell
            "padding": "10px",  # Add padding for better readability
            "word-wrap": "break-word",  # Ensure text wraps within cells
        }
    )
    .hide(axis="index")  # Hide the row index
)

# Use _repr_html_() method to get the HTML representation of the styled DataFrame
table_html = styled_table._repr_html_()
# table_html = dataset_comparison1.to_html(index=False, border=0)
# new_table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
new_table_div_1 = Div(
    NotStr(table_html), 
        style="display: flex; justify-content: center; align-items: center; width: 100%; max-width: 100%; height: auto; overflow-x: auto;"
)


dataset_comparison1 = pd.DataFrame(
    {
        "Dataset": [
            "TxT360",
            "FineWeb",
            "RefinedWeb",
            "RedPajama-v2",
            "C4",
            "Dolma",
            "RedPajama-v1",
            "The Pile",
        ],
        "CommonCrawl": [
            "99 Snapshots",
            "96 Snapshots",
            "90 Snapshots",
            "84 Snapshots",
            "1 Snapshots",
            "24 Snapshots",
            "5 Snapshots",
            "0.6% of 74 Snapshots",
        ],
        "Papers": [
            "5 Sources",
            "-",
            "-",
            "-",
            "-",
            "1 Source",
            "1 Source",
            "4 Sources",
        ],
        "Wikipedia": [
            "310+ Languages",
            "-",
            "-",
            "-",
            "-",
            "what does a check mark mean?",
            "what does a check mark mean?",
            "English Only",
        ],
        "FreeLaw": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "DM Math": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "USPTO": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
    }
)

# Apply table styling: Light green for the header, alternating white and light grey for rows
styled_table = (
    dataset_comparison1.style.set_properties(
        **{"background-color": "#E1EEDB"},
        subset=pd.IndexSlice[0, :],  # Row 0 with a light green background
    )
    .apply(
        lambda x: [
            "background-color: #E1EEDB"  # Green background for row 0
            if i == 0
            else "background-color: rgb(237, 242, 251)"  # Blue background for other rows
            for i in range(len(x))
        ],
        axis=0,
    )
    .hide(axis="index")
)  # Hide the row index

# Use _repr_html_() method to get the HTML representation of the styled DataFrame
table_html = styled_table._repr_html_()
# table_html = dataset_comparison1.to_html(index=False, border=0)
table_div_1 = Div(NotStr(table_html), style="margin: 40px;")

dataset_comparison2 = pd.DataFrame(
    {
        "Dataset": [
            "TxT360",
            "FineWeb",
            "RefinedWeb",
            "RedPajama-v2",
            "C4",
            "Dolma",
            "RedPajama-v1",
            "The Pile",
        ],
        "PG-19": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "Included",
            "Included",
            "Included",
        ],
        "HackerNews": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "Ubuntu IRC": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "EuroParl": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
        ],
        "StackExchange": [
            "Included",
            "-",
            "-",
            "-",
            "-",
            "-",
            "Included",
            "Included",
        ],
        "Code": [
            "- what is this?",
            "-",
            "-",
            "-",
            "-",
            "Included",
            "Included",
            "Included",
        ],
    }
)
# Apply table styling: Light green for the header, alternating white and light grey for rows
styled_table = (
    dataset_comparison2.style.set_properties(
        **{"background-color": "#E1EEDB"},
        subset=pd.IndexSlice[0, :],  # Row 0 with a light green background
    )
    .apply(
        lambda x: [
            "background-color: #E1EEDB"
            if i == 0
            else (
                "background-color: rgb(237, 242, 251)"
                if i % 2 == 0
                else "background-color: white"
            )
            for i in range(len(x))
        ],
        axis=0,
    )
    .set_table_styles(
        [
            {"selector": "table", "props": [("margin-left", "auto"), ("width", "100%")]},  # Make table responsive and centered
        ]
    )
    .hide(axis="index")
)  # Hide the row index

# Use _repr_html_() method to get the HTML representation of the styled DataFrame
table_html2 = styled_table._repr_html_()
# table_html2 = dataset_comparison2.to_html(index=False, border=0)
# table_div_2 = Div(NotStr(table_html2), style="margin: 40px;")
table_div_2 = Div(NotStr(table_html2))

dataset_sources = pd.DataFrame(
    {
        "Data Source": [
            "CommonCrawl",
            "Papers",
            "Wikipedia",
            "Freelaw",
            "DM Math",
            "USPTO",
            "PG-19",
            "HackerNews",
            "Ubuntu IRC",
            "Europarl",
            "StackExchange",
        ],
        "Raw Data Size": [
            "9.2 TB",
            "712 GB",
            "210 GB",
            "23 GB",
            "22 GB",
            "45 GB",
            "11 GB",
            "4.1 GB",
            "4.7 GB",
            "6.1 GB",
            "45 GB",
        ],
        "Token Count": [
            "4.83T",
            "154.96B",
            "4.75B",
            "7.34B",
            "5.23B",
            "4.95B",
            "2.94B",
            "1.08B",
            "1.54B",
            "1.96B",
            "8.37B",
        ],
        "Information Cut-Off Date": [
            "2024-30",
            "Q4 2023",
            "-",
            "Q1 2024",
            "-",
            "Q4 2023",
            "-",
            "Q4 2023",
            "Q4 2023",
            "-",
            "Q4 2023",
        ],
    }
)
# Apply table styling: Light green for the header, alternating white and light grey for rows
styled_table = (
    dataset_sources.style.apply(
        lambda x: [
            "background-color: white"
            if i % 2 == 0
            else "background-color: rgb(237, 242, 251)"
            for i in range(len(x))
        ],
        axis=0,
    )
    .set_properties(
        **{
            "text-align": "center",  # Center the text in each cell
            "padding": "10px",  # Add padding for better readability
            "word-wrap": "break-word",  # Ensure text wraps within cells
        }
    )
    .hide(axis="index")  # Hide the row index
)

table_html_data = styled_table._repr_html_()
# Wrap the table in a Div, ensuring it is centered
table_div_data = Div(
    NotStr(table_html_data), 
    # style="margin-left: auto; width: 90%; max-width: 100%; text-align: center; align: center; overflow-x: auto;"
    style="display: flex; justify-content: center; align-items: center; width: 100%; max-width: 100%; height: auto; overflow-x: auto;"

)


@app.get("/intro")
def intro():
    return Div(
        Section(
            H2("About TxT360"),
            P(  B("TL;DR "), 
                "We introduce ",
                A(B("TxT360 (Trillion eXtracted Text),"), href="https://huggingface.co/datasets/LLM360/TxT360"),
                " the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T on several key metrics. With the information, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a feature not commonly available in previous pre-training datasets. Our findings highlight the importance of both high-quality data sources and appropriate weighting for optimal blending in LLM training." 
            ),
            P("In line with our 360° open source spirit, we document all detailed steps, reasons of our decisions, detailed statistics, our code (stay tuned!), analysis results and more, in additional to the dataset itself. We hope this can serve as a useful resource for future developers."
            ),
            plotly2fasthtml(all_eval_res_figs["MMLU"]),
            P(
                "Building on top of the prior studies on pre-training data",
                D_cite(bibtex_key="refinedweb"),
                D_cite(bibtex_key="fineweb"),
                D_cite(bibtex_key="c4"),
                D_cite(bibtex_key="muennighoff2023scaling"),
                D_cite(bibtex_key="dolma"),
                ", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps. Unlike DCLM",
                D_cite(bibtex_key="dclm"),
                "and RedPajama V2,",
                D_cite(bibtex_key="redpajama-v2"),
                "we also hope to provide a dataset at this scale that is ready to go, without requiring futher filtering."
            ),
            P(
                B("How to Read this Blog Post?"),
                P("This document contains all the details and is lengthy. We recommend readers to use the Table of Contents to jump to the appropriate sections. At each top level section, we provided a quick guide for the content. We also recommend readers to consider this post as a reference for some high level statistics related to pre-training datasets.")
            ),
            id="section11",
        ),
        Section(
            H2("Why TxT360"),
            P(
                "In this year we have seen excellent datasets released by the community. Among those, most datasets focus on one source (e.g., crawled websites, code bases, papers). However, it is not trivial to combine these sources together due to the potential duplicaiton across them. TxT360 is the first dataset to combine most of sources commonly used in pretraining."
            ),
            new_table_div_1,
            # table_div_1,
            # table_div_2,
            P(
                "In LLM pretraining, it is common to combine all possible text sources due to the Scaling Law. Crawled web pages are included to provide a vast quantity of data which can cover long tail and diverse information, while curated datasets such as Wikipedia are also used, which often provide the 'deep-dive' domain information. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training."
            ),
            P(
                "** TxT360 does not include very specific domains such as code and math. This decision was made due to the perceived low duplication code with other sources, and the different logic requiring to build those datasets. We leave those work to future work and recommend users refer to existing projects such as Stack V2",
                D_cite(bibtex_key="lozhkov2024starcoder2stackv2"),
                ".",
            ),
            # P("Table 2: Basic TxT360 Statistics."),
            # table_div_data,
            id="section12",
        ),
        Section(
            H2("Our Approach"),
            P(
                "To produce TxT360, a comprehensive data processing pipeline was designed to account for the nuances of both web and curated datasets. The pipeline presents a unified framework for processing both data types, making it convenient and easily adaptive for users to revise and fine-tune the pipeline for their own use cases."
            ),
            P(
                "Web datasets are inherently noisy and varied. The TxT360 pipeline implements sophisticated filtering and deduplication techniques to clean and remove redundancies while preserving data integrity."
            ),
            P(
                "Curated datasets are typically structured and consistently formatted, but also can cause troubles with their own special formatting preferences. TxT360 filters these sources with selective steps to maintain their integrity while providing seamless integration into the larger dataset. Both data source types are globally deduplicated together resulting in ~5T tokens of high-quality data. The table below shows the source distribution of TxT360 tokens. ",
                B("Note that we do not recommend to use the raw distribution of the deduplicated dataset, a simple recipe is provided in the studies section."),
            ),
            table_div_data,
            P(
                "We provide details and context for the choices behind TxT360 in the respective Common Crawl Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the Shared Processing Steps section."
            ),
            # Img(src="images/pipeline.png", height="300", width="600"),
            # P(
            #    "Figure 1: Data processing pipeline. All the steps are adopted for processing web data while the yellow blocks are adopted for processing curated sources."
            # ),
            id="section13",
        ),
        id="inner-text",
    )


rt("/update/{target}")(data_viewer.update)

serve()