Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
from fasthtml.components import * | |
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline | |
from fasthtml.components import HR | |
from plotly import graph_objects as go | |
from fh_plotly import plotly2fasthtml | |
import pandas as pd | |
import json | |
from rich import print | |
app, rt = fast_app(debug=True) | |
def main(): | |
return Html( | |
Head( | |
Meta(charset="UTF-8"), | |
Meta(name="viewport", content="width=device-width, initial-scale=1.0"), | |
Script(src="https://distill.pub/template.v2.js"), | |
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), | |
Script(src="https://cdn.plot.ly/plotly-latest.min.js"), | |
Link(rel="stylesheet", href="style.css"), | |
), | |
Body( | |
D_title( | |
H1( | |
"TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models", | |
cls="l-body", | |
style="text-align: center;", | |
), | |
Div( | |
Img(src="images/llm360_logo.png"), | |
id="title-plot", | |
cls="main-plot-container l-page", | |
), | |
), | |
D_article( | |
D_contents( | |
Nav( | |
H3("Table of Contents"), | |
Div( | |
A("TxT360", href="#_self"), | |
hx_get="/intro", | |
hx_target="#inner-text", | |
), | |
Div( | |
Ul( | |
Li( | |
A( | |
"Introduction", | |
href="/intro#section1", | |
hx_get="/intro#section1", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Background", | |
href="/intro#section2", | |
hx_get="/intro#section2", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Main Content", | |
href="/intro#section3", | |
hx_get="/intro#section3", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Conclusion", | |
href="/intro#section4", | |
hx_get="/intro#section4", | |
hx_target="#inner-text", | |
) | |
), | |
), | |
), | |
Div( | |
A("Web Data", href="#inner-text"), | |
hx_get="/webdata", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Curated Sources", href="#inner-text"), | |
hx_get="/curated", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Common Steps", href="#inner-text"), | |
hx_get="/common", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("TxT360 Results", href="#inner-text"), | |
hx_get="/results", | |
hx_target="#inner-text", | |
), | |
role="navigation", | |
cls="l-text figcaption", | |
), | |
prerendered="true", | |
), | |
intro(), | |
), | |
), | |
lang="en", | |
) | |
def intro(): | |
return Div( | |
Section( | |
H2("Introduction"), | |
P("""We are excited to introduce TxT360, a | |
large-scale, comprehensive, and fully transparent | |
dataset designed for Large Language Model (LLM) | |
pre-training. TxT360 is engineered to strike a | |
balance between the quantity and quality of | |
pre-training data, pushing the limit on both | |
fronts. This comprehensive dataset encompasses both | |
expansive web-based data and highly curated data | |
sources, making it one of the most robust LLM | |
pre-training corpora available today. Our web data | |
component includes 99 snapshots from Common Crawl, | |
amassing 5.7 trillion tokens and occupying 11 TB of | |
disk space in jsonl.gz format. On the curated side, | |
TxT360 integrates one of the most extensive | |
collections of high-quality sources across multiple | |
domains, ensuring diverse and rich content referred | |
to as curated sources, 14 sources across 10 | |
domains. To maintain the highest quality, we | |
meticulously pre-processed the web data to filter | |
out low-quality content and conducted thorough | |
reviews of the curated sources. This process not | |
only unified their formats but also identified and | |
rectified any anomalies. Not only do we 100% | |
open-source our processing scripts, but we also | |
release the details of our data reviews, revealing | |
the decision-making processes behind data selection | |
and quality assurance. This level of transparency | |
allows researchers and practitioners to fully | |
understand the dataset’s composition and make | |
informed decisions when using TxT360 for training. | |
Additionally, TxT360 includes detailed | |
documentation and analysis of the data, covering | |
distribution statistics, domain coverage, and | |
processing pipeline, which helps users navigate and | |
utilize the dataset effectively. Overall, TxT360 | |
represents a significant step forward in the | |
availability and transparency of large-scale | |
training data for language models, setting a new | |
standard for dataset quality and openness."""), | |
id="section1", | |
), | |
Section( | |
H2("Background"), | |
P( | |
""" The quality and size of a pre-training dataset | |
play a crucial role in the performance of large | |
language models (LLMs). The community has | |
introduced a variety of datasets for this purpose, | |
including purely web-based datasets like RefinedWeb | |
[1], RedPajama-Data-V2 [2], DCLM [3], and | |
FineWeb [4], as well as comprehensive datasets | |
derived from multiple highly-curated data sources | |
such as The Pile [5], RedPajama-Data-V1 [6], and | |
Dolma [7] . It is commonly known that web-based | |
datasets provide a vast quantity of data, while | |
highly-curated multi-source datasets consistently | |
deliver high quality and diversity, both critical | |
for effective LLM pre-training. However, despite | |
the advancements in both types of data, each type | |
of dataset has its limitations. For instance, the | |
processing scripts for the web dataset, RefinedWeb, | |
known for its high quality, are not public, and | |
only about 10% of the entire dataset has been | |
disclosed. Conversely, the web component of | |
existing highly-curated multi-source datasets is | |
relatively small compared to purely web-based | |
datasets, limiting their coverage and diversity | |
compared to the scale of information from the | |
internet. By integrating the extensive reach of | |
web data with the exceptional quality of curated | |
sources, TxT360 is crafted to meet and surpass the | |
rigorous standards required for state-of-the-art | |
LLM pre-training. """ | |
), | |
id="section2", | |
), | |
Section( | |
H2("Main Content"), | |
P("""The performance of a large language model (LLM) | |
depends heavily on the quality and size of its | |
pretraining dataset. However, the pretraining | |
datasets for state-of-the-art open LLMs like Llama | |
3 and Mixtral are not publicly available and very | |
little is known about how they were created. | |
Reading time: 45 min. For the best reading | |
experience, we recommend not using a mobile phone. | |
Recently, we released 🍷 FineWeb, a new, | |
large-scale (15-trillion tokens, 44TB disk space) | |
dataset for LLM pretraining. FineWeb is derived | |
from 96 CommonCrawl snapshots and produces | |
better-performing LLMs than other open pretraining | |
datasets. To bring more clarity in machine learning | |
and advance the open understanding of how to train | |
good quality large language models, we carefully | |
documented and ablated all of the design choices | |
used in FineWeb, including in-depth investigations | |
of deduplication and filtering strategies. The | |
present long form report is a deep dive in how to | |
create a large and high-quality web-scale dataset | |
for LLM pretraining. The dataset itself, 🍷 | |
FineWeb, is available here. We are extremely | |
thankful to the whole distill.pub team (Christopher | |
Olah, Shan Carter, Ludwig Schubert in particular) | |
for creating the template on which we based this | |
blog post. Thanks also for inspiring us with | |
exquisitely crafted articles and blog posts. In | |
this report we also introduce 📚 FineWeb-Edu, a | |
subset of FineWeb constructed using scalable | |
automated high-quality annotations for educational | |
value, and which outperforms all openly accessible | |
web-datasets on a number of educational benchmarks | |
such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu | |
is available in two sizes/filtering-level: 1.3 | |
trillion (very high educational content) and 5.4 | |
trillion (high educational content) tokens (all | |
tokens are measured with GPT2 tokenizer). You can | |
download it here. Both datasets are released under | |
the permissive ODC-By 1.0 license TLDR: This blog | |
covers a discussion on processing and evaluating | |
data quality at scale, the 🍷 FineWeb recipe | |
(listing and explaining all of our design choices), | |
and the process followed to create its 📚 | |
FineWeb-Edu subset."""), | |
id="section3", | |
), | |
Section( | |
H2("Conclusion"), | |
P("""This is the conclusion section where we | |
summarize the key points discussed in the blog post | |
and provide final thoughts."""), | |
id="section4", | |
), | |
id="inner-text", | |
) | |
def web_data(): | |
return Div(Section(H2(P("Web Data")), id="inner-text")) | |
def get_chart_28168342(): | |
fig = go.Figure() | |
filter_names = [ | |
"Download", | |
"Language", | |
"Min word count", | |
"Title Abstract", | |
"Majority language", | |
"Paragraph count", | |
"Frequency", | |
"Unigram log probability", | |
"Local dedup", | |
] | |
data_sources = [ | |
("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]), | |
("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]), | |
("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
] | |
for name, x_values in data_sources: | |
fig.add_trace( | |
go.Funnel( | |
name=name, | |
orientation="h", | |
y=filter_names, | |
x=x_values, | |
textinfo="value+percent total", | |
textposition="inside", | |
) | |
) | |
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)") | |
return fig | |
def curated(request): | |
from curated import get_data | |
# Partial Updates | |
params = request.query_params | |
if data_source := params.get("data_source"): | |
return get_data(data_source, params.get("doc_id", 3)) | |
if doc_id := params.get("doc_id"): | |
return get_data(params.get("data_source"), doc_id) | |
hr = HR() | |
data_preparation_steps = pd.DataFrame( | |
{ | |
"Method": [ | |
"HTTP/FTP dumps", | |
"Web crawling", | |
"Archive snapshot", | |
"Generated", | |
"Curated", | |
], | |
"Description": [ | |
"Acquiring data from HTTP/FTP dumps", | |
"Crawling websites to extract data", | |
"Working with archive dumps", | |
"Generating synthetic data", | |
"High quality curated data", | |
], | |
"Source": [ | |
"Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds", | |
"USPTO | Hackernews | Ubuntu IRC", | |
"StackExchange", | |
"DM Maths", | |
"PG19 | Europarl", | |
], | |
} | |
) | |
table_html = data_preparation_steps.to_html(index=False, border=0) | |
table_div = Div(NotStr(table_html), cls="l-body-outset") | |
expander = Details( | |
Summary("Raw Data Extraction"), | |
get_data(), | |
style="border: 1px solid #ccc; padding: 20px;", | |
open=True, | |
) | |
return Div( | |
Section( | |
H2("Curated Sources"), | |
plotly2fasthtml(get_chart_28168342()), | |
H3("Data Preparation"), | |
table_div, | |
H3("Data Preprocessing"), | |
expander, | |
id="inner-text", | |
) | |
) | |
def common_steps(): | |
return Div(Section(H2(P("Common Steps")), id="inner-text")) | |
def results(): | |
return Div(Section(H2(P("Results")), id="inner-text")) | |
serve() | |