merge
Browse files- dist/index.html +48 -3
- dist/main.bundle.js +0 -0
- dist/main.bundle.js.map +0 -0
- src/index.html +48 -3
- src/plotting.js +1 -1
- webpack.config.js +1 -1
dist/index.html
CHANGED
@@ -75,9 +75,9 @@
|
|
75 |
However, the pretraining datasets for state-of-the-art open LLMs like Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Mixtral<d-cite bibtex-key="jiang2024mixtral"></d-cite> are not publicly available and very little is known about how they were created.</p>
|
76 |
<aside>Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone.</aside>
|
77 |
|
78 |
-
<p>Recently, we released <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb"><strong>π· FineWeb</strong></a>, a new, large-scale
|
79 |
-
(<strong>15-trillion tokens, 44TB disk space</strong>) dataset for LLM pretraining. FineWeb is derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots and produces <strong>better-performing LLMs than other open pretraining datasets</strong>. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we decided to carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset it-self, π· FineWeb, is available <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.
|
80 |
-
|
81 |
<aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts.</aside>
|
82 |
|
83 |
<p>In this report we also introduce <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>π FineWeb-Edu</strong></a>, a subset of FineWeb constructed using scalable high-quality education annotations and which outperform all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA.
|
@@ -600,6 +600,11 @@
|
|
600 |
<p>π· FineWeb is thus βup to our knowledgeβ the dataset leading to the current highest model performances while allowing to train on several trillion of openly accessible unique tokens.</p>
|
601 |
|
602 |
<h2>π FineWeb-Edu</h2>
|
|
|
|
|
|
|
|
|
|
|
603 |
<p><a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">π FineWeb-Edu</a> is an additional developement of FineWeb that we are excited to introduce in this tech report and openly release. FineWeb-Edu is based on a new approach that recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was notably used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been really published or fully explored in public yet in our opinion.</p>
|
604 |
<p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
|
605 |
<blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
|
@@ -646,6 +651,46 @@
|
|
646 |
<p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
|
647 |
<p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
<h2>Conclusion and looking forward</h2>
|
650 |
<p>Through our open science efforts we hope to open more and more the black box around training high performance large language models as well as give every model trainer the ability to create state-of-the-art LLMs. We're excited to continue iterating on FineWeb and increasingly better filtered subsets of web data, in a fully open and reproducible manner.</p>
|
651 |
<p>In particular in the short term, while English currently dominates the large language model landscape, we're looking forward to applying the learnings we make in this project to make high quality training data available in other languages as well and as accessible as possible.</p>
|
|
|
75 |
However, the pretraining datasets for state-of-the-art open LLMs like Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Mixtral<d-cite bibtex-key="jiang2024mixtral"></d-cite> are not publicly available and very little is known about how they were created.</p>
|
76 |
<aside>Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone.</aside>
|
77 |
|
78 |
+
<p>Recently, we released <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb"><strong>π· FineWeb</strong></a>, a new, large-scale
|
79 |
+
(<strong>15-trillion tokens, 44TB disk space</strong>) dataset for LLM pretraining. FineWeb is derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots and produces <strong>better-performing LLMs than other open pretraining datasets</strong>. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we decided to carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset it-self, π· FineWeb, is available <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.
|
80 |
+
|
81 |
<aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts.</aside>
|
82 |
|
83 |
<p>In this report we also introduce <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>π FineWeb-Edu</strong></a>, a subset of FineWeb constructed using scalable high-quality education annotations and which outperform all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA.
|
|
|
600 |
<p>π· FineWeb is thus βup to our knowledgeβ the dataset leading to the current highest model performances while allowing to train on several trillion of openly accessible unique tokens.</p>
|
601 |
|
602 |
<h2>π FineWeb-Edu</h2>
|
603 |
+
|
604 |
+
<figure style="text-align: center;">
|
605 |
+
<img src="assets/images/dataset_comparisons_agg_fw_edu.png"/>
|
606 |
+
<figcaption style="font-style: italic; margin-top: 10px;">π FineWeb-Edu outperforms π· FineWeb and all other open web datasets on our group of evaluation tasks.</figcaption>
|
607 |
+
</figure>
|
608 |
<p><a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">π FineWeb-Edu</a> is an additional developement of FineWeb that we are excited to introduce in this tech report and openly release. FineWeb-Edu is based on a new approach that recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was notably used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been really published or fully explored in public yet in our opinion.</p>
|
609 |
<p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
|
610 |
<blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
|
|
|
651 |
<p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
|
652 |
<p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
|
653 |
|
654 |
+
|
655 |
+
|
656 |
+
<h2>Bonus: CommonCrawl over time</h2>
|
657 |
+
<blockquote>
|
658 |
+
<p>Just like fine wine, not all crawls are created equal.</p>
|
659 |
+
</blockquote>
|
660 |
+
<p>While ablating filtering steps, we noticed that certain crawls outperformed others by a significant margin. We decided to investigate this phenomenon.</p>
|
661 |
+
<h3>Benchmark performance by crawl</h3>
|
662 |
+
<p>For each crawl, we trained two 1.8B models on 27 billion tokens randomly sampled from that crawl's data (after the
|
663 |
+
base filtering and MinHash deduplication steps), where each run had a different random 27BT sampling of this data. We trained 192 such models, totaling over 60 thousand H100 GPU-hours. We subsequently took
|
664 |
+
the last 3 checkpoints for both runs and plotted the average of these 6 data points per crawl.</p>
|
665 |
+
<p>The plot below clearly shows that some dumps perform far
|
666 |
+
worse than others. Each year has a different color, and the number of crawls per year also varies.</p>
|
667 |
+
<div class="main-plot-container l-page-outset">
|
668 |
+
<figure>
|
669 |
+
<img src="assets/images/score_by_dump.png">
|
670 |
+
</figure>
|
671 |
+
<div id="plot-score_by_dump"></div>
|
672 |
+
</div>
|
673 |
+
|
674 |
+
<p>We investigated possible causes for this behaviour such as changes in the most common URLs of each dump, as well as potential benchmark contamination, but could not find any conclusive explanation. We leave further investigation for future work.</p>
|
675 |
+
|
676 |
+
<h3>Synthetic data</h3>
|
677 |
+
<p>We wondered if the strong performance of the last few crawls could be, in part, attributed to the presence of a larger quantity of synthetic data (data
|
678 |
+
generated by LLMs). Such a change would not be surprising due to the recent increase in popularity of LLMs,
|
679 |
+
notably of ChatGPT.</p>
|
680 |
+
<p>Since, to the best of our knowledge, there is no foolproof method to detect synthetic data, we opted to use a proxy metric: we measured the frequency of the
|
681 |
+
following words in each crawl: <code>"delve", "as a large language model", "it's important to note", "rich tapestry",
|
682 |
+
"intertwined", "certainly!", "dive into"</code>, all of which are commonly used by ChatGPT.</p>
|
683 |
+
<p>It is important to note that not all samples containing
|
684 |
+
one of these phrases were necessarily generated by ChatGPT (and also that many ChatGPT generated samples do
|
685 |
+
not contain any of these phrases), but assuming that the amount of synthetic data were to not change across
|
686 |
+
crawls, one would expect these frequencies to remain approximately constant over time.</p>
|
687 |
+
<p>The results are shown in the following plot:</p>
|
688 |
+
<figure><img src="assets/images/synthetic-data.png"/></figure>
|
689 |
+
<p>While the frequency remained approximately constant until
|
690 |
+
2023-14 (ChatGPT was released at the end of 2022), we find a steep increase of our proxy metric
|
691 |
+
in recent crawls. While this simple test is not enough to conclude that ChatGPT completions and other synthetic data is improving the quality of the most recent crawl, it at the very least does not seem to drastically harm it.</p>
|
692 |
+
<p>We expect to continue seeing increasing quantities of synthetic data on new CC crawls. However, while for relatively small trainings this data does not seem to harm performance (and might actually improve it), it is not clear that this holds for much larger trainings.</p>
|
693 |
+
|
694 |
<h2>Conclusion and looking forward</h2>
|
695 |
<p>Through our open science efforts we hope to open more and more the black box around training high performance large language models as well as give every model trainer the ability to create state-of-the-art LLMs. We're excited to continue iterating on FineWeb and increasingly better filtered subsets of web data, in a fully open and reproducible manner.</p>
|
696 |
<p>In particular in the short term, while English currently dominates the large language model landscape, we're looking forward to applying the learnings we make in this project to make high quality training data available in other languages as well and as accessible as possible.</p>
|
dist/main.bundle.js
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
dist/main.bundle.js.map
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/index.html
CHANGED
@@ -75,9 +75,9 @@
|
|
75 |
However, the pretraining datasets for state-of-the-art open LLMs like Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Mixtral<d-cite bibtex-key="jiang2024mixtral"></d-cite> are not publicly available and very little is known about how they were created.</p>
|
76 |
<aside>Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone.</aside>
|
77 |
|
78 |
-
<p>Recently, we released <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb"><strong>π· FineWeb</strong></a>, a new, large-scale
|
79 |
-
(<strong>15-trillion tokens, 44TB disk space</strong>) dataset for LLM pretraining. FineWeb is derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots and produces <strong>better-performing LLMs than other open pretraining datasets</strong>. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we decided to carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset it-self, π· FineWeb, is available <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.
|
80 |
-
|
81 |
<aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts.</aside>
|
82 |
|
83 |
<p>In this report we also introduce <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>π FineWeb-Edu</strong></a>, a subset of FineWeb constructed using scalable high-quality education annotations and which outperform all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA.
|
@@ -600,6 +600,11 @@
|
|
600 |
<p>π· FineWeb is thus βup to our knowledgeβ the dataset leading to the current highest model performances while allowing to train on several trillion of openly accessible unique tokens.</p>
|
601 |
|
602 |
<h2>π FineWeb-Edu</h2>
|
|
|
|
|
|
|
|
|
|
|
603 |
<p><a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">π FineWeb-Edu</a> is an additional developement of FineWeb that we are excited to introduce in this tech report and openly release. FineWeb-Edu is based on a new approach that recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was notably used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been really published or fully explored in public yet in our opinion.</p>
|
604 |
<p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
|
605 |
<blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
|
@@ -646,6 +651,46 @@
|
|
646 |
<p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
|
647 |
<p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
<h2>Conclusion and looking forward</h2>
|
650 |
<p>Through our open science efforts we hope to open more and more the black box around training high performance large language models as well as give every model trainer the ability to create state-of-the-art LLMs. We're excited to continue iterating on FineWeb and increasingly better filtered subsets of web data, in a fully open and reproducible manner.</p>
|
651 |
<p>In particular in the short term, while English currently dominates the large language model landscape, we're looking forward to applying the learnings we make in this project to make high quality training data available in other languages as well and as accessible as possible.</p>
|
|
|
75 |
However, the pretraining datasets for state-of-the-art open LLMs like Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Mixtral<d-cite bibtex-key="jiang2024mixtral"></d-cite> are not publicly available and very little is known about how they were created.</p>
|
76 |
<aside>Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone.</aside>
|
77 |
|
78 |
+
<p>Recently, we released <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb"><strong>π· FineWeb</strong></a>, a new, large-scale
|
79 |
+
(<strong>15-trillion tokens, 44TB disk space</strong>) dataset for LLM pretraining. FineWeb is derived from 96 <a href="https://commoncrawl.org/">CommonCrawl</a> snapshots and produces <strong>better-performing LLMs than other open pretraining datasets</strong>. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we decided to carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset it-self, π· FineWeb, is available <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.
|
80 |
+
|
81 |
<aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts.</aside>
|
82 |
|
83 |
<p>In this report we also introduce <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu"><strong>π FineWeb-Edu</strong></a>, a subset of FineWeb constructed using scalable high-quality education annotations and which outperform all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA.
|
|
|
600 |
<p>π· FineWeb is thus βup to our knowledgeβ the dataset leading to the current highest model performances while allowing to train on several trillion of openly accessible unique tokens.</p>
|
601 |
|
602 |
<h2>π FineWeb-Edu</h2>
|
603 |
+
|
604 |
+
<figure style="text-align: center;">
|
605 |
+
<img src="assets/images/dataset_comparisons_agg_fw_edu.png"/>
|
606 |
+
<figcaption style="font-style: italic; margin-top: 10px;">π FineWeb-Edu outperforms π· FineWeb and all other open web datasets on our group of evaluation tasks.</figcaption>
|
607 |
+
</figure>
|
608 |
<p><a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu">π FineWeb-Edu</a> is an additional developement of FineWeb that we are excited to introduce in this tech report and openly release. FineWeb-Edu is based on a new approach that recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was notably used in the trainings of Llama 3<d-cite bibtex-key="llama3modelcard"></d-cite> and Phi3<d-cite bibtex-key="abdin2024phi"></d-cite> but its large-scale impact on web data filtering hasn't been really published or fully explored in public yet in our opinion.</p>
|
609 |
<p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the paper<d-cite bibtex-key="abdin2024phi"></d-cite> stating:</p>
|
610 |
<blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
|
|
|
651 |
<p>Given that a threshold of 2 also demonstrated strong performance while retaining more data, we are releasing an additional dataset filtered with this threshold, containing 5.4 trillion tokens under <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu-score-2">HuggingFaceFW/fineweb-edu-score-2</a>.</p>
|
652 |
<p>You can find the two datasets along with the classifier used for the filtering in this <a href="https://huggingface.co/collections/HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd">collection</a>.</p>
|
653 |
|
654 |
+
|
655 |
+
|
656 |
+
<h2>Bonus: CommonCrawl over time</h2>
|
657 |
+
<blockquote>
|
658 |
+
<p>Just like fine wine, not all crawls are created equal.</p>
|
659 |
+
</blockquote>
|
660 |
+
<p>While ablating filtering steps, we noticed that certain crawls outperformed others by a significant margin. We decided to investigate this phenomenon.</p>
|
661 |
+
<h3>Benchmark performance by crawl</h3>
|
662 |
+
<p>For each crawl, we trained two 1.8B models on 27 billion tokens randomly sampled from that crawl's data (after the
|
663 |
+
base filtering and MinHash deduplication steps), where each run had a different random 27BT sampling of this data. We trained 192 such models, totaling over 60 thousand H100 GPU-hours. We subsequently took
|
664 |
+
the last 3 checkpoints for both runs and plotted the average of these 6 data points per crawl.</p>
|
665 |
+
<p>The plot below clearly shows that some dumps perform far
|
666 |
+
worse than others. Each year has a different color, and the number of crawls per year also varies.</p>
|
667 |
+
<div class="main-plot-container l-page-outset">
|
668 |
+
<figure>
|
669 |
+
<img src="assets/images/score_by_dump.png">
|
670 |
+
</figure>
|
671 |
+
<div id="plot-score_by_dump"></div>
|
672 |
+
</div>
|
673 |
+
|
674 |
+
<p>We investigated possible causes for this behaviour such as changes in the most common URLs of each dump, as well as potential benchmark contamination, but could not find any conclusive explanation. We leave further investigation for future work.</p>
|
675 |
+
|
676 |
+
<h3>Synthetic data</h3>
|
677 |
+
<p>We wondered if the strong performance of the last few crawls could be, in part, attributed to the presence of a larger quantity of synthetic data (data
|
678 |
+
generated by LLMs). Such a change would not be surprising due to the recent increase in popularity of LLMs,
|
679 |
+
notably of ChatGPT.</p>
|
680 |
+
<p>Since, to the best of our knowledge, there is no foolproof method to detect synthetic data, we opted to use a proxy metric: we measured the frequency of the
|
681 |
+
following words in each crawl: <code>"delve", "as a large language model", "it's important to note", "rich tapestry",
|
682 |
+
"intertwined", "certainly!", "dive into"</code>, all of which are commonly used by ChatGPT.</p>
|
683 |
+
<p>It is important to note that not all samples containing
|
684 |
+
one of these phrases were necessarily generated by ChatGPT (and also that many ChatGPT generated samples do
|
685 |
+
not contain any of these phrases), but assuming that the amount of synthetic data were to not change across
|
686 |
+
crawls, one would expect these frequencies to remain approximately constant over time.</p>
|
687 |
+
<p>The results are shown in the following plot:</p>
|
688 |
+
<figure><img src="assets/images/synthetic-data.png"/></figure>
|
689 |
+
<p>While the frequency remained approximately constant until
|
690 |
+
2023-14 (ChatGPT was released at the end of 2022), we find a steep increase of our proxy metric
|
691 |
+
in recent crawls. While this simple test is not enough to conclude that ChatGPT completions and other synthetic data is improving the quality of the most recent crawl, it at the very least does not seem to drastically harm it.</p>
|
692 |
+
<p>We expect to continue seeing increasing quantities of synthetic data on new CC crawls. However, while for relatively small trainings this data does not seem to harm performance (and might actually improve it), it is not clear that this holds for much larger trainings.</p>
|
693 |
+
|
694 |
<h2>Conclusion and looking forward</h2>
|
695 |
<p>Through our open science efforts we hope to open more and more the black box around training high performance large language models as well as give every model trainer the ability to create state-of-the-art LLMs. We're excited to continue iterating on FineWeb and increasingly better filtered subsets of web data, in a fully open and reproducible manner.</p>
|
696 |
<p>In particular in the short term, while English currently dominates the large language model landscape, we're looking forward to applying the learnings we make in this project to make high quality training data available in other languages as well and as accessible as possible.</p>
|
src/plotting.js
CHANGED
@@ -267,7 +267,7 @@ export const init_ablation_plot = function () {
|
|
267 |
const metricData = await fetch(
|
268 |
`${DATA_FOLDER}/${plotName}/${indexMapping[metricName]["file"]}`
|
269 |
).then((response) => response.json());
|
270 |
-
const traces = metricData?.traces
|
271 |
for (const [index, [key, traceData]] of Object.entries(metricData?.data ?? []).entries()) {
|
272 |
const y = rollingWindow(traceData.y, sliderValue);
|
273 |
const x = traceData.x.slice(0, y.length);
|
|
|
267 |
const metricData = await fetch(
|
268 |
`${DATA_FOLDER}/${plotName}/${indexMapping[metricName]["file"]}`
|
269 |
).then((response) => response.json());
|
270 |
+
const traces = metricData?.traces ?? [];
|
271 |
for (const [index, [key, traceData]] of Object.entries(metricData?.data ?? []).entries()) {
|
272 |
const y = rollingWindow(traceData.y, sliderValue);
|
273 |
const x = traceData.x.slice(0, y.length);
|
webpack.config.js
CHANGED
@@ -10,7 +10,7 @@ const COLOR_KEYS = ["color", "bgColor", "fillcolor"];
|
|
10 |
const transformDataColors = async (data, path) => {
|
11 |
const {getNamedColor} = await import('./src/colors.mjs');
|
12 |
// if not json file, return
|
13 |
-
if (!path.endsWith(".json")) {
|
14 |
return data;
|
15 |
}
|
16 |
console.log(path)
|
|
|
10 |
const transformDataColors = async (data, path) => {
|
11 |
const {getNamedColor} = await import('./src/colors.mjs');
|
12 |
// if not json file, return
|
13 |
+
if (!path.endsWith(".json") || path.includes("score_by_dump")) {
|
14 |
return data;
|
15 |
}
|
16 |
console.log(path)
|