victormiller
commited on
Commit
•
37e0b82
1
Parent(s):
9b18c90
Update main.py
Browse files
main.py
CHANGED
@@ -274,8 +274,11 @@ def intro():
|
|
274 |
id="section3",
|
275 |
),
|
276 |
Section(
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
279 |
id="section4",
|
280 |
),
|
281 |
id="inner-text",
|
|
|
274 |
id="section3",
|
275 |
),
|
276 |
Section(
|
277 |
+
H3("Full and Openly Documented Production Ready Pretraining Corpus"),
|
278 |
+
P("We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "),
|
279 |
+
P("Our code is open sourced here[link to github]."),
|
280 |
+
P("The dataset is ready for immediate download directly from Hugging Face [link]."),
|
281 |
+
P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
|
282 |
id="section4",
|
283 |
),
|
284 |
id="inner-text",
|