Spaces:
Running
Running
victormiller
commited on
Commit
•
6058099
1
Parent(s):
872ea67
Update web.py
Browse files
web.py
CHANGED
@@ -254,7 +254,7 @@ def web_data():
|
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
|
256 |
"DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
|
257 |
-
"and/or RedPajama-V-2" D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
|
258 |
),
|
259 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
260 |
),
|
|
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
|
256 |
"DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
|
257 |
+
"and/or RedPajama-V-2", D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
|
258 |
),
|
259 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
260 |
),
|