victormiller
commited on
Commit
•
f36591a
1
Parent(s):
564e0a1
Update curated.py
Browse files- curated.py +4 -17
curated.py
CHANGED
@@ -445,14 +445,9 @@ filtering_process = Div(
|
|
445 |
Section(
|
446 |
H3("Wikipedia"),
|
447 |
H4("Download and Extraction"),
|
448 |
-
|
449 |
-
Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
|
450 |
-
Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
|
451 |
-
),
|
452 |
H4("Filtering"),
|
453 |
-
|
454 |
-
Li("As we expect the dataset to be already of high quality so only one filter is applied which is to remove all documents (articles) with less than 10 words (not inclusive)"),
|
455 |
-
),
|
456 |
H4("Local Deduplication Process"),
|
457 |
Ol(
|
458 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
@@ -463,12 +458,7 @@ filtering_process = Div(
|
|
463 |
Section(
|
464 |
H3("ArXiv"),
|
465 |
H4("Download and Extraction"),
|
466 |
-
|
467 |
-
Li("All the data was downloaded in original latex format from Arxiv official S3 dump s3://arxic/src"),
|
468 |
-
Li("We try to encode the downloaded data into utf-8 or guess encoding using chardet library"),
|
469 |
-
Li("After that pandoc was used to extract information from the latex files and saved as markdown format - code: pandoc -s {tex} -o out/{out_name}.md --wrap=none"),
|
470 |
-
Li("All markdowns were combined to create jsonl files"),
|
471 |
-
),
|
472 |
H4("Filtering"),
|
473 |
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
474 |
Ol(
|
@@ -1207,12 +1197,9 @@ def curated(request):
|
|
1207 |
H2("Curated Sources Defined"),
|
1208 |
table_desc,
|
1209 |
data_preprocessing_div,
|
1210 |
-
# plotly2fasthtml(get_chart_28168342()),
|
1211 |
-
# plotly2fasthtml(get_chart_new()),
|
1212 |
-
# plotly2fasthtml(stacked_bar),
|
1213 |
-
# plotly2fasthtml(diff_stacked_bar),
|
1214 |
plotly2fasthtml(diff2_stacked_bar),
|
1215 |
H2("Curated Sources Processing"),
|
|
|
1216 |
filtering_process,
|
1217 |
data_preparation_div,
|
1218 |
H2("Local Deduplication"),
|
|
|
445 |
Section(
|
446 |
H3("Wikipedia"),
|
447 |
H4("Download and Extraction"),
|
448 |
+
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
|
|
|
|
|
|
449 |
H4("Filtering"),
|
450 |
+
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
|
|
|
|
451 |
H4("Local Deduplication Process"),
|
452 |
Ol(
|
453 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
|
|
458 |
Section(
|
459 |
H3("ArXiv"),
|
460 |
H4("Download and Extraction"),
|
461 |
+
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), "We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
|
|
|
|
|
|
|
|
|
462 |
H4("Filtering"),
|
463 |
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
464 |
Ol(
|
|
|
1197 |
H2("Curated Sources Defined"),
|
1198 |
table_desc,
|
1199 |
data_preprocessing_div,
|
|
|
|
|
|
|
|
|
1200 |
plotly2fasthtml(diff2_stacked_bar),
|
1201 |
H2("Curated Sources Processing"),
|
1202 |
+
H3("TALK ABOUT THE DIFFERENT FILTERS BEFORE HAND"),
|
1203 |
filtering_process,
|
1204 |
data_preparation_div,
|
1205 |
H2("Local Deduplication"),
|