victormiller commited on
Commit
103b5cf
1 Parent(s): c642284

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +3 -1
curated.py CHANGED
@@ -443,8 +443,10 @@ filtering_process = Div(
443
  P("This section contains the specific steps taken to filter all 14 curated source datasets.")
444
  ),
445
  Section(
 
446
  H3("Wikipedia"),
447
  H4("Download and Extraction"),
 
448
  P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
449
  H4("Filtering"),
450
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
@@ -453,7 +455,7 @@ filtering_process = Div(
453
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
454
  ),
455
  table_div_wikipedia,
456
-
457
  ),
458
  Section(
459
  H3("ArXiv"),
 
443
  P("This section contains the specific steps taken to filter all 14 curated source datasets.")
444
  ),
445
  Section(
446
+ Div(
447
  H3("Wikipedia"),
448
  H4("Download and Extraction"),
449
+
450
  P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
451
  H4("Filtering"),
452
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
 
455
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
456
  ),
457
  table_div_wikipedia,
458
+ ),
459
  ),
460
  Section(
461
  H3("ArXiv"),