victormiller commited on
Commit
c642284
1 Parent(s): eee4211

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +1 -1
curated.py CHANGED
@@ -464,7 +464,7 @@ filtering_process = Div(
464
  Ol(
465
  Li("Language Filter: any language other than English are discarded"),
466
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
467
- Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by". A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
468
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
469
  ),
470
  H4("Local Deduplication Process"),
 
464
  Ol(
465
  Li("Language Filter: any language other than English are discarded"),
466
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
467
+ Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
468
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
469
  ),
470
  H4("Local Deduplication Process"),