victormiller commited on
Commit
b4e3ff3
1 Parent(s): 583d7c5

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +3 -3
curated.py CHANGED
@@ -41,7 +41,7 @@ filtering_process = Div(
41
  Li("All markdowns were combined to create jsonl files"),
42
  ),
43
  H4("Filtering"),
44
- P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset")
45
  Ol(
46
  Li("min_word: less than 500 words (not inclusive) are discarded"),
47
  Li("Language: any language other than English are discarded"),
@@ -76,7 +76,7 @@ filtering_process = Div(
76
  Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
77
  ),
78
  H4("Filtering - S2ORC Abstract"),
79
- P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually")
80
  Ol(
81
  Li("title_abstract: must have title and abstract"),
82
  Li("language: abstract must be in English"),
@@ -105,7 +105,7 @@ filtering_process = Div(
105
  Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
106
  ),
107
  H4("Filtering"),
108
- P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset.")
109
  Ol(
110
  Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
111
  Li("Language: any language other than English are discarded"),
 
41
  Li("All markdowns were combined to create jsonl files"),
42
  ),
43
  H4("Filtering"),
44
+ P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
45
  Ol(
46
  Li("min_word: less than 500 words (not inclusive) are discarded"),
47
  Li("Language: any language other than English are discarded"),
 
76
  Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
77
  ),
78
  H4("Filtering - S2ORC Abstract"),
79
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
80
  Ol(
81
  Li("title_abstract: must have title and abstract"),
82
  Li("language: abstract must be in English"),
 
105
  Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
106
  ),
107
  H4("Filtering"),
108
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
109
  Ol(
110
  Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
111
  Li("Language: any language other than English are discarded"),