victormiller commited on
Commit
189eeae
1 Parent(s): d5c9701

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +15 -2
curated.py CHANGED
@@ -679,8 +679,21 @@ filtering_process = Div(
679
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
  H4("Filtering"),
682
- Ol(
683
- Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
 
 
 
 
 
 
 
 
 
 
 
 
 
684
  ),
685
  table_div_phil,
686
  Details(
 
679
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
  H4("Filtering"),
682
+ Ul(
683
+ Li(P(B("Hyphenation Removal"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
684
+ Li(P(B("Newline Filtering"), "")),
685
+ Li(P(B("Header/Footer Filtering"), "")),
686
+ Li(P(B("Double Whitespace Filtering"), "")),
687
+ Li(P(B("Mean Line Length Check"), "")),
688
+ Li(P(B("CID Percentage Filter "), "")),
689
+ Li(P(B("Letterness Filter"), "")),
690
+ Li(P(B("Removing Leading/Trailing Numbers"), "")),
691
+ Li(P(B("Fixing Unicode Issues"), "")),
692
+ Li(P(B("Combining Diacritics Correction"), "")),
693
+ Li(P(B("Unigram Log Probability"), "")),
694
+ Li(""),
695
+
696
+
697
  ),
698
  table_div_phil,
699
  Details(