victormiller
commited on
Commit
•
189eeae
1
Parent(s):
d5c9701
Update curated.py
Browse files- curated.py +15 -2
curated.py
CHANGED
@@ -679,8 +679,21 @@ filtering_process = Div(
|
|
679 |
P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
|
680 |
P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
681 |
H4("Filtering"),
|
682 |
-
|
683 |
-
Li("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
684 |
),
|
685 |
table_div_phil,
|
686 |
Details(
|
|
|
679 |
P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
|
680 |
P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
681 |
H4("Filtering"),
|
682 |
+
Ul(
|
683 |
+
Li(P(B("Hyphenation Removal"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
|
684 |
+
Li(P(B("Newline Filtering"), "")),
|
685 |
+
Li(P(B("Header/Footer Filtering"), "")),
|
686 |
+
Li(P(B("Double Whitespace Filtering"), "")),
|
687 |
+
Li(P(B("Mean Line Length Check"), "")),
|
688 |
+
Li(P(B("CID Percentage Filter "), "")),
|
689 |
+
Li(P(B("Letterness Filter"), "")),
|
690 |
+
Li(P(B("Removing Leading/Trailing Numbers"), "")),
|
691 |
+
Li(P(B("Fixing Unicode Issues"), "")),
|
692 |
+
Li(P(B("Combining Diacritics Correction"), "")),
|
693 |
+
Li(P(B("Unigram Log Probability"), "")),
|
694 |
+
Li(""),
|
695 |
+
|
696 |
+
|
697 |
),
|
698 |
table_div_phil,
|
699 |
Details(
|