victormiller commited on
Commit
8acb3f0
1 Parent(s): dc1c900

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +2 -2
curated.py CHANGED
@@ -678,11 +678,11 @@ filtering_process = Div(
678
  H3("Phil Papers"),
679
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
- H4("Filtering"),
682
  Ul(
683
  Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
684
  Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
685
- Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",))),
686
  Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python"))),
687
  Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
688
  Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),
 
678
  H3("Phil Papers"),
679
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
680
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
681
+ H4("Filtering"),
682
  Ul(
683
  Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
684
  Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
685
+ Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",)),
686
  Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python"))),
687
  Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
688
  Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),