victormiller commited on
Commit
d29b7f7
1 Parent(s): db5d55c

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +11 -11
curated.py CHANGED
@@ -677,17 +677,17 @@ filtering_process = Div(
677
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
678
  P(B("Filters Applied: ")),
679
  Ul(
680
- Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python")), style = "margin-bottom: -3px"),
681
- Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: -3px"),
682
- Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",), style = "margin-bottom: -3px"),
683
- Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python")), style = "margin-bottom: -3px"),
684
- Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0"), style = "margin-bottom: -3px"),
685
- Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts."), style = "margin-bottom: -3px"),
686
- Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters"), style = "margin-bottom: -3px"),
687
- Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: -3px"),
688
- Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues."), style = "margin-bottom: -3px"),
689
- Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python")), style = "margin-bottom: -3px"),
690
- Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability."), style = "margin-bottom: -3px"),
691
  ),
692
  table_div_phil,
693
  Details(
 
677
  P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
678
  P(B("Filters Applied: ")),
679
  Ul(
680
+ Li(P("Hyphenation Removal:", D_code("end-of", language="python"), " becomes ", D_code("end of", language="python")), style = "margin-bottom: -3px"),
681
+ Li(P("Newline Filtering:", D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: -3px"),
682
+ Li(P("Header/Footer Filtering:", D_code("(c) 2023 Company Name.", language="python"), " is removed ",), style = "margin-bottom: -3px"),
683
+ Li(P("Double Whitespace Filtering:", D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python")), style = "margin-bottom: -3px"),
684
+ Li(P("Mean Line Length Check: ", "removes paragraphs with an average line length of < 2.0"), style = "margin-bottom: -3px"),
685
+ Li(P("CID Percentage Filter: ", "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts."), style = "margin-bottom: -3px"),
686
+ Li(P("Letterness Filter: ", "discards paragraphs with a low proportion of letters"), style = "margin-bottom: -3px"),
687
+ Li(P("Removing Leading/Trailing Numbers: ", "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: -3px"),
688
+ Li(P("Fixing Unicode Issues: ", "fixes Unicode issues."), style = "margin-bottom: -3px"),
689
+ Li(P("Combining Diacritics Correction: ", D_code("a'", language="python"), " becomes ", D_code("å", language="python")), style = "margin-bottom: -3px"),
690
+ Li(P("Unigram Log Probability: ", "the document must have higher than -20 average unigram log probability."), style = "margin-bottom: -3px"),
691
  ),
692
  table_div_phil,
693
  Details(