victormiller
commited on
Commit
•
5b83110
1
Parent(s):
7444772
Update web.py
Browse files
web.py
CHANGED
@@ -399,21 +399,21 @@ def web_data():
|
|
399 |
),
|
400 |
P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
|
401 |
Ul(
|
402 |
-
Li("the word count in the document", style = "margin-bottom:
|
403 |
-
Li("the mean word length", style = "margin-bottom:
|
404 |
-
Li("the number of sentences", style = "margin-bottom:
|
405 |
-
Li("the symbol-to-word ratio", style = "margin-bottom:
|
406 |
-
Li("the fraction of alphabetic words", style = "margin-bottom:
|
407 |
-
Li("and the number of stop words", style = "margin-bottom:
|
408 |
),
|
409 |
P("Specifically, we remove any document which satisfies any of the following criteria:"),
|
410 |
Ul(
|
411 |
-
Li("it contains less than 50 words or more than 100,000 words"),
|
412 |
-
Li("its mean word length is outside the range of 3 to 10"),
|
413 |
-
Li("it contains less than 3 sentences"),
|
414 |
-
Li("its symbol-to-word ratio is greater than 0.1"),
|
415 |
-
Li("the words that contain at least one alphabetic character are less than 80% of the whole words"),
|
416 |
-
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with"),
|
417 |
),
|
418 |
|
419 |
P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|
|
|
399 |
),
|
400 |
P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
|
401 |
Ul(
|
402 |
+
Li("the word count in the document", style = "margin-bottom: 5px"),
|
403 |
+
Li("the mean word length", style = "margin-bottom: 5px"),
|
404 |
+
Li("the number of sentences", style = "margin-bottom: 5px"),
|
405 |
+
Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
|
406 |
+
Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
|
407 |
+
Li("and the number of stop words", style = "margin-bottom: 5px"),
|
408 |
),
|
409 |
P("Specifically, we remove any document which satisfies any of the following criteria:"),
|
410 |
Ul(
|
411 |
+
Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
|
412 |
+
Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
|
413 |
+
Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
|
414 |
+
Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
|
415 |
+
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
416 |
+
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
417 |
),
|
418 |
|
419 |
P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|