victormiller
commited on
Commit
•
cdae785
1
Parent(s):
073687e
Update web.py
Browse files
web.py
CHANGED
@@ -442,9 +442,13 @@ def web_data():
|
|
442 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
443 |
This step removes over 60% of the whole data.
|
444 |
"""),
|
445 |
-
|
|
|
|
|
|
|
|
|
446 |
|
447 |
-
DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
448 |
|
449 |
|
450 |
DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
|
|
|
442 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
443 |
This step removes over 60% of the whole data.
|
444 |
"""),
|
445 |
+
|
446 |
+
Details(
|
447 |
+
Summary("Sample documents that are classified as non-English"),
|
448 |
+
DV("data/sample_non_en.json", 3),
|
449 |
+
),
|
450 |
|
451 |
+
#DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
452 |
|
453 |
|
454 |
DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
|