Spaces:

omkarenator
/

fh-new

Sleeping

App Files Files Community

omkarenator commited on Sep 25

Commit

2783986

•

1 Parent(s): 3eba508

updates

Browse files

Files changed (2) hide show

requirements.txt +1 -0
web.py +40 -17

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ fh-plotly
 pandas
 Jinja2
 rich

 pandas
 Jinja2
 rich
+jsonlines

web.py CHANGED Viewed

@@ -4,6 +4,28 @@ import json
 import random
 import string
 from rich import print
 def view_data(
@@ -15,7 +37,10 @@ def view_data(
     if target is None:
         target = "".join(random.choices(string.ascii_lowercase, k=8))
-    left = json.load(open(left_file, encoding="utf-8"))
     max_doc_id = len(left) - 1
     slider = Input(
         type="range",
@@ -208,32 +233,30 @@ def web_data():
         articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
         4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
         """),
-        Img(
-            src="path/to/24_URL_domains.png",
-            alt="24 URL domains with more than 4k matches",
-        ),
         P("""
         We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
         """),
-        Img(
-            src="path/to/6_domains_removed.png",
-            alt="6 URL domains that are removed from the blocklist",
         ),
-        Img(
-            src="path/to/sample_documents_blocked.png",
-            alt="Sample documents whose URLs are blocked by the refined URL blocklist",
         ),
         H5("1.3.2 Excluded High Quality Sources"),
         P("""
         To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
         """),
-        Img(
-            src="path/to/curated_url_domains_excluded.png",
-            alt="Curated URL domains that are excluded from our dataset",
         ),
-        Img(
-            src="path/to/sample_documents_curated_domains.png",
-            alt="Sample documents whose URLs are in our curated URL domain list",
         ),
         H3("2. Line-Level Removal"),
         P("""

 import random
 import string
 from rich import print
+import jsonlines
+from data.url_blocklist import urls_high_matches, urls_false_positives
+from data.non_web_urls import non_web_urls
+def view_data_static(
+    left,
+    header,
+):
+    col1 = Div(
+        Pre(
+            json.dumps(left, indent=4, ensure_ascii=False),
+            style="white-space: pre-wrap; word-break: break-all;",
+        ),
+        style="float: left; overflow-x: auto;",
+    )
+    data_display = Div(
+        col1,
+        style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
+    )
+    return Div(H3(header), data_display, style="margin-top: 10px;")
 def view_data(
     if target is None:
         target = "".join(random.choices(string.ascii_lowercase, k=8))
+    if left_file.endswith("jsonl"):
+        left = [x for x in jsonlines.open(left_file)]
+    else:
+        left = json.load(open(left_file, encoding="utf-8"))
     max_doc_id = len(left) - 1
     slider = Input(
         type="range",
         articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
         4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
         """),
+        view_data_static(urls_high_matches, "24 URL domains with more than 4k matches"),
         P("""
         We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
         """),
+        view_data_static(
+            urls_false_positives, "6 url domains that are removed from the blocklist"
         ),
+        view_data(
+            "data/bad_url_doc.jsonl",
+            3,
+            "Sample documents whose urls are blocked by the refined url blocklist",
         ),
         H5("1.3.2 Excluded High Quality Sources"),
         P("""
         To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
         """),
+        view_data_static(
+            non_web_urls,
+            "curated url domains that are excluded from our dataset",
         ),
+        view_data(
+            "data/sample_url_exclusion.json",
+            0,
+            "Sample documents whose urls are in our curated url domain list",
         ),
         H3("2. Line-Level Removal"),
         P("""