victormiller
commited on
Commit
•
24b53c0
1
Parent(s):
ee3ad0f
Update curated.py
Browse files- curated.py +32 -3
curated.py
CHANGED
@@ -9,12 +9,41 @@ from rich import print
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
13 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
14 |
|
15 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
16 |
|
17 |
-
|
18 |
treemap_data = {
|
19 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
20 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
@@ -467,7 +496,7 @@ def curated(request):
|
|
467 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
468 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
469 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
470 |
-
|
471 |
return Div(
|
472 |
H2("Curated Sources: Overview"),
|
473 |
overview_text,
|
@@ -475,7 +504,7 @@ def curated(request):
|
|
475 |
plotly2fasthtml(treemap_chart),
|
476 |
table_desc,
|
477 |
H2("Curated Sources: Data Gathering and Filtering"),
|
478 |
-
|
479 |
data_preparation_div,
|
480 |
H3("Data Filtering"),
|
481 |
data_preprocessing_div,
|
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
12 |
+
filtering_process = Div(
|
13 |
+
Section(
|
14 |
+
H3("Title"),
|
15 |
+
H4("Download and Extraction"),
|
16 |
+
Ol(
|
17 |
+
Li("one"),
|
18 |
+
Li("two"),
|
19 |
+
),
|
20 |
+
H4("Filtering"),
|
21 |
+
Ol(
|
22 |
+
Li("one"),
|
23 |
+
Li("two"),
|
24 |
+
),
|
25 |
+
H4("Local Deduplication Process"),
|
26 |
+
Ol(
|
27 |
+
Li("one"),
|
28 |
+
Li("two"),
|
29 |
+
),
|
30 |
+
H4("Global Deduplication Process"),
|
31 |
+
Ol(
|
32 |
+
Li("one"),
|
33 |
+
Li("two"),
|
34 |
+
),
|
35 |
+
|
36 |
+
),
|
37 |
+
)
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
43 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
44 |
|
45 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
46 |
|
|
|
47 |
treemap_data = {
|
48 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
49 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
|
|
496 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
497 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
498 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
499 |
+
|
500 |
return Div(
|
501 |
H2("Curated Sources: Overview"),
|
502 |
overview_text,
|
|
|
504 |
plotly2fasthtml(treemap_chart),
|
505 |
table_desc,
|
506 |
H2("Curated Sources: Data Gathering and Filtering"),
|
507 |
+
filtering_process,
|
508 |
data_preparation_div,
|
509 |
H3("Data Filtering"),
|
510 |
data_preprocessing_div,
|