victormiller commited on
Commit
31b08ca
1 Parent(s): c31df73

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +163 -0
curated.py CHANGED
@@ -8,6 +8,156 @@ from data_viewer import view_data, gen_random_id
8
  from rich import print
9
  import uuid
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  data_sources = [
13
  "Freelaw",
@@ -153,6 +303,19 @@ def update(target: str, request):
153
 
154
 
155
  def curated(request):
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  data_preparation_steps = pd.DataFrame(
157
  {
158
  "Method": [
 
8
  from rich import print
9
  import uuid
10
 
11
+ overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
12
+ copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
13
+
14
+ local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
15
+
16
+ data_pipeline_table = pd.DataFrame(
17
+ {
18
+ "Data Source": [
19
+ "Papers",
20
+ "Wikipedia",
21
+ "StackExchange",
22
+ "Europarl",
23
+ "Ubuntu IRC",
24
+ "HackerNews",
25
+ "PG-19",
26
+ "USPTO",
27
+ "Freelaw",
28
+ "DM Math",
29
+ ],
30
+ "Percent Filtered": [
31
+ "15%",
32
+ "21%",
33
+ "<0.1%",
34
+ "1%",
35
+ "0.4%",
36
+ "60%",
37
+ "0.8%",
38
+ "22.5%",
39
+ "94%",
40
+ "0",
41
+ ],
42
+ "Unique Document Percentage": [
43
+ "75.99%",
44
+ "91.91%",
45
+ "98.02%",
46
+ "98.87%",
47
+ "100%",
48
+ "99.91%",
49
+ "31.81%",
50
+ "99.94%",
51
+ "91.01%",
52
+ "0",
53
+ ],
54
+ "2 - 5 Duplicates": [
55
+ "19.4%",
56
+ "4.7%",
57
+ "1.27%",
58
+ "0.94%",
59
+ "0",
60
+ "0.05%",
61
+ "20.03%",
62
+ "0.05%",
63
+ "6,87%",
64
+ "0",
65
+ ],
66
+ "6 - 10 Duplicates": [
67
+ "2.89%",
68
+ "1.58%",
69
+ "0.35%",
70
+ "0.09%",
71
+ "0",
72
+ "0.02%",
73
+ "24.27%",
74
+ "0.01%",
75
+ "1.07%",
76
+ "0",
77
+ ],
78
+ "11 - 100 Duplicates": [
79
+ "1.17%",
80
+ "1.76%",
81
+ "0.35%",
82
+ "0.1",
83
+ "0",
84
+ "0.02%",
85
+ "22.26%",
86
+ "0.01%",
87
+ "1.05%",
88
+ "0",
89
+ ],
90
+ "101 - 1000 Duplicates": [
91
+ "0.01%",
92
+ "0.05%",
93
+ "0.01%",
94
+ "0",
95
+ "0",
96
+ "<0.01%",
97
+ "1.58%",
98
+ "<0.01%",
99
+ "0.01%",
100
+ "0",
101
+ ],
102
+ "1001+ Duplicates": [
103
+ "<0.01%",
104
+ "<0.01%",
105
+ "<0.01%",
106
+ "0",
107
+ "0",
108
+ "<0.01%",
109
+ "0.06%",
110
+ "0",
111
+ "0",
112
+ "0",
113
+ ],
114
+ }
115
+ )
116
+
117
+ table_html_data_pipe = data_pipeline_table.to_html(index=False, border=0)
118
+ table_div_data_pipe = Div(NotStr(table_html_data_pipe), style="margin: 40px;")
119
+
120
+ data_descriptions = pd.DataFrame(
121
+ {
122
+ "Source": [
123
+ "Papers - ArXiv",
124
+ "Papers - PhilPapers",
125
+ "Papers - S2ORC",
126
+ "Papers - PubMed Central",
127
+ "Papers - PubMed Abstract",
128
+ "Wikipedia",
129
+ "StackExchange",
130
+ "EuroParl",
131
+ "Ubuntu IRC",
132
+ "Freelaw",
133
+ "PG-19",
134
+ "USPTO",
135
+ "HackerNews",
136
+ "DM Maths",
137
+ ],
138
+ "Description": [
139
+ "The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format.",
140
+ "Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario.",
141
+ "The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text.",
142
+ "The PubMed Central (PMC) dataset is a comprehensive collection of full-text biomedical and life sciences journal articles run by the United States of America’s National Center for Biotechnology Information (NCBI). It provides open access to a wealth of scientific literature, facilitating research and discovery in the medical and biological fields starting from 2008 by the NIH Public Access Policy. Articles in PMC are available for text mining and other secondary analyses, making it an invaluable resource for researchers and developers and other downstream tasks.",
143
+ "Abstracts of more than 30 million publications of biomedical literature from various sources mainly including biomedical articles run by the National Library of Medicine. ",
144
+ "Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages.",
145
+ "A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post.",
146
+ "A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks.",
147
+ "Chat logs from the Ubuntu Internet Relay Chat (IRC) channels on the Freenode IRC chat server. This data is also another form of dialog dataset on niche topics.",
148
+ "Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts.",
149
+ "A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919.",
150
+ "Patent documents from the United States Patent and Trademark Office.",
151
+ "High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator.",
152
+ "DeepMind Maths dataset with generated questions from various topics like algebra, calculus, geometry, etc. Maths data is included to improve model reasoning abilities in the downstream tasks.",
153
+ ],
154
+
155
+ }
156
+ )
157
+
158
+ table_html_desc = data_descriptions.to_html(index=False, border=0)
159
+ table_desc = Div(NotStr(table_html_desc), style="margin: 40px;")
160
+
161
 
162
  data_sources = [
163
  "Freelaw",
 
303
 
304
 
305
  def curated(request):
306
+
307
+ # Partial Updates
308
+ params = dict(request.query_params)
309
+ if target := params.get("target"):
310
+ if data_source := params.get(f"data_source_{target}"):
311
+ return get_data(
312
+ data_source, params.get(f"doc_id_{target}", 3), params.get("target")
313
+ )
314
+ if doc_id := params.get(f"doc_id_{target}"):
315
+ return get_data(
316
+ params.get(f"data_source_{target}"), doc_id, params.get("target")
317
+ )
318
+
319
  data_preparation_steps = pd.DataFrame(
320
  {
321
  "Method": [