victormiller commited on
Commit
6a9172e
1 Parent(s): 90b43c6

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +19 -16
curated.py CHANGED
@@ -544,7 +544,7 @@ data_preprocessing_div = Div(
544
  P(
545
  "The ",
546
  B("Unigram Log Probability Filter"),
547
- " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the",
548
  A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
549
  ". Specifically, we use the list available created by ",
550
  A(
@@ -909,7 +909,7 @@ filtering_process = Div(
909
  ),
910
  P(
911
  B("Download and Extraction: "),
912
- "Original PDF files download from",
913
  A(
914
  "https://philarchive.org/oai.pl",
915
  href="https://philarchive.org/oai.pl",
@@ -917,7 +917,7 @@ filtering_process = Div(
917
  ". All available PDF's were downloaded. Each PDF was converted to text using java",
918
  D_code(
919
  "-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}",
920
- language="java",
921
  ),
922
  ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library.",
923
  ),
@@ -1034,7 +1034,7 @@ filtering_process = Div(
1034
  ),
1035
  P(
1036
  B("Download and Extraction: "),
1037
- "Original dataset was downloaded from",
1038
  A(
1039
  "http://www.statmt.org/europarl/v7/europarl.tgz",
1040
  href="http://www.statmt.org/europarl/v7/europarl.tgz",
@@ -1098,11 +1098,11 @@ filtering_process = Div(
1098
  Div(
1099
  H3("HackerNews"),
1100
  P(
1101
- "High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."
1102
  ),
1103
  P(
1104
  B("Download and Extraction: "),
1105
- "The dataset was downloaded from the HackerNews repo here:",
1106
  A(
1107
  "https://hacker-news.firebaseio.com/v0/item/",
1108
  href="https://hacker-news.firebaseio.com/v0/item/",
@@ -1110,7 +1110,7 @@ filtering_process = Div(
1110
  ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time.",
1111
  ),
1112
  P(
1113
- "The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."
1114
  ),
1115
  P(B("Unique Data Preperation Challenges: ")),
1116
  Ul(
@@ -1141,7 +1141,7 @@ filtering_process = Div(
1141
  P("Patent documents from the United States Patent and Trademark Office."),
1142
  P(
1143
  B("Download and Extraction: "),
1144
- "Data was downloaded and extracted using tags from",
1145
  A(
1146
  "https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
1147
  href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
@@ -1171,7 +1171,7 @@ filtering_process = Div(
1171
  ),
1172
  P(
1173
  B("Download and Extraction"),
1174
- "The dataset was downloaded from:",
1175
  A(
1176
  "https://storage.courtlistener.com/bulk-data/",
1177
  href="https://storage.courtlistener.com/bulk-data/",
@@ -1185,7 +1185,7 @@ filtering_process = Div(
1185
  ("html_with_citations", html2text), ("xml_harvard", html2text),
1186
  plain_text
1187
  """,
1188
- language="SQL",
1189
  ),
1190
  P(
1191
  "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
@@ -1247,7 +1247,8 @@ filtering_process = Div(
1247
  A("math.stackexchange.com", href="math.stackexchange.com"),
1248
  ". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released.",
1249
  ),
1250
- P("""
 
1251
  1. Questions:
1252
  2. Comment1:
1253
  3. Comment2:
@@ -1256,8 +1257,10 @@ filtering_process = Div(
1256
  6. Comment2:
1257
  7. Answer2:
1258
  8. Comment1:
1259
- 9. Comment2:
1260
- """),
 
 
1261
  P(B("Unique Data Preperation Challenges: ")),
1262
  Ul(
1263
  Li(
@@ -1301,7 +1304,7 @@ filtering_process = Div(
1301
  ),
1302
  P(
1303
  B("Download and Extraction: "),
1304
- "The dataset was downloaded from:",
1305
  A(
1306
  "https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
1307
  href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
@@ -1349,7 +1352,7 @@ filtering_process = Div(
1349
  ),
1350
  P(
1351
  B("Download and Extraction: "),
1352
- "The dataset was downloaded rirectly downloaded from the Huggingface repo:",
1353
  A(
1354
  "https://huggingface.co/datasets/deepmind/math_dataset",
1355
  href="https://huggingface.co/datasets/deepmind/math_dataset",
@@ -1359,7 +1362,7 @@ filtering_process = Div(
1359
  D_code(
1360
  """
1361
  Question: TEXT
1362
- Answer: TEXT""",
1363
  block="block",
1364
  language="python",
1365
  ),
 
544
  P(
545
  "The ",
546
  B("Unigram Log Probability Filter"),
547
+ " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
548
  A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
549
  ". Specifically, we use the list available created by ",
550
  A(
 
909
  ),
910
  P(
911
  B("Download and Extraction: "),
912
+ "Original PDF files download from ",
913
  A(
914
  "https://philarchive.org/oai.pl",
915
  href="https://philarchive.org/oai.pl",
 
917
  ". All available PDF's were downloaded. Each PDF was converted to text using java",
918
  D_code(
919
  "-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}",
920
+ language="python",
921
  ),
922
  ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library.",
923
  ),
 
1034
  ),
1035
  P(
1036
  B("Download and Extraction: "),
1037
+ "Original dataset was downloaded from ",
1038
  A(
1039
  "http://www.statmt.org/europarl/v7/europarl.tgz",
1040
  href="http://www.statmt.org/europarl/v7/europarl.tgz",
 
1098
  Div(
1099
  H3("HackerNews"),
1100
  P(
1101
+ "A dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."
1102
  ),
1103
  P(
1104
  B("Download and Extraction: "),
1105
+ "The dataset was downloaded from the HackerNews repo here: ",
1106
  A(
1107
  "https://hacker-news.firebaseio.com/v0/item/",
1108
  href="https://hacker-news.firebaseio.com/v0/item/",
 
1110
  ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time.",
1111
  ),
1112
  P(
1113
+ "The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
1114
  ),
1115
  P(B("Unique Data Preperation Challenges: ")),
1116
  Ul(
 
1141
  P("Patent documents from the United States Patent and Trademark Office."),
1142
  P(
1143
  B("Download and Extraction: "),
1144
+ "Data was downloaded and extracted using tags from ",
1145
  A(
1146
  "https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
1147
  href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
 
1171
  ),
1172
  P(
1173
  B("Download and Extraction"),
1174
+ "The dataset was downloaded from: ",
1175
  A(
1176
  "https://storage.courtlistener.com/bulk-data/",
1177
  href="https://storage.courtlistener.com/bulk-data/",
 
1185
  ("html_with_citations", html2text), ("xml_harvard", html2text),
1186
  plain_text
1187
  """,
1188
+ language="python",
1189
  ),
1190
  P(
1191
  "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
 
1247
  A("math.stackexchange.com", href="math.stackexchange.com"),
1248
  ". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released.",
1249
  ),
1250
+ D_code(
1251
+ """
1252
  1. Questions:
1253
  2. Comment1:
1254
  3. Comment2:
 
1257
  6. Comment2:
1258
  7. Answer2:
1259
  8. Comment1:
1260
+ 9. Comment2:""",
1261
+ block="block",
1262
+ language="python",
1263
+ ),
1264
  P(B("Unique Data Preperation Challenges: ")),
1265
  Ul(
1266
  Li(
 
1304
  ),
1305
  P(
1306
  B("Download and Extraction: "),
1307
+ "The dataset was downloaded from: ",
1308
  A(
1309
  "https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
1310
  href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
 
1352
  ),
1353
  P(
1354
  B("Download and Extraction: "),
1355
+ "The dataset was downloaded rirectly downloaded from the Huggingface repo: ",
1356
  A(
1357
  "https://huggingface.co/datasets/deepmind/math_dataset",
1358
  href="https://huggingface.co/datasets/deepmind/math_dataset",
 
1362
  D_code(
1363
  """
1364
  Question: TEXT
1365
+ Answer: TEXT""",
1366
  block="block",
1367
  language="python",
1368
  ),