victormiller commited on
Commit
9fd7ac0
1 Parent(s): 61e28b6

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +13 -44
curated.py CHANGED
@@ -456,6 +456,7 @@ filtering_process = Div(
456
  Section(
457
  Div(
458
  H3("ArXiv"),
 
459
  H4("Download and Extraction"),
460
  P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
461
  H4("Filtering"),
@@ -472,6 +473,7 @@ filtering_process = Div(
472
  Section(
473
  Div(
474
  H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
 
475
  H4("Download and Extraction"),
476
  Ol(
477
  Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
@@ -509,6 +511,7 @@ filtering_process = Div(
509
  Section(
510
  Div(
511
  H3("PubMed - need to update with abstract vs central"),
 
512
  H4("Download and Extraction"),
513
  Ol(
514
  Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
@@ -540,6 +543,7 @@ filtering_process = Div(
540
  Section(
541
  Div(
542
  H3("Phil Papers"),
 
543
  H4("Download and Extraction"),
544
  P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
545
  H4("Filtering"),
@@ -552,6 +556,7 @@ filtering_process = Div(
552
  Section(
553
  Div(
554
  H3("Europarl"),
 
555
  H4("Download and Extraction"),
556
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
557
  H4("Filtering"),
@@ -562,6 +567,7 @@ filtering_process = Div(
562
  Section(
563
  Div(
564
  H3("HackerNews"),
 
565
  H4("Download and Extraction"),
566
  P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
567
  P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest threads included stories from the 3rd level onwards. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
@@ -577,6 +583,7 @@ filtering_process = Div(
577
  Section(
578
  Div(
579
  H3("USPTO"),
 
580
  H4("Download and Extraction"),
581
  P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
582
  H4("Filtering"),
@@ -591,6 +598,7 @@ filtering_process = Div(
591
  Section(
592
  Div(
593
  H3("FreeLaw"),
 
594
  H4("Download and Extraction"),
595
  #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
596
  D_code("""
@@ -619,6 +627,7 @@ filtering_process = Div(
619
  Section(
620
  Div(
621
  H3("StackExchange"),
 
622
  H4("Download and Extraction"),
623
  P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
624
  P("""
@@ -642,6 +651,7 @@ filtering_process = Div(
642
  Section(
643
  Div(
644
  H3("Ubuntu IRC"),
 
645
  H4("Download and Extraction"),
646
  P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
647
  P("During extraction, the logs were cleaned using following functions:"),
@@ -669,6 +679,7 @@ filtering_process = Div(
669
  Section(
670
  Div(
671
  H3("DM Math"),
 
672
  H4("Download and Extraction"),
673
  P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
674
  D_code("""
@@ -687,7 +698,8 @@ filtering_process = Div(
687
  ),
688
  Section(
689
  Div(
690
- H3("PG19"),
 
691
  H4("Download and Extraction"),
692
  Ol(
693
  Li("The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
@@ -821,47 +833,6 @@ data_pipeline_table = pd.DataFrame(
821
  table_html_data_pipe = data_pipeline_table.to_html(index=False, border=0)
822
  table_div_data_pipe = Div(NotStr(table_html_data_pipe), style="margin: 40px;")
823
 
824
- data_descriptions = pd.DataFrame(
825
- {
826
- "Source": [
827
- "Papers - ArXiv",
828
- "Papers - PhilPapers",
829
- "Papers - S2ORC",
830
- "Papers - PubMed Central",
831
- "Papers - PubMed Abstract",
832
- "Wikipedia",
833
- "StackExchange",
834
- "EuroParl",
835
- "Ubuntu IRC",
836
- "Freelaw",
837
- "PG-19",
838
- "USPTO",
839
- "HackerNews",
840
- "DM Maths",
841
- ],
842
- "Description": [
843
- "The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format.",
844
- "Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario.",
845
- "The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text.",
846
- "The PubMed Central (PMC) dataset is a comprehensive collection of full-text biomedical and life sciences journal articles run by the United States of America’s National Center for Biotechnology Information (NCBI). It provides open access to a wealth of scientific literature, facilitating research and discovery in the medical and biological fields starting from 2008 by the NIH Public Access Policy. Articles in PMC are available for text mining and other secondary analyses, making it an invaluable resource for researchers and developers and other downstream tasks.",
847
- "Abstracts of more than 30 million publications of biomedical literature from various sources mainly including biomedical articles run by the National Library of Medicine. ",
848
- "Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages.",
849
- "A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post.",
850
- "A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks.",
851
- "Chat logs from the Ubuntu Internet Relay Chat (IRC) channels on the Freenode IRC chat server. This data is also another form of dialog dataset on niche topics.",
852
- "Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts.",
853
- "A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919.",
854
- "Patent documents from the United States Patent and Trademark Office.",
855
- "High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator.",
856
- "DeepMind Maths dataset with generated questions from various topics like algebra, calculus, geometry, etc. Maths data is included to improve model reasoning abilities in the downstream tasks.",
857
- ],
858
-
859
- }
860
- )
861
-
862
- table_html_desc = data_descriptions.to_html(index=False, border=0)
863
- table_desc = Div(NotStr(table_html_desc), style="margin: 40px;")
864
-
865
 
866
  data_sources = [
867
  "Freelaw",
@@ -1135,8 +1106,6 @@ def curated(request):
1135
  overview_text,
1136
  copyright_disclaimer,
1137
  plotly2fasthtml(treemap_chart),
1138
- H2("Curated Sources Defined"),
1139
- table_desc,
1140
  data_preprocessing_div,
1141
  plotly2fasthtml(diff2_stacked_bar),
1142
  H2("Curated Sources Processing"),
 
456
  Section(
457
  Div(
458
  H3("ArXiv"),
459
+ P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
460
  H4("Download and Extraction"),
461
  P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
462
  H4("Filtering"),
 
473
  Section(
474
  Div(
475
  H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
476
+ P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
477
  H4("Download and Extraction"),
478
  Ol(
479
  Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
 
511
  Section(
512
  Div(
513
  H3("PubMed - need to update with abstract vs central"),
514
+ P(""),
515
  H4("Download and Extraction"),
516
  Ol(
517
  Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
 
543
  Section(
544
  Div(
545
  H3("Phil Papers"),
546
+ P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
547
  H4("Download and Extraction"),
548
  P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
549
  H4("Filtering"),
 
556
  Section(
557
  Div(
558
  H3("Europarl"),
559
+ P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
560
  H4("Download and Extraction"),
561
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
562
  H4("Filtering"),
 
567
  Section(
568
  Div(
569
  H3("HackerNews"),
570
+ P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
571
  H4("Download and Extraction"),
572
  P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
573
  P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest threads included stories from the 3rd level onwards. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
 
583
  Section(
584
  Div(
585
  H3("USPTO"),
586
+ P("Patent documents from the United States Patent and Trademark Office."),
587
  H4("Download and Extraction"),
588
  P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
589
  H4("Filtering"),
 
598
  Section(
599
  Div(
600
  H3("FreeLaw"),
601
+ P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
602
  H4("Download and Extraction"),
603
  #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
604
  D_code("""
 
627
  Section(
628
  Div(
629
  H3("StackExchange"),
630
+ P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
631
  H4("Download and Extraction"),
632
  P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
633
  P("""
 
651
  Section(
652
  Div(
653
  H3("Ubuntu IRC"),
654
+ P("Chat logs from the Ubuntu Internet Relay Chat (IRC) channels on the Freenode IRC chat server. This data is also another form of dialog dataset on niche topics."),
655
  H4("Download and Extraction"),
656
  P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
657
  P("During extraction, the logs were cleaned using following functions:"),
 
679
  Section(
680
  Div(
681
  H3("DM Math"),
682
+ P("DeepMind Math dataset with generated questions from various topics like algebra, calculus, geometry, etc. Maths data is included to improve model reasoning abilities in the downstream tasks."),
683
  H4("Download and Extraction"),
684
  P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
685
  D_code("""
 
698
  ),
699
  Section(
700
  Div(
701
+ H3("PG-19"),
702
+ P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
703
  H4("Download and Extraction"),
704
  Ol(
705
  Li("The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
 
833
  table_html_data_pipe = data_pipeline_table.to_html(index=False, border=0)
834
  table_div_data_pipe = Div(NotStr(table_html_data_pipe), style="margin: 40px;")
835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
 
837
  data_sources = [
838
  "Freelaw",
 
1106
  overview_text,
1107
  copyright_disclaimer,
1108
  plotly2fasthtml(treemap_chart),
 
 
1109
  data_preprocessing_div,
1110
  plotly2fasthtml(diff2_stacked_bar),
1111
  H2("Curated Sources Processing"),