victormiller
commited on
Commit
•
5672cf7
1
Parent(s):
f36591a
Update curated.py
Browse files- curated.py +61 -113
curated.py
CHANGED
@@ -458,15 +458,14 @@ filtering_process = Div(
|
|
458 |
Section(
|
459 |
H3("ArXiv"),
|
460 |
H4("Download and Extraction"),
|
461 |
-
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), "We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
462 |
H4("Filtering"),
|
463 |
-
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
464 |
Ol(
|
465 |
-
Li("
|
466 |
-
Li("
|
467 |
-
Li("
|
468 |
-
Li("
|
469 |
-
Li("number 4 above had hyperlinks that need to be included"),
|
470 |
),
|
471 |
H4("Local Deduplication Process"),
|
472 |
Ol(
|
@@ -510,7 +509,7 @@ filtering_process = Div(
|
|
510 |
table_div_s2o,
|
511 |
),
|
512 |
Section(
|
513 |
-
H3("PubMed"),
|
514 |
H4("Download and Extraction"),
|
515 |
Ol(
|
516 |
Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
|
@@ -541,12 +540,7 @@ filtering_process = Div(
|
|
541 |
Section(
|
542 |
H3("Phil Papers"),
|
543 |
H4("Download and Extraction"),
|
544 |
-
|
545 |
-
Li("Original pdf files download location was downloaded from https://philarchive.org/oai.pl "),
|
546 |
-
Li("All pdf files were downloaded"),
|
547 |
-
Li("Pdf was converted to text using java -jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}"),
|
548 |
-
Li("Language was detected and added using langdetect library"),
|
549 |
-
),
|
550 |
H4("Filtering"),
|
551 |
Ol(
|
552 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
@@ -555,120 +549,84 @@ filtering_process = Div(
|
|
555 |
Ol(
|
556 |
Li("Local dedup was done with all papers combined."),
|
557 |
),
|
558 |
-
H4("Global Deduplication Process"),
|
559 |
-
Ol(
|
560 |
-
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
561 |
-
),
|
562 |
table_div_phil,
|
563 |
),
|
564 |
Section(
|
565 |
H3("Europarl"),
|
566 |
H4("Download and Extraction"),
|
567 |
-
|
568 |
-
Li("Original data was downloaded from http://www.statmt.org/europarl/v7/europarl.tgz"),
|
569 |
-
Li("Finally the remaining files are converted to jsonl lines"),
|
570 |
-
),
|
571 |
H4("Filtering"),
|
572 |
-
|
573 |
-
Li("Smaller than 200 characters of documents are removed while downloading so no others filtered were run"),
|
574 |
-
Li("Tags were also removed while downloading"),
|
575 |
-
),
|
576 |
H4("Local Deduplication Process"),
|
577 |
Ol(
|
578 |
Li("Local dedup was done within europarl itself"),
|
579 |
),
|
580 |
-
H4("Global Deduplication Process"),
|
581 |
-
Ol(
|
582 |
-
Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
|
583 |
-
),
|
584 |
table_div_up,
|
585 |
),
|
586 |
Section(
|
587 |
H3("HackerNews"),
|
588 |
H4("Download and Extraction"),
|
589 |
-
|
590 |
-
|
591 |
-
Li("Story ids was started from 1 till 37500000 (all stories that gives error while pinging the url was removed). Each post is a story, with each reply another story"),
|
592 |
-
Li("As there were too many requests error, there was a wait(2 sec) statement included in the code"),
|
593 |
-
Li("As the number of stories were large and containing all the replies was time consuming and possibility of introducing too much error, only longest depth threads were included from 3rd level onwards. So we include the title then all the replies (2nd level) but replies to those replies (3rd level) were only the ones which has maximum depth."),
|
594 |
-
),
|
595 |
H4("Filtering"),
|
596 |
Ol(
|
597 |
-
Li("
|
598 |
-
Li("
|
599 |
-
Li("Unigram
|
600 |
),
|
601 |
H4("Local Deduplication Process"),
|
602 |
Ol(
|
603 |
Li("Local dedup was done within hackernews itself"),
|
604 |
),
|
605 |
-
H4("Global Deduplication Process"),
|
606 |
-
Ol(
|
607 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
608 |
-
),
|
609 |
table_div_hn,
|
610 |
),
|
611 |
Section(
|
612 |
H3("USPTO"),
|
613 |
H4("Download and Extraction"),
|
614 |
-
|
615 |
-
Li("Data was downloaded and extracted using tags from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),
|
616 |
-
Li("There were three different format that needed three different functions to download and extract the data based on year: Pre_2002, 2002_to_2004, post_2004"),
|
617 |
-
|
618 |
-
),
|
619 |
H4("Filtering"),
|
620 |
Ol(
|
621 |
-
Li("
|
622 |
-
Li("
|
623 |
-
Li("Unigram
|
624 |
),
|
625 |
H4("Local Deduplication Process"),
|
626 |
Ol(
|
627 |
Li("Local dedup was done within USPTO itself"),
|
628 |
),
|
629 |
-
H4("Global Deduplication Process"),
|
630 |
-
Ol(
|
631 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
632 |
-
),
|
633 |
table_div_uspto,
|
634 |
),
|
635 |
Section(
|
636 |
H3("FreeLaw"),
|
637 |
H4("Download and Extraction"),
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
|
|
647 |
H4("Filtering"),
|
648 |
Ol(
|
649 |
-
Li("
|
650 |
-
Li("
|
651 |
-
Li("Unigram
|
652 |
),
|
653 |
H4("Local Deduplication Process"),
|
654 |
Ol(
|
655 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
656 |
),
|
657 |
-
H4("Global Deduplication Process"),
|
658 |
-
Ol(
|
659 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
660 |
-
),
|
661 |
table_div_freelaw,
|
662 |
),
|
663 |
Section(
|
664 |
H3("StackExchange"),
|
665 |
H4("Download and Extraction"),
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
Li("We parsed using post_id to connect each question to answer and then to comments so our data has same hierarchy as stackexchange UI"),
|
670 |
-
Li("""
|
671 |
-
1. Questions:
|
672 |
2. Comment1:
|
673 |
3. Comment2:
|
674 |
4. Answer1:
|
@@ -677,87 +635,77 @@ filtering_process = Div(
|
|
677 |
7. Answer2:
|
678 |
8. Comment1:
|
679 |
9. Comment2:
|
680 |
-
|
681 |
-
),
|
682 |
H4("Filtering"),
|
683 |
Ol(
|
684 |
-
Li("
|
685 |
),
|
686 |
H4("Local Deduplication Process"),
|
687 |
Ol(
|
688 |
Li("Local dedup was done within stackexchange itself"),
|
689 |
),
|
690 |
-
H4("Global Deduplication Process"),
|
691 |
-
Ol(
|
692 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
693 |
-
),
|
694 |
table_div_se,
|
695 |
),
|
696 |
Section(
|
697 |
H3("Ubuntu IRC"),
|
698 |
H4("Download and Extraction"),
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
H4("Filtering"),
|
705 |
Ol(
|
706 |
-
Li("
|
707 |
-
Li("
|
708 |
-
Li("Unigram
|
709 |
),
|
710 |
H4("Local Deduplication Process"),
|
711 |
Ol(
|
712 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
713 |
),
|
714 |
-
H4("Global Deduplication Process"),
|
715 |
-
Ol(
|
716 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
717 |
-
),
|
718 |
table_div_uirc,
|
719 |
),
|
720 |
Section(
|
721 |
H3("DM Maths"),
|
722 |
H4("Download and Extraction"),
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
),
|
727 |
H4("Filtering"),
|
728 |
Ol(
|
729 |
-
Li("
|
730 |
),
|
731 |
H4("Local Deduplication Process"),
|
732 |
Ol(
|
733 |
Li("None"),
|
734 |
),
|
735 |
-
H4("Global Deduplication Process"),
|
736 |
-
Ol(
|
737 |
-
Li("None"),
|
738 |
-
),
|
739 |
table_div_dmm,
|
740 |
),
|
741 |
Section(
|
742 |
H3("PG19"),
|
743 |
H4("Download and Extraction"),
|
744 |
Ol(
|
745 |
-
Li("
|
746 |
),
|
747 |
H4("Filtering"),
|
748 |
Ol(
|
749 |
-
Li("
|
750 |
-
Li("
|
751 |
-
Li("Unigram
|
752 |
),
|
753 |
H4("Local Deduplication Process"),
|
754 |
Ol(
|
755 |
Li("Local dedup was done within PG19 itself"),
|
756 |
),
|
757 |
-
H4("Global Deduplication Process"),
|
758 |
-
Ol(
|
759 |
-
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
760 |
-
),
|
761 |
table_div_pg19,
|
762 |
),
|
763 |
)
|
|
|
458 |
Section(
|
459 |
H3("ArXiv"),
|
460 |
H4("Download and Extraction"),
|
461 |
+
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
462 |
H4("Filtering"),
|
463 |
+
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
|
464 |
Ol(
|
465 |
+
Li("Language Filter: any language other than English are discarded"),
|
466 |
+
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
467 |
+
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by". A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
468 |
+
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
|
|
469 |
),
|
470 |
H4("Local Deduplication Process"),
|
471 |
Ol(
|
|
|
509 |
table_div_s2o,
|
510 |
),
|
511 |
Section(
|
512 |
+
H3("PubMed - need to update with abstract vs central"),
|
513 |
H4("Download and Extraction"),
|
514 |
Ol(
|
515 |
Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
|
|
|
540 |
Section(
|
541 |
H3("Phil Papers"),
|
542 |
H4("Download and Extraction"),
|
543 |
+
P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
|
|
|
|
|
|
|
|
|
|
544 |
H4("Filtering"),
|
545 |
Ol(
|
546 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
|
|
549 |
Ol(
|
550 |
Li("Local dedup was done with all papers combined."),
|
551 |
),
|
|
|
|
|
|
|
|
|
552 |
table_div_phil,
|
553 |
),
|
554 |
Section(
|
555 |
H3("Europarl"),
|
556 |
H4("Download and Extraction"),
|
557 |
+
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
|
|
|
|
|
|
558 |
H4("Filtering"),
|
559 |
+
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
|
|
|
|
|
|
560 |
H4("Local Deduplication Process"),
|
561 |
Ol(
|
562 |
Li("Local dedup was done within europarl itself"),
|
563 |
),
|
|
|
|
|
|
|
|
|
564 |
table_div_up,
|
565 |
),
|
566 |
Section(
|
567 |
H3("HackerNews"),
|
568 |
H4("Download and Extraction"),
|
569 |
+
P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
570 |
+
P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest threads included stories from the 3rd level onwards. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
|
|
|
|
|
|
|
|
|
571 |
H4("Filtering"),
|
572 |
Ol(
|
573 |
+
Li("Language Filter: English"),
|
574 |
+
Li("Minimum Word Count Filter: 10"),
|
575 |
+
Li("Unigram Log Probability"),
|
576 |
),
|
577 |
H4("Local Deduplication Process"),
|
578 |
Ol(
|
579 |
Li("Local dedup was done within hackernews itself"),
|
580 |
),
|
|
|
|
|
|
|
|
|
581 |
table_div_hn,
|
582 |
),
|
583 |
Section(
|
584 |
H3("USPTO"),
|
585 |
H4("Download and Extraction"),
|
586 |
+
P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
|
|
|
|
|
|
|
|
|
587 |
H4("Filtering"),
|
588 |
Ol(
|
589 |
+
Li("Language Filter: English"),
|
590 |
+
Li("Minimum Word Count Filter: 50"),
|
591 |
+
Li("Unigram Log Probability"),
|
592 |
),
|
593 |
H4("Local Deduplication Process"),
|
594 |
Ol(
|
595 |
Li("Local dedup was done within USPTO itself"),
|
596 |
),
|
|
|
|
|
|
|
|
|
597 |
table_div_uspto,
|
598 |
),
|
599 |
Section(
|
600 |
H3("FreeLaw"),
|
601 |
H4("Download and Extraction"),
|
602 |
+
P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function.",
|
603 |
+
D_code("""
|
604 |
+
("html", html2text),
|
605 |
+
("html_lawbox", html2text),
|
606 |
+
("html_columbia", html2text),
|
607 |
+
("html_anon_2020", html2text),
|
608 |
+
("html_with_citations", html2text),
|
609 |
+
("xml_harvard", html2text),
|
610 |
+
plain_text
|
611 |
+
""", language ="SQL")," All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
612 |
H4("Filtering"),
|
613 |
Ol(
|
614 |
+
Li("Language Filter: English"),
|
615 |
+
Li("Minimum Word Count Filter: 50"),
|
616 |
+
Li("Unigram Log Probability"),
|
617 |
),
|
618 |
H4("Local Deduplication Process"),
|
619 |
Ol(
|
620 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
621 |
),
|
|
|
|
|
|
|
|
|
622 |
table_div_freelaw,
|
623 |
),
|
624 |
Section(
|
625 |
H3("StackExchange"),
|
626 |
H4("Download and Extraction"),
|
627 |
+
P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href"math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
628 |
+
P("""
|
629 |
+
1. Questions:
|
|
|
|
|
|
|
630 |
2. Comment1:
|
631 |
3. Comment2:
|
632 |
4. Answer1:
|
|
|
635 |
7. Answer2:
|
636 |
8. Comment1:
|
637 |
9. Comment2:
|
638 |
+
"""),
|
|
|
639 |
H4("Filtering"),
|
640 |
Ol(
|
641 |
+
Li("Minimum Word Count Filter: 10"),
|
642 |
),
|
643 |
H4("Local Deduplication Process"),
|
644 |
Ol(
|
645 |
Li("Local dedup was done within stackexchange itself"),
|
646 |
),
|
|
|
|
|
|
|
|
|
647 |
table_div_se,
|
648 |
),
|
649 |
Section(
|
650 |
H3("Ubuntu IRC"),
|
651 |
H4("Download and Extraction"),
|
652 |
+
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href"https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
653 |
+
P("During extraction, the logs were cleaned using following functions:"),
|
654 |
+
D_code("""
|
655 |
+
def exclude_system(x):
|
656 |
+
return '\n'.join(line for line in x.split('\n') if not line.startswith('==='))
|
657 |
+
|
658 |
+
def exclude_select_system(x):
|
659 |
+
return '\n'.join(line for line in x.split('\n') if not (line.startswith('===') and any(term in line for term in ['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
|
660 |
+
|
661 |
+
def clean(x):
|
662 |
+
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
663 |
+
""", block="block", language="python" ),
|
664 |
H4("Filtering"),
|
665 |
Ol(
|
666 |
+
Li("Language Filter: English"),
|
667 |
+
Li("Minimum Word Count Filter: 10"),
|
668 |
+
Li("Unigram Log Probability"),
|
669 |
),
|
670 |
H4("Local Deduplication Process"),
|
671 |
Ol(
|
672 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
673 |
),
|
|
|
|
|
|
|
|
|
674 |
table_div_uirc,
|
675 |
),
|
676 |
Section(
|
677 |
H3("DM Maths"),
|
678 |
H4("Download and Extraction"),
|
679 |
+
P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
|
680 |
+
D_code("""
|
681 |
+
Question: TEXT
|
682 |
+
Answer: TEXT""", block="block", language="python"),
|
683 |
H4("Filtering"),
|
684 |
Ol(
|
685 |
+
Li("No filtering was applied to DM Math"),
|
686 |
),
|
687 |
H4("Local Deduplication Process"),
|
688 |
Ol(
|
689 |
Li("None"),
|
690 |
),
|
|
|
|
|
|
|
|
|
691 |
table_div_dmm,
|
692 |
),
|
693 |
Section(
|
694 |
H3("PG19"),
|
695 |
H4("Download and Extraction"),
|
696 |
Ol(
|
697 |
+
Li("The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
698 |
),
|
699 |
H4("Filtering"),
|
700 |
Ol(
|
701 |
+
Li("Language Filter: ???"),
|
702 |
+
Li("Minimum Word Count Filter: 20"),
|
703 |
+
Li("Unigram Log Probability"),
|
704 |
),
|
705 |
H4("Local Deduplication Process"),
|
706 |
Ol(
|
707 |
Li("Local dedup was done within PG19 itself"),
|
708 |
),
|
|
|
|
|
|
|
|
|
709 |
table_div_pg19,
|
710 |
),
|
711 |
)
|