victormiller commited on
Commit
5f4285e
1 Parent(s): e3ed423

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +39 -53
curated.py CHANGED
@@ -462,7 +462,7 @@ data_preprocessing_div = Div(
462
  P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
463
  H3("Data Processing for S2ORC"),
464
  P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
465
- P("The ", B("Title Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
466
  P("The ", B("Majority Language Filter")," identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
467
  P("The ", B("Paragraph Count Filter")," counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
468
  P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
@@ -559,7 +559,7 @@ filtering_process = Div(
559
  Li("Language Filter: any language other than English are discarded"),
560
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
561
  Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
562
- Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
563
  ),
564
  table_div_arx,
565
  Details(
@@ -579,33 +579,24 @@ filtering_process = Div(
579
  ),
580
  Section(
581
  Div(
582
- H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
583
  P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
584
  H4("Download and Extraction"),
585
  Ol(
586
- Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
587
- Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
588
  ),
589
- H4("Filtering - S2ORC"),
590
  P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
591
  Ol(
592
- Li("title_abstract: must have title and abstract"),
593
- Li("The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
594
- Li("word_count: less than 500 words (not inclusive) are discarded"),
595
- Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
596
- Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
597
- ),
598
- H4("Local Deduplication Process"),
599
- Ol(
600
- Li("Local dedup was done with all papers combined."),
601
- ),
602
- H4("Global Deduplication Process"),
603
- Ol(
604
- Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
605
  ),
606
  table_div_s2o,
607
  Details(
608
- Summary("FreeLaw Filtering Examples -- need to update"),
609
  Div(
610
  P("examples are missing"),
611
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
@@ -621,21 +612,33 @@ filtering_process = Div(
621
  ),
622
  Section(
623
  Div(
624
- H3("S2ORC ABSTRACT"),
625
  P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
626
  H4("Download and Extraction"),
627
  Ol(
628
- Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
629
- Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
630
  ),
631
- H4("Filtering - S2ORC Abstract"),
632
- P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
633
  Ol(
634
- Li("title_abstract: must have title and abstract"),
635
- Li("language: abstract must be in English"),
636
- Li("word_count: less than 20 (not inclusive) are discarded"),
637
- Li("Unigram log probablity"),
638
- Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  ),
640
  )
641
  ),
@@ -643,32 +646,15 @@ filtering_process = Div(
643
 
644
  Section(
645
  Div(
646
- H3("PubMed - need to update with abstract vs central"),
647
- P(""),
648
- H4("Download and Extraction"),
649
- Ol(
650
- Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
651
- Li("All the urls are downloaded and the downloaded data is in xml.tar format"),
652
- Li("For pubmed central First tar files are opened using tarfile library and then converted to markdown format using pandoc: pandoc -f jats {nxml} -o {pmcid}.md --wrap=none"),
653
- Li("All the markdown files are combined to create jsonl files. In jsonl files, 1 line correspond to 1 markdown file."),
654
- Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
655
- ),
656
  H4("Filtering"),
657
  P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
658
  Ol(
659
- Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
660
- Li("Language: any language other than English are discarded"),
661
- Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
662
- Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
663
- Li("need to add the hyperlinks for the section above"),
664
- ),
665
- H4("Local Deduplication Process"),
666
- Ol(
667
- Li("Local dedup was done with all papers combined."),
668
- ),
669
- H4("Global Deduplication Process"),
670
- Ol(
671
- Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
672
  ),
673
  table_div_med,
674
  Details(
 
462
  P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
463
  H3("Data Processing for S2ORC"),
464
  P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
465
+ P("The ", B("Title and Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
466
  P("The ", B("Majority Language Filter")," identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
467
  P("The ", B("Paragraph Count Filter")," counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
468
  P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
 
559
  Li("Language Filter: any language other than English are discarded"),
560
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
561
  Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
562
+ Li("Note: the Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
563
  ),
564
  table_div_arx,
565
  Details(
 
579
  ),
580
  Section(
581
  Div(
582
+ H3("S2ORC"),
583
  P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
584
  H4("Download and Extraction"),
585
  Ol(
586
+ Li("This was downloaded directly in zip format using S2ORC api key and a get() request: ", D_code("response = urllib.request.urlopen(url)", language = "python")),
 
587
  ),
588
+ H4("Filtering"),
589
  P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
590
  Ol(
591
+ Li("Title and Abstract Filter: must have title and abstract"),
592
+ Li("Language Filter: The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
593
+ Li("Word Count Filter: less than 500 words (not inclusive) are discarded"),
594
+ Li("Paragraph Count Filter: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
595
+ Li("Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
 
 
 
 
 
 
 
 
596
  ),
597
  table_div_s2o,
598
  Details(
599
+ Summary("S2ORC Filtering Examples -- need to update"),
600
  Div(
601
  P("examples are missing"),
602
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
 
612
  ),
613
  Section(
614
  Div(
615
+ H3("S2ORC Abstract"),
616
  P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
617
  H4("Download and Extraction"),
618
  Ol(
619
+ Li("This was downloaded directly in zip format using S2ORC api key and a get() request: ", D_code("response = urllib.request.urlopen(url)", language = "python")),
 
620
  ),
621
+ H4("Filtering"),
622
+ P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
623
  Ol(
624
+ Li("Title and Abstract Filter: must have title and abstract"),
625
+ Li("Majority Language Filter: abstract must be in English"),
626
+ Li("Minimum Word Count Filter: less than 20 (not inclusive) are discarded"),
627
+ Li("Unigram Log Probability Threshold: -20"),
628
+ Li("Note: Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
629
+ ),
630
+ Details(
631
+ Summary("S2ORC Abstract Filtering Examples "),
632
+ Div(
633
+ P("examples are missing"),
634
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
635
+ ),
636
+ style="""
637
+ background-color: #FFFAEA; /* Light yellow background */
638
+ padding: 15px;
639
+ border-radius: 12px;
640
+ margin-bottom: 15px
641
+ """,
642
  ),
643
  )
644
  ),
 
646
 
647
  Section(
648
  Div(
649
+ H3("PubMed Central and PubMed Abstract"),
650
+ P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.")
 
 
 
 
 
 
 
 
651
  H4("Filtering"),
652
  P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
653
  Ol(
654
+ Li("Minimum Word Count Filter: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
655
+ Li("Language Filter: any language other than English are discarded"),
656
+ Li("Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
657
+ Li("Unigram Log Probability Threshold: -20"),
 
 
 
 
 
 
 
 
 
658
  ),
659
  table_div_med,
660
  Details(