"""Module for fetching and parsing articles from PubMed and PMC using Entrez efetch.""" from __future__ import annotations import html import requests import unicodedata from abc import ABC, abstractmethod from io import StringIO from pathlib import Path from typing import IO, Any, Dict, Union from xml.etree.ElementTree import Element # nosec from zipfile import ZipFile from typing import Generator from defusedxml import ElementTree _ENTREZ_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" def _db_parser(article_id:str) -> str|None: """Parse the article ID to ensure it is in the correct format.""" db = None if article_id.startswith('PMC') and article_id[3:].isdigit(): db = "pmc" elif article_id.isdigit(): db = "pubmed" return db def _dl_article_xml(article_id:str, db:str|None) -> tuple[None|str,str] : xml_string = None params = {"db": db, "id": article_id, "retmode": "xml"} response = requests.get(_ENTREZ_EFETCH_URL, params=params) if response.status_code == 200: xml_string = response.text return xml_string def _parse_article(xml_string:str, db:str) -> Union[None,ArticleParser] : parsed_article = None if db == "pmc": parsed_article = JATSXMLParser.from_string(xml_string) elif db == "pubmed": parsed_article = PubMedXMLParser(xml_string) # check if parsing was successful if not parsed_article.abstract and not parsed_article.paragraphs: parsed_article = None return parsed_article def _reformat_article(parsed_article:ArticleParser) -> Dict[str,Any] : reformatted_article = {"Title":[parsed_article.title]} for sec_title,sentence in parsed_article.abstract : sec_title = "Abstract" if sec_title is None else "Abstract - " + sec_title reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence] for sec_title,sentence in parsed_article.paragraphs : reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence] return reformatted_article def dl_and_parse(article_id:str) -> Dict[str,Union[None,Any]]: """Fetch article from PubMed or PMC using the ID using Entrez efetch and parse it using the appropriate parser. Then returns dict containing keys : article_xml(raw xml of downloaded article) and article_sections (parsed sections in the form of a dictionary with keys as section titles and values as list of text content)""" parse_output = { "db" : None, "article_xml": None, "article_sections": None, } # parse id for correct db format parse_output["db"] = _db_parser(article_id) if parse_output["db"] is None: return parse_output parse_output["article_xml"] = _dl_article_xml(article_id, parse_output["db"]) article_parser = _parse_article(parse_output["article_xml"], parse_output["db"]) if article_parser is None : return parse_output parse_output["article_sections"] = _reformat_article(article_parser) return parse_output class ArticleParser(ABC): """An abstract base class for article parsers.""" @property @abstractmethod def title(self) -> str: """Get the article title. Returns ------- str The article title. """ @property @abstractmethod def abstract(self) -> list[str]: """Get a sequence of paragraphs in the article abstract. Returns ------- list of str The paragraphs of the article abstract. """ @property @abstractmethod def paragraphs(self) -> list[tuple[str, str]]: """Get all paragraphs and titles of sections they are part of. Returns ------- list of (str, str) For each paragraph a tuple with two strings is returned. The first is the section title, the second the paragraph content. """ class JATSXMLParser(ArticleParser): def __init__(self, xml_stream: IO[Any]) -> None: super().__init__() self.content = ElementTree.parse(xml_stream) if self.content.getroot().tag == "pmc-articleset": self.content = self.content.find("article") @classmethod def from_string(cls, xml_string: str) -> JATSXMLParser: with StringIO(xml_string) as stream: obj = cls(stream) return obj @classmethod def from_zip(cls, path: str | Path) -> JATSXMLParser: with ZipFile(path) as myzip: xml_files = [ x for x in myzip.namelist() if x.startswith("content/") and x.endswith(".xml") ] if len(xml_files) != 1: raise ValueError( "There needs to be exactly one .xml file inside of content/" ) xml_file = xml_files[0] # Parsing logic with myzip.open(xml_file, "r") as fh: obj = cls(fh) return obj @property def title(self) -> str: titles = self.content.find("./front/article-meta/title-group/article-title") return self._element_to_str(titles) @property def abstract(self) -> list[tuple[str, str]]: abstract = self.content.find("./front/article-meta/abstract") abstract_list: list[tuple[str, str]] = [] if abstract: for sec_title, text in self.parse_section(abstract): abstract_list.append((sec_title,text)) return abstract_list @property def paragraphs(self) -> list[tuple[str, str]]: paragraph_list: list[tuple[str, str]] = [] # Paragraphs of text body body = self.content.find("./body") if body: paragraph_list.extend(self.parse_section(body,"")) # Figure captions figs = self.content.findall("./body//fig") for fig in figs: fig_captions = fig.findall("caption") if fig_captions is None: continue caption = " ".join(self._element_to_str(c) for c in list(fig_captions)) if caption: paragraph_list.append(("Figure Caption", caption)) # Table captions tables = self.content.findall("./body//table-wrap") for table in tables: caption_elements = table.findall("./caption/p") or table.findall( "./caption/title" ) if caption_elements is None: continue caption = " ".join(self._element_to_str(c) for c in caption_elements) if caption: paragraph_list.append(("Table Caption", caption)) return paragraph_list def parse_section(self, section: Element, sec_title_path: str = "") -> Generator[tuple[str, str], None, None]: sec_title = self._element_to_str(section.find("title")) if sec_title == "Author contributions": return sec_title_path = sec_title_path + " - " + sec_title if sec_title_path else sec_title for element in section: if element.tag == "sec": yield from self.parse_section(element, sec_title_path) elif element.tag in {"title", "caption", "fig", "table-wrap", "label"}: continue else: text = self._element_to_str(element) if text: yield sec_title_path, text def _inner_text(self, element: Element) -> str: text_parts = [html.unescape(element.text or "")] for sub_element in element: # recursively parse the sub-element text_parts.append(self._element_to_str(sub_element)) # don't forget the text after the sub-element text_parts.append(html.unescape(sub_element.tail or "")) return unicodedata.normalize("NFKC", "".join(text_parts)).strip() def _element_to_str(self, element: Element | None) -> str: if element is None: return "" if element.tag in { "bold", "italic", "monospace", "p", "sc", "styled-content", "underline", "xref", }: # Mostly styling tags for which getting the inner text is enough. # Currently this is the same as the default handling. Writing it out # explicitly here to decouple from the default handling, which may # change in the future. return self._inner_text(element) elif element.tag == "sub": return f"_{self._inner_text(element)}" elif element.tag == "sup": return f"^{self._inner_text(element)}" elif element.tag in { "disp-formula", "email", "ext-link", "inline-formula", "uri", }: return "" else: # Default handling for all other element tags return self._inner_text(element) class PubMedXMLParser(ArticleParser): """Parser for PubMed abstract.""" def __init__(self, data: str | bytes) -> None: super().__init__() self.content = ElementTree.fromstring(data) @property def title(self) -> str: title = self.content.find("./PubmedArticle/MedlineCitation/Article/ArticleTitle") if title is None: return "" return "".join(title.itertext()) @property def abstract(self) -> list[tuple[str,str]]: abstract = self.content.find("./PubmedArticle/MedlineCitation/Article/Abstract") if abstract is None: # No paragraphs to parse: stop and return an empty iterable. return [] # noqa paragraphs = abstract.iter("AbstractText") abstract_list: list[tuple[str,str]] = [] if paragraphs is not None: for paragraph in paragraphs: sec_title = paragraph.get("Label") abstract_list.append((sec_title,"".join(paragraph.itertext()))) return abstract_list @property def paragraphs(self) -> list[tuple[str, str]]: # No paragraph to parse in PubMed article sets: return an empty iterable. return []