Spaces:

laiking
/

outcome-switch

Sleeping

App Files Files Community

Mathieu Lai-King commited on Sep 18

Commit

1a3b3aa

•

0 Parent(s):

first commit

Browse files

Files changed (19) hide show

.gitattributes +27 -0
.gitignore +3 -0
README.md +35 -0
app.py +109 -0
config.json +11 -0
front/app-description.md +9 -0
front/examples.json +16 -0
outcome_switch/__init__.py +73 -0
outcome_switch/ctgov.py +36 -0
outcome_switch/entrez.py +291 -0
outcome_switch/filter.py +91 -0
outcome_switch/similarity.py +63 -0
outcome_switch/visual.py +139 -0
requirements.txt +6 -0
test/parse_examples/36473651.xml +0 -0
test/parse_examples/PMC11102686.xml +0 -0
test/test_ctgov.py +69 -0
test/test_entrez.py +38 -0
test/test_filter.py +43 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+.vscode/
+models/

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+title: Outcome Switching Detector
+emoji: 🔄
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: true
+python_version: 3.11.9
+models: ['aakorolyova/primary_and_secondary_outcome_extraction','Mathking/all-mpnet-outcome-similarity']
+---
+# Outcome Switching Detector
+## Installation
+1. Download dependencies : `pip install -r requirements.txt`
+2. Define pretrained models path in config file : you must redefine `config.json` so that it points to the models if you do not have them on disk. YOu also can redefine ner_labe2id depending on the model you use
+```json
+{
+    "ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
+    "sim_path": "laiking/all-mpnet-outcome-similarity",
+    "ner_label2id" : {
+        "O": 0,
+        "B-PrimaryOutcome": 1,
+        "I-PrimaryOutcome": 2,
+        "B-SecondaryOutcome": 3,
+        "I-SecondaryOutcome": 4
+    }
+}
+```
+1. Run `python3 -m app.py`

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import json
+import gradio as gr
+from outcome_switch import OutcomeSwitchingDetector, get_sections_text
+from outcome_switch.visual import (
+    get_article_markdown,
+    get_highlighted_text,
+    get_registry_dataframe,
+    get_sankey_diagram,
+)
+_CALCULATED_COSINE_THRESHOLD = 0.44
+_app_description = open("front/app-description.md").read()
+_article_id_examples = json.load(open("front/examples.json"))
+_pmcid_start_value = _article_id_examples[0]
+config = json.load(open('./config.json', 'r'))
+# Load Detector (ner and sim model)
+osd = OutcomeSwitchingDetector(
+    config["ner_path"],
+    config["sim_path"],
+    config["ner_label2id"]
+)
+def controller(article_id:str):
+    # clean input and run detection
+    article_id = str(article_id).strip()
+    output = osd.detect(article_id)
+    # init outputs
+    article_markdown=None
+    article_highlighted_text=None
+    registry_df=None
+    similarity_diagram=None
+    # check whether article markdown can be displayed
+    if output["db"] is None :
+        gr.Warning(f"Wrong format for input id : {article_id}")
+        return None, None, None, None
+    elif output["article_sections"] is None or output["filtered_sections"] is None:
+        gr.Warning(f"Could not retrieve text for id {article_id} (id not found in database or abstract/fulltext unavailable on PubMed/PMC)")
+        return None, None, None, None
+    else :
+        article_markdown = get_article_markdown(article_id, output["article_sections"], output["filtered_sections"])
+    # check whether annotations can be displayed
+    if output["raw_entities"] is not None and output["filtered_sections"] is not None:
+        original_text = get_sections_text(output["filtered_sections"])
+        article_highlighted_text = get_highlighted_text(output["raw_entities"], original_text)
+    else :
+        gr.Warning("Could not extract any outcomes entities in article text")
+    # check whether registry outcomes can be displayed
+    if output["ctgov_outcomes"] is not None:
+        registry_df = get_registry_dataframe(output["ctgov_outcomes"])
+    else:
+        gr.Warning("ClinicalTrials.Gov outcomes were not found (either no NCTID detected or no outcomes declared in registry)")
+        return article_markdown, article_highlighted_text, registry_df, similarity_diagram
+    # check whether similarity diagram can be displayed
+    if (output["connections"] is not None and output["raw_entities"] is not None and
+        output["ctgov_outcomes"] is not None and output["article_outcomes"] is not None):
+        registry_outcomes_tup = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
+                        for outcome in output["ctgov_outcomes"]]
+        similarity_diagram = get_sankey_diagram(
+            registry_outcomes_tup,
+            output["article_outcomes"],
+            output["connections"],
+            output["raw_entities"],
+            _CALCULATED_COSINE_THRESHOLD
+        )
+    else:
+        gr.Warning("Could not compute similarity diagram (missing registry or article outcomes)")
+    return article_markdown, article_highlighted_text, registry_df, similarity_diagram
+def clean():
+    return None, None, None, None
+with gr.Blocks() as blocks:
+    with gr.Column():
+        gr.Markdown('# Outcome Switching Detection \n' + _app_description )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    pmid_input = gr.Textbox(value=_pmcid_start_value, label="PMID or PMCID (PMCID must be preceded by 'PMC' prefix)")
+                with gr.Row():
+                    clear_button = gr.ClearButton()
+                    detect_button = gr.Button(value="Detect", variant="primary")
+        gr.Examples(examples = _article_id_examples, inputs=pmid_input)
+        gr.Markdown("## Results  \n")
+        with gr.Tabs():
+            with gr.TabItem("Article Useful Sections"):
+                filtered_article = gr.Markdown()
+            with gr.TabItem("Article Detected Outcomes"):
+                ner_output = gr.HighlightedText(
+                    color_map={"primary": "lightcoral", "secondary": "lightgreen"},
+                    show_legend=True,
+                    combine_adjacent=True,
+                )
+            with gr.TabItem("Registry Outcomes"):
+                ctgov_output = gr.DataFrame()
+            with gr.TabItem("Similarity"):
+                similarity_output = gr.Plot(show_label=False)
+    # OUTPUTS AND BUTTONS
+    outputs = [filtered_article, ner_output, ctgov_output,  similarity_output]
+    clear_button.add([pmid_input]+outputs)
+    detect_button.click(fn=controller, inputs=pmid_input, outputs=outputs)
+blocks.launch()

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
+    "sim_path": "laiking/all-mpnet-outcome-similarity",
+    "ner_label2id" : {
+        "O": 0,
+        "B-PrimaryOutcome": 1,
+        "I-PrimaryOutcome": 2,
+        "B-SecondaryOutcome": 3,
+        "I-SecondaryOutcome": 4
+    }
+}

front/app-description.md ADDED Viewed

	@@ -0,0 +1,9 @@

+Demo of outcome switching detection using transformers models. Outcome switching is defined as the modification, inversion, suppression of a primary outcome in a Randomized Controlled Trial(RCT) between the  published article and the registry entry.
+What this demo is doing :
+1. Retrieve abstract (PMID given) or fulltext (PMCID given) of an article
+2. Parse the Methods section of the article and get section text
+3. Use finetuned NER model for detecting primary outcomes in that text
+4. Use a RegEx to find the NCT ID (ClinicalTrials.gov) in the full text
+5. Use CTGOV API to extract registry primary outcome (considered as ground truth)
+6. Use Semantic Textual Similarity Model to compare CTGOV outcome to article detected outcomes

front/examples.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    "33476595",
+    "33356422",
+    "PMC6206648",
+    "PMC6935101",
+    "PMC8491132",
+    "PMC7781101",
+    "PMC8005085",
+    "29283904",
+    "33443017",
+    "31599809",
+    "30010751",
+    "29847251",
+    "29946728",
+    "29677641"
+]

outcome_switch/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Any
+from outcome_switch.ctgov import extract_nct_outcomes
+from outcome_switch.similarity import OutcomeSimilarity
+from outcome_switch.entrez import dl_and_parse
+from outcome_switch.filter import filter_sections, filter_outcomes, get_sections_text
+from transformers import (BertConfig,
+                          BertTokenizerFast,
+                          BertForTokenClassification,
+                          TokenClassificationPipeline)
+class OutcomeSwitchingDetector:
+    """Main Class for the whole pipeline of outcome switching detection"""
+    def __init__(self, ner_path:str, sim_path:str, ner_label2id:dict[str,str]):
+        # define config
+        config = BertConfig.from_pretrained(ner_path,
+                                            label2id=ner_label2id,
+                                            id2label={v: k for k, v in ner_label2id.items()})
+        self.outcomes_ner = TokenClassificationPipeline(
+            model = BertForTokenClassification.from_pretrained(ner_path,config=config),
+            tokenizer = BertTokenizerFast.from_pretrained(ner_path),
+            ignore_labels = [],
+            aggregation_strategy = "average",
+            stride=64
+        )
+        self.outcome_sim = OutcomeSimilarity(sim_path)
+    def _extract_article_outcomes(self, article_text:str) -> dict[str, Any]:
+        if not article_text :
+            return {"raw_entities" : None, "article_outcomes" : None}
+        # get article outcomes (all pieces of text annotated)
+        entities_list = self.outcomes_ner(article_text)
+        # filter outcomes and reformat
+        detected_outcomes =  filter_outcomes(entities_list)
+        return {"raw_entities" : entities_list, "article_outcomes" : detected_outcomes}
+    def _compare_outcomes(
+            self,
+            registry_outcomes:list[tuple[str,str]],
+            article_outcomes:list[tuple[str,str]],
+        ) -> dict[str, Any]:
+        if not registry_outcomes or not article_outcomes :
+            return None
+        registry_outcomes = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
+                                for outcome in registry_outcomes]
+        # semantic similarity of outcomes between registry and article
+        return self.outcome_sim.get_similarity(registry_outcomes,article_outcomes)
+    def detect(self, article_id:str) -> dict[str,Any]:
+        """detect outcome switching in input id (pmid, pmcid)
+        returns a dictionary with the following keys :
+        - article_xml : xml string of the article
+        - article_sections : dict of all sections of the article key=title, value=list of text content
+        - check_type : type of the check for regex outcome section filtering (title or content)
+        - regex_priority_name : name of the regex used for outcome section filtering
+        - regex_priority_index : number of priority of the regex used for outcome section filtering (0 is the highest priority)
+        - filtered_sections : dict of all filtered sections of the article key=title, value=list of text content
+        - raw_entities : output of huggingface token classification pipeline with aggregated entities but also O text (non-entity)
+        - article_outcomes : List of tuples (type, outcome) of all outcomes detected in the article
+        - detected_nct_id : first nct id detected in the article
+        - ctgov_outcomes : List of tuples (type, outcome) of all outcomes detected in the registry
+        """
+        # download and parse article
+        parse_output = dl_and_parse(article_id)
+        # search nct id in text, then download and parse registry outcomes
+        registry_outcomes = extract_nct_outcomes(parse_output["article_xml"])
+        # filter article sections and get text
+        filter_output = filter_sections(parse_output["article_sections"])
+        sections_text = get_sections_text(filter_output["filtered_sections"])
+        # outcomes ner in article text
+        ner_output = self._extract_article_outcomes(sections_text)
+        # compare outcomes between article and registry
+        connections = self._compare_outcomes(registry_outcomes, ner_output["article_outcomes"])
+        return parse_output | {"ctgov_outcomes":registry_outcomes} | filter_output | ner_output | {"connections":connections}

outcome_switch/ctgov.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import re
+import ast
+import requests
+from typing import Union
+def _find_nctid(text: str) -> Union[str,None]:
+    "return nct string if found in text else none"
+    match = re.search(r"[Nn][Cc][Tt]0*[1-9]\d{0,7}", text)
+    return match[0] if match is not None else match
+def _get_registry_outcomes(nct_id: str) -> Union[dict,None]:
+    outcomes = None
+    r = requests.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}", params={"fields":"OutcomesModule"})
+    if r.status_code == 200 and "outcomesModule" in r.json()["protocolSection"]:
+        outcomes = ast.literal_eval(r.text)["protocolSection"]["outcomesModule"]
+    return outcomes
+def _reformat_outcomes(outcomes: dict) -> list[dict[str,str]]:
+    new_outcomes = []
+    for outcome_type, outcome_list in outcomes.items() :
+        outcome_type = outcome_type.replace("Outcomes","")
+        for outcome_item in outcome_list :
+            outcome_item["type"] = outcome_type
+            new_outcomes.append(outcome_item)
+    return new_outcomes
+def extract_nct_outcomes(text:str) -> Union[None,list[dict[str,str]]]:
+    """Extract outcomes from a text using CTGOV APIV2 if a nct id is found else return None"""
+    outcomes = None
+    if text is None :
+        return outcomes
+    nct_id = _find_nctid(text)
+    if nct_id is not None:
+        outcomes = _get_registry_outcomes(nct_id)
+        outcomes = _reformat_outcomes(outcomes)
+    return outcomes

outcome_switch/entrez.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""Module for fetching and parsing articles from PubMed and PMC using Entrez efetch."""
+from __future__ import annotations
+import html
+import requests
+import unicodedata
+from abc import ABC, abstractmethod
+from io import StringIO
+from pathlib import Path
+from typing import IO, Any, Dict, Union
+from xml.etree.ElementTree import Element  # nosec
+from zipfile import ZipFile
+from typing import Generator
+from defusedxml import ElementTree
+_ENTREZ_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+def _db_parser(article_id:str) -> str|None:
+    """Parse the article ID to ensure it is in the correct format."""
+    db = None
+    if article_id.startswith('PMC') and article_id[3:].isdigit():
+        db = "pmc"
+    elif article_id.isdigit():
+        db = "pubmed"
+    return db
+def _dl_article_xml(article_id:str, db:str|None) -> tuple[None|str,str] :
+    xml_string = None
+    params = {"db": db, "id": article_id, "retmode": "xml"}
+    response = requests.get(_ENTREZ_EFETCH_URL, params=params)
+    if response.status_code == 200:
+        xml_string = response.text
+    return xml_string
+def _parse_article(xml_string:str, db:str) -> Union[None,ArticleParser] :
+    parsed_article = None
+    if db == "pmc":
+        parsed_article = JATSXMLParser.from_string(xml_string)
+    elif db == "pubmed":
+        parsed_article = PubMedXMLParser(xml_string)
+    # check if parsing was successful
+    if not parsed_article.abstract and not parsed_article.paragraphs:
+        parsed_article = None
+    return parsed_article
+def _reformat_article(parsed_article:ArticleParser) -> Dict[str,Any] :
+    reformatted_article = {"Title":[parsed_article.title]}
+    for sec_title,sentence in parsed_article.abstract :
+        sec_title = "Abstract" if sec_title is None else "Abstract - " + sec_title
+        reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
+    for sec_title,sentence in parsed_article.paragraphs :
+        reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
+    return reformatted_article
+def dl_and_parse(article_id:str) -> Dict[str,Union[None,Any]]:
+    """Fetch article from PubMed or PMC using the ID using Entrez efetch
+    and parse it using the appropriate parser. Then returns dict containing keys :
+    article_xml(raw xml of downloaded article) and
+    article_sections (parsed sections in the form of a dictionary with keys as section titles
+    and values as list of text content)"""
+    parse_output = {
+        "db" : None,
+        "article_xml": None,
+        "article_sections": None,
+    }
+    # parse id for correct db format
+    parse_output["db"] = _db_parser(article_id)
+    if parse_output["db"] is None:
+        return parse_output
+    parse_output["article_xml"] = _dl_article_xml(article_id, parse_output["db"])
+    article_parser = _parse_article(parse_output["article_xml"], parse_output["db"])
+    if article_parser is None :
+        return parse_output
+    parse_output["article_sections"] = _reformat_article(article_parser)
+    return parse_output
+class ArticleParser(ABC):
+    """An abstract base class for article parsers."""
+    @property
+    @abstractmethod
+    def title(self) -> str:
+        """Get the article title.
+        Returns
+        -------
+        str
+            The article title.
+        """
+    @property
+    @abstractmethod
+    def abstract(self) -> list[str]:
+        """Get a sequence of paragraphs in the article abstract.
+        Returns
+        -------
+        list of str
+            The paragraphs of the article abstract.
+        """
+    @property
+    @abstractmethod
+    def paragraphs(self) -> list[tuple[str, str]]:
+        """Get all paragraphs and titles of sections they are part of.
+        Returns
+        -------
+        list of (str, str)
+            For each paragraph a tuple with two strings is returned. The first
+            is the section title, the second the paragraph content.
+        """
+class JATSXMLParser(ArticleParser):
+    def __init__(self, xml_stream: IO[Any]) -> None:
+        super().__init__()
+        self.content = ElementTree.parse(xml_stream)
+        if self.content.getroot().tag == "pmc-articleset":
+            self.content = self.content.find("article")
+    @classmethod
+    def from_string(cls, xml_string: str) -> JATSXMLParser:
+        with StringIO(xml_string) as stream:
+            obj = cls(stream)
+        return obj
+    @classmethod
+    def from_zip(cls, path: str | Path) -> JATSXMLParser:
+        with ZipFile(path) as myzip:
+            xml_files = [
+                x
+                for x in myzip.namelist()
+                if x.startswith("content/") and x.endswith(".xml")
+            ]
+            if len(xml_files) != 1:
+                raise ValueError(
+                    "There needs to be exactly one .xml file inside of content/"
+                )
+            xml_file = xml_files[0]
+            # Parsing logic
+            with myzip.open(xml_file, "r") as fh:
+                obj = cls(fh)
+        return obj
+    @property
+    def title(self) -> str:
+        titles = self.content.find("./front/article-meta/title-group/article-title")
+        return self._element_to_str(titles)
+    @property
+    def abstract(self) -> list[tuple[str, str]]:
+        abstract = self.content.find("./front/article-meta/abstract")
+        abstract_list: list[tuple[str, str]] = []
+        if abstract:
+            for sec_title, text in self.parse_section(abstract):
+                abstract_list.append((sec_title,text))
+        return abstract_list
+    @property
+    def paragraphs(self) -> list[tuple[str, str]]:
+        paragraph_list: list[tuple[str, str]] = []
+        # Paragraphs of text body
+        body = self.content.find("./body")
+        if body:
+            paragraph_list.extend(self.parse_section(body,""))
+        # Figure captions
+        figs = self.content.findall("./body//fig")
+        for fig in figs:
+            fig_captions = fig.findall("caption")
+            if fig_captions is None:
+                continue
+            caption = " ".join(self._element_to_str(c) for c in list(fig_captions))
+            if caption:
+                paragraph_list.append(("Figure Caption", caption))
+        # Table captions
+        tables = self.content.findall("./body//table-wrap")
+        for table in tables:
+            caption_elements = table.findall("./caption/p") or table.findall(
+                "./caption/title"
+            )
+            if caption_elements is None:
+                continue
+            caption = " ".join(self._element_to_str(c) for c in caption_elements)
+            if caption:
+                paragraph_list.append(("Table Caption", caption))
+        return paragraph_list
+    def parse_section(self, section: Element, sec_title_path: str = "") -> Generator[tuple[str, str], None, None]:
+        sec_title = self._element_to_str(section.find("title"))
+        if sec_title == "Author contributions":
+            return
+        sec_title_path = sec_title_path + " - " + sec_title if sec_title_path else sec_title
+        for element in section:
+            if element.tag == "sec":
+                yield from self.parse_section(element, sec_title_path)
+            elif element.tag in {"title", "caption", "fig", "table-wrap", "label"}:
+                continue
+            else:
+                text = self._element_to_str(element)
+                if text:
+                    yield sec_title_path, text
+    def _inner_text(self, element: Element) -> str:
+        text_parts = [html.unescape(element.text or "")]
+        for sub_element in element:
+            # recursively parse the sub-element
+            text_parts.append(self._element_to_str(sub_element))
+            # don't forget the text after the sub-element
+            text_parts.append(html.unescape(sub_element.tail or ""))
+        return unicodedata.normalize("NFKC", "".join(text_parts)).strip()
+    def _element_to_str(self, element: Element | None) -> str:
+        if element is None:
+            return ""
+        if element.tag in {
+            "bold",
+            "italic",
+            "monospace",
+            "p",
+            "sc",
+            "styled-content",
+            "underline",
+            "xref",
+        }:
+            # Mostly styling tags for which getting the inner text is enough.
+            # Currently this is the same as the default handling. Writing it out
+            # explicitly here to decouple from the default handling, which may
+            # change in the future.
+            return self._inner_text(element)
+        elif element.tag == "sub":
+            return f"_{self._inner_text(element)}"
+        elif element.tag == "sup":
+            return f"^{self._inner_text(element)}"
+        elif element.tag in {
+            "disp-formula",
+            "email",
+            "ext-link",
+            "inline-formula",
+            "uri",
+        }:
+            return ""
+        else:
+            # Default handling for all other element tags
+            return self._inner_text(element)
+class PubMedXMLParser(ArticleParser):
+    """Parser for PubMed abstract."""
+    def __init__(self, data: str | bytes) -> None:
+        super().__init__()
+        self.content = ElementTree.fromstring(data)
+    @property
+    def title(self) -> str:
+        title = self.content.find("./PubmedArticle/MedlineCitation/Article/ArticleTitle")
+        if title is None:
+            return ""
+        return "".join(title.itertext())
+    @property
+    def abstract(self) -> list[tuple[str,str]]:
+        abstract = self.content.find("./PubmedArticle/MedlineCitation/Article/Abstract")
+        if abstract is None:
+            # No paragraphs to parse: stop and return an empty iterable.
+            return []  # noqa
+        paragraphs = abstract.iter("AbstractText")
+        abstract_list: list[tuple[str,str]] = []
+        if paragraphs is not None:
+            for paragraph in paragraphs:
+                sec_title = paragraph.get("Label")
+                abstract_list.append((sec_title,"".join(paragraph.itertext())))
+        return abstract_list
+    @property
+    def paragraphs(self) -> list[tuple[str, str]]:
+        # No paragraph to parse in PubMed article sets: return an empty iterable.
+        return []

outcome_switch/filter.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import re
+from typing import Dict, List, Any, Tuple
+STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)'
+OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)'
+METHOD_REGEX = '(method|approach|strategy|design|protocol)'
+SAMPLE_SIZE_REGEX = 'sample\s(size|number)'
+ABSTRACT_REGEX = '(abstract|summary)'
+STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
+PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}'
+STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}'
+METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'
+CHECK_PRIORITY = [
+    ("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
+    ("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
+    ("prim_sec","title",PRIM_SEC_REGEX),
+    ("outcome","title",OUTCOME_REGEX),
+    ("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
+    ("prim_sec","content",PRIM_SEC_REGEX),
+    ("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
+    ("outcome","content",OUTCOME_REGEX),
+    ("method","title",METHOD_REGEX),
+    ("sample_size","title",SAMPLE_SIZE_REGEX),
+    ("abstract","title",ABSTRACT_REGEX),
+]
+def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
+    """Filter sections to keep only the ones containing relevant information if the text is a fulltext
+    else keep all sections of abstract
+    Args:
+        sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
+        text_type (str): type of text to filter (abstract or fulltext)
+    Returns:
+        Dict[str,Any]: dictionary containing the following keys:
+            - filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
+            - regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
+            - regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
+            - check_type: type of check used to filter the sections (title or content)
+    """
+    filter_output = {
+        "filtered_sections" : None,
+        "regex_priority_index" : None,
+        "regex_priority_name" : None,
+        "check_type" : None,
+    }
+    if not sections_dict:
+        return filter_output
+    # else we filter the sections
+    filter_output["filtered_sections"] = {} # init
+    match_found = False
+    for i, el  in enumerate(CHECK_PRIORITY) :
+        priority_name, content_type, current_regex = el
+        current_regex = re.compile(current_regex, re.IGNORECASE)
+        for title, content_list in sections_dict.items() :
+            content = title if content_type == "title" else '\n'.join(content_list)
+            if current_regex.search(content) :
+                filter_output["check_type"] = content_type
+                filter_output["regex_priority_name"] = priority_name
+                filter_output["regex_priority_index"] = i
+                filter_output["filtered_sections"][title] = content_list
+                match_found = True
+        if match_found :
+            break
+    return filter_output
+def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
+    """Filter primary and secondary outcomes from the list of entities a key is created
+    only if at least one entity is found for the given group"""
+    outcomes = []
+    for entity in entities:
+        if entity["entity_group"] == "O":
+            continue
+        elif entity["entity_group"] == "PrimaryOutcome" :
+            outcomes.append(("primary", entity["word"]))
+        elif entity["entity_group"] == "SecondaryOutcome":
+            outcomes.append(("secondary", entity["word"]))
+    return outcomes
+def get_sections_text(sections: Dict[str, List[str]]) -> str:
+    if not sections :
+        return None
+    sections_text = ""
+    for title, content in sections.items():
+        sections_text += title + '\n' + " ".join(content) + '\n'
+    return sections_text

outcome_switch/similarity.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import torch.nn.functional as F
+from sentence_transformers.util import cos_sim
+from transformers import AutoTokenizer, AutoModel
+class OutcomeSimilarity:
+    """ similarity detector between outcomes statements"""
+    ID2LABEL = ["different", "similar"]
+    def __init__(self, model_path: str):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModel.from_pretrained(model_path)
+    def _mean_pooling(self, model_output, attention_mask: torch.Tensor):
+        """ Mean Pooling - Take attention mask into account for correct averaging"""
+        # First element of model_output contains all token embeddings
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(
+            -1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def _encode(self, outcomes_lot: list[tuple[str,str]]):
+        # Parse sentences
+        sentences = []
+        if len(outcomes_lot) > 0:
+            _, sentences = zip(*outcomes_lot)
+        # Tokenize sentences
+        encoded_input = self.tokenizer(
+            sentences, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        # Perform pooling
+        sentence_embeddings = self._mean_pooling(
+            model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+        return sentence_embeddings
+    def get_similarity(
+            self,
+            registry_outcomes:list[tuple[str,str]],
+            article_outcomes:list[tuple[str,str]]
+        ) -> list[tuple[int,int,float]]:
+        """For each outcome in true_dict, find the most similar outcome in compared_dict and return a mapping
+        of all matchs , for each tuple : registry is the first index (at i=0); article is the second index (at i=1)
+        and the third index (i=3) is the cosine similarity score"""
+        connections = set()
+        rembs = self._encode(registry_outcomes)
+        aembs = self._encode(article_outcomes)
+        cosines_scores = cos_sim(rembs, aembs)
+        lines_max = torch.argmax(cosines_scores, dim=1)
+        col_max = torch.argmax(cosines_scores, dim=0)
+        remaining_cols = set(range(len(col_max)))
+        for i in range(len(lines_max)):
+            connection = (i, lines_max[i].item(), cosines_scores[i, lines_max[i]].item())
+            remaining_cols.discard(lines_max[i].item())
+            connections.add(connection)
+        for j in remaining_cols:
+            connection = (col_max[j].item(), j, cosines_scores[col_max[j], j].item())
+            connections.add(connection)
+        return connections

outcome_switch/visual.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import pandas as pd
+import plotly.graph_objects as go
+from typing import List, Dict, Any, Tuple, Union
+_PUBMED_LINK= "https://pubmed.ncbi.nlm.nih.gov/{article_id}/"
+_PMC_LINK = "https://www.ncbi.nlm.nih.gov/pmc/articles/{article_id}/"
+_MARKDOWN_TEMPLATE = """# [{article_title}]({article_link})
+# Filtered sections :
+{sections_md}"""
+# entities highlighted text
+def get_highlighted_text(entities:List[Dict[str,Any]], original_text:str) -> List[Tuple[str,Union[str,None]]] :
+    """Convert the output of the model to a list of tuples (entity, label)
+    for `gradio.HighlightedText`output"""
+    conversion = {"PrimaryOutcome":"primary","SecondaryOutcome":"secondary"}
+    highlighted_text = []
+    for entity in entities:
+        entity_original_text = original_text[entity["start"]:entity["end"]]
+        if entity["entity_group"] == "O":
+            entity_output = (entity_original_text, None)
+        else:
+            entity_output = (entity_original_text, conversion[entity["entity_group"]])
+        highlighted_text.append(entity_output)
+    return highlighted_text
+# article filtered sections markdown output
+def get_article_markdown(
+        article_id:str,
+        article_sections:dict[str,list[str]],
+        filtered_sections:dict[str,list[str]]) -> str:
+    """Get the markdown of a list of sections"""
+    # link to online article
+    article_link = _PMC_LINK if article_id.startswith("PMC") else _PUBMED_LINK
+    article_link = article_link.format(article_id=article_id)
+    # get title, abstract, and filtered sections
+    article_title = article_sections["Title"][0]
+    sections_md = ""
+    for title, content in filtered_sections.items():
+        sections_md += f"## {title}\n"
+        sections_md += " ".join(content) + "\n"
+    return _MARKDOWN_TEMPLATE.format(
+        article_link=article_link,
+        article_title=article_title,
+        sections_md=sections_md
+    )
+# registry dataframe display
+def _highlight_df_rows(row):
+    if row['type'] =='primary':
+        return ['background-color: lightcoral'] * len(row)
+    elif row['type'] == 'secondary':
+        return ['background-color: lightgreen'] * len(row)
+    else :
+        return ['background-color: lightgrey'] * len(row)
+def get_registry_dataframe(registry_outcomes: list[dict[str,str]]) -> str:
+    return pd.DataFrame(registry_outcomes).style.apply(_highlight_df_rows, axis=1)
+# fcts for sankey diagram
+def _sent_line_formatting(sentence:str, max_words:int=10) -> str:
+    """format a sentence to be displayed in a sankey diagram so that
+    each line has a maximum of `max_words` words"""
+    words = sentence.split()
+    batchs = [words[i:i+max_words] for i in range(0, len(words), max_words)]
+    return "<br>".join([" ".join(batch) for batch in batchs])
+def _find_entity_score(entity_text, raw_entities):
+    for tc_output in raw_entities:
+        if entity_text == tc_output["word"]:
+            return tc_output["score"]
+def get_sankey_diagram(
+        registry_outcomes: list[tuple[str,str]],
+        article_outcomes: list[tuple[str,str]],
+        connections: set[tuple[int,int,float]],
+        raw_entities: list[Dict[str,Any]],
+        cosine_threshold: float=0.44,
+    ) -> go.Figure:
+    color_map = {
+        "primary": "red",
+        "secondary": "green",
+        "other": "grey",
+    }
+    # Create lists of formatted sentences and colors for the nodes
+    list1 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in registry_outcomes]
+    list2 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in article_outcomes]
+    display_connections = [
+        (list1[i][0],list2[j][0],"mediumaquamarine") if cosine > cosine_threshold
+        else (list1[i][0],list2[j][0],"lightgray") for i,j,cosine in connections
+    ]
+    # Create a list of labels and colors for the nodes
+    labels = [x[0] for x in list1 + list2]
+    colors = [x[1] for x in list1 + list2]
+    # Create lists of sources and targets for the connections
+    sources = [labels.index(x[0]) for x in display_connections]
+    targets = [labels.index(x[1]) for x in display_connections]
+    # Create a list of values and colors for the connections
+    values = [1] * len(display_connections)
+    connection_colors = [x[2] for x in display_connections]
+    # data appearing on hover of each node (outcome)
+    node_customdata = [f"from: registry<br>type:{t}" for t,_ in registry_outcomes]
+    node_customdata += [f"from: article<br>type: {t}<br>confidence: " + str(_find_entity_score(s, raw_entities)) for t,s in article_outcomes]
+    node_hovertemplate = "outcome: %{label}<br>%{customdata} <extra></extra>"
+    # data appearing on hover of each link (node connections)
+    link_customdata = [cosine for _,_,cosine in connections]
+    link_hovertemplate = "similarity: %{customdata} <extra></extra>"
+    # sankey diagram data filling
+    sankey =  go.Sankey(
+        node=dict(
+            pad=15,
+            thickness=20,
+            line=dict(color="black", width=0.5),
+            label=labels,
+            color=colors,
+            customdata=node_customdata,
+            hovertemplate=node_hovertemplate
+        ),
+        link=dict(
+            source=sources,
+            target=targets,
+            value=values,
+            customdata=link_customdata,
+            color=connection_colors,
+            hovertemplate=link_hovertemplate
+        )
+    )
+    # conversion to figure
+    fig = go.Figure(data=[sankey])
+    fig.update_layout(
+        title_text="Registry outcomes (left) connections with article outcomes (right), similarity threshold = " + str(cosine_threshold),
+        font_size=10,
+        width=1200,
+        xaxis=dict(rangeslider=dict(visible=True),type="linear")
+    )
+    return fig

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
+pandas==2.2.2
+gradio==4.44.0
+plotly==5.24.0
+transformers==4.44.2
+sentence-transformers==3.0.1

test/parse_examples/36473651.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

test/parse_examples/PMC11102686.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

test/test_ctgov.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import unittest
+from outcome_switch.ctgov import find_nctid, get_registry_outcomes, reformat_outcomes
+_EMPTY_STRING = ""
+# NCT REGEX extraction tests
+_TEXT_WITH_ONE_NCT = """blablabla nct id is NCT04647656 blabla"""
+_TEXT_WITH_TWO_NCT = """blablabla nct id is NCT04647656 blabla NCT06562582 bla"""
+_TEXT_WITHOUT_NCT = "blablabla blablabla"
+# NCT REGISTRY API tests
+_CORRECT_NCT = "NCT04647656"
+_INCORRECT_NCT = "PRN03216548"
+# REGISTRY outcomes reformatting test
+_CTGOV_OUTCOMES = {
+    "primaryOutcomes": [
+        {
+            "measure": "Cognitive health assessment (NeuroTrax)",
+            "description": "Memory, attention and information process will be evaluated using the NeuroTrax computerized cognitive evaluation battery.",
+            "timeFrame": "Baseline, 2 months",
+        }
+    ],
+    "secondaryOutcomes": [
+        {
+            "measure": "Brain perfusion",
+            "description": "Cerebral blood volume and flow will be measured using perfusion MRI protocol Dynamic susceptibility contrast (DSC).",
+            "timeFrame": "Baseline, 2 months",
+        }
+    ],
+}
+class NctidFinderTest(unittest.TestCase) :
+    def test_text_with_one_nct(self):
+        self.assertEqual(find_nctid(_TEXT_WITH_ONE_NCT), "NCT04647656")
+    def test_text_with_two_nct(self):
+        self.assertEqual(find_nctid(_TEXT_WITH_TWO_NCT), "NCT04647656")
+    def test_text_without_nct(self):
+        self.assertIsNone(find_nctid(_TEXT_WITHOUT_NCT))
+    def test_empty_string(self):
+        self.assertIsNone(find_nctid(_EMPTY_STRING))
+    def test_none_input(self):
+        self.assertIsNone(find_nctid(None))
+class CtgovExtractionTest(unittest.TestCase) :
+    def test_correct_nct(self):
+        self.assertIsNotNone(get_registry_outcomes(_CORRECT_NCT))
+    def test_incorrect_nct(self):
+        self.assertIsNone(get_registry_outcomes(_INCORRECT_NCT))
+    def test_empty_string(self):
+        self.assertIsNone(get_registry_outcomes(_EMPTY_STRING))
+class CtgovReformatTest(unittest.TestCase) :
+    def test_correct_reformat_outcomes(self):
+        self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES), list)
+        self.assertEqual(len(reformat_outcomes(_CTGOV_OUTCOMES)), 2)
+        self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[0], dict)
+        self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[1], dict)
+        self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[0]["type"], "primary")
+        self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[1]["type"], "secondary")

test/test_entrez.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import unittest
+from outcome_switch.entrez import _dl_article_xml, _parse_article, _reformat_article
+# Efetch tests
+_VALID_PMCID = "PMC6206648"
+_VALID_PMID_1 = "29283904"
+_VALID_PMID_2 = "29214975"
+_INVALID_1 = "10.1056/NEJMoa2110345"
+_INVALID_2 = "0123456789"
+_EMPTY = ""
+# XML Parsing tests files
+# TODO :  tests for parsing XML files
+class EntrezEfetchTest(unittest.TestCase):
+    def test_valid_pmcid(self):
+        self.assertIsNotNone(_dl_article_xml(_VALID_PMCID)[0])
+        self.assertEqual(_dl_article_xml(_VALID_PMCID)[1], "pmc")
+    def test_valid_pmid1(self):
+        self.assertIsNotNone(_dl_article_xml(_VALID_PMID_1)[0])
+        self.assertEqual(_dl_article_xml(_VALID_PMID_1)[1], "pubmed")
+    def test_valid_pmid2(self):
+        self.assertIsNotNone(_dl_article_xml(_VALID_PMID_2)[0])
+        self.assertEqual(_dl_article_xml(_VALID_PMID_2)[1], "pubmed")
+    def test_invalid1(self):
+        self.assertIsNone(_dl_article_xml(_INVALID_1)[0])
+    def test_invalid2(self):
+        self.assertIsNone(_dl_article_xml(_INVALID_2)[0])
+    def test_empty(self):
+        self.assertIsNone(_dl_article_xml(_EMPTY)[0])
+        self.assertIsNone(_dl_article_xml(_EMPTY)[1])

test/test_filter.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import unittest
+from outcome_switch import filter_sections, filter_outcomes, get_sections_text
+_VALID_DICT_WITH_2_SECTIONS = {
+    "Methods - Outcomes - Primary outcome": [
+        "The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n         2 4]."
+    ],
+    "Methods - Outcomes - Secondary outcomes": [
+        "Secondary outcomes include:",
+        "Hip function as measured by the Hip Outcome Score (HOS).",
+        "Generic physical and mental health as measured by the Short Form-12 (SF-12).",
+        "Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
+        "Health utility as measured by the EuroQol (EQ-5D).",
+        "Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
+        "The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n         5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6,  7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11,  12].",
+    ],
+    "Discussion - Analysis plan - Blinded analyses": [
+        "All statistical analyses will first be completed using blinded treatment groups (i.e. treatment X and Y). Interpretations for the effect of the surgical interventions will be documented based upon blinded X versus Y treatment [ \n         14]."
+    ],
+}
+_FILTERED_SECTIONS = {
+    "filtered_sections": {
+        "Methods - Outcomes - Primary outcome": [
+            "The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n         2 4]."
+        ],
+        "Methods - Outcomes - Secondary outcomes": [
+            "Secondary outcomes include:",
+            "Hip function as measured by the Hip Outcome Score (HOS).",
+            "Generic physical and mental health as measured by the Short Form-12 (SF-12).",
+            "Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
+            "Health utility as measured by the EuroQol (EQ-5D).",
+            "Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
+            "The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n         5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6,  7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11,  12].",
+        ],
+    },
+    "regex_priority_index": 0,
+    "regex_priority_name": "strict_method_and_prim_sec",
+    "check_type": "title",
+}
+_EMPTY_DICT = {}