Mathieu Lai-King commited on
Commit
1a3b3aa
0 Parent(s):

first commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ .vscode/
3
+ models/
README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Outcome Switching Detector
3
+ emoji: 🔄
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: true
10
+ python_version: 3.11.9
11
+ models: ['aakorolyova/primary_and_secondary_outcome_extraction','Mathking/all-mpnet-outcome-similarity']
12
+ ---
13
+
14
+ # Outcome Switching Detector
15
+
16
+ ## Installation
17
+
18
+ 1. Download dependencies : `pip install -r requirements.txt`
19
+
20
+ 2. Define pretrained models path in config file : you must redefine `config.json` so that it points to the models if you do not have them on disk. YOu also can redefine ner_labe2id depending on the model you use
21
+ ```json
22
+ {
23
+ "ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
24
+ "sim_path": "laiking/all-mpnet-outcome-similarity",
25
+ "ner_label2id" : {
26
+ "O": 0,
27
+ "B-PrimaryOutcome": 1,
28
+ "I-PrimaryOutcome": 2,
29
+ "B-SecondaryOutcome": 3,
30
+ "I-SecondaryOutcome": 4
31
+ }
32
+ }
33
+ ```
34
+
35
+ 1. Run `python3 -m app.py`
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import gradio as gr
3
+ from outcome_switch import OutcomeSwitchingDetector, get_sections_text
4
+ from outcome_switch.visual import (
5
+ get_article_markdown,
6
+ get_highlighted_text,
7
+ get_registry_dataframe,
8
+ get_sankey_diagram,
9
+ )
10
+
11
+ _CALCULATED_COSINE_THRESHOLD = 0.44
12
+ _app_description = open("front/app-description.md").read()
13
+ _article_id_examples = json.load(open("front/examples.json"))
14
+ _pmcid_start_value = _article_id_examples[0]
15
+ config = json.load(open('./config.json', 'r'))
16
+
17
+ # Load Detector (ner and sim model)
18
+ osd = OutcomeSwitchingDetector(
19
+ config["ner_path"],
20
+ config["sim_path"],
21
+ config["ner_label2id"]
22
+ )
23
+
24
+ def controller(article_id:str):
25
+ # clean input and run detection
26
+ article_id = str(article_id).strip()
27
+ output = osd.detect(article_id)
28
+
29
+ # init outputs
30
+ article_markdown=None
31
+ article_highlighted_text=None
32
+ registry_df=None
33
+ similarity_diagram=None
34
+
35
+ # check whether article markdown can be displayed
36
+ if output["db"] is None :
37
+ gr.Warning(f"Wrong format for input id : {article_id}")
38
+ return None, None, None, None
39
+ elif output["article_sections"] is None or output["filtered_sections"] is None:
40
+ gr.Warning(f"Could not retrieve text for id {article_id} (id not found in database or abstract/fulltext unavailable on PubMed/PMC)")
41
+ return None, None, None, None
42
+ else :
43
+ article_markdown = get_article_markdown(article_id, output["article_sections"], output["filtered_sections"])
44
+
45
+ # check whether annotations can be displayed
46
+ if output["raw_entities"] is not None and output["filtered_sections"] is not None:
47
+ original_text = get_sections_text(output["filtered_sections"])
48
+ article_highlighted_text = get_highlighted_text(output["raw_entities"], original_text)
49
+ else :
50
+ gr.Warning("Could not extract any outcomes entities in article text")
51
+
52
+ # check whether registry outcomes can be displayed
53
+ if output["ctgov_outcomes"] is not None:
54
+ registry_df = get_registry_dataframe(output["ctgov_outcomes"])
55
+ else:
56
+ gr.Warning("ClinicalTrials.Gov outcomes were not found (either no NCTID detected or no outcomes declared in registry)")
57
+ return article_markdown, article_highlighted_text, registry_df, similarity_diagram
58
+
59
+ # check whether similarity diagram can be displayed
60
+ if (output["connections"] is not None and output["raw_entities"] is not None and
61
+ output["ctgov_outcomes"] is not None and output["article_outcomes"] is not None):
62
+ registry_outcomes_tup = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
63
+ for outcome in output["ctgov_outcomes"]]
64
+ similarity_diagram = get_sankey_diagram(
65
+ registry_outcomes_tup,
66
+ output["article_outcomes"],
67
+ output["connections"],
68
+ output["raw_entities"],
69
+ _CALCULATED_COSINE_THRESHOLD
70
+ )
71
+ else:
72
+ gr.Warning("Could not compute similarity diagram (missing registry or article outcomes)")
73
+
74
+ return article_markdown, article_highlighted_text, registry_df, similarity_diagram
75
+
76
+ def clean():
77
+ return None, None, None, None
78
+
79
+ with gr.Blocks() as blocks:
80
+ with gr.Column():
81
+ gr.Markdown('# Outcome Switching Detection \n' + _app_description )
82
+ with gr.Row():
83
+ with gr.Column():
84
+ with gr.Row():
85
+ pmid_input = gr.Textbox(value=_pmcid_start_value, label="PMID or PMCID (PMCID must be preceded by 'PMC' prefix)")
86
+ with gr.Row():
87
+ clear_button = gr.ClearButton()
88
+ detect_button = gr.Button(value="Detect", variant="primary")
89
+ gr.Examples(examples = _article_id_examples, inputs=pmid_input)
90
+ gr.Markdown("## Results \n")
91
+ with gr.Tabs():
92
+ with gr.TabItem("Article Useful Sections"):
93
+ filtered_article = gr.Markdown()
94
+ with gr.TabItem("Article Detected Outcomes"):
95
+ ner_output = gr.HighlightedText(
96
+ color_map={"primary": "lightcoral", "secondary": "lightgreen"},
97
+ show_legend=True,
98
+ combine_adjacent=True,
99
+ )
100
+ with gr.TabItem("Registry Outcomes"):
101
+ ctgov_output = gr.DataFrame()
102
+ with gr.TabItem("Similarity"):
103
+ similarity_output = gr.Plot(show_label=False)
104
+ # OUTPUTS AND BUTTONS
105
+ outputs = [filtered_article, ner_output, ctgov_output, similarity_output]
106
+ clear_button.add([pmid_input]+outputs)
107
+ detect_button.click(fn=controller, inputs=pmid_input, outputs=outputs)
108
+
109
+ blocks.launch()
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
3
+ "sim_path": "laiking/all-mpnet-outcome-similarity",
4
+ "ner_label2id" : {
5
+ "O": 0,
6
+ "B-PrimaryOutcome": 1,
7
+ "I-PrimaryOutcome": 2,
8
+ "B-SecondaryOutcome": 3,
9
+ "I-SecondaryOutcome": 4
10
+ }
11
+ }
front/app-description.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Demo of outcome switching detection using transformers models. Outcome switching is defined as the modification, inversion, suppression of a primary outcome in a Randomized Controlled Trial(RCT) between the published article and the registry entry.
2
+
3
+ What this demo is doing :
4
+ 1. Retrieve abstract (PMID given) or fulltext (PMCID given) of an article
5
+ 2. Parse the Methods section of the article and get section text
6
+ 3. Use finetuned NER model for detecting primary outcomes in that text
7
+ 4. Use a RegEx to find the NCT ID (ClinicalTrials.gov) in the full text
8
+ 5. Use CTGOV API to extract registry primary outcome (considered as ground truth)
9
+ 6. Use Semantic Textual Similarity Model to compare CTGOV outcome to article detected outcomes
front/examples.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "33476595",
3
+ "33356422",
4
+ "PMC6206648",
5
+ "PMC6935101",
6
+ "PMC8491132",
7
+ "PMC7781101",
8
+ "PMC8005085",
9
+ "29283904",
10
+ "33443017",
11
+ "31599809",
12
+ "30010751",
13
+ "29847251",
14
+ "29946728",
15
+ "29677641"
16
+ ]
outcome_switch/__init__.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from outcome_switch.ctgov import extract_nct_outcomes
3
+ from outcome_switch.similarity import OutcomeSimilarity
4
+ from outcome_switch.entrez import dl_and_parse
5
+ from outcome_switch.filter import filter_sections, filter_outcomes, get_sections_text
6
+ from transformers import (BertConfig,
7
+ BertTokenizerFast,
8
+ BertForTokenClassification,
9
+ TokenClassificationPipeline)
10
+
11
+ class OutcomeSwitchingDetector:
12
+ """Main Class for the whole pipeline of outcome switching detection"""
13
+ def __init__(self, ner_path:str, sim_path:str, ner_label2id:dict[str,str]):
14
+ # define config
15
+ config = BertConfig.from_pretrained(ner_path,
16
+ label2id=ner_label2id,
17
+ id2label={v: k for k, v in ner_label2id.items()})
18
+ self.outcomes_ner = TokenClassificationPipeline(
19
+ model = BertForTokenClassification.from_pretrained(ner_path,config=config),
20
+ tokenizer = BertTokenizerFast.from_pretrained(ner_path),
21
+ ignore_labels = [],
22
+ aggregation_strategy = "average",
23
+ stride=64
24
+ )
25
+ self.outcome_sim = OutcomeSimilarity(sim_path)
26
+
27
+ def _extract_article_outcomes(self, article_text:str) -> dict[str, Any]:
28
+ if not article_text :
29
+ return {"raw_entities" : None, "article_outcomes" : None}
30
+ # get article outcomes (all pieces of text annotated)
31
+ entities_list = self.outcomes_ner(article_text)
32
+ # filter outcomes and reformat
33
+ detected_outcomes = filter_outcomes(entities_list)
34
+ return {"raw_entities" : entities_list, "article_outcomes" : detected_outcomes}
35
+
36
+ def _compare_outcomes(
37
+ self,
38
+ registry_outcomes:list[tuple[str,str]],
39
+ article_outcomes:list[tuple[str,str]],
40
+ ) -> dict[str, Any]:
41
+ if not registry_outcomes or not article_outcomes :
42
+ return None
43
+ registry_outcomes = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
44
+ for outcome in registry_outcomes]
45
+ # semantic similarity of outcomes between registry and article
46
+ return self.outcome_sim.get_similarity(registry_outcomes,article_outcomes)
47
+
48
+ def detect(self, article_id:str) -> dict[str,Any]:
49
+ """detect outcome switching in input id (pmid, pmcid)
50
+ returns a dictionary with the following keys :
51
+ - article_xml : xml string of the article
52
+ - article_sections : dict of all sections of the article key=title, value=list of text content
53
+ - check_type : type of the check for regex outcome section filtering (title or content)
54
+ - regex_priority_name : name of the regex used for outcome section filtering
55
+ - regex_priority_index : number of priority of the regex used for outcome section filtering (0 is the highest priority)
56
+ - filtered_sections : dict of all filtered sections of the article key=title, value=list of text content
57
+ - raw_entities : output of huggingface token classification pipeline with aggregated entities but also O text (non-entity)
58
+ - article_outcomes : List of tuples (type, outcome) of all outcomes detected in the article
59
+ - detected_nct_id : first nct id detected in the article
60
+ - ctgov_outcomes : List of tuples (type, outcome) of all outcomes detected in the registry
61
+ """
62
+ # download and parse article
63
+ parse_output = dl_and_parse(article_id)
64
+ # search nct id in text, then download and parse registry outcomes
65
+ registry_outcomes = extract_nct_outcomes(parse_output["article_xml"])
66
+ # filter article sections and get text
67
+ filter_output = filter_sections(parse_output["article_sections"])
68
+ sections_text = get_sections_text(filter_output["filtered_sections"])
69
+ # outcomes ner in article text
70
+ ner_output = self._extract_article_outcomes(sections_text)
71
+ # compare outcomes between article and registry
72
+ connections = self._compare_outcomes(registry_outcomes, ner_output["article_outcomes"])
73
+ return parse_output | {"ctgov_outcomes":registry_outcomes} | filter_output | ner_output | {"connections":connections}
outcome_switch/ctgov.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import ast
3
+ import requests
4
+ from typing import Union
5
+
6
+ def _find_nctid(text: str) -> Union[str,None]:
7
+ "return nct string if found in text else none"
8
+ match = re.search(r"[Nn][Cc][Tt]0*[1-9]\d{0,7}", text)
9
+ return match[0] if match is not None else match
10
+
11
+ def _get_registry_outcomes(nct_id: str) -> Union[dict,None]:
12
+ outcomes = None
13
+ r = requests.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}", params={"fields":"OutcomesModule"})
14
+ if r.status_code == 200 and "outcomesModule" in r.json()["protocolSection"]:
15
+ outcomes = ast.literal_eval(r.text)["protocolSection"]["outcomesModule"]
16
+ return outcomes
17
+
18
+ def _reformat_outcomes(outcomes: dict) -> list[dict[str,str]]:
19
+ new_outcomes = []
20
+ for outcome_type, outcome_list in outcomes.items() :
21
+ outcome_type = outcome_type.replace("Outcomes","")
22
+ for outcome_item in outcome_list :
23
+ outcome_item["type"] = outcome_type
24
+ new_outcomes.append(outcome_item)
25
+ return new_outcomes
26
+
27
+ def extract_nct_outcomes(text:str) -> Union[None,list[dict[str,str]]]:
28
+ """Extract outcomes from a text using CTGOV APIV2 if a nct id is found else return None"""
29
+ outcomes = None
30
+ if text is None :
31
+ return outcomes
32
+ nct_id = _find_nctid(text)
33
+ if nct_id is not None:
34
+ outcomes = _get_registry_outcomes(nct_id)
35
+ outcomes = _reformat_outcomes(outcomes)
36
+ return outcomes
outcome_switch/entrez.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for fetching and parsing articles from PubMed and PMC using Entrez efetch."""
2
+
3
+ from __future__ import annotations
4
+ import html
5
+ import requests
6
+ import unicodedata
7
+ from abc import ABC, abstractmethod
8
+ from io import StringIO
9
+ from pathlib import Path
10
+ from typing import IO, Any, Dict, Union
11
+ from xml.etree.ElementTree import Element # nosec
12
+ from zipfile import ZipFile
13
+ from typing import Generator
14
+ from defusedxml import ElementTree
15
+
16
+ _ENTREZ_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
17
+
18
+ def _db_parser(article_id:str) -> str|None:
19
+ """Parse the article ID to ensure it is in the correct format."""
20
+ db = None
21
+ if article_id.startswith('PMC') and article_id[3:].isdigit():
22
+ db = "pmc"
23
+ elif article_id.isdigit():
24
+ db = "pubmed"
25
+ return db
26
+
27
+ def _dl_article_xml(article_id:str, db:str|None) -> tuple[None|str,str] :
28
+ xml_string = None
29
+ params = {"db": db, "id": article_id, "retmode": "xml"}
30
+ response = requests.get(_ENTREZ_EFETCH_URL, params=params)
31
+ if response.status_code == 200:
32
+ xml_string = response.text
33
+ return xml_string
34
+
35
+ def _parse_article(xml_string:str, db:str) -> Union[None,ArticleParser] :
36
+ parsed_article = None
37
+ if db == "pmc":
38
+ parsed_article = JATSXMLParser.from_string(xml_string)
39
+ elif db == "pubmed":
40
+ parsed_article = PubMedXMLParser(xml_string)
41
+ # check if parsing was successful
42
+ if not parsed_article.abstract and not parsed_article.paragraphs:
43
+ parsed_article = None
44
+ return parsed_article
45
+
46
+ def _reformat_article(parsed_article:ArticleParser) -> Dict[str,Any] :
47
+ reformatted_article = {"Title":[parsed_article.title]}
48
+ for sec_title,sentence in parsed_article.abstract :
49
+ sec_title = "Abstract" if sec_title is None else "Abstract - " + sec_title
50
+ reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
51
+ for sec_title,sentence in parsed_article.paragraphs :
52
+ reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
53
+ return reformatted_article
54
+
55
+
56
+ def dl_and_parse(article_id:str) -> Dict[str,Union[None,Any]]:
57
+ """Fetch article from PubMed or PMC using the ID using Entrez efetch
58
+ and parse it using the appropriate parser. Then returns dict containing keys :
59
+ article_xml(raw xml of downloaded article) and
60
+ article_sections (parsed sections in the form of a dictionary with keys as section titles
61
+ and values as list of text content)"""
62
+ parse_output = {
63
+ "db" : None,
64
+ "article_xml": None,
65
+ "article_sections": None,
66
+ }
67
+ # parse id for correct db format
68
+ parse_output["db"] = _db_parser(article_id)
69
+ if parse_output["db"] is None:
70
+ return parse_output
71
+ parse_output["article_xml"] = _dl_article_xml(article_id, parse_output["db"])
72
+ article_parser = _parse_article(parse_output["article_xml"], parse_output["db"])
73
+ if article_parser is None :
74
+ return parse_output
75
+ parse_output["article_sections"] = _reformat_article(article_parser)
76
+ return parse_output
77
+
78
+ class ArticleParser(ABC):
79
+ """An abstract base class for article parsers."""
80
+
81
+ @property
82
+ @abstractmethod
83
+ def title(self) -> str:
84
+ """Get the article title.
85
+
86
+ Returns
87
+ -------
88
+ str
89
+ The article title.
90
+ """
91
+
92
+ @property
93
+ @abstractmethod
94
+ def abstract(self) -> list[str]:
95
+ """Get a sequence of paragraphs in the article abstract.
96
+
97
+ Returns
98
+ -------
99
+ list of str
100
+ The paragraphs of the article abstract.
101
+ """
102
+
103
+ @property
104
+ @abstractmethod
105
+ def paragraphs(self) -> list[tuple[str, str]]:
106
+ """Get all paragraphs and titles of sections they are part of.
107
+
108
+ Returns
109
+ -------
110
+ list of (str, str)
111
+ For each paragraph a tuple with two strings is returned. The first
112
+ is the section title, the second the paragraph content.
113
+ """
114
+
115
+
116
+ class JATSXMLParser(ArticleParser):
117
+ def __init__(self, xml_stream: IO[Any]) -> None:
118
+ super().__init__()
119
+ self.content = ElementTree.parse(xml_stream)
120
+ if self.content.getroot().tag == "pmc-articleset":
121
+ self.content = self.content.find("article")
122
+
123
+ @classmethod
124
+ def from_string(cls, xml_string: str) -> JATSXMLParser:
125
+ with StringIO(xml_string) as stream:
126
+ obj = cls(stream)
127
+ return obj
128
+
129
+ @classmethod
130
+ def from_zip(cls, path: str | Path) -> JATSXMLParser:
131
+ with ZipFile(path) as myzip:
132
+ xml_files = [
133
+ x
134
+ for x in myzip.namelist()
135
+ if x.startswith("content/") and x.endswith(".xml")
136
+ ]
137
+
138
+ if len(xml_files) != 1:
139
+ raise ValueError(
140
+ "There needs to be exactly one .xml file inside of content/"
141
+ )
142
+
143
+ xml_file = xml_files[0]
144
+
145
+ # Parsing logic
146
+ with myzip.open(xml_file, "r") as fh:
147
+ obj = cls(fh)
148
+ return obj
149
+
150
+ @property
151
+ def title(self) -> str:
152
+ titles = self.content.find("./front/article-meta/title-group/article-title")
153
+ return self._element_to_str(titles)
154
+
155
+ @property
156
+ def abstract(self) -> list[tuple[str, str]]:
157
+ abstract = self.content.find("./front/article-meta/abstract")
158
+ abstract_list: list[tuple[str, str]] = []
159
+ if abstract:
160
+ for sec_title, text in self.parse_section(abstract):
161
+ abstract_list.append((sec_title,text))
162
+ return abstract_list
163
+
164
+ @property
165
+ def paragraphs(self) -> list[tuple[str, str]]:
166
+ paragraph_list: list[tuple[str, str]] = []
167
+
168
+ # Paragraphs of text body
169
+ body = self.content.find("./body")
170
+ if body:
171
+ paragraph_list.extend(self.parse_section(body,""))
172
+
173
+ # Figure captions
174
+ figs = self.content.findall("./body//fig")
175
+ for fig in figs:
176
+ fig_captions = fig.findall("caption")
177
+ if fig_captions is None:
178
+ continue
179
+ caption = " ".join(self._element_to_str(c) for c in list(fig_captions))
180
+ if caption:
181
+ paragraph_list.append(("Figure Caption", caption))
182
+
183
+ # Table captions
184
+ tables = self.content.findall("./body//table-wrap")
185
+ for table in tables:
186
+ caption_elements = table.findall("./caption/p") or table.findall(
187
+ "./caption/title"
188
+ )
189
+ if caption_elements is None:
190
+ continue
191
+ caption = " ".join(self._element_to_str(c) for c in caption_elements)
192
+ if caption:
193
+ paragraph_list.append(("Table Caption", caption))
194
+ return paragraph_list
195
+
196
+ def parse_section(self, section: Element, sec_title_path: str = "") -> Generator[tuple[str, str], None, None]:
197
+ sec_title = self._element_to_str(section.find("title"))
198
+ if sec_title == "Author contributions":
199
+ return
200
+ sec_title_path = sec_title_path + " - " + sec_title if sec_title_path else sec_title
201
+ for element in section:
202
+ if element.tag == "sec":
203
+ yield from self.parse_section(element, sec_title_path)
204
+ elif element.tag in {"title", "caption", "fig", "table-wrap", "label"}:
205
+ continue
206
+ else:
207
+ text = self._element_to_str(element)
208
+ if text:
209
+ yield sec_title_path, text
210
+
211
+ def _inner_text(self, element: Element) -> str:
212
+ text_parts = [html.unescape(element.text or "")]
213
+ for sub_element in element:
214
+ # recursively parse the sub-element
215
+ text_parts.append(self._element_to_str(sub_element))
216
+ # don't forget the text after the sub-element
217
+ text_parts.append(html.unescape(sub_element.tail or ""))
218
+ return unicodedata.normalize("NFKC", "".join(text_parts)).strip()
219
+
220
+ def _element_to_str(self, element: Element | None) -> str:
221
+ if element is None:
222
+ return ""
223
+
224
+ if element.tag in {
225
+ "bold",
226
+ "italic",
227
+ "monospace",
228
+ "p",
229
+ "sc",
230
+ "styled-content",
231
+ "underline",
232
+ "xref",
233
+ }:
234
+ # Mostly styling tags for which getting the inner text is enough.
235
+ # Currently this is the same as the default handling. Writing it out
236
+ # explicitly here to decouple from the default handling, which may
237
+ # change in the future.
238
+ return self._inner_text(element)
239
+ elif element.tag == "sub":
240
+ return f"_{self._inner_text(element)}"
241
+ elif element.tag == "sup":
242
+ return f"^{self._inner_text(element)}"
243
+ elif element.tag in {
244
+ "disp-formula",
245
+ "email",
246
+ "ext-link",
247
+ "inline-formula",
248
+ "uri",
249
+ }:
250
+ return ""
251
+ else:
252
+ # Default handling for all other element tags
253
+ return self._inner_text(element)
254
+
255
+
256
+ class PubMedXMLParser(ArticleParser):
257
+ """Parser for PubMed abstract."""
258
+
259
+ def __init__(self, data: str | bytes) -> None:
260
+ super().__init__()
261
+ self.content = ElementTree.fromstring(data)
262
+
263
+ @property
264
+ def title(self) -> str:
265
+ title = self.content.find("./PubmedArticle/MedlineCitation/Article/ArticleTitle")
266
+ if title is None:
267
+ return ""
268
+ return "".join(title.itertext())
269
+
270
+ @property
271
+ def abstract(self) -> list[tuple[str,str]]:
272
+ abstract = self.content.find("./PubmedArticle/MedlineCitation/Article/Abstract")
273
+
274
+ if abstract is None:
275
+ # No paragraphs to parse: stop and return an empty iterable.
276
+ return [] # noqa
277
+
278
+ paragraphs = abstract.iter("AbstractText")
279
+ abstract_list: list[tuple[str,str]] = []
280
+ if paragraphs is not None:
281
+ for paragraph in paragraphs:
282
+ sec_title = paragraph.get("Label")
283
+ abstract_list.append((sec_title,"".join(paragraph.itertext())))
284
+ return abstract_list
285
+
286
+ @property
287
+ def paragraphs(self) -> list[tuple[str, str]]:
288
+ # No paragraph to parse in PubMed article sets: return an empty iterable.
289
+ return []
290
+
291
+
outcome_switch/filter.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Any, Tuple
3
+
4
+ STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)'
5
+ OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)'
6
+
7
+ METHOD_REGEX = '(method|approach|strategy|design|protocol)'
8
+ SAMPLE_SIZE_REGEX = 'sample\s(size|number)'
9
+ ABSTRACT_REGEX = '(abstract|summary)'
10
+
11
+ STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
12
+ PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}'
13
+ STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}'
14
+ METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'
15
+
16
+ CHECK_PRIORITY = [
17
+ ("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
18
+ ("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
19
+ ("prim_sec","title",PRIM_SEC_REGEX),
20
+ ("outcome","title",OUTCOME_REGEX),
21
+ ("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
22
+ ("prim_sec","content",PRIM_SEC_REGEX),
23
+ ("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
24
+ ("outcome","content",OUTCOME_REGEX),
25
+ ("method","title",METHOD_REGEX),
26
+ ("sample_size","title",SAMPLE_SIZE_REGEX),
27
+ ("abstract","title",ABSTRACT_REGEX),
28
+ ]
29
+
30
+ def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
31
+ """Filter sections to keep only the ones containing relevant information if the text is a fulltext
32
+ else keep all sections of abstract
33
+
34
+ Args:
35
+ sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
36
+ text_type (str): type of text to filter (abstract or fulltext)
37
+
38
+ Returns:
39
+ Dict[str,Any]: dictionary containing the following keys:
40
+ - filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
41
+ - regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
42
+ - regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
43
+ - check_type: type of check used to filter the sections (title or content)
44
+ """
45
+ filter_output = {
46
+ "filtered_sections" : None,
47
+ "regex_priority_index" : None,
48
+ "regex_priority_name" : None,
49
+ "check_type" : None,
50
+ }
51
+ if not sections_dict:
52
+ return filter_output
53
+ # else we filter the sections
54
+ filter_output["filtered_sections"] = {} # init
55
+ match_found = False
56
+ for i, el in enumerate(CHECK_PRIORITY) :
57
+ priority_name, content_type, current_regex = el
58
+ current_regex = re.compile(current_regex, re.IGNORECASE)
59
+ for title, content_list in sections_dict.items() :
60
+ content = title if content_type == "title" else '\n'.join(content_list)
61
+ if current_regex.search(content) :
62
+ filter_output["check_type"] = content_type
63
+ filter_output["regex_priority_name"] = priority_name
64
+ filter_output["regex_priority_index"] = i
65
+ filter_output["filtered_sections"][title] = content_list
66
+ match_found = True
67
+ if match_found :
68
+ break
69
+ return filter_output
70
+
71
+
72
+ def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
73
+ """Filter primary and secondary outcomes from the list of entities a key is created
74
+ only if at least one entity is found for the given group"""
75
+ outcomes = []
76
+ for entity in entities:
77
+ if entity["entity_group"] == "O":
78
+ continue
79
+ elif entity["entity_group"] == "PrimaryOutcome" :
80
+ outcomes.append(("primary", entity["word"]))
81
+ elif entity["entity_group"] == "SecondaryOutcome":
82
+ outcomes.append(("secondary", entity["word"]))
83
+ return outcomes
84
+
85
+ def get_sections_text(sections: Dict[str, List[str]]) -> str:
86
+ if not sections :
87
+ return None
88
+ sections_text = ""
89
+ for title, content in sections.items():
90
+ sections_text += title + '\n' + " ".join(content) + '\n'
91
+ return sections_text
outcome_switch/similarity.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from sentence_transformers.util import cos_sim
4
+ from transformers import AutoTokenizer, AutoModel
5
+
6
+
7
+ class OutcomeSimilarity:
8
+ """ similarity detector between outcomes statements"""
9
+ ID2LABEL = ["different", "similar"]
10
+
11
+ def __init__(self, model_path: str):
12
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
13
+ self.model = AutoModel.from_pretrained(model_path)
14
+
15
+ def _mean_pooling(self, model_output, attention_mask: torch.Tensor):
16
+ """ Mean Pooling - Take attention mask into account for correct averaging"""
17
+ # First element of model_output contains all token embeddings
18
+ token_embeddings = model_output[0]
19
+ input_mask_expanded = attention_mask.unsqueeze(
20
+ -1).expand(token_embeddings.size()).float()
21
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
22
+
23
+ def _encode(self, outcomes_lot: list[tuple[str,str]]):
24
+ # Parse sentences
25
+ sentences = []
26
+ if len(outcomes_lot) > 0:
27
+ _, sentences = zip(*outcomes_lot)
28
+ # Tokenize sentences
29
+ encoded_input = self.tokenizer(
30
+ sentences, padding=True, truncation=True, return_tensors='pt')
31
+ # Compute token embeddings
32
+ with torch.no_grad():
33
+ model_output = self.model(**encoded_input)
34
+ # Perform pooling
35
+ sentence_embeddings = self._mean_pooling(
36
+ model_output, encoded_input['attention_mask'])
37
+ # Normalize embeddings
38
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
39
+ return sentence_embeddings
40
+
41
+ def get_similarity(
42
+ self,
43
+ registry_outcomes:list[tuple[str,str]],
44
+ article_outcomes:list[tuple[str,str]]
45
+ ) -> list[tuple[int,int,float]]:
46
+ """For each outcome in true_dict, find the most similar outcome in compared_dict and return a mapping
47
+ of all matchs , for each tuple : registry is the first index (at i=0); article is the second index (at i=1)
48
+ and the third index (i=3) is the cosine similarity score"""
49
+ connections = set()
50
+ rembs = self._encode(registry_outcomes)
51
+ aembs = self._encode(article_outcomes)
52
+ cosines_scores = cos_sim(rembs, aembs)
53
+ lines_max = torch.argmax(cosines_scores, dim=1)
54
+ col_max = torch.argmax(cosines_scores, dim=0)
55
+ remaining_cols = set(range(len(col_max)))
56
+ for i in range(len(lines_max)):
57
+ connection = (i, lines_max[i].item(), cosines_scores[i, lines_max[i]].item())
58
+ remaining_cols.discard(lines_max[i].item())
59
+ connections.add(connection)
60
+ for j in remaining_cols:
61
+ connection = (col_max[j].item(), j, cosines_scores[col_max[j], j].item())
62
+ connections.add(connection)
63
+ return connections
outcome_switch/visual.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.graph_objects as go
3
+ from typing import List, Dict, Any, Tuple, Union
4
+
5
+
6
+ _PUBMED_LINK= "https://pubmed.ncbi.nlm.nih.gov/{article_id}/"
7
+ _PMC_LINK = "https://www.ncbi.nlm.nih.gov/pmc/articles/{article_id}/"
8
+ _MARKDOWN_TEMPLATE = """# [{article_title}]({article_link})
9
+ # Filtered sections :
10
+
11
+ {sections_md}"""
12
+
13
+ # entities highlighted text
14
+ def get_highlighted_text(entities:List[Dict[str,Any]], original_text:str) -> List[Tuple[str,Union[str,None]]] :
15
+ """Convert the output of the model to a list of tuples (entity, label)
16
+ for `gradio.HighlightedText`output"""
17
+ conversion = {"PrimaryOutcome":"primary","SecondaryOutcome":"secondary"}
18
+ highlighted_text = []
19
+ for entity in entities:
20
+ entity_original_text = original_text[entity["start"]:entity["end"]]
21
+ if entity["entity_group"] == "O":
22
+ entity_output = (entity_original_text, None)
23
+ else:
24
+ entity_output = (entity_original_text, conversion[entity["entity_group"]])
25
+ highlighted_text.append(entity_output)
26
+ return highlighted_text
27
+
28
+ # article filtered sections markdown output
29
+ def get_article_markdown(
30
+ article_id:str,
31
+ article_sections:dict[str,list[str]],
32
+ filtered_sections:dict[str,list[str]]) -> str:
33
+ """Get the markdown of a list of sections"""
34
+ # link to online article
35
+ article_link = _PMC_LINK if article_id.startswith("PMC") else _PUBMED_LINK
36
+ article_link = article_link.format(article_id=article_id)
37
+ # get title, abstract, and filtered sections
38
+ article_title = article_sections["Title"][0]
39
+ sections_md = ""
40
+ for title, content in filtered_sections.items():
41
+ sections_md += f"## {title}\n"
42
+ sections_md += " ".join(content) + "\n"
43
+ return _MARKDOWN_TEMPLATE.format(
44
+ article_link=article_link,
45
+ article_title=article_title,
46
+ sections_md=sections_md
47
+ )
48
+
49
+ # registry dataframe display
50
+ def _highlight_df_rows(row):
51
+ if row['type'] =='primary':
52
+ return ['background-color: lightcoral'] * len(row)
53
+ elif row['type'] == 'secondary':
54
+ return ['background-color: lightgreen'] * len(row)
55
+ else :
56
+ return ['background-color: lightgrey'] * len(row)
57
+
58
+ def get_registry_dataframe(registry_outcomes: list[dict[str,str]]) -> str:
59
+ return pd.DataFrame(registry_outcomes).style.apply(_highlight_df_rows, axis=1)
60
+
61
+ # fcts for sankey diagram
62
+ def _sent_line_formatting(sentence:str, max_words:int=10) -> str:
63
+ """format a sentence to be displayed in a sankey diagram so that
64
+ each line has a maximum of `max_words` words"""
65
+ words = sentence.split()
66
+ batchs = [words[i:i+max_words] for i in range(0, len(words), max_words)]
67
+ return "<br>".join([" ".join(batch) for batch in batchs])
68
+
69
+ def _find_entity_score(entity_text, raw_entities):
70
+ for tc_output in raw_entities:
71
+ if entity_text == tc_output["word"]:
72
+ return tc_output["score"]
73
+
74
+ def get_sankey_diagram(
75
+ registry_outcomes: list[tuple[str,str]],
76
+ article_outcomes: list[tuple[str,str]],
77
+ connections: set[tuple[int,int,float]],
78
+ raw_entities: list[Dict[str,Any]],
79
+ cosine_threshold: float=0.44,
80
+ ) -> go.Figure:
81
+
82
+ color_map = {
83
+ "primary": "red",
84
+ "secondary": "green",
85
+ "other": "grey",
86
+ }
87
+ # Create lists of formatted sentences and colors for the nodes
88
+ list1 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in registry_outcomes]
89
+ list2 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in article_outcomes]
90
+ display_connections = [
91
+ (list1[i][0],list2[j][0],"mediumaquamarine") if cosine > cosine_threshold
92
+ else (list1[i][0],list2[j][0],"lightgray") for i,j,cosine in connections
93
+ ]
94
+ # Create a list of labels and colors for the nodes
95
+ labels = [x[0] for x in list1 + list2]
96
+ colors = [x[1] for x in list1 + list2]
97
+ # Create lists of sources and targets for the connections
98
+ sources = [labels.index(x[0]) for x in display_connections]
99
+ targets = [labels.index(x[1]) for x in display_connections]
100
+ # Create a list of values and colors for the connections
101
+ values = [1] * len(display_connections)
102
+ connection_colors = [x[2] for x in display_connections]
103
+
104
+ # data appearing on hover of each node (outcome)
105
+ node_customdata = [f"from: registry<br>type:{t}" for t,_ in registry_outcomes]
106
+ node_customdata += [f"from: article<br>type: {t}<br>confidence: " + str(_find_entity_score(s, raw_entities)) for t,s in article_outcomes]
107
+ node_hovertemplate = "outcome: %{label}<br>%{customdata} <extra></extra>"
108
+ # data appearing on hover of each link (node connections)
109
+ link_customdata = [cosine for _,_,cosine in connections]
110
+ link_hovertemplate = "similarity: %{customdata} <extra></extra>"
111
+ # sankey diagram data filling
112
+ sankey = go.Sankey(
113
+ node=dict(
114
+ pad=15,
115
+ thickness=20,
116
+ line=dict(color="black", width=0.5),
117
+ label=labels,
118
+ color=colors,
119
+ customdata=node_customdata,
120
+ hovertemplate=node_hovertemplate
121
+ ),
122
+ link=dict(
123
+ source=sources,
124
+ target=targets,
125
+ value=values,
126
+ customdata=link_customdata,
127
+ color=connection_colors,
128
+ hovertemplate=link_hovertemplate
129
+ )
130
+ )
131
+ # conversion to figure
132
+ fig = go.Figure(data=[sankey])
133
+ fig.update_layout(
134
+ title_text="Registry outcomes (left) connections with article outcomes (right), similarity threshold = " + str(cosine_threshold),
135
+ font_size=10,
136
+ width=1200,
137
+ xaxis=dict(rangeslider=dict(visible=True),type="linear")
138
+ )
139
+ return fig
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
2
+ pandas==2.2.2
3
+ gradio==4.44.0
4
+ plotly==5.24.0
5
+ transformers==4.44.2
6
+ sentence-transformers==3.0.1
test/parse_examples/36473651.xml ADDED
The diff for this file is too large to render. See raw diff
 
test/parse_examples/PMC11102686.xml ADDED
The diff for this file is too large to render. See raw diff
 
test/test_ctgov.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from outcome_switch.ctgov import find_nctid, get_registry_outcomes, reformat_outcomes
3
+
4
+
5
+ _EMPTY_STRING = ""
6
+ # NCT REGEX extraction tests
7
+ _TEXT_WITH_ONE_NCT = """blablabla nct id is NCT04647656 blabla"""
8
+ _TEXT_WITH_TWO_NCT = """blablabla nct id is NCT04647656 blabla NCT06562582 bla"""
9
+ _TEXT_WITHOUT_NCT = "blablabla blablabla"
10
+
11
+ # NCT REGISTRY API tests
12
+ _CORRECT_NCT = "NCT04647656"
13
+ _INCORRECT_NCT = "PRN03216548"
14
+
15
+ # REGISTRY outcomes reformatting test
16
+ _CTGOV_OUTCOMES = {
17
+ "primaryOutcomes": [
18
+ {
19
+ "measure": "Cognitive health assessment (NeuroTrax)",
20
+ "description": "Memory, attention and information process will be evaluated using the NeuroTrax computerized cognitive evaluation battery.",
21
+ "timeFrame": "Baseline, 2 months",
22
+ }
23
+ ],
24
+ "secondaryOutcomes": [
25
+ {
26
+ "measure": "Brain perfusion",
27
+ "description": "Cerebral blood volume and flow will be measured using perfusion MRI protocol Dynamic susceptibility contrast (DSC).",
28
+ "timeFrame": "Baseline, 2 months",
29
+ }
30
+ ],
31
+ }
32
+
33
+ class NctidFinderTest(unittest.TestCase) :
34
+
35
+ def test_text_with_one_nct(self):
36
+ self.assertEqual(find_nctid(_TEXT_WITH_ONE_NCT), "NCT04647656")
37
+
38
+ def test_text_with_two_nct(self):
39
+ self.assertEqual(find_nctid(_TEXT_WITH_TWO_NCT), "NCT04647656")
40
+
41
+ def test_text_without_nct(self):
42
+ self.assertIsNone(find_nctid(_TEXT_WITHOUT_NCT))
43
+
44
+ def test_empty_string(self):
45
+ self.assertIsNone(find_nctid(_EMPTY_STRING))
46
+
47
+ def test_none_input(self):
48
+ self.assertIsNone(find_nctid(None))
49
+
50
+ class CtgovExtractionTest(unittest.TestCase) :
51
+
52
+ def test_correct_nct(self):
53
+ self.assertIsNotNone(get_registry_outcomes(_CORRECT_NCT))
54
+
55
+ def test_incorrect_nct(self):
56
+ self.assertIsNone(get_registry_outcomes(_INCORRECT_NCT))
57
+
58
+ def test_empty_string(self):
59
+ self.assertIsNone(get_registry_outcomes(_EMPTY_STRING))
60
+
61
+ class CtgovReformatTest(unittest.TestCase) :
62
+
63
+ def test_correct_reformat_outcomes(self):
64
+ self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES), list)
65
+ self.assertEqual(len(reformat_outcomes(_CTGOV_OUTCOMES)), 2)
66
+ self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[0], dict)
67
+ self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[1], dict)
68
+ self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[0]["type"], "primary")
69
+ self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[1]["type"], "secondary")
test/test_entrez.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from outcome_switch.entrez import _dl_article_xml, _parse_article, _reformat_article
3
+
4
+ # Efetch tests
5
+ _VALID_PMCID = "PMC6206648"
6
+ _VALID_PMID_1 = "29283904"
7
+ _VALID_PMID_2 = "29214975"
8
+ _INVALID_1 = "10.1056/NEJMoa2110345"
9
+ _INVALID_2 = "0123456789"
10
+ _EMPTY = ""
11
+
12
+ # XML Parsing tests files
13
+ # TODO : tests for parsing XML files
14
+
15
+
16
+ class EntrezEfetchTest(unittest.TestCase):
17
+
18
+ def test_valid_pmcid(self):
19
+ self.assertIsNotNone(_dl_article_xml(_VALID_PMCID)[0])
20
+ self.assertEqual(_dl_article_xml(_VALID_PMCID)[1], "pmc")
21
+
22
+ def test_valid_pmid1(self):
23
+ self.assertIsNotNone(_dl_article_xml(_VALID_PMID_1)[0])
24
+ self.assertEqual(_dl_article_xml(_VALID_PMID_1)[1], "pubmed")
25
+
26
+ def test_valid_pmid2(self):
27
+ self.assertIsNotNone(_dl_article_xml(_VALID_PMID_2)[0])
28
+ self.assertEqual(_dl_article_xml(_VALID_PMID_2)[1], "pubmed")
29
+
30
+ def test_invalid1(self):
31
+ self.assertIsNone(_dl_article_xml(_INVALID_1)[0])
32
+
33
+ def test_invalid2(self):
34
+ self.assertIsNone(_dl_article_xml(_INVALID_2)[0])
35
+
36
+ def test_empty(self):
37
+ self.assertIsNone(_dl_article_xml(_EMPTY)[0])
38
+ self.assertIsNone(_dl_article_xml(_EMPTY)[1])
test/test_filter.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from outcome_switch import filter_sections, filter_outcomes, get_sections_text
3
+
4
+ _VALID_DICT_WITH_2_SECTIONS = {
5
+ "Methods - Outcomes - Primary outcome": [
6
+ "The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n 2 4]."
7
+ ],
8
+ "Methods - Outcomes - Secondary outcomes": [
9
+ "Secondary outcomes include:",
10
+ "Hip function as measured by the Hip Outcome Score (HOS).",
11
+ "Generic physical and mental health as measured by the Short Form-12 (SF-12).",
12
+ "Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
13
+ "Health utility as measured by the EuroQol (EQ-5D).",
14
+ "Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
15
+ "The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n 5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6, 7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11, 12].",
16
+ ],
17
+ "Discussion - Analysis plan - Blinded analyses": [
18
+ "All statistical analyses will first be completed using blinded treatment groups (i.e. treatment X and Y). Interpretations for the effect of the surgical interventions will be documented based upon blinded X versus Y treatment [ \n 14]."
19
+ ],
20
+ }
21
+
22
+ _FILTERED_SECTIONS = {
23
+ "filtered_sections": {
24
+ "Methods - Outcomes - Primary outcome": [
25
+ "The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n 2 4]."
26
+ ],
27
+ "Methods - Outcomes - Secondary outcomes": [
28
+ "Secondary outcomes include:",
29
+ "Hip function as measured by the Hip Outcome Score (HOS).",
30
+ "Generic physical and mental health as measured by the Short Form-12 (SF-12).",
31
+ "Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
32
+ "Health utility as measured by the EuroQol (EQ-5D).",
33
+ "Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
34
+ "The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n 5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6, 7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11, 12].",
35
+ ],
36
+ },
37
+ "regex_priority_index": 0,
38
+ "regex_priority_name": "strict_method_and_prim_sec",
39
+ "check_type": "title",
40
+ }
41
+
42
+
43
+ _EMPTY_DICT = {}