Spaces:
Sleeping
Sleeping
Mathieu Lai-King
commited on
Commit
•
1a3b3aa
0
Parent(s):
first commit
Browse files- .gitattributes +27 -0
- .gitignore +3 -0
- README.md +35 -0
- app.py +109 -0
- config.json +11 -0
- front/app-description.md +9 -0
- front/examples.json +16 -0
- outcome_switch/__init__.py +73 -0
- outcome_switch/ctgov.py +36 -0
- outcome_switch/entrez.py +291 -0
- outcome_switch/filter.py +91 -0
- outcome_switch/similarity.py +63 -0
- outcome_switch/visual.py +139 -0
- requirements.txt +6 -0
- test/parse_examples/36473651.xml +0 -0
- test/parse_examples/PMC11102686.xml +0 -0
- test/test_ctgov.py +69 -0
- test/test_entrez.py +38 -0
- test/test_filter.py +43 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.vscode/
|
3 |
+
models/
|
README.md
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Outcome Switching Detector
|
3 |
+
emoji: 🔄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
python_version: 3.11.9
|
11 |
+
models: ['aakorolyova/primary_and_secondary_outcome_extraction','Mathking/all-mpnet-outcome-similarity']
|
12 |
+
---
|
13 |
+
|
14 |
+
# Outcome Switching Detector
|
15 |
+
|
16 |
+
## Installation
|
17 |
+
|
18 |
+
1. Download dependencies : `pip install -r requirements.txt`
|
19 |
+
|
20 |
+
2. Define pretrained models path in config file : you must redefine `config.json` so that it points to the models if you do not have them on disk. YOu also can redefine ner_labe2id depending on the model you use
|
21 |
+
```json
|
22 |
+
{
|
23 |
+
"ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
|
24 |
+
"sim_path": "laiking/all-mpnet-outcome-similarity",
|
25 |
+
"ner_label2id" : {
|
26 |
+
"O": 0,
|
27 |
+
"B-PrimaryOutcome": 1,
|
28 |
+
"I-PrimaryOutcome": 2,
|
29 |
+
"B-SecondaryOutcome": 3,
|
30 |
+
"I-SecondaryOutcome": 4
|
31 |
+
}
|
32 |
+
}
|
33 |
+
```
|
34 |
+
|
35 |
+
1. Run `python3 -m app.py`
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import gradio as gr
|
3 |
+
from outcome_switch import OutcomeSwitchingDetector, get_sections_text
|
4 |
+
from outcome_switch.visual import (
|
5 |
+
get_article_markdown,
|
6 |
+
get_highlighted_text,
|
7 |
+
get_registry_dataframe,
|
8 |
+
get_sankey_diagram,
|
9 |
+
)
|
10 |
+
|
11 |
+
_CALCULATED_COSINE_THRESHOLD = 0.44
|
12 |
+
_app_description = open("front/app-description.md").read()
|
13 |
+
_article_id_examples = json.load(open("front/examples.json"))
|
14 |
+
_pmcid_start_value = _article_id_examples[0]
|
15 |
+
config = json.load(open('./config.json', 'r'))
|
16 |
+
|
17 |
+
# Load Detector (ner and sim model)
|
18 |
+
osd = OutcomeSwitchingDetector(
|
19 |
+
config["ner_path"],
|
20 |
+
config["sim_path"],
|
21 |
+
config["ner_label2id"]
|
22 |
+
)
|
23 |
+
|
24 |
+
def controller(article_id:str):
|
25 |
+
# clean input and run detection
|
26 |
+
article_id = str(article_id).strip()
|
27 |
+
output = osd.detect(article_id)
|
28 |
+
|
29 |
+
# init outputs
|
30 |
+
article_markdown=None
|
31 |
+
article_highlighted_text=None
|
32 |
+
registry_df=None
|
33 |
+
similarity_diagram=None
|
34 |
+
|
35 |
+
# check whether article markdown can be displayed
|
36 |
+
if output["db"] is None :
|
37 |
+
gr.Warning(f"Wrong format for input id : {article_id}")
|
38 |
+
return None, None, None, None
|
39 |
+
elif output["article_sections"] is None or output["filtered_sections"] is None:
|
40 |
+
gr.Warning(f"Could not retrieve text for id {article_id} (id not found in database or abstract/fulltext unavailable on PubMed/PMC)")
|
41 |
+
return None, None, None, None
|
42 |
+
else :
|
43 |
+
article_markdown = get_article_markdown(article_id, output["article_sections"], output["filtered_sections"])
|
44 |
+
|
45 |
+
# check whether annotations can be displayed
|
46 |
+
if output["raw_entities"] is not None and output["filtered_sections"] is not None:
|
47 |
+
original_text = get_sections_text(output["filtered_sections"])
|
48 |
+
article_highlighted_text = get_highlighted_text(output["raw_entities"], original_text)
|
49 |
+
else :
|
50 |
+
gr.Warning("Could not extract any outcomes entities in article text")
|
51 |
+
|
52 |
+
# check whether registry outcomes can be displayed
|
53 |
+
if output["ctgov_outcomes"] is not None:
|
54 |
+
registry_df = get_registry_dataframe(output["ctgov_outcomes"])
|
55 |
+
else:
|
56 |
+
gr.Warning("ClinicalTrials.Gov outcomes were not found (either no NCTID detected or no outcomes declared in registry)")
|
57 |
+
return article_markdown, article_highlighted_text, registry_df, similarity_diagram
|
58 |
+
|
59 |
+
# check whether similarity diagram can be displayed
|
60 |
+
if (output["connections"] is not None and output["raw_entities"] is not None and
|
61 |
+
output["ctgov_outcomes"] is not None and output["article_outcomes"] is not None):
|
62 |
+
registry_outcomes_tup = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
|
63 |
+
for outcome in output["ctgov_outcomes"]]
|
64 |
+
similarity_diagram = get_sankey_diagram(
|
65 |
+
registry_outcomes_tup,
|
66 |
+
output["article_outcomes"],
|
67 |
+
output["connections"],
|
68 |
+
output["raw_entities"],
|
69 |
+
_CALCULATED_COSINE_THRESHOLD
|
70 |
+
)
|
71 |
+
else:
|
72 |
+
gr.Warning("Could not compute similarity diagram (missing registry or article outcomes)")
|
73 |
+
|
74 |
+
return article_markdown, article_highlighted_text, registry_df, similarity_diagram
|
75 |
+
|
76 |
+
def clean():
|
77 |
+
return None, None, None, None
|
78 |
+
|
79 |
+
with gr.Blocks() as blocks:
|
80 |
+
with gr.Column():
|
81 |
+
gr.Markdown('# Outcome Switching Detection \n' + _app_description )
|
82 |
+
with gr.Row():
|
83 |
+
with gr.Column():
|
84 |
+
with gr.Row():
|
85 |
+
pmid_input = gr.Textbox(value=_pmcid_start_value, label="PMID or PMCID (PMCID must be preceded by 'PMC' prefix)")
|
86 |
+
with gr.Row():
|
87 |
+
clear_button = gr.ClearButton()
|
88 |
+
detect_button = gr.Button(value="Detect", variant="primary")
|
89 |
+
gr.Examples(examples = _article_id_examples, inputs=pmid_input)
|
90 |
+
gr.Markdown("## Results \n")
|
91 |
+
with gr.Tabs():
|
92 |
+
with gr.TabItem("Article Useful Sections"):
|
93 |
+
filtered_article = gr.Markdown()
|
94 |
+
with gr.TabItem("Article Detected Outcomes"):
|
95 |
+
ner_output = gr.HighlightedText(
|
96 |
+
color_map={"primary": "lightcoral", "secondary": "lightgreen"},
|
97 |
+
show_legend=True,
|
98 |
+
combine_adjacent=True,
|
99 |
+
)
|
100 |
+
with gr.TabItem("Registry Outcomes"):
|
101 |
+
ctgov_output = gr.DataFrame()
|
102 |
+
with gr.TabItem("Similarity"):
|
103 |
+
similarity_output = gr.Plot(show_label=False)
|
104 |
+
# OUTPUTS AND BUTTONS
|
105 |
+
outputs = [filtered_article, ner_output, ctgov_output, similarity_output]
|
106 |
+
clear_button.add([pmid_input]+outputs)
|
107 |
+
detect_button.click(fn=controller, inputs=pmid_input, outputs=outputs)
|
108 |
+
|
109 |
+
blocks.launch()
|
config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"ner_path": "aakorolyova/primary_and_secondary_outcome_extraction",
|
3 |
+
"sim_path": "laiking/all-mpnet-outcome-similarity",
|
4 |
+
"ner_label2id" : {
|
5 |
+
"O": 0,
|
6 |
+
"B-PrimaryOutcome": 1,
|
7 |
+
"I-PrimaryOutcome": 2,
|
8 |
+
"B-SecondaryOutcome": 3,
|
9 |
+
"I-SecondaryOutcome": 4
|
10 |
+
}
|
11 |
+
}
|
front/app-description.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Demo of outcome switching detection using transformers models. Outcome switching is defined as the modification, inversion, suppression of a primary outcome in a Randomized Controlled Trial(RCT) between the published article and the registry entry.
|
2 |
+
|
3 |
+
What this demo is doing :
|
4 |
+
1. Retrieve abstract (PMID given) or fulltext (PMCID given) of an article
|
5 |
+
2. Parse the Methods section of the article and get section text
|
6 |
+
3. Use finetuned NER model for detecting primary outcomes in that text
|
7 |
+
4. Use a RegEx to find the NCT ID (ClinicalTrials.gov) in the full text
|
8 |
+
5. Use CTGOV API to extract registry primary outcome (considered as ground truth)
|
9 |
+
6. Use Semantic Textual Similarity Model to compare CTGOV outcome to article detected outcomes
|
front/examples.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
"33476595",
|
3 |
+
"33356422",
|
4 |
+
"PMC6206648",
|
5 |
+
"PMC6935101",
|
6 |
+
"PMC8491132",
|
7 |
+
"PMC7781101",
|
8 |
+
"PMC8005085",
|
9 |
+
"29283904",
|
10 |
+
"33443017",
|
11 |
+
"31599809",
|
12 |
+
"30010751",
|
13 |
+
"29847251",
|
14 |
+
"29946728",
|
15 |
+
"29677641"
|
16 |
+
]
|
outcome_switch/__init__.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
from outcome_switch.ctgov import extract_nct_outcomes
|
3 |
+
from outcome_switch.similarity import OutcomeSimilarity
|
4 |
+
from outcome_switch.entrez import dl_and_parse
|
5 |
+
from outcome_switch.filter import filter_sections, filter_outcomes, get_sections_text
|
6 |
+
from transformers import (BertConfig,
|
7 |
+
BertTokenizerFast,
|
8 |
+
BertForTokenClassification,
|
9 |
+
TokenClassificationPipeline)
|
10 |
+
|
11 |
+
class OutcomeSwitchingDetector:
|
12 |
+
"""Main Class for the whole pipeline of outcome switching detection"""
|
13 |
+
def __init__(self, ner_path:str, sim_path:str, ner_label2id:dict[str,str]):
|
14 |
+
# define config
|
15 |
+
config = BertConfig.from_pretrained(ner_path,
|
16 |
+
label2id=ner_label2id,
|
17 |
+
id2label={v: k for k, v in ner_label2id.items()})
|
18 |
+
self.outcomes_ner = TokenClassificationPipeline(
|
19 |
+
model = BertForTokenClassification.from_pretrained(ner_path,config=config),
|
20 |
+
tokenizer = BertTokenizerFast.from_pretrained(ner_path),
|
21 |
+
ignore_labels = [],
|
22 |
+
aggregation_strategy = "average",
|
23 |
+
stride=64
|
24 |
+
)
|
25 |
+
self.outcome_sim = OutcomeSimilarity(sim_path)
|
26 |
+
|
27 |
+
def _extract_article_outcomes(self, article_text:str) -> dict[str, Any]:
|
28 |
+
if not article_text :
|
29 |
+
return {"raw_entities" : None, "article_outcomes" : None}
|
30 |
+
# get article outcomes (all pieces of text annotated)
|
31 |
+
entities_list = self.outcomes_ner(article_text)
|
32 |
+
# filter outcomes and reformat
|
33 |
+
detected_outcomes = filter_outcomes(entities_list)
|
34 |
+
return {"raw_entities" : entities_list, "article_outcomes" : detected_outcomes}
|
35 |
+
|
36 |
+
def _compare_outcomes(
|
37 |
+
self,
|
38 |
+
registry_outcomes:list[tuple[str,str]],
|
39 |
+
article_outcomes:list[tuple[str,str]],
|
40 |
+
) -> dict[str, Any]:
|
41 |
+
if not registry_outcomes or not article_outcomes :
|
42 |
+
return None
|
43 |
+
registry_outcomes = [(outcome["type"], outcome["measure"] + " , " + outcome["timeFrame"])
|
44 |
+
for outcome in registry_outcomes]
|
45 |
+
# semantic similarity of outcomes between registry and article
|
46 |
+
return self.outcome_sim.get_similarity(registry_outcomes,article_outcomes)
|
47 |
+
|
48 |
+
def detect(self, article_id:str) -> dict[str,Any]:
|
49 |
+
"""detect outcome switching in input id (pmid, pmcid)
|
50 |
+
returns a dictionary with the following keys :
|
51 |
+
- article_xml : xml string of the article
|
52 |
+
- article_sections : dict of all sections of the article key=title, value=list of text content
|
53 |
+
- check_type : type of the check for regex outcome section filtering (title or content)
|
54 |
+
- regex_priority_name : name of the regex used for outcome section filtering
|
55 |
+
- regex_priority_index : number of priority of the regex used for outcome section filtering (0 is the highest priority)
|
56 |
+
- filtered_sections : dict of all filtered sections of the article key=title, value=list of text content
|
57 |
+
- raw_entities : output of huggingface token classification pipeline with aggregated entities but also O text (non-entity)
|
58 |
+
- article_outcomes : List of tuples (type, outcome) of all outcomes detected in the article
|
59 |
+
- detected_nct_id : first nct id detected in the article
|
60 |
+
- ctgov_outcomes : List of tuples (type, outcome) of all outcomes detected in the registry
|
61 |
+
"""
|
62 |
+
# download and parse article
|
63 |
+
parse_output = dl_and_parse(article_id)
|
64 |
+
# search nct id in text, then download and parse registry outcomes
|
65 |
+
registry_outcomes = extract_nct_outcomes(parse_output["article_xml"])
|
66 |
+
# filter article sections and get text
|
67 |
+
filter_output = filter_sections(parse_output["article_sections"])
|
68 |
+
sections_text = get_sections_text(filter_output["filtered_sections"])
|
69 |
+
# outcomes ner in article text
|
70 |
+
ner_output = self._extract_article_outcomes(sections_text)
|
71 |
+
# compare outcomes between article and registry
|
72 |
+
connections = self._compare_outcomes(registry_outcomes, ner_output["article_outcomes"])
|
73 |
+
return parse_output | {"ctgov_outcomes":registry_outcomes} | filter_output | ner_output | {"connections":connections}
|
outcome_switch/ctgov.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import ast
|
3 |
+
import requests
|
4 |
+
from typing import Union
|
5 |
+
|
6 |
+
def _find_nctid(text: str) -> Union[str,None]:
|
7 |
+
"return nct string if found in text else none"
|
8 |
+
match = re.search(r"[Nn][Cc][Tt]0*[1-9]\d{0,7}", text)
|
9 |
+
return match[0] if match is not None else match
|
10 |
+
|
11 |
+
def _get_registry_outcomes(nct_id: str) -> Union[dict,None]:
|
12 |
+
outcomes = None
|
13 |
+
r = requests.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}", params={"fields":"OutcomesModule"})
|
14 |
+
if r.status_code == 200 and "outcomesModule" in r.json()["protocolSection"]:
|
15 |
+
outcomes = ast.literal_eval(r.text)["protocolSection"]["outcomesModule"]
|
16 |
+
return outcomes
|
17 |
+
|
18 |
+
def _reformat_outcomes(outcomes: dict) -> list[dict[str,str]]:
|
19 |
+
new_outcomes = []
|
20 |
+
for outcome_type, outcome_list in outcomes.items() :
|
21 |
+
outcome_type = outcome_type.replace("Outcomes","")
|
22 |
+
for outcome_item in outcome_list :
|
23 |
+
outcome_item["type"] = outcome_type
|
24 |
+
new_outcomes.append(outcome_item)
|
25 |
+
return new_outcomes
|
26 |
+
|
27 |
+
def extract_nct_outcomes(text:str) -> Union[None,list[dict[str,str]]]:
|
28 |
+
"""Extract outcomes from a text using CTGOV APIV2 if a nct id is found else return None"""
|
29 |
+
outcomes = None
|
30 |
+
if text is None :
|
31 |
+
return outcomes
|
32 |
+
nct_id = _find_nctid(text)
|
33 |
+
if nct_id is not None:
|
34 |
+
outcomes = _get_registry_outcomes(nct_id)
|
35 |
+
outcomes = _reformat_outcomes(outcomes)
|
36 |
+
return outcomes
|
outcome_switch/entrez.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for fetching and parsing articles from PubMed and PMC using Entrez efetch."""
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
import html
|
5 |
+
import requests
|
6 |
+
import unicodedata
|
7 |
+
from abc import ABC, abstractmethod
|
8 |
+
from io import StringIO
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import IO, Any, Dict, Union
|
11 |
+
from xml.etree.ElementTree import Element # nosec
|
12 |
+
from zipfile import ZipFile
|
13 |
+
from typing import Generator
|
14 |
+
from defusedxml import ElementTree
|
15 |
+
|
16 |
+
_ENTREZ_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
17 |
+
|
18 |
+
def _db_parser(article_id:str) -> str|None:
|
19 |
+
"""Parse the article ID to ensure it is in the correct format."""
|
20 |
+
db = None
|
21 |
+
if article_id.startswith('PMC') and article_id[3:].isdigit():
|
22 |
+
db = "pmc"
|
23 |
+
elif article_id.isdigit():
|
24 |
+
db = "pubmed"
|
25 |
+
return db
|
26 |
+
|
27 |
+
def _dl_article_xml(article_id:str, db:str|None) -> tuple[None|str,str] :
|
28 |
+
xml_string = None
|
29 |
+
params = {"db": db, "id": article_id, "retmode": "xml"}
|
30 |
+
response = requests.get(_ENTREZ_EFETCH_URL, params=params)
|
31 |
+
if response.status_code == 200:
|
32 |
+
xml_string = response.text
|
33 |
+
return xml_string
|
34 |
+
|
35 |
+
def _parse_article(xml_string:str, db:str) -> Union[None,ArticleParser] :
|
36 |
+
parsed_article = None
|
37 |
+
if db == "pmc":
|
38 |
+
parsed_article = JATSXMLParser.from_string(xml_string)
|
39 |
+
elif db == "pubmed":
|
40 |
+
parsed_article = PubMedXMLParser(xml_string)
|
41 |
+
# check if parsing was successful
|
42 |
+
if not parsed_article.abstract and not parsed_article.paragraphs:
|
43 |
+
parsed_article = None
|
44 |
+
return parsed_article
|
45 |
+
|
46 |
+
def _reformat_article(parsed_article:ArticleParser) -> Dict[str,Any] :
|
47 |
+
reformatted_article = {"Title":[parsed_article.title]}
|
48 |
+
for sec_title,sentence in parsed_article.abstract :
|
49 |
+
sec_title = "Abstract" if sec_title is None else "Abstract - " + sec_title
|
50 |
+
reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
|
51 |
+
for sec_title,sentence in parsed_article.paragraphs :
|
52 |
+
reformatted_article[sec_title] = reformatted_article.get(sec_title,[]) + [sentence]
|
53 |
+
return reformatted_article
|
54 |
+
|
55 |
+
|
56 |
+
def dl_and_parse(article_id:str) -> Dict[str,Union[None,Any]]:
|
57 |
+
"""Fetch article from PubMed or PMC using the ID using Entrez efetch
|
58 |
+
and parse it using the appropriate parser. Then returns dict containing keys :
|
59 |
+
article_xml(raw xml of downloaded article) and
|
60 |
+
article_sections (parsed sections in the form of a dictionary with keys as section titles
|
61 |
+
and values as list of text content)"""
|
62 |
+
parse_output = {
|
63 |
+
"db" : None,
|
64 |
+
"article_xml": None,
|
65 |
+
"article_sections": None,
|
66 |
+
}
|
67 |
+
# parse id for correct db format
|
68 |
+
parse_output["db"] = _db_parser(article_id)
|
69 |
+
if parse_output["db"] is None:
|
70 |
+
return parse_output
|
71 |
+
parse_output["article_xml"] = _dl_article_xml(article_id, parse_output["db"])
|
72 |
+
article_parser = _parse_article(parse_output["article_xml"], parse_output["db"])
|
73 |
+
if article_parser is None :
|
74 |
+
return parse_output
|
75 |
+
parse_output["article_sections"] = _reformat_article(article_parser)
|
76 |
+
return parse_output
|
77 |
+
|
78 |
+
class ArticleParser(ABC):
|
79 |
+
"""An abstract base class for article parsers."""
|
80 |
+
|
81 |
+
@property
|
82 |
+
@abstractmethod
|
83 |
+
def title(self) -> str:
|
84 |
+
"""Get the article title.
|
85 |
+
|
86 |
+
Returns
|
87 |
+
-------
|
88 |
+
str
|
89 |
+
The article title.
|
90 |
+
"""
|
91 |
+
|
92 |
+
@property
|
93 |
+
@abstractmethod
|
94 |
+
def abstract(self) -> list[str]:
|
95 |
+
"""Get a sequence of paragraphs in the article abstract.
|
96 |
+
|
97 |
+
Returns
|
98 |
+
-------
|
99 |
+
list of str
|
100 |
+
The paragraphs of the article abstract.
|
101 |
+
"""
|
102 |
+
|
103 |
+
@property
|
104 |
+
@abstractmethod
|
105 |
+
def paragraphs(self) -> list[tuple[str, str]]:
|
106 |
+
"""Get all paragraphs and titles of sections they are part of.
|
107 |
+
|
108 |
+
Returns
|
109 |
+
-------
|
110 |
+
list of (str, str)
|
111 |
+
For each paragraph a tuple with two strings is returned. The first
|
112 |
+
is the section title, the second the paragraph content.
|
113 |
+
"""
|
114 |
+
|
115 |
+
|
116 |
+
class JATSXMLParser(ArticleParser):
|
117 |
+
def __init__(self, xml_stream: IO[Any]) -> None:
|
118 |
+
super().__init__()
|
119 |
+
self.content = ElementTree.parse(xml_stream)
|
120 |
+
if self.content.getroot().tag == "pmc-articleset":
|
121 |
+
self.content = self.content.find("article")
|
122 |
+
|
123 |
+
@classmethod
|
124 |
+
def from_string(cls, xml_string: str) -> JATSXMLParser:
|
125 |
+
with StringIO(xml_string) as stream:
|
126 |
+
obj = cls(stream)
|
127 |
+
return obj
|
128 |
+
|
129 |
+
@classmethod
|
130 |
+
def from_zip(cls, path: str | Path) -> JATSXMLParser:
|
131 |
+
with ZipFile(path) as myzip:
|
132 |
+
xml_files = [
|
133 |
+
x
|
134 |
+
for x in myzip.namelist()
|
135 |
+
if x.startswith("content/") and x.endswith(".xml")
|
136 |
+
]
|
137 |
+
|
138 |
+
if len(xml_files) != 1:
|
139 |
+
raise ValueError(
|
140 |
+
"There needs to be exactly one .xml file inside of content/"
|
141 |
+
)
|
142 |
+
|
143 |
+
xml_file = xml_files[0]
|
144 |
+
|
145 |
+
# Parsing logic
|
146 |
+
with myzip.open(xml_file, "r") as fh:
|
147 |
+
obj = cls(fh)
|
148 |
+
return obj
|
149 |
+
|
150 |
+
@property
|
151 |
+
def title(self) -> str:
|
152 |
+
titles = self.content.find("./front/article-meta/title-group/article-title")
|
153 |
+
return self._element_to_str(titles)
|
154 |
+
|
155 |
+
@property
|
156 |
+
def abstract(self) -> list[tuple[str, str]]:
|
157 |
+
abstract = self.content.find("./front/article-meta/abstract")
|
158 |
+
abstract_list: list[tuple[str, str]] = []
|
159 |
+
if abstract:
|
160 |
+
for sec_title, text in self.parse_section(abstract):
|
161 |
+
abstract_list.append((sec_title,text))
|
162 |
+
return abstract_list
|
163 |
+
|
164 |
+
@property
|
165 |
+
def paragraphs(self) -> list[tuple[str, str]]:
|
166 |
+
paragraph_list: list[tuple[str, str]] = []
|
167 |
+
|
168 |
+
# Paragraphs of text body
|
169 |
+
body = self.content.find("./body")
|
170 |
+
if body:
|
171 |
+
paragraph_list.extend(self.parse_section(body,""))
|
172 |
+
|
173 |
+
# Figure captions
|
174 |
+
figs = self.content.findall("./body//fig")
|
175 |
+
for fig in figs:
|
176 |
+
fig_captions = fig.findall("caption")
|
177 |
+
if fig_captions is None:
|
178 |
+
continue
|
179 |
+
caption = " ".join(self._element_to_str(c) for c in list(fig_captions))
|
180 |
+
if caption:
|
181 |
+
paragraph_list.append(("Figure Caption", caption))
|
182 |
+
|
183 |
+
# Table captions
|
184 |
+
tables = self.content.findall("./body//table-wrap")
|
185 |
+
for table in tables:
|
186 |
+
caption_elements = table.findall("./caption/p") or table.findall(
|
187 |
+
"./caption/title"
|
188 |
+
)
|
189 |
+
if caption_elements is None:
|
190 |
+
continue
|
191 |
+
caption = " ".join(self._element_to_str(c) for c in caption_elements)
|
192 |
+
if caption:
|
193 |
+
paragraph_list.append(("Table Caption", caption))
|
194 |
+
return paragraph_list
|
195 |
+
|
196 |
+
def parse_section(self, section: Element, sec_title_path: str = "") -> Generator[tuple[str, str], None, None]:
|
197 |
+
sec_title = self._element_to_str(section.find("title"))
|
198 |
+
if sec_title == "Author contributions":
|
199 |
+
return
|
200 |
+
sec_title_path = sec_title_path + " - " + sec_title if sec_title_path else sec_title
|
201 |
+
for element in section:
|
202 |
+
if element.tag == "sec":
|
203 |
+
yield from self.parse_section(element, sec_title_path)
|
204 |
+
elif element.tag in {"title", "caption", "fig", "table-wrap", "label"}:
|
205 |
+
continue
|
206 |
+
else:
|
207 |
+
text = self._element_to_str(element)
|
208 |
+
if text:
|
209 |
+
yield sec_title_path, text
|
210 |
+
|
211 |
+
def _inner_text(self, element: Element) -> str:
|
212 |
+
text_parts = [html.unescape(element.text or "")]
|
213 |
+
for sub_element in element:
|
214 |
+
# recursively parse the sub-element
|
215 |
+
text_parts.append(self._element_to_str(sub_element))
|
216 |
+
# don't forget the text after the sub-element
|
217 |
+
text_parts.append(html.unescape(sub_element.tail or ""))
|
218 |
+
return unicodedata.normalize("NFKC", "".join(text_parts)).strip()
|
219 |
+
|
220 |
+
def _element_to_str(self, element: Element | None) -> str:
|
221 |
+
if element is None:
|
222 |
+
return ""
|
223 |
+
|
224 |
+
if element.tag in {
|
225 |
+
"bold",
|
226 |
+
"italic",
|
227 |
+
"monospace",
|
228 |
+
"p",
|
229 |
+
"sc",
|
230 |
+
"styled-content",
|
231 |
+
"underline",
|
232 |
+
"xref",
|
233 |
+
}:
|
234 |
+
# Mostly styling tags for which getting the inner text is enough.
|
235 |
+
# Currently this is the same as the default handling. Writing it out
|
236 |
+
# explicitly here to decouple from the default handling, which may
|
237 |
+
# change in the future.
|
238 |
+
return self._inner_text(element)
|
239 |
+
elif element.tag == "sub":
|
240 |
+
return f"_{self._inner_text(element)}"
|
241 |
+
elif element.tag == "sup":
|
242 |
+
return f"^{self._inner_text(element)}"
|
243 |
+
elif element.tag in {
|
244 |
+
"disp-formula",
|
245 |
+
"email",
|
246 |
+
"ext-link",
|
247 |
+
"inline-formula",
|
248 |
+
"uri",
|
249 |
+
}:
|
250 |
+
return ""
|
251 |
+
else:
|
252 |
+
# Default handling for all other element tags
|
253 |
+
return self._inner_text(element)
|
254 |
+
|
255 |
+
|
256 |
+
class PubMedXMLParser(ArticleParser):
|
257 |
+
"""Parser for PubMed abstract."""
|
258 |
+
|
259 |
+
def __init__(self, data: str | bytes) -> None:
|
260 |
+
super().__init__()
|
261 |
+
self.content = ElementTree.fromstring(data)
|
262 |
+
|
263 |
+
@property
|
264 |
+
def title(self) -> str:
|
265 |
+
title = self.content.find("./PubmedArticle/MedlineCitation/Article/ArticleTitle")
|
266 |
+
if title is None:
|
267 |
+
return ""
|
268 |
+
return "".join(title.itertext())
|
269 |
+
|
270 |
+
@property
|
271 |
+
def abstract(self) -> list[tuple[str,str]]:
|
272 |
+
abstract = self.content.find("./PubmedArticle/MedlineCitation/Article/Abstract")
|
273 |
+
|
274 |
+
if abstract is None:
|
275 |
+
# No paragraphs to parse: stop and return an empty iterable.
|
276 |
+
return [] # noqa
|
277 |
+
|
278 |
+
paragraphs = abstract.iter("AbstractText")
|
279 |
+
abstract_list: list[tuple[str,str]] = []
|
280 |
+
if paragraphs is not None:
|
281 |
+
for paragraph in paragraphs:
|
282 |
+
sec_title = paragraph.get("Label")
|
283 |
+
abstract_list.append((sec_title,"".join(paragraph.itertext())))
|
284 |
+
return abstract_list
|
285 |
+
|
286 |
+
@property
|
287 |
+
def paragraphs(self) -> list[tuple[str, str]]:
|
288 |
+
# No paragraph to parse in PubMed article sets: return an empty iterable.
|
289 |
+
return []
|
290 |
+
|
291 |
+
|
outcome_switch/filter.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Dict, List, Any, Tuple
|
3 |
+
|
4 |
+
STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)'
|
5 |
+
OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)'
|
6 |
+
|
7 |
+
METHOD_REGEX = '(method|approach|strategy|design|protocol)'
|
8 |
+
SAMPLE_SIZE_REGEX = 'sample\s(size|number)'
|
9 |
+
ABSTRACT_REGEX = '(abstract|summary)'
|
10 |
+
|
11 |
+
STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
|
12 |
+
PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}'
|
13 |
+
STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}'
|
14 |
+
METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'
|
15 |
+
|
16 |
+
CHECK_PRIORITY = [
|
17 |
+
("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
|
18 |
+
("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
|
19 |
+
("prim_sec","title",PRIM_SEC_REGEX),
|
20 |
+
("outcome","title",OUTCOME_REGEX),
|
21 |
+
("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
|
22 |
+
("prim_sec","content",PRIM_SEC_REGEX),
|
23 |
+
("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
|
24 |
+
("outcome","content",OUTCOME_REGEX),
|
25 |
+
("method","title",METHOD_REGEX),
|
26 |
+
("sample_size","title",SAMPLE_SIZE_REGEX),
|
27 |
+
("abstract","title",ABSTRACT_REGEX),
|
28 |
+
]
|
29 |
+
|
30 |
+
def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
|
31 |
+
"""Filter sections to keep only the ones containing relevant information if the text is a fulltext
|
32 |
+
else keep all sections of abstract
|
33 |
+
|
34 |
+
Args:
|
35 |
+
sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
|
36 |
+
text_type (str): type of text to filter (abstract or fulltext)
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
Dict[str,Any]: dictionary containing the following keys:
|
40 |
+
- filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
|
41 |
+
- regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
|
42 |
+
- regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
|
43 |
+
- check_type: type of check used to filter the sections (title or content)
|
44 |
+
"""
|
45 |
+
filter_output = {
|
46 |
+
"filtered_sections" : None,
|
47 |
+
"regex_priority_index" : None,
|
48 |
+
"regex_priority_name" : None,
|
49 |
+
"check_type" : None,
|
50 |
+
}
|
51 |
+
if not sections_dict:
|
52 |
+
return filter_output
|
53 |
+
# else we filter the sections
|
54 |
+
filter_output["filtered_sections"] = {} # init
|
55 |
+
match_found = False
|
56 |
+
for i, el in enumerate(CHECK_PRIORITY) :
|
57 |
+
priority_name, content_type, current_regex = el
|
58 |
+
current_regex = re.compile(current_regex, re.IGNORECASE)
|
59 |
+
for title, content_list in sections_dict.items() :
|
60 |
+
content = title if content_type == "title" else '\n'.join(content_list)
|
61 |
+
if current_regex.search(content) :
|
62 |
+
filter_output["check_type"] = content_type
|
63 |
+
filter_output["regex_priority_name"] = priority_name
|
64 |
+
filter_output["regex_priority_index"] = i
|
65 |
+
filter_output["filtered_sections"][title] = content_list
|
66 |
+
match_found = True
|
67 |
+
if match_found :
|
68 |
+
break
|
69 |
+
return filter_output
|
70 |
+
|
71 |
+
|
72 |
+
def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
|
73 |
+
"""Filter primary and secondary outcomes from the list of entities a key is created
|
74 |
+
only if at least one entity is found for the given group"""
|
75 |
+
outcomes = []
|
76 |
+
for entity in entities:
|
77 |
+
if entity["entity_group"] == "O":
|
78 |
+
continue
|
79 |
+
elif entity["entity_group"] == "PrimaryOutcome" :
|
80 |
+
outcomes.append(("primary", entity["word"]))
|
81 |
+
elif entity["entity_group"] == "SecondaryOutcome":
|
82 |
+
outcomes.append(("secondary", entity["word"]))
|
83 |
+
return outcomes
|
84 |
+
|
85 |
+
def get_sections_text(sections: Dict[str, List[str]]) -> str:
|
86 |
+
if not sections :
|
87 |
+
return None
|
88 |
+
sections_text = ""
|
89 |
+
for title, content in sections.items():
|
90 |
+
sections_text += title + '\n' + " ".join(content) + '\n'
|
91 |
+
return sections_text
|
outcome_switch/similarity.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from sentence_transformers.util import cos_sim
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
|
6 |
+
|
7 |
+
class OutcomeSimilarity:
|
8 |
+
""" similarity detector between outcomes statements"""
|
9 |
+
ID2LABEL = ["different", "similar"]
|
10 |
+
|
11 |
+
def __init__(self, model_path: str):
|
12 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
13 |
+
self.model = AutoModel.from_pretrained(model_path)
|
14 |
+
|
15 |
+
def _mean_pooling(self, model_output, attention_mask: torch.Tensor):
|
16 |
+
""" Mean Pooling - Take attention mask into account for correct averaging"""
|
17 |
+
# First element of model_output contains all token embeddings
|
18 |
+
token_embeddings = model_output[0]
|
19 |
+
input_mask_expanded = attention_mask.unsqueeze(
|
20 |
+
-1).expand(token_embeddings.size()).float()
|
21 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
22 |
+
|
23 |
+
def _encode(self, outcomes_lot: list[tuple[str,str]]):
|
24 |
+
# Parse sentences
|
25 |
+
sentences = []
|
26 |
+
if len(outcomes_lot) > 0:
|
27 |
+
_, sentences = zip(*outcomes_lot)
|
28 |
+
# Tokenize sentences
|
29 |
+
encoded_input = self.tokenizer(
|
30 |
+
sentences, padding=True, truncation=True, return_tensors='pt')
|
31 |
+
# Compute token embeddings
|
32 |
+
with torch.no_grad():
|
33 |
+
model_output = self.model(**encoded_input)
|
34 |
+
# Perform pooling
|
35 |
+
sentence_embeddings = self._mean_pooling(
|
36 |
+
model_output, encoded_input['attention_mask'])
|
37 |
+
# Normalize embeddings
|
38 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
39 |
+
return sentence_embeddings
|
40 |
+
|
41 |
+
def get_similarity(
|
42 |
+
self,
|
43 |
+
registry_outcomes:list[tuple[str,str]],
|
44 |
+
article_outcomes:list[tuple[str,str]]
|
45 |
+
) -> list[tuple[int,int,float]]:
|
46 |
+
"""For each outcome in true_dict, find the most similar outcome in compared_dict and return a mapping
|
47 |
+
of all matchs , for each tuple : registry is the first index (at i=0); article is the second index (at i=1)
|
48 |
+
and the third index (i=3) is the cosine similarity score"""
|
49 |
+
connections = set()
|
50 |
+
rembs = self._encode(registry_outcomes)
|
51 |
+
aembs = self._encode(article_outcomes)
|
52 |
+
cosines_scores = cos_sim(rembs, aembs)
|
53 |
+
lines_max = torch.argmax(cosines_scores, dim=1)
|
54 |
+
col_max = torch.argmax(cosines_scores, dim=0)
|
55 |
+
remaining_cols = set(range(len(col_max)))
|
56 |
+
for i in range(len(lines_max)):
|
57 |
+
connection = (i, lines_max[i].item(), cosines_scores[i, lines_max[i]].item())
|
58 |
+
remaining_cols.discard(lines_max[i].item())
|
59 |
+
connections.add(connection)
|
60 |
+
for j in remaining_cols:
|
61 |
+
connection = (col_max[j].item(), j, cosines_scores[col_max[j], j].item())
|
62 |
+
connections.add(connection)
|
63 |
+
return connections
|
outcome_switch/visual.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import plotly.graph_objects as go
|
3 |
+
from typing import List, Dict, Any, Tuple, Union
|
4 |
+
|
5 |
+
|
6 |
+
_PUBMED_LINK= "https://pubmed.ncbi.nlm.nih.gov/{article_id}/"
|
7 |
+
_PMC_LINK = "https://www.ncbi.nlm.nih.gov/pmc/articles/{article_id}/"
|
8 |
+
_MARKDOWN_TEMPLATE = """# [{article_title}]({article_link})
|
9 |
+
# Filtered sections :
|
10 |
+
|
11 |
+
{sections_md}"""
|
12 |
+
|
13 |
+
# entities highlighted text
|
14 |
+
def get_highlighted_text(entities:List[Dict[str,Any]], original_text:str) -> List[Tuple[str,Union[str,None]]] :
|
15 |
+
"""Convert the output of the model to a list of tuples (entity, label)
|
16 |
+
for `gradio.HighlightedText`output"""
|
17 |
+
conversion = {"PrimaryOutcome":"primary","SecondaryOutcome":"secondary"}
|
18 |
+
highlighted_text = []
|
19 |
+
for entity in entities:
|
20 |
+
entity_original_text = original_text[entity["start"]:entity["end"]]
|
21 |
+
if entity["entity_group"] == "O":
|
22 |
+
entity_output = (entity_original_text, None)
|
23 |
+
else:
|
24 |
+
entity_output = (entity_original_text, conversion[entity["entity_group"]])
|
25 |
+
highlighted_text.append(entity_output)
|
26 |
+
return highlighted_text
|
27 |
+
|
28 |
+
# article filtered sections markdown output
|
29 |
+
def get_article_markdown(
|
30 |
+
article_id:str,
|
31 |
+
article_sections:dict[str,list[str]],
|
32 |
+
filtered_sections:dict[str,list[str]]) -> str:
|
33 |
+
"""Get the markdown of a list of sections"""
|
34 |
+
# link to online article
|
35 |
+
article_link = _PMC_LINK if article_id.startswith("PMC") else _PUBMED_LINK
|
36 |
+
article_link = article_link.format(article_id=article_id)
|
37 |
+
# get title, abstract, and filtered sections
|
38 |
+
article_title = article_sections["Title"][0]
|
39 |
+
sections_md = ""
|
40 |
+
for title, content in filtered_sections.items():
|
41 |
+
sections_md += f"## {title}\n"
|
42 |
+
sections_md += " ".join(content) + "\n"
|
43 |
+
return _MARKDOWN_TEMPLATE.format(
|
44 |
+
article_link=article_link,
|
45 |
+
article_title=article_title,
|
46 |
+
sections_md=sections_md
|
47 |
+
)
|
48 |
+
|
49 |
+
# registry dataframe display
|
50 |
+
def _highlight_df_rows(row):
|
51 |
+
if row['type'] =='primary':
|
52 |
+
return ['background-color: lightcoral'] * len(row)
|
53 |
+
elif row['type'] == 'secondary':
|
54 |
+
return ['background-color: lightgreen'] * len(row)
|
55 |
+
else :
|
56 |
+
return ['background-color: lightgrey'] * len(row)
|
57 |
+
|
58 |
+
def get_registry_dataframe(registry_outcomes: list[dict[str,str]]) -> str:
|
59 |
+
return pd.DataFrame(registry_outcomes).style.apply(_highlight_df_rows, axis=1)
|
60 |
+
|
61 |
+
# fcts for sankey diagram
|
62 |
+
def _sent_line_formatting(sentence:str, max_words:int=10) -> str:
|
63 |
+
"""format a sentence to be displayed in a sankey diagram so that
|
64 |
+
each line has a maximum of `max_words` words"""
|
65 |
+
words = sentence.split()
|
66 |
+
batchs = [words[i:i+max_words] for i in range(0, len(words), max_words)]
|
67 |
+
return "<br>".join([" ".join(batch) for batch in batchs])
|
68 |
+
|
69 |
+
def _find_entity_score(entity_text, raw_entities):
|
70 |
+
for tc_output in raw_entities:
|
71 |
+
if entity_text == tc_output["word"]:
|
72 |
+
return tc_output["score"]
|
73 |
+
|
74 |
+
def get_sankey_diagram(
|
75 |
+
registry_outcomes: list[tuple[str,str]],
|
76 |
+
article_outcomes: list[tuple[str,str]],
|
77 |
+
connections: set[tuple[int,int,float]],
|
78 |
+
raw_entities: list[Dict[str,Any]],
|
79 |
+
cosine_threshold: float=0.44,
|
80 |
+
) -> go.Figure:
|
81 |
+
|
82 |
+
color_map = {
|
83 |
+
"primary": "red",
|
84 |
+
"secondary": "green",
|
85 |
+
"other": "grey",
|
86 |
+
}
|
87 |
+
# Create lists of formatted sentences and colors for the nodes
|
88 |
+
list1 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in registry_outcomes]
|
89 |
+
list2 = [(_sent_line_formatting(sent), color_map[typ]) for typ, sent in article_outcomes]
|
90 |
+
display_connections = [
|
91 |
+
(list1[i][0],list2[j][0],"mediumaquamarine") if cosine > cosine_threshold
|
92 |
+
else (list1[i][0],list2[j][0],"lightgray") for i,j,cosine in connections
|
93 |
+
]
|
94 |
+
# Create a list of labels and colors for the nodes
|
95 |
+
labels = [x[0] for x in list1 + list2]
|
96 |
+
colors = [x[1] for x in list1 + list2]
|
97 |
+
# Create lists of sources and targets for the connections
|
98 |
+
sources = [labels.index(x[0]) for x in display_connections]
|
99 |
+
targets = [labels.index(x[1]) for x in display_connections]
|
100 |
+
# Create a list of values and colors for the connections
|
101 |
+
values = [1] * len(display_connections)
|
102 |
+
connection_colors = [x[2] for x in display_connections]
|
103 |
+
|
104 |
+
# data appearing on hover of each node (outcome)
|
105 |
+
node_customdata = [f"from: registry<br>type:{t}" for t,_ in registry_outcomes]
|
106 |
+
node_customdata += [f"from: article<br>type: {t}<br>confidence: " + str(_find_entity_score(s, raw_entities)) for t,s in article_outcomes]
|
107 |
+
node_hovertemplate = "outcome: %{label}<br>%{customdata} <extra></extra>"
|
108 |
+
# data appearing on hover of each link (node connections)
|
109 |
+
link_customdata = [cosine for _,_,cosine in connections]
|
110 |
+
link_hovertemplate = "similarity: %{customdata} <extra></extra>"
|
111 |
+
# sankey diagram data filling
|
112 |
+
sankey = go.Sankey(
|
113 |
+
node=dict(
|
114 |
+
pad=15,
|
115 |
+
thickness=20,
|
116 |
+
line=dict(color="black", width=0.5),
|
117 |
+
label=labels,
|
118 |
+
color=colors,
|
119 |
+
customdata=node_customdata,
|
120 |
+
hovertemplate=node_hovertemplate
|
121 |
+
),
|
122 |
+
link=dict(
|
123 |
+
source=sources,
|
124 |
+
target=targets,
|
125 |
+
value=values,
|
126 |
+
customdata=link_customdata,
|
127 |
+
color=connection_colors,
|
128 |
+
hovertemplate=link_hovertemplate
|
129 |
+
)
|
130 |
+
)
|
131 |
+
# conversion to figure
|
132 |
+
fig = go.Figure(data=[sankey])
|
133 |
+
fig.update_layout(
|
134 |
+
title_text="Registry outcomes (left) connections with article outcomes (right), similarity threshold = " + str(cosine_threshold),
|
135 |
+
font_size=10,
|
136 |
+
width=1200,
|
137 |
+
xaxis=dict(rangeslider=dict(visible=True),type="linear")
|
138 |
+
)
|
139 |
+
return fig
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
|
2 |
+
pandas==2.2.2
|
3 |
+
gradio==4.44.0
|
4 |
+
plotly==5.24.0
|
5 |
+
transformers==4.44.2
|
6 |
+
sentence-transformers==3.0.1
|
test/parse_examples/36473651.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test/parse_examples/PMC11102686.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test/test_ctgov.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from outcome_switch.ctgov import find_nctid, get_registry_outcomes, reformat_outcomes
|
3 |
+
|
4 |
+
|
5 |
+
_EMPTY_STRING = ""
|
6 |
+
# NCT REGEX extraction tests
|
7 |
+
_TEXT_WITH_ONE_NCT = """blablabla nct id is NCT04647656 blabla"""
|
8 |
+
_TEXT_WITH_TWO_NCT = """blablabla nct id is NCT04647656 blabla NCT06562582 bla"""
|
9 |
+
_TEXT_WITHOUT_NCT = "blablabla blablabla"
|
10 |
+
|
11 |
+
# NCT REGISTRY API tests
|
12 |
+
_CORRECT_NCT = "NCT04647656"
|
13 |
+
_INCORRECT_NCT = "PRN03216548"
|
14 |
+
|
15 |
+
# REGISTRY outcomes reformatting test
|
16 |
+
_CTGOV_OUTCOMES = {
|
17 |
+
"primaryOutcomes": [
|
18 |
+
{
|
19 |
+
"measure": "Cognitive health assessment (NeuroTrax)",
|
20 |
+
"description": "Memory, attention and information process will be evaluated using the NeuroTrax computerized cognitive evaluation battery.",
|
21 |
+
"timeFrame": "Baseline, 2 months",
|
22 |
+
}
|
23 |
+
],
|
24 |
+
"secondaryOutcomes": [
|
25 |
+
{
|
26 |
+
"measure": "Brain perfusion",
|
27 |
+
"description": "Cerebral blood volume and flow will be measured using perfusion MRI protocol Dynamic susceptibility contrast (DSC).",
|
28 |
+
"timeFrame": "Baseline, 2 months",
|
29 |
+
}
|
30 |
+
],
|
31 |
+
}
|
32 |
+
|
33 |
+
class NctidFinderTest(unittest.TestCase) :
|
34 |
+
|
35 |
+
def test_text_with_one_nct(self):
|
36 |
+
self.assertEqual(find_nctid(_TEXT_WITH_ONE_NCT), "NCT04647656")
|
37 |
+
|
38 |
+
def test_text_with_two_nct(self):
|
39 |
+
self.assertEqual(find_nctid(_TEXT_WITH_TWO_NCT), "NCT04647656")
|
40 |
+
|
41 |
+
def test_text_without_nct(self):
|
42 |
+
self.assertIsNone(find_nctid(_TEXT_WITHOUT_NCT))
|
43 |
+
|
44 |
+
def test_empty_string(self):
|
45 |
+
self.assertIsNone(find_nctid(_EMPTY_STRING))
|
46 |
+
|
47 |
+
def test_none_input(self):
|
48 |
+
self.assertIsNone(find_nctid(None))
|
49 |
+
|
50 |
+
class CtgovExtractionTest(unittest.TestCase) :
|
51 |
+
|
52 |
+
def test_correct_nct(self):
|
53 |
+
self.assertIsNotNone(get_registry_outcomes(_CORRECT_NCT))
|
54 |
+
|
55 |
+
def test_incorrect_nct(self):
|
56 |
+
self.assertIsNone(get_registry_outcomes(_INCORRECT_NCT))
|
57 |
+
|
58 |
+
def test_empty_string(self):
|
59 |
+
self.assertIsNone(get_registry_outcomes(_EMPTY_STRING))
|
60 |
+
|
61 |
+
class CtgovReformatTest(unittest.TestCase) :
|
62 |
+
|
63 |
+
def test_correct_reformat_outcomes(self):
|
64 |
+
self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES), list)
|
65 |
+
self.assertEqual(len(reformat_outcomes(_CTGOV_OUTCOMES)), 2)
|
66 |
+
self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[0], dict)
|
67 |
+
self.assertIsInstance(reformat_outcomes(_CTGOV_OUTCOMES)[1], dict)
|
68 |
+
self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[0]["type"], "primary")
|
69 |
+
self.assertEqual(reformat_outcomes(_CTGOV_OUTCOMES)[1]["type"], "secondary")
|
test/test_entrez.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from outcome_switch.entrez import _dl_article_xml, _parse_article, _reformat_article
|
3 |
+
|
4 |
+
# Efetch tests
|
5 |
+
_VALID_PMCID = "PMC6206648"
|
6 |
+
_VALID_PMID_1 = "29283904"
|
7 |
+
_VALID_PMID_2 = "29214975"
|
8 |
+
_INVALID_1 = "10.1056/NEJMoa2110345"
|
9 |
+
_INVALID_2 = "0123456789"
|
10 |
+
_EMPTY = ""
|
11 |
+
|
12 |
+
# XML Parsing tests files
|
13 |
+
# TODO : tests for parsing XML files
|
14 |
+
|
15 |
+
|
16 |
+
class EntrezEfetchTest(unittest.TestCase):
|
17 |
+
|
18 |
+
def test_valid_pmcid(self):
|
19 |
+
self.assertIsNotNone(_dl_article_xml(_VALID_PMCID)[0])
|
20 |
+
self.assertEqual(_dl_article_xml(_VALID_PMCID)[1], "pmc")
|
21 |
+
|
22 |
+
def test_valid_pmid1(self):
|
23 |
+
self.assertIsNotNone(_dl_article_xml(_VALID_PMID_1)[0])
|
24 |
+
self.assertEqual(_dl_article_xml(_VALID_PMID_1)[1], "pubmed")
|
25 |
+
|
26 |
+
def test_valid_pmid2(self):
|
27 |
+
self.assertIsNotNone(_dl_article_xml(_VALID_PMID_2)[0])
|
28 |
+
self.assertEqual(_dl_article_xml(_VALID_PMID_2)[1], "pubmed")
|
29 |
+
|
30 |
+
def test_invalid1(self):
|
31 |
+
self.assertIsNone(_dl_article_xml(_INVALID_1)[0])
|
32 |
+
|
33 |
+
def test_invalid2(self):
|
34 |
+
self.assertIsNone(_dl_article_xml(_INVALID_2)[0])
|
35 |
+
|
36 |
+
def test_empty(self):
|
37 |
+
self.assertIsNone(_dl_article_xml(_EMPTY)[0])
|
38 |
+
self.assertIsNone(_dl_article_xml(_EMPTY)[1])
|
test/test_filter.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from outcome_switch import filter_sections, filter_outcomes, get_sections_text
|
3 |
+
|
4 |
+
_VALID_DICT_WITH_2_SECTIONS = {
|
5 |
+
"Methods - Outcomes - Primary outcome": [
|
6 |
+
"The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n 2 4]."
|
7 |
+
],
|
8 |
+
"Methods - Outcomes - Secondary outcomes": [
|
9 |
+
"Secondary outcomes include:",
|
10 |
+
"Hip function as measured by the Hip Outcome Score (HOS).",
|
11 |
+
"Generic physical and mental health as measured by the Short Form-12 (SF-12).",
|
12 |
+
"Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
|
13 |
+
"Health utility as measured by the EuroQol (EQ-5D).",
|
14 |
+
"Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
|
15 |
+
"The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n 5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6, 7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11, 12].",
|
16 |
+
],
|
17 |
+
"Discussion - Analysis plan - Blinded analyses": [
|
18 |
+
"All statistical analyses will first be completed using blinded treatment groups (i.e. treatment X and Y). Interpretations for the effect of the surgical interventions will be documented based upon blinded X versus Y treatment [ \n 14]."
|
19 |
+
],
|
20 |
+
}
|
21 |
+
|
22 |
+
_FILTERED_SECTIONS = {
|
23 |
+
"filtered_sections": {
|
24 |
+
"Methods - Outcomes - Primary outcome": [
|
25 |
+
"The FIRST primary outcome is pain at 12months as measured by the VAS. The primary analysis is to assess whether surgical correction of the impingement morphology (arthroscopic osteochondroplasty) with/without labral repair, in adults aged 1850 years diagnosed with FAI, provides decreased pain at 12months compared to arthroscopic lavage of the hip joint with/without labral repair, as measured by the VAS. The VAS is a validated unidimensional scale that is easy to use, requires no verbal or reading skills, and is sufficiently versatile to be employed in a variety of settings [ \n 2 4]."
|
26 |
+
],
|
27 |
+
"Methods - Outcomes - Secondary outcomes": [
|
28 |
+
"Secondary outcomes include:",
|
29 |
+
"Hip function as measured by the Hip Outcome Score (HOS).",
|
30 |
+
"Generic physical and mental health as measured by the Short Form-12 (SF-12).",
|
31 |
+
"Impact of hip-specific disease on function and lifestyle in the young, active patient as measured by the International Hip Outcome Tool (iHOT-12).",
|
32 |
+
"Health utility as measured by the EuroQol (EQ-5D).",
|
33 |
+
"Complications, including additional surgery and other serious and non-serious adverse events. Reasons for re-operations for the randomized hip typically include, but are not limited to re-injury of the labrum/cartilage, hip dislocation, hip instability, infection (deep or superficial), wound healing problem, soft tissue problem, and unresolved hip pain. Other hip-related adverse events to be reported include, but are not limited to, hip instability, tendinopathy, re-injury of the labrum/cartilage, hip osteoarthritis post-surgery, and infection (superficial or deep).",
|
34 |
+
"The HOS is a self-administered hip score that was designed to capture hip function and outcomes following surgical therapies such as arthroscopy [ \n 5]. The HOS has been shown to have the greatest clinimetric evidence for use in patients with FAI or labral tears [ 6, 7]. The SF-12 may be self-completed or interview-administered and will help document general health status and the burden of illness that FAI presents [ 8]. The iHOT-12 is a shorter version of the iHOT-33 designed to be easier to complete in routine clinical practice to measure both health-related quality of life and changes after treatment in young, active patients with hip disorders [ 9]. This questionnaire has been shown to be valid, reliable, and responsive to change [ 9]. The EQ-5D is a standardized instrument for use as a measure of health outcome [ 10]. The EQ-5D comprises five dimensions of health (mobility, self-care, usual activities, pain/discomfort, and anxiety/depression). The EQ-5D has been used in previous studies involving patients with hip pain and has been extensively validated [ 11, 12].",
|
35 |
+
],
|
36 |
+
},
|
37 |
+
"regex_priority_index": 0,
|
38 |
+
"regex_priority_name": "strict_method_and_prim_sec",
|
39 |
+
"check_type": "title",
|
40 |
+
}
|
41 |
+
|
42 |
+
|
43 |
+
_EMPTY_DICT = {}
|