ljyflores commited on
Commit
2d00e5a
1 Parent(s): 5d4e803
Files changed (5) hide show
  1. app.py +40 -0
  2. requirements.txt +6 -0
  3. terms.json +1 -0
  4. utils_casemaker.py +250 -0
  5. utils_report_parser.py +20 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from utils_casemaker import CaseMaker, format_casemaker_data
5
+
6
+ st.title("Juni Health Patient Casemaker")
7
+
8
+ casemaker = CaseMaker("terms.json")
9
+
10
+ uploaded_file = st.file_uploader("Choose a file")
11
+
12
+ if uploaded_file is not None:
13
+ # Can be used wherever a "file-like" object is accepted:
14
+ df = pd.read_csv(uploaded_file)
15
+ reports = format_casemaker_data(
16
+ df=df,
17
+ patient_id_column="patient_id",
18
+ date_column="report_id",
19
+ text_column="text",
20
+ )
21
+
22
+ patient_options = {
23
+ f"Patient {patient_id}: {len(reports[patient_id])} reports": patient_id
24
+ for patient_id in reports.keys()
25
+ }
26
+ selected_patient_string = st.radio(
27
+ "Select a Patient ID",
28
+ list(patient_options.keys()),
29
+ key = "patient_select_button"
30
+ )
31
+
32
+ if st.button("Generate Case", key = "task_begin_button"):
33
+ selected_patient_id = patient_options[selected_patient_string]
34
+ summary_by_organ = casemaker.parse_records(reports[selected_patient_id])
35
+ summary_by_organ = casemaker.format_reports(summary_by_organ)
36
+
37
+ for chosen_organ in summary_by_organ.keys():
38
+ if summary_by_organ[chosen_organ]:
39
+ st.header(chosen_organ.capitalize())
40
+ st.write(summary_by_organ[chosen_organ])
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ nltk
2
+ numpy==1.24.1
3
+ pandas
4
+ torch==2.0.1
5
+ transformers
6
+ streamlit
terms.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"brain": ["craniotomy", "corticopontine fibers", "lamina terminalis", "superior frontal sulcus", "other", "diagonal band of broca", "gigantocellular reticular nucleus", "cuneus", "intraparietal sulcus", "cross-section of the midbrain.", "retinohypothalamic tract", "parvocellular reticular nucleus", "rostromedial tegmental nucleus", "serotonergic pathways", "precuneus", "superior salivatory nucleus", "lateral spinothalamic tract", "hypoglossal", "forebrain (prosencephalon)", "nigrostriatal pathway", "tuberomammillary nucleus", "botzinger complex", "posterior cingulate", "cerebrum", "spinomesencephalic tract", "posterior part of lateral nucleus", "uncinate fasciculus", "cuneate nucleus", "dorsomedial hypothalamic nucleus", "nucleus ambiguus", "parietal lobe", "part of supraoptic nucleus", "subthalamus (hpa axis)", "pretectum", "vestibulocochlear nuclei (vestibular nuclei and cochlear nuclei) (viii)", "tegmentum", "cingulate cortex", "ventral striatum", "rubro-olivary tract", "third ventricle", "insula", "intralaminar nuclear group", "other nuclei of preoptic area", "caudal ventrolateral medulla", "crus cerebri", "supramarginal gyrus", "tectum", "interpeduncular nucleus", "entorhinal cortex", "anterior perforated substance", "rostral linear nucleus of the raphe", "adenohypophysis", "paracentral nucleus", "extrapyramidal system", "vagus", "perihypoglossal nuclei", "cerebral cortex", "rhomboidal nucleus", "inferior salivatory nucleus", "medial longitudinal fasciculus", "medial lemniscus", "metathalamus", "gyri", "bed nucleus of the stria terminalis", "tuberal region", "mesolimbic pathway", "anteroventral nucleus (a.k.a. ventral anterior nucleus)", "pyramidal tract", "ventrolateral prefrontal cortex", "lateral raphespinal tract", "pars intermedia (intermediate lobe)", "ventral pallidum", "habenular nuclei", "nucleus basalis", "stria terminalis", "primary motor cortex", "tuberal", "interstitiospinal tract", "laterodorsal tegmental nucleus", "vestibulospinal tract", "substantia innominata", "lateral corticospinal tract", "abducens", "metencephalon", "caudal linear nucleus", "habenular commissure", "ventral anterior nucleus", "claustrum", "rostral ventrolateral medulla", "primary somatosensory cortex", "superior frontal gyrus", "fourth ventricle", "facial nerve nucleus (vii)", "superior colliculi", "subinsular cortex", "fusiform gyrus", "paranigral nucleus", "parabrachial area", "perforant pathway", "fastigial nucleus", "inferior olivary nucleus", "accessory", "midline nuclear group", "dorsal raphe nucleus", "corticospinal tract or cerebrospinal fibers", "anterior hypothalamic nucleus", "inferior parietal lobule", "optic radiation", "medullary striae of fourth ventricle", "neurohypophysis", "ventral posterior nucleus", "pituitary", "midbrain reticular formation", "basal ganglia", "superior olivary complex", "mesencephalic duct (cerebral aqueduct, aqueduct of sylvius)", "medial nuclear group", "vestibulocochlear", "parts of preoptic area", "pineal body (pineal gland)", "superior parietal lobule", "lateral sulcus", "substantia nigra", "infundibulum", "lateral nuclear group", "inferior temporal gyrus", "inferior temporal cortex", "trochlear nucleus (iv)", "myoclonic triangle", "trochlear", "anterior corticospinal tract", "solitary tract", "corpora quadrigemina", "cerebellar peduncles", "oculomotor nucleus (iii)", "corpus callosum", "rhinencephalon", "globus pallidus", "corticobulbar tract", "pulvinar", "area postrema", "pars reticulata", "hypoglossal nucleus", "caudate nucleus", "diencephalon", "glossopharyngeal", "pars compacta", "anteromedial nucleus", "anterior nuclear group", "frontopontine fibers", "subcommissural organ", "lateral geniculate body", "prefrontal cortex", "olfactory tubercle", "occipital lobe", "gracile fasciculus", "tuberal part of lateral nucleus", "anterodorsal nucleus", "posterior inferior temporal cortex", "periventricular nucleus", "parabrachial pigmented nucleus", "anterior commissure", "parafascicular nucleus", "fornix", "medial geniculate body", "middle cerebellar peduncle", "secondary somatosensory cortex", "raphe nuclei", "parafacial zone", "tegmental pontine reticular nucleus", "edinger-westphal nucleus", "putamen", "oculomotor", "amygdala", "parahippocampal gyrus", "trapezoid body", "cerebral hemisphere", "hypothalamus", "ventral respiratory group or apneustic centre", "central sulcus", "pineal", "septal nuclei", "optic tract", "ventromedial prefrontal cortex", "epithalamus", "paraventricular nucleus", "olfactory bulb", "thalamocortical radiations", "retrotrapezoid nucleus", "dentate nucleus", "supplementary motor cortex", "medial superior olive", "posterior commissure", "anterior olfactory nucleus", "chemoreceptor trigger zone", "basal forebrain", "medial parabrachial nucleus", "caudate", "olivary body", "rostral interstitial nucleus of medial longitudinal fasciculus", "reuniens nucleus", "pontine nuclei", "amygdalofugal pathway", "angular gyrus", "nucleus retrofacialis", "mesencephalic cranial nerve nuclei", "olfactory tract", "middle frontal gyrus", "dorsal column\u2013medial lemniscus pathway", "lateral dorsal nucleus", "olfactory", "pedunculopontine nucleus", "precentral gyrus", "interthalamic adhesion", "cranial nerves", "subparabrachial nucleus", "centromedian nucleus", "emboliform nucleus", "vermis", "incertohypothalamic pathway", "cerebral peduncle", "perirhinal cortex", "abducens nucleus (vi)", "cingulate gyrus", "lateral posterior nucleus", "medulla", "superior longitudinal fasciculus", "lateral occipital gyrus", "dorsolateral prefrontal cortex", "spinocerebellar tract", "lateral tuberal nuclei", "suprachiasmatic nucleus", "interfascicular nucleus", "cortex", "middle frontal sulcus", "caudal pontine reticular nucleus", "circumventricular organs (also fourth ventricle)", "thalamic reticular nucleus", "lateral superior olive", "corona radiata", "tuber cinereum", "anterior spinothalamic tract", "lateral preoptic nucleus", "globose nucleus", "medial septal nuclei", "spinothalamic tract", "white matter", "olivocerebellar tract", "nucleus para-ambiguus", "mammillary nucleus", "supraoptic nucleus", "taenia thalami", "chief or pontine nucleus of the trigeminal nerve sensory nucleus (v)", "trigeminal", "olivospinal tract", "cerebellar hemispheres", "interhemispheric fissure", "temporal lobe", "premotor cortex", "prepositus nucleus", "internal capsule", "postcentral gyrus (primary somesthetic area)", "subthalamic nucleus", "medullary cranial nerve nuclei", "lateral area", "reticulospinal tract", "nucleus accumbens", "posterior parietal cortex", "frontal lobe", "rubrospinal tract", "posterior lobe", "midbrain (mesencephalon)", "periamygdaloid cortex", "tectospinal tract", "inferior frontal gyrus", "respiratory center-respiratory groups", "pons", "cerebellar nuclei", "subfornical organ", "medial dorsal nucleus", "stria medullaris", "tuberoinfundibular pathway", "medial nucleus of the trapezoid body", "thalamus", "superior cerebellar peduncle", "longitudinal cerebral fissure", "spinoreticular tract", "piriform cortex", "pontine cranial nerve nuclei", "pontine micturition center (barrington's nucleus)", "paramedian reticular nucleus", "motor cortex", "striatum", "motor nucleus for the trigeminal nerve (v)", "periaqueductal gray", "extreme capsule", "facial", "blood brain barrier", "mesocortical pathway", "paratenial nucleus", "orbitofrontal cortex", "anterior cingulate", "vascular organ of lamina terminalis", "periventricular preoptic nucleus", "red nucleus", "optic chiasm", "external capsule", "anterior lobe", "sublingual nucleus", "temporopontine fibers", "ventral posterior lateral nucleus", "corticomesencephalic tract", "precentral sulcus", "cuneate fasciculus", "dentate gyrus", "myelencephalon", "tuberal nucleus", "medial forebrain bundle", "ventral nuclear group", "cingulate sulcus", "centrum semiovale", "dorsomedial prefrontal cortex", "dorsal nucleus of vagus nerve", "inferior colliculi", "arcuate fasciculus", "cerebellum", "dorsal respiratory group", "gracile nucleus", "median eminence", "anterior part of lateral nucleus", "nucleus retroambiguus", "brain stem", "central lateral nucleus", "lateral lemniscus", "insular cortex", "interposed nucleus", "flocculonodular lobe", "ventromedial nucleus", "pontine respiratory group", "cerebrospinal fluid", "zona incerta", "postcentral sulcus", "intercalated nucleus", "medullary pyramids", "ventral posterior medial nucleus", "mammillotegmental fasciculus", "postcentral gyrus", "locus coeruleus", "nucleus incertus", "sylvian fissure", "median preoptic nucleus", "cerebellar vermis", "medial preoptic nucleus", "indusium griseum", "ventral lateral nucleus", "mammillary bodies", "pontine tegmentum", "lateral vestibulospinal tract", "superior temporal gyrus", "lateral parabrachial nucleus", "optic", "middle temporal gyrus", "subcortical", "major dopaminergic pathways from dopaminergic cell groups", "paramedian pontine reticular formation", "retrosplenial cortex", "medulla oblongata", "mammillary nuclei (part of mammillary bodies)", "inferior cerebellar peduncle", "spino-olivary tract", "uncus", "arcuate nucleus", "medial vestibulospinal tract", "solitary nucleus (nucleus of the solitary tract)", "ventral tegmental area"], "spine": [], "meninges": ["posterior horn", "leptomeningeal", "angular bundle", "foramen of magendie", "subdural", "fourth ventricle", "subarachnoid", "pia", "arachnoid septum", "ventricular system", "subarachnoid space", "interventricular foramina", "epidural space", "pia mater", "meningeal coverings", "pontine cistern", "calcar avis", "superior cistern", "dura mater", "arachnoid mater", "lateral ventricles", "arachnoid", "subdural space", "chiasmatic cistern", "cisterna magna", "cistern of lamina terminalis", "body of lateral ventricle", "foramina of luschka", "foramina", "inferior horn", "pachymeningeal", "subventricular zone", "interpeduncular cistern", "dura", "spinal subarachnoid space", "cerebral aqueduct", "anterior horn", "third ventricle"], "vascular": ["circle of willis", "circumventricular organs", "blood brain barrier", "middle cerebral artery", "basilar artery", "m4", "v2", "p2", "v4", "a1", "m2", "m3", "superior sagittal sinus", "p4", "p3", "v1", "m1", "anterior cerebral artery", "p1", "vertebral artery", "posterior cerebral artery", "a2", "glymphatic system", "v3", "a3"], "head": ["neck", "pharyngeal mucosal space", "masticator space", "carotid space", "oral cavity", "hypopharynx", "pharynx", "parotid space", "larynx", "parapharyngeal space", "nasopharynx", "perivertebral space", "oropharynx", "nasal cavity", "retropharyngeal space"], "liver": ["intrahepatic", "hepatic", "cirrhosis"], "biliary": ["right hepatic duct", "common bile duct", "gallbladder", "common hepatic duct", "left hepatic duct"], "spleen": ["splenic"], "pancreas": ["wirsung", "main pancreatic duct", "pancreatic "], "adrenal glands": ["adrenal", "pheochromocystoma", "adrenal adenoma"], "urinary system": ["renal cell carcinoma", "ureter", "renal angiomyolipoma", "renal", "kidneys", "oncocytoma", "bladder", "kidney", "urinary bladder", "rcc"], "gastrointestinal": ["small intestine", "duodenum", "ileum", "appendix", "esophagus", "descending colon", "transverse colon", "sigmoid", "ascending colon", "stomach", "cecum", "pylorus", "jejunum", "rectum", "colon", "antrum", "large intestine", "anus"], "peritoneum": ["pneumoperitoneum", "peritonitis", "ascites", "peritoneal carcinomatosis"], "retroperitoneum": [], "pelvis": ["ovary", "uterus", "uterine tubes", "fallopean tubes", "parametrium", "endometrium", "ovaries"], "lung": ["left lung", "right middle lobe", "right upper lobe", "right lung", "left upper lobe", "lingula", "right lower lobe", "left lower lobe"], "airway": ["bronchus intermedius", "bronchi", "main stem bronchus", "small airways", "trachea", "carina", "central airways"], "pleura": ["pneumothorax", "pleural effusion"], "mediastinum": ["mediastinal", "thymic", "thymus"], "heart": ["cardiac", "left atrium", "left ventricle", "right atrium", "right ventricle"], "breast": ["mammogram", "nipple", "mammary", "mammography", "subareolar"], "upper extremity": ["ulnar", "wrist", "radial", "carpal", "ulna", "metacarpal", "humerus", "radius", "hand", "shoulder", "elbow"], "lower extremity": ["femoral", "femur"], "vascular": ["superior mesenteric vein", "superior mesenteric artery", "aneurysm", "inferior mesenteric artery", "common femoral vein", "aorta", "sma", "common femoral artery", "iliac artery", "gda", "external iliac artery", "proper hepatic artery", "renal arteries", "ima", "renal artery", "common iliac artery", "common hepatic artery", "gastroduodenal artery", "splenic vein", "celiac axis", "internal iliac artery", "celiac artery", "portal vein", "celiac trunk"], "lymphatic": ["lymphatics", "lad", "lymphadenopathy", "lymph node", "lymph nodes"], "soft tissues": ["soft tissues", "dermis", "subcutaneous fat", "epidermis", "skin"]}
utils_casemaker.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import re
4
+
5
+ from dataclasses import dataclass
6
+ from nltk.tokenize import sent_tokenize
7
+ from typing import Dict, List, Sequence
8
+ from utils_report_parser import get_section_from_report
9
+
10
+ from transformers import (
11
+ AutoModelForTokenClassification,
12
+ AutoTokenizer,
13
+ pipeline,
14
+ )
15
+
16
+ @dataclass
17
+ class Report:
18
+ patient_id: str|int
19
+ text: str
20
+ date: str
21
+ summary: str|None = None
22
+
23
+ def clean(s: str) -> str:
24
+ s = s.replace("\n", " ") # Concatenate into one string
25
+ s = s.replace("_", "") # Remove long lines and underscores
26
+ s = re.sub(r"\[.*?\]", "", s) # Remove brackets and parentheses
27
+ s = re.sub(r"\(.*?\)", "", s)
28
+ s = " ".join(s.split()) # Replace multiple white spaces
29
+ return s
30
+
31
+
32
+ def split_paragraphs(text: str) -> List[str]:
33
+ paragraphs = text.split("\n\n")
34
+ paragraphs = list(map(clean, paragraphs))
35
+ paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
36
+ return paragraphs
37
+
38
+
39
+ def format_casemaker_data(
40
+ df: pd.DataFrame, patient_id_column: str, text_column: str, date_column: str
41
+ ):
42
+ """Take in a pandas dataframe where each row corresponds to one report for a patient,
43
+ and output a dataframe where each row corresponds to a patient, and the "records" column
44
+ contains a list of dictionaries of all their reports sorted by date
45
+
46
+ Args:
47
+ df (pd.DataFrame): Input dataframe on report level
48
+ patient_id_column (str): Patient ID
49
+ text_column (str): Text/Report
50
+ date_column (str): Date (will be used to sort)
51
+ """
52
+ df = df.rename(
53
+ columns={
54
+ patient_id_column: "patient_id",
55
+ text_column: "text",
56
+ date_column: "date",
57
+ }
58
+ )
59
+ df = (
60
+ df.sort_values(by=["patient_id", "date"])
61
+ .groupby("patient_id")
62
+ .apply(lambda df: df[["patient_id", "text", "date"]].to_dict("records"))
63
+ )
64
+ reports_by_patient = dict[str,Sequence[Report]]()
65
+ for patient_id, report_list in zip(df.index, df):
66
+ patient_id = str(patient_id)
67
+ report_list = [Report(**report) for report in report_list]
68
+ reports_by_patient[patient_id] = report_list
69
+ return reports_by_patient
70
+
71
+
72
+ class CaseMaker:
73
+ def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
74
+ self.organ_keyword_dict = json.load(open(organ_keywords_dict_path, "r"))
75
+
76
+ self.ner_pipe = pipeline(
77
+ "ner",
78
+ model=AutoModelForTokenClassification.from_pretrained(
79
+ "d4data/biomedical-ner-all"
80
+ ),
81
+ tokenizer=AutoTokenizer.from_pretrained("d4data/biomedical-ner-all"),
82
+ aggregation_strategy="simple",
83
+ device_map="auto",
84
+ )
85
+ # self.summ_pipe = pipeline(
86
+ # "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
87
+ # )
88
+
89
+ def standardize_organ(self, organ_entity: Dict) -> Dict:
90
+ """Given an entity, map its name to a set of recognized entities provided in
91
+ organ_keyword_dict if it matches any of the keywords; otherwise set it as "Other"
92
+
93
+ Args:
94
+ organ_entity (Dict): Dictionary corresponding to entity; should contain "word" key
95
+ which is the entity
96
+
97
+ Returns:
98
+ Dict: Same dictionary where the "word" key has been updated to either a set of standard
99
+ body organs or "Other"
100
+ """
101
+ # If the organ matches any of the keys or their synonyms, replace the name and return
102
+ for key in self.organ_keyword_dict:
103
+ if (organ_entity["word"].lower() == key.lower()) or (
104
+ organ_entity["word"].lower() in self.organ_keyword_dict[key]
105
+ ):
106
+ organ_entity["word"] = key
107
+ return organ_entity
108
+ # Otherwise, it's a bad match so set the score to 0 and return other
109
+ organ_entity["word"] = "Other"
110
+ organ_entity["score"] = 0.0
111
+
112
+ return organ_entity
113
+
114
+ def pick_organ_by_keyword(self, s: str):
115
+ words = s.lower()
116
+ for organ in self.organ_keyword_dict.keys():
117
+ if any(
118
+ [
119
+ keyword.lower() in words
120
+ for keyword in [organ] + self.organ_keyword_dict[organ]
121
+ ]
122
+ ):
123
+ return organ
124
+ return "other"
125
+
126
+ def parse_report_by_organ(self, report: str):
127
+ """Take in a text report and output a dictionary of body organs
128
+ and a list of all the sentences corresponding to that organ
129
+
130
+ Args:
131
+ report (str): Input report
132
+ """
133
+ report_string_by_organ = dict[str, str]()
134
+
135
+ # Split the report into a list of paragraphs
136
+ paragraphs = split_paragraphs(report)
137
+ # Collect a list of paragraphs related to each organ
138
+ for p in paragraphs:
139
+ # Figure out which organ is being referenced
140
+ selected_organ = self.pick_organ_by_keyword(p)
141
+
142
+ # Concatenate the report to its corresponding organ
143
+ if selected_organ not in report_string_by_organ:
144
+ report_string_by_organ[selected_organ] = p
145
+ else:
146
+ report_string_by_organ[selected_organ] += p
147
+
148
+ return report_string_by_organ
149
+
150
+ def trim_to_relevant_portion(self, report: str):
151
+ # Cut the report to the findings
152
+ report = get_section_from_report(report, "findings")
153
+
154
+ # Only keep sentences with symptoms and disease descriptions
155
+ relevant_sentences = []
156
+ for sentence in sent_tokenize(report):
157
+ if any(
158
+ [
159
+ ent["entity_group"] in ["Sign_symptom", "Disease_disorder"]
160
+ for ent in self.ner_pipe(sentence)
161
+ ]
162
+ ):
163
+ relevant_sentences.append(sentence)
164
+ return "\n".join(relevant_sentences)
165
+
166
+ def summarize_report(self, text: str) -> str:
167
+ """Format text into prompt and summarize clinical text
168
+
169
+ Args:
170
+ text (str): Input report
171
+
172
+ Returns:
173
+ str: Output
174
+ """
175
+
176
+ question = (
177
+ "Can you provide a succinct summary of the key clinical findings "
178
+ "and treatment recommendations outlined in this discharge summary?"
179
+ )
180
+
181
+ prompt = """
182
+ You are an intelligent clinical languge model.
183
+ Below is a snippet of patient's discharge summary and a following instruction from healthcare professional.
184
+ Write a response that appropriately completes the instruction.
185
+ The response should provide the accurate answer to the instruction, while being concise.
186
+
187
+ [Discharge Summary Begin]
188
+ {note}
189
+ [Discharge Summary End]
190
+
191
+ [Instruction Begin]
192
+ {question}
193
+ [Instruction End]
194
+ """.format(
195
+ question=question, note=text
196
+ )
197
+
198
+ output = self.summ_pipe(prompt, max_new_tokens=len(text.split()) // 2)[0][
199
+ "generated_text"
200
+ ]
201
+ answer = output.split("[Instruction End]")[-1]
202
+ answer = clean(answer)
203
+ return answer
204
+
205
+ def parse_records(
206
+ self,
207
+ reports: Sequence[Report],
208
+ ):
209
+ """Given a list of reports (represented by dictionaries), split each of them
210
+ by body part using parse_report_by_organ, then compile all the text for the same
211
+ organ across different reports
212
+ (i.e. for each body part, have a list of dicts which contain the text from various reports)
213
+
214
+ Args:
215
+ records (Sequence[Report]): List of reports represented by dictionaries; each dictionary
216
+ must contain "text" and "date" keys
217
+ """
218
+
219
+ # For each organ, collect a list of relevant records containing the text and date
220
+ reports_by_organ = dict[str, Sequence[Report]]()
221
+ for report in reports:
222
+ report_by_organ = self.parse_report_by_organ(report.text)
223
+ for organ, report_text in report_by_organ.items():
224
+ organ_level_record = Report(text=report_text, date=report.date, patient_id=report.patient_id)
225
+ if organ in reports_by_organ:
226
+ reports_by_organ[organ].append(organ_level_record)
227
+ else:
228
+ reports_by_organ[organ] = [organ_level_record]
229
+
230
+ # For each organ, then filter only to the relevant reports and summarize them
231
+ summarized_reports_by_organ = dict[str, Sequence[Report]]()
232
+ for organ in reports_by_organ.keys():
233
+ cleaned_reports = list[Report]()
234
+ for report in reports_by_organ[organ]:
235
+ # Trim the report
236
+ report_text = self.trim_to_relevant_portion(report.text)
237
+ if report_text:
238
+ report.summary = report_text
239
+ cleaned_reports.append(report)
240
+ summarized_reports_by_organ[organ] = cleaned_reports
241
+
242
+ return summarized_reports_by_organ
243
+
244
+ def format_reports(self, all_reports: Dict[str, List[Dict]]):
245
+ new_reports = {}
246
+ for organ, organ_reports in all_reports.items():
247
+ new_reports[organ] = "\n\n".join(
248
+ [f"**Report {str(r.date)}**\n\n{str(r.summary)}" for r in organ_reports]
249
+ )
250
+ return new_reports
utils_report_parser.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_section_from_report(report: str, section: str):
2
+ section_upper = section.upper()
3
+ section_lower = section.lower()
4
+ findings_start_idx = report.lower().find(f"{section_lower}:") + len(
5
+ f"{section_lower}:"
6
+ )
7
+
8
+ if findings_start_idx == -1:
9
+ findings_start_idx = report.lower().find(f"{section_lower}:") + len(
10
+ f"{section_lower}:"
11
+ )
12
+ if findings_start_idx == -1:
13
+ findings_start_idx = report.find(f"{section_upper}") + len(f"{section_upper}")
14
+
15
+ if findings_start_idx == -1:
16
+ findings = report
17
+ else:
18
+ findings = report[findings_start_idx:]
19
+
20
+ return findings