Spaces:
Running
Running
danielhajialigol
commited on
Commit
•
9901139
1
Parent(s):
fa32459
normalized diseases
Browse files
utils.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
import ssl
|
5 |
import torch
|
6 |
import re
|
|
|
7 |
from pprint import pprint
|
8 |
from captum.attr import visualization
|
9 |
|
@@ -21,6 +22,39 @@ class PyTMinMaxScalerVectorized(object):
|
|
21 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
22 |
return tensor
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def get_diseases(text, pipe):
|
25 |
results = pipe(text)
|
26 |
diseases = []
|
@@ -44,7 +78,8 @@ def get_diseases(text, pipe):
|
|
44 |
if len(disease_span) > 1:
|
45 |
disease = text[disease_span[0]: disease_span[1]]
|
46 |
diseases.append(disease)
|
47 |
-
|
|
|
48 |
|
49 |
def find_end(text):
|
50 |
"""Find the end of the report."""
|
|
|
4 |
import ssl
|
5 |
import torch
|
6 |
import re
|
7 |
+
import difflib
|
8 |
from pprint import pprint
|
9 |
from captum.attr import visualization
|
10 |
|
|
|
22 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
23 |
return tensor
|
24 |
|
25 |
+
def _normalized_diseases(text_list, disease):
|
26 |
+
candidates = difflib.get_close_matches(disease, text_list)
|
27 |
+
if len(candidates) > 0:
|
28 |
+
return candidates[0]
|
29 |
+
return ''
|
30 |
+
|
31 |
+
def clean_disease_string(disease):
|
32 |
+
disease = disease.strip().lower()
|
33 |
+
disease = re.sub(r'[^\w\s]','',disease)
|
34 |
+
return disease
|
35 |
+
|
36 |
+
def normalized_diseases(text, disease_list):
|
37 |
+
disease_list = list(set(disease_list))
|
38 |
+
text_split = text.split()
|
39 |
+
normalized = []
|
40 |
+
for disease in disease_list:
|
41 |
+
# case when the disease is one word
|
42 |
+
if ' ' not in disease:
|
43 |
+
candidate = _normalized_diseases(disease=disease, text_list=text_split)
|
44 |
+
if len(candidate) > 0:
|
45 |
+
candidate = clean_disease_string(candidate)
|
46 |
+
normalized.append(candidate)
|
47 |
+
else:
|
48 |
+
concept = ''
|
49 |
+
for disease_word in disease.split():
|
50 |
+
candidate = _normalized_diseases(text_list=text_split, disease=disease_word)
|
51 |
+
if len(candidate) > 0:
|
52 |
+
concept += (candidate + ' ')
|
53 |
+
if len(concept.split()) == len(disease.split()):
|
54 |
+
concept = clean_disease_string(concept)
|
55 |
+
normalized.append(concept)
|
56 |
+
return list(set(normalized))
|
57 |
+
|
58 |
def get_diseases(text, pipe):
|
59 |
results = pipe(text)
|
60 |
diseases = []
|
|
|
78 |
if len(disease_span) > 1:
|
79 |
disease = text[disease_span[0]: disease_span[1]]
|
80 |
diseases.append(disease)
|
81 |
+
normalized = normalized_diseases(text, diseases)
|
82 |
+
return normalized
|
83 |
|
84 |
def find_end(text):
|
85 |
"""Find the end of the report."""
|