danielhajialigol commited on
Commit
9901139
1 Parent(s): fa32459

normalized diseases

Browse files
Files changed (1) hide show
  1. utils.py +36 -1
utils.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import ssl
5
  import torch
6
  import re
 
7
  from pprint import pprint
8
  from captum.attr import visualization
9
 
@@ -21,6 +22,39 @@ class PyTMinMaxScalerVectorized(object):
21
  tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
22
  return tensor
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_diseases(text, pipe):
25
  results = pipe(text)
26
  diseases = []
@@ -44,7 +78,8 @@ def get_diseases(text, pipe):
44
  if len(disease_span) > 1:
45
  disease = text[disease_span[0]: disease_span[1]]
46
  diseases.append(disease)
47
- return diseases
 
48
 
49
  def find_end(text):
50
  """Find the end of the report."""
 
4
  import ssl
5
  import torch
6
  import re
7
+ import difflib
8
  from pprint import pprint
9
  from captum.attr import visualization
10
 
 
22
  tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
23
  return tensor
24
 
25
+ def _normalized_diseases(text_list, disease):
26
+ candidates = difflib.get_close_matches(disease, text_list)
27
+ if len(candidates) > 0:
28
+ return candidates[0]
29
+ return ''
30
+
31
+ def clean_disease_string(disease):
32
+ disease = disease.strip().lower()
33
+ disease = re.sub(r'[^\w\s]','',disease)
34
+ return disease
35
+
36
+ def normalized_diseases(text, disease_list):
37
+ disease_list = list(set(disease_list))
38
+ text_split = text.split()
39
+ normalized = []
40
+ for disease in disease_list:
41
+ # case when the disease is one word
42
+ if ' ' not in disease:
43
+ candidate = _normalized_diseases(disease=disease, text_list=text_split)
44
+ if len(candidate) > 0:
45
+ candidate = clean_disease_string(candidate)
46
+ normalized.append(candidate)
47
+ else:
48
+ concept = ''
49
+ for disease_word in disease.split():
50
+ candidate = _normalized_diseases(text_list=text_split, disease=disease_word)
51
+ if len(candidate) > 0:
52
+ concept += (candidate + ' ')
53
+ if len(concept.split()) == len(disease.split()):
54
+ concept = clean_disease_string(concept)
55
+ normalized.append(concept)
56
+ return list(set(normalized))
57
+
58
  def get_diseases(text, pipe):
59
  results = pipe(text)
60
  diseases = []
 
78
  if len(disease_span) > 1:
79
  disease = text[disease_span[0]: disease_span[1]]
80
  diseases.append(disease)
81
+ normalized = normalized_diseases(text, diseases)
82
+ return normalized
83
 
84
  def find_end(text):
85
  """Find the end of the report."""