Spaces:
Running
Running
Upload 3 files
Browse files
transformers_rec/configuration.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
STANFORD_COFIGURATION = {
|
2 |
"DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
|
3 |
"PRESIDIO_SUPPORTED_ENTITIES": [
|
@@ -11,7 +13,8 @@ STANFORD_COFIGURATION = {
|
|
11 |
"DEVICE",
|
12 |
"ZIP",
|
13 |
"PROFESSION",
|
14 |
-
"USERNAME"
|
|
|
15 |
|
16 |
],
|
17 |
"LABELS_TO_IGNORE": ["O"],
|
@@ -22,8 +25,8 @@ STANFORD_COFIGURATION = {
|
|
22 |
"DOCTOR": "PERSON",
|
23 |
"PATIENT": "PERSON",
|
24 |
"HOSPITAL": "LOCATION",
|
25 |
-
"MEDICALRECORD": "
|
26 |
-
"IDNUM": "
|
27 |
"ORGANIZATION": "ORGANIZATION",
|
28 |
"ZIP": "ZIP",
|
29 |
"PHONE": "PHONE_NUMBER",
|
@@ -55,6 +58,8 @@ STANFORD_COFIGURATION = {
|
|
55 |
},
|
56 |
"CHUNK_OVERLAP_SIZE": 40,
|
57 |
"CHUNK_SIZE": 600,
|
|
|
|
|
58 |
}
|
59 |
|
60 |
|
@@ -70,6 +75,7 @@ BERT_DEID_CONFIGURATION = {
|
|
70 |
"ZIP",
|
71 |
"PROFESSION",
|
72 |
"USERNAME",
|
|
|
73 |
],
|
74 |
"DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
|
75 |
"LABELS_TO_IGNORE": ["O"],
|
@@ -102,7 +108,7 @@ BERT_DEID_CONFIGURATION = {
|
|
102 |
"LOC": "LOCATION",
|
103 |
"ORG": "ORGANIZATION",
|
104 |
"AGE": "AGE",
|
105 |
-
"ID": "
|
106 |
"EMAIL": "EMAIL",
|
107 |
"PATIENT": "PERSON",
|
108 |
"STAFF": "PERSON",
|
@@ -113,4 +119,6 @@ BERT_DEID_CONFIGURATION = {
|
|
113 |
},
|
114 |
"CHUNK_OVERLAP_SIZE": 40,
|
115 |
"CHUNK_SIZE": 600,
|
|
|
|
|
116 |
}
|
|
|
1 |
+
## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
|
2 |
+
|
3 |
STANFORD_COFIGURATION = {
|
4 |
"DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
|
5 |
"PRESIDIO_SUPPORTED_ENTITIES": [
|
|
|
13 |
"DEVICE",
|
14 |
"ZIP",
|
15 |
"PROFESSION",
|
16 |
+
"USERNAME",
|
17 |
+
"ID"
|
18 |
|
19 |
],
|
20 |
"LABELS_TO_IGNORE": ["O"],
|
|
|
25 |
"DOCTOR": "PERSON",
|
26 |
"PATIENT": "PERSON",
|
27 |
"HOSPITAL": "LOCATION",
|
28 |
+
"MEDICALRECORD": "ID",
|
29 |
+
"IDNUM": "ID",
|
30 |
"ORGANIZATION": "ORGANIZATION",
|
31 |
"ZIP": "ZIP",
|
32 |
"PHONE": "PHONE_NUMBER",
|
|
|
58 |
},
|
59 |
"CHUNK_OVERLAP_SIZE": 40,
|
60 |
"CHUNK_SIZE": 600,
|
61 |
+
"ID_SCORE_MULTIPLIER": 0.4,
|
62 |
+
"ID_ENTITY_NAME": "ID"
|
63 |
}
|
64 |
|
65 |
|
|
|
75 |
"ZIP",
|
76 |
"PROFESSION",
|
77 |
"USERNAME",
|
78 |
+
"ID"
|
79 |
],
|
80 |
"DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
|
81 |
"LABELS_TO_IGNORE": ["O"],
|
|
|
108 |
"LOC": "LOCATION",
|
109 |
"ORG": "ORGANIZATION",
|
110 |
"AGE": "AGE",
|
111 |
+
"ID": "ID",
|
112 |
"EMAIL": "EMAIL",
|
113 |
"PATIENT": "PERSON",
|
114 |
"STAFF": "PERSON",
|
|
|
119 |
},
|
120 |
"CHUNK_OVERLAP_SIZE": 40,
|
121 |
"CHUNK_SIZE": 600,
|
122 |
+
"ID_SCORE_MULTIPLIER": 0.4,
|
123 |
+
"ID_ENTITY_NAME": "ID"
|
124 |
}
|
transformers_rec/transformers_recognizer.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import copy
|
2 |
import logging
|
3 |
from typing import Optional, List
|
@@ -90,6 +92,8 @@ class TransformersRecognizer(EntityRecognizer):
|
|
90 |
self.default_explanation = None
|
91 |
self.text_overlap_length = None
|
92 |
self.chunk_length = None
|
|
|
|
|
93 |
|
94 |
def load_transformer(self, **kwargs) -> None:
|
95 |
"""Load external configuration parameters and set default values.
|
@@ -104,6 +108,8 @@ class TransformersRecognizer(EntityRecognizer):
|
|
104 |
**CHUNK_SIZE (int) - number of characters in each chunk of text
|
105 |
**LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
|
106 |
**DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
|
|
|
|
|
107 |
"""
|
108 |
|
109 |
self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
|
@@ -113,6 +119,9 @@ class TransformersRecognizer(EntityRecognizer):
|
|
113 |
self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
|
114 |
self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
|
115 |
self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
|
|
|
|
|
|
|
116 |
if not self.pipeline:
|
117 |
if not self.model_path:
|
118 |
self.model_path = "obi/deid_roberta_i2b2"
|
@@ -165,11 +174,14 @@ class TransformersRecognizer(EntityRecognizer):
|
|
165 |
ner_results = self._get_ner_results_for_text(text)
|
166 |
|
167 |
for res in ner_results:
|
168 |
-
|
169 |
-
if not
|
170 |
continue
|
171 |
|
172 |
-
res["entity_group"]
|
|
|
|
|
|
|
173 |
textual_explanation = self.default_explanation.format(res["entity_group"])
|
174 |
explanation = self.build_transformers_explanation(
|
175 |
float(round(res["score"], 2)), textual_explanation, res["word"]
|
@@ -224,33 +236,32 @@ class TransformersRecognizer(EntityRecognizer):
|
|
224 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
225 |
# calculate inputs based on the text
|
226 |
text_length = len(text)
|
227 |
-
|
228 |
-
if text_length
|
229 |
-
|
|
|
230 |
logger.info(
|
231 |
-
f"splitting the text into chunks, length {text_length} > {model_max_length
|
232 |
)
|
233 |
-
|
234 |
chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
|
235 |
text_length, self.chunk_length, self.text_overlap_length
|
236 |
-
|
237 |
-
else:
|
238 |
-
chunk_indexes = [[0, text_length]]
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
|
253 |
-
|
254 |
|
255 |
# remove duplicates
|
256 |
predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
|
@@ -302,27 +313,24 @@ class TransformersRecognizer(EntityRecognizer):
|
|
302 |
)
|
303 |
return explanation
|
304 |
|
305 |
-
def __check_label_transformer(self, label: str) -> str:
|
306 |
"""The function validates the predicted label is identified by Presidio
|
307 |
and maps the string into a Presidio representation
|
308 |
:param label: Predicted label by the model
|
309 |
-
:
|
310 |
-
:return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
|
311 |
-
and is supported by Presidio entities
|
312 |
-
:rtype: str
|
313 |
"""
|
314 |
|
315 |
-
if label == "O":
|
316 |
-
return label
|
317 |
-
|
318 |
# convert model label to presidio label
|
319 |
entity = self.model_to_presidio_mapping.get(label, None)
|
320 |
|
|
|
|
|
|
|
321 |
if entity is None:
|
322 |
-
logger.warning(f"Found unrecognized label {label}, returning entity as
|
323 |
-
return
|
324 |
|
325 |
if entity not in self.supported_entities:
|
326 |
logger.warning(f"Found entity {entity} which is not supported by Presidio")
|
327 |
-
return
|
328 |
return entity
|
|
|
1 |
+
# Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
|
2 |
+
|
3 |
import copy
|
4 |
import logging
|
5 |
from typing import Optional, List
|
|
|
92 |
self.default_explanation = None
|
93 |
self.text_overlap_length = None
|
94 |
self.chunk_length = None
|
95 |
+
self.id_entity_name = None
|
96 |
+
self.id_score_reduction = None
|
97 |
|
98 |
def load_transformer(self, **kwargs) -> None:
|
99 |
"""Load external configuration parameters and set default values.
|
|
|
108 |
**CHUNK_SIZE (int) - number of characters in each chunk of text
|
109 |
**LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
|
110 |
**DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
|
111 |
+
**ID_ENTITY_NAME (str) - name of the ID entity
|
112 |
+
**ID_SCORE_REDUCTION (float) - score multiplier for ID entities
|
113 |
"""
|
114 |
|
115 |
self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
|
|
|
119 |
self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
|
120 |
self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
|
121 |
self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
|
122 |
+
self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
|
123 |
+
self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
|
124 |
+
|
125 |
if not self.pipeline:
|
126 |
if not self.model_path:
|
127 |
self.model_path = "obi/deid_roberta_i2b2"
|
|
|
174 |
ner_results = self._get_ner_results_for_text(text)
|
175 |
|
176 |
for res in ner_results:
|
177 |
+
res["entity_group"] = self.__check_label_transformer(res["entity_group"])
|
178 |
+
if not res["entity_group"]:
|
179 |
continue
|
180 |
|
181 |
+
if res["entity_group"] == self.id_entity_name:
|
182 |
+
print(f"ID entity found, multiplying score by {self.id_score_reduction}")
|
183 |
+
res["score"] = res["score"] * self.id_score_reduction
|
184 |
+
|
185 |
textual_explanation = self.default_explanation.format(res["entity_group"])
|
186 |
explanation = self.build_transformers_explanation(
|
187 |
float(round(res["score"], 2)), textual_explanation, res["word"]
|
|
|
236 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
237 |
# calculate inputs based on the text
|
238 |
text_length = len(text)
|
239 |
+
# split text into chunks
|
240 |
+
if text_length <= model_max_length:
|
241 |
+
predictions = self.pipeline(text)
|
242 |
+
else:
|
243 |
logger.info(
|
244 |
+
f"splitting the text into chunks, length {text_length} > {model_max_length}"
|
245 |
)
|
246 |
+
predictions = list()
|
247 |
chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
|
248 |
text_length, self.chunk_length, self.text_overlap_length
|
249 |
+
)
|
|
|
|
|
250 |
|
251 |
+
# iterate over text chunks and run inference
|
252 |
+
for chunk_start, chunk_end in chunk_indexes:
|
253 |
+
chunk_text = text[chunk_start:chunk_end]
|
254 |
+
chunk_preds = self.pipeline(chunk_text)
|
255 |
|
256 |
+
# align indexes to match the original text - add to each position the value of chunk_start
|
257 |
+
aligned_predictions = list()
|
258 |
+
for prediction in chunk_preds:
|
259 |
+
prediction_tmp = copy.deepcopy(prediction)
|
260 |
+
prediction_tmp["start"] += chunk_start
|
261 |
+
prediction_tmp["end"] += chunk_start
|
262 |
+
aligned_predictions.append(prediction_tmp)
|
263 |
|
264 |
+
predictions.extend(aligned_predictions)
|
265 |
|
266 |
# remove duplicates
|
267 |
predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
|
|
|
313 |
)
|
314 |
return explanation
|
315 |
|
316 |
+
def __check_label_transformer(self, label: str) -> Optional[str]:
|
317 |
"""The function validates the predicted label is identified by Presidio
|
318 |
and maps the string into a Presidio representation
|
319 |
:param label: Predicted label by the model
|
320 |
+
:return: Returns the adjusted entity name
|
|
|
|
|
|
|
321 |
"""
|
322 |
|
|
|
|
|
|
|
323 |
# convert model label to presidio label
|
324 |
entity = self.model_to_presidio_mapping.get(label, None)
|
325 |
|
326 |
+
if entity in self.ignore_labels:
|
327 |
+
return None
|
328 |
+
|
329 |
if entity is None:
|
330 |
+
logger.warning(f"Found unrecognized label {label}, returning entity as is")
|
331 |
+
return label
|
332 |
|
333 |
if entity not in self.supported_entities:
|
334 |
logger.warning(f"Found entity {entity} which is not supported by Presidio")
|
335 |
+
return entity
|
336 |
return entity
|