Spaces:
Sleeping
Sleeping
from typing import List, Dict, Optional, Tuple, Type | |
from presidio_anonymizer import AnonymizerEngine | |
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry | |
from presidio_anonymizer.entities import ( | |
OperatorConfig, | |
) | |
from presidio_analyzer.nlp_engine import ( | |
NlpEngine, | |
NlpEngineProvider, | |
) | |
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration | |
class PiiMaskingService(): | |
def analyze(self, text: str, model: str): | |
entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL', | |
'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR', | |
'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE', | |
'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD', | |
'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE', | |
'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII' | |
] | |
if model == "HuggingFace/obi/deid_roberta_i2b2": | |
nlp_engine, registry= self.create_nlp_engine_with_transformers("obi/deid_roberta_i2b2") | |
elif model == "flair/ner-english-large": | |
nlp_engine, registry= self.create_nlp_engine_with_flair("flair/ner-english-large") | |
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) | |
results = analyzer.analyze(text=text, entities=entitiesToRecognize, language='en') | |
print("analyzer results:") | |
print(results) | |
return results | |
def anonymize( | |
self, | |
text: str, | |
operator: str, | |
model: str | |
# analyze_results: List[RecognizerResult], | |
): | |
operator_config = None | |
encrypt_key = "WmZq4t7w!z%C&F)J" | |
if operator == 'mask': | |
operator_config = { | |
"type": "mask", | |
"masking_char": "*", | |
"chars_to_mask": 15, | |
"from_end": False, | |
} | |
elif operator == "encrypt": | |
operator_config = {"key": encrypt_key} | |
elif operator == "highlight": | |
operator_config = {"lambda": lambda x: x} | |
if operator == "highlight": | |
operator = "custom" | |
analyzer_result = self.analyze(text, model) | |
engine = AnonymizerEngine() | |
# Invoke the anonymize function with the text, analyzer results and | |
# Operators to define the anonymization type. | |
result = engine.anonymize( | |
text=text, | |
operators={"DEFAULT": OperatorConfig(operator, operator_config)}, | |
analyzer_results=analyzer_result | |
) | |
print("res:") | |
print(result) | |
print(result.text) | |
print(type(result.text)) | |
return result.text | |
def create_nlp_engine_with_flair( | |
self, | |
model_path: str, | |
) -> Tuple[NlpEngine, RecognizerRegistry]: | |
""" | |
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model. | |
The FlairRecognizer would return results from Flair models, the spaCy model | |
would return NlpArtifacts such as POS and lemmas. | |
:param model_path: Flair model path. | |
""" | |
from flair_recognizer import FlairRecognizer | |
registry = RecognizerRegistry() | |
registry.load_predefined_recognizers() | |
# there is no official Flair NlpEngine, hence we load it as an additional recognizer | |
# if not spacy.util.is_package("en_core_web_sm"): | |
# spacy.cli.download("en_core_web_sm") | |
# Using a small spaCy model + a Flair NER model | |
flair_recognizer = FlairRecognizer(model_path=model_path) | |
nlp_configuration = { | |
"nlp_engine_name": "spacy", | |
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], | |
} | |
registry.add_recognizer(flair_recognizer) | |
registry.remove_recognizer("SpacyRecognizer") | |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() | |
return nlp_engine, registry | |
def create_nlp_engine_with_transformers( | |
self, | |
model_path: str, | |
) -> Tuple[NlpEngine, RecognizerRegistry]: | |
""" | |
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model. | |
The TransformersRecognizer would return results from Transformers models, the spaCy model | |
would return NlpArtifacts such as POS and lemmas. | |
:param model_path: HuggingFace model path. | |
""" | |
print(f"Loading Transformers model: {model_path} of type {type(model_path)}") | |
nlp_configuration = { | |
"nlp_engine_name": "transformers", | |
"models": [ | |
{ | |
"lang_code": "en", | |
"model_name": {"spacy": "en_core_web_sm", "transformers": model_path}, | |
} | |
], | |
"ner_model_configuration": { | |
"model_to_presidio_entity_mapping": { | |
"PER": "PERSON", | |
"PERSON": "PERSON", | |
"LOC": "LOCATION", | |
"LOCATION": "LOCATION", | |
"GPE": "LOCATION", | |
"ORG": "ORGANIZATION", | |
"ORGANIZATION": "ORGANIZATION", | |
"NORP": "NRP", | |
"AGE": "AGE", | |
"ID": "ID", | |
"EMAIL": "EMAIL", | |
"PATIENT": "PERSON", | |
"STAFF": "PERSON", | |
"HOSP": "ORGANIZATION", | |
"PATORG": "ORGANIZATION", | |
"DATE": "DATE_TIME", | |
"TIME": "DATE_TIME", | |
"PHONE": "PHONE_NUMBER", | |
"HCW": "PERSON", | |
"HOSPITAL": "ORGANIZATION", | |
"FACILITY": "LOCATION", | |
}, | |
"low_confidence_score_multiplier": 0.4, | |
"low_score_entity_names": ["ID"], | |
"labels_to_ignore": [ | |
"CARDINAL", | |
"EVENT", | |
"LANGUAGE", | |
"LAW", | |
"MONEY", | |
"ORDINAL", | |
"PERCENT", | |
"PRODUCT", | |
"QUANTITY", | |
"WORK_OF_ART", | |
], | |
}, | |
} | |
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() | |
registry = RecognizerRegistry() | |
registry.load_predefined_recognizers(nlp_engine=nlp_engine) | |
return nlp_engine, registry | |