PIIMasking / PiiMaskingService.py
KarishmaShirsath's picture
Upload 2 files
46dbc0f verified
raw
history blame
6.7 kB
from typing import List, Dict, Optional, Tuple, Type
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer.entities import (
OperatorConfig,
)
from presidio_analyzer.nlp_engine import (
NlpEngine,
NlpEngineProvider,
)
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration
class PiiMaskingService():
def analyze(self, text: str, model: str):
entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
]
if model == "HuggingFace/obi/deid_roberta_i2b2":
nlp_engine, registry= self.create_nlp_engine_with_transformers("obi/deid_roberta_i2b2")
elif model == "flair/ner-english-large":
nlp_engine, registry= self.create_nlp_engine_with_flair("flair/ner-english-large")
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
results = analyzer.analyze(text=text, entities=entitiesToRecognize, language='en')
print("analyzer results:")
print(results)
return results
def anonymize(
self,
text: str,
operator: str,
model: str
# analyze_results: List[RecognizerResult],
):
operator_config = None
encrypt_key = "WmZq4t7w!z%C&F)J"
if operator == 'mask':
operator_config = {
"type": "mask",
"masking_char": "*",
"chars_to_mask": 15,
"from_end": False,
}
elif operator == "encrypt":
operator_config = {"key": encrypt_key}
elif operator == "highlight":
operator_config = {"lambda": lambda x: x}
if operator == "highlight":
operator = "custom"
analyzer_result = self.analyze(text, model)
engine = AnonymizerEngine()
# Invoke the anonymize function with the text, analyzer results and
# Operators to define the anonymization type.
result = engine.anonymize(
text=text,
operators={"DEFAULT": OperatorConfig(operator, operator_config)},
analyzer_results=analyzer_result
)
print("res:")
print(result)
print(result.text)
print(type(result.text))
return result.text
def create_nlp_engine_with_flair(
self,
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
The FlairRecognizer would return results from Flair models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: Flair model path.
"""
from flair_recognizer import FlairRecognizer
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
# there is no official Flair NlpEngine, hence we load it as an additional recognizer
# if not spacy.util.is_package("en_core_web_sm"):
# spacy.cli.download("en_core_web_sm")
# Using a small spaCy model + a Flair NER model
flair_recognizer = FlairRecognizer(model_path=model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
registry.add_recognizer(flair_recognizer)
registry.remove_recognizer("SpacyRecognizer")
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
return nlp_engine, registry
def create_nlp_engine_with_transformers(
self,
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
The TransformersRecognizer would return results from Transformers models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: HuggingFace model path.
"""
print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
nlp_configuration = {
"nlp_engine_name": "transformers",
"models": [
{
"lang_code": "en",
"model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
}
],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"LOC": "LOCATION",
"LOCATION": "LOCATION",
"GPE": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"NORP": "NRP",
"AGE": "AGE",
"ID": "ID",
"EMAIL": "EMAIL",
"PATIENT": "PERSON",
"STAFF": "PERSON",
"HOSP": "ORGANIZATION",
"PATORG": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"PHONE": "PHONE_NUMBER",
"HCW": "PERSON",
"HOSPITAL": "ORGANIZATION",
"FACILITY": "LOCATION",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ID"],
"labels_to_ignore": [
"CARDINAL",
"EVENT",
"LANGUAGE",
"LAW",
"MONEY",
"ORDINAL",
"PERCENT",
"PRODUCT",
"QUANTITY",
"WORK_OF_ART",
],
},
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
return nlp_engine, registry