|
import re |
|
import torch |
|
import gradio as gr |
|
|
|
from presidio_anonymizer import AnonymizerEngine |
|
from presidio_analyzer import AnalyzerEngine |
|
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
|
|
|
|
|
|
analyzer = AnalyzerEngine() |
|
anonymizer = AnonymizerEngine() |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased") |
|
tokenizer.add_tokens('<person>') |
|
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased") |
|
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner') |
|
|
|
|
|
ENT_TYPES = [ |
|
|
|
'CREDIT_CARD', |
|
'EMAIL_ADDRESS', |
|
'IP_ADDRESS', |
|
'PHONE_NUMBER' |
|
] |
|
|
|
def mask_names_hf(text): |
|
|
|
inputs = tokenizer(text, return_tensors='pt', truncation=True) |
|
tokens = inputs.tokens() |
|
|
|
|
|
outputs = model(**inputs).logits |
|
predictions = torch.argmax(outputs, dim=2) |
|
|
|
|
|
words = [] |
|
for token, prediction in zip(tokens, predictions[0].numpy()): |
|
prediction = model.config.id2label[prediction] |
|
if prediction not in ('I-PER', 'B-PER'): |
|
words.append(token) |
|
elif prediction == 'B-PER': |
|
if words[-1] != '<PERSON>': |
|
words.append('<PERSON>') |
|
else: |
|
pass |
|
|
|
return tokenizer.convert_tokens_to_string(words[1:-1]) |
|
|
|
def anonymize(text, min_len=3): |
|
|
|
|
|
ents = analyzer.analyze(text, language='en', entities=ENT_TYPES) |
|
results = anonymizer.anonymize(text, analyzer_results=ents) |
|
t = results.text |
|
|
|
|
|
t = mask_names_hf(t) |
|
|
|
pats = re.findall('<.+?>', t) |
|
for p in pats: |
|
t = t.replace(p, p.upper().replace(' ', '')) |
|
|
|
|
|
t = t.replace('<PERSON><PERSON>', '<PERSON>') |
|
return t |
|
|
|
title = "Personal Info Remover" |
|
description = """Personal Info Remover""" |
|
|
|
gr.Interface( |
|
anonymize, |
|
inputs='text', |
|
outputs='text', |
|
title=title, |
|
description=description, |
|
examples=["My name is Yuriy, contacts info: 0-800-123-456, [email protected], IP address is 1.0.0.1"] |
|
).launch(debug=True) |