Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""pii_redaction_app.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1rE0OCygUnXrD34qaq12QZKp7_ixkYXN2 | |
""" | |
from spacy.cli import download | |
download('en_core_web_lg') | |
from presidio_anonymizer import AnonymizerEngine | |
from presidio_analyzer import AnalyzerEngine | |
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
import torch | |
import re | |
import gradio as gr | |
# Initialize the engine: | |
analyzer = AnalyzerEngine() | |
anonymizer = AnonymizerEngine() | |
# Create the NER pipeline | |
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased") | |
tokenizer.add_tokens('<person>') | |
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased") | |
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner') | |
# https://microsoft.github.io/presidio/supported_entities/ | |
ENT_TYPES = [ | |
# 'PERSON', | |
'CREDIT_CARD', | |
'EMAIL_ADDRESS', | |
'IP_ADDRESS', | |
'PHONE_NUMBER' | |
] | |
def mask_names_hf(text): | |
# Tokenize inputs | |
inputs = tokenizer(text, return_tensors='pt', truncation=True) | |
tokens = inputs.tokens() | |
# Make inferences | |
outputs = model(**inputs).logits | |
predictions = torch.argmax(outputs, dim=2) | |
# Replace tokens that are people with <PERSON> | |
words = [] | |
for token, prediction in zip(tokens, predictions[0].numpy()): | |
prediction = model.config.id2label[prediction] | |
if prediction not in ('I-PER', 'B-PER'): | |
words.append(token) | |
elif prediction == 'B-PER': | |
if words[-1] != '<PERSON>': | |
words.append('<PERSON>') | |
else: | |
pass | |
# Convert those tokens to a string | |
return tokenizer.convert_tokens_to_string(words[1:-1]) | |
# def mask_names_hf(text): | |
# outputs = pipe(text) | |
# tokens = [] | |
# for token in outputs: | |
# if 'PER' in token['entity']: | |
# if tokens[-1] != '<PERSON>': | |
# tokens.append('<PERSON>') | |
# else: | |
# tokens.append(token['word']) | |
# t = tokenizer.convert_tokens_to_string(tokens) | |
# return t | |
def anonymize(text, min_len=3): | |
# Find and replace other stuff (Presidio NER) | |
ents = analyzer.analyze(text, language='en', entities=ENT_TYPES) | |
results = anonymizer.anonymize(text, analyzer_results=ents) | |
t = results.text | |
# t = copy(text) | |
# Find and replace names (HF NER) | |
t = mask_names_hf(t) | |
pats = re.findall('<.+?>', t) | |
for p in pats: | |
t = t.replace(p, p.upper().replace(' ', '')) | |
t = t.replace('<PERSON><PERSON>', '<PERSON>') | |
return t | |
gr.Interface(anonymize, inputs='text', outputs='text').launch(debug=True) | |