Spaces:
Runtime error
Runtime error
File size: 2,820 Bytes
c5f3a07 b1f2899 c5f3a07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# -*- coding: utf-8 -*-
"""pii_redaction_app.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1rE0OCygUnXrD34qaq12QZKp7_ixkYXN2
"""
from spacy.cli import download
download('en_core_web_lg')
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import re
import gradio as gr
# Initialize the engine:
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
# Create the NER pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
tokenizer.add_tokens('<person>')
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')
# https://microsoft.github.io/presidio/supported_entities/
ENT_TYPES = [
# 'PERSON',
'CREDIT_CARD',
'EMAIL_ADDRESS',
'IP_ADDRESS',
'PHONE_NUMBER'
]
def mask_names_hf(text):
# Tokenize inputs
inputs = tokenizer(text, return_tensors='pt', truncation=True)
tokens = inputs.tokens()
# Make inferences
outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)
# Replace tokens that are people with <PERSON>
words = []
for token, prediction in zip(tokens, predictions[0].numpy()):
prediction = model.config.id2label[prediction]
if prediction not in ('I-PER', 'B-PER'):
words.append(token)
elif prediction == 'B-PER':
if words[-1] != '<PERSON>':
words.append('<PERSON>')
else:
pass
# Convert those tokens to a string
return tokenizer.convert_tokens_to_string(words[1:-1])
# def mask_names_hf(text):
# outputs = pipe(text)
# tokens = []
# for token in outputs:
# if 'PER' in token['entity']:
# if tokens[-1] != '<PERSON>':
# tokens.append('<PERSON>')
# else:
# tokens.append(token['word'])
# t = tokenizer.convert_tokens_to_string(tokens)
# return t
def anonymize(text, min_len=3):
# Find and replace other stuff (Presidio NER)
ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
results = anonymizer.anonymize(text, analyzer_results=ents)
t = results.text
# t = copy(text)
# Find and replace names (HF NER)
t = mask_names_hf(t)
pats = re.findall('<.+?>', t)
for p in pats:
t = t.replace(p, p.upper().replace(' ', ''))
t = t.replace('<PERSON><PERSON>', '<PERSON>')
return t
gr.Interface(anonymize, inputs='text', outputs='text').launch(debug=True)
|