File size: 3,195 Bytes
c5f3a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4ed3b8
 
 
 
 
 
c5f3a07
5e77ca5
 
 
 
 
 
 
 
 
 
 
c5f3a07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import re

import gradio as gr

# Initialize the engine:
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Create the NER pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
tokenizer.add_tokens('<person>')
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')

# https://microsoft.github.io/presidio/supported_entities/
ENT_TYPES = [
#     'PERSON',
    'CREDIT_CARD',
    'EMAIL_ADDRESS',
    'IP_ADDRESS',
    'PHONE_NUMBER'
]

def mask_names_hf(text):
    # Tokenize inputs
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    tokens = inputs.tokens()
    
    # Make inferences
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    
    # Replace tokens that are people with <PERSON>
    words = []
    for token, prediction in zip(tokens, predictions[0].numpy()):
        prediction = model.config.id2label[prediction]
        if prediction not in ('I-PER', 'B-PER'):
            words.append(token)
        elif prediction == 'B-PER':
            if words[-1] != '<PERSON>':
                words.append('<PERSON>')
        else:
            pass
    # Convert those tokens to a string
    return tokenizer.convert_tokens_to_string(words[1:-1])

# def mask_names_hf(text):
#     outputs = pipe(text)
#     tokens = []
#     for token in outputs:
#         if 'PER' in token['entity']:
#             if tokens[-1] != '<PERSON>':
#                 tokens.append('<PERSON>')
#         else:
#             tokens.append(token['word'])

#     t = tokenizer.convert_tokens_to_string(tokens)
#     return t

def anonymize(text, min_len=3):
    
    # Find and replace other stuff (Presidio NER)
    ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
    results = anonymizer.anonymize(text, analyzer_results=ents)
    t = results.text
    
#     t = copy(text)
    # Find and replace names (HF NER)
    t = mask_names_hf(t)
    
    pats = re.findall('<.+?>', t)
    for p in pats:
        t = t.replace(p, p.upper().replace(' ', ''))
        
    
    t = t.replace('<PERSON><PERSON>', '<PERSON>')
    return t

title = "PII Masking"
description = """
In many applications, PII is easy to remove from databases.
However, it can be less straightforward to remove from unstructured text data.
This app accepts text and obfuscates names, phone numbers, emails, and IP addresses.
"""

gr.Interface(
        anonymize, 
        inputs='text', 
        outputs='text',
        examples=[
              "Hi, my name is Mike and my phone number is 1-234-567-9000",
              "Hi, my name is Mike and my email address is my_name@my_domain.com",
              "Hi, my name is Mike and my IP address is 127.0.0.1",
            #   "Hi, my name is Mike and my credit card is 1200 3859 8281 0593"
              ]
).launch(debug=True)