Mike Frantz commited on
Commit
c5f3a07
1 Parent(s): 8eff136

initial commit

Browse files
Files changed (2) hide show
  1. app.py +98 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """pii_redaction_app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1rE0OCygUnXrD34qaq12QZKp7_ixkYXN2
8
+ """
9
+
10
+ from spacy.cli import download
11
+ download('en_core_web_lg')
12
+ from presidio_anonymizer import AnonymizerEngine
13
+ from presidio_analyzer import AnalyzerEngine
14
+ from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
15
+
16
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
17
+ import torch
18
+ import re
19
+
20
+ import gradio as gr
21
+
22
+ # Initialize the engine:
23
+ analyzer = AnalyzerEngine()
24
+ anonymizer = AnonymizerEngine()
25
+
26
+ # Create the NER pipeline
27
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
28
+ tokenizer.add_tokens('<person>')
29
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
30
+ pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')
31
+
32
+ # https://microsoft.github.io/presidio/supported_entities/
33
+ ENT_TYPES = [
34
+ # 'PERSON',
35
+ 'CREDIT_CARD',
36
+ 'EMAIL_ADDRESS',
37
+ 'IP_ADDRESS',
38
+ 'PHONE_NUMBER'
39
+ ]
40
+
41
+ def mask_names_hf(text):
42
+ # Tokenize inputs
43
+ inputs = tokenizer(text, return_tensors='pt', truncation=True)
44
+ tokens = inputs.tokens()
45
+
46
+ # Make inferences
47
+ outputs = model(**inputs).logits
48
+ predictions = torch.argmax(outputs, dim=2)
49
+
50
+ # Replace tokens that are people with <PERSON>
51
+ words = []
52
+ for token, prediction in zip(tokens, predictions[0].numpy()):
53
+ prediction = model.config.id2label[prediction]
54
+ if prediction not in ('I-PER', 'B-PER'):
55
+ words.append(token)
56
+ elif prediction == 'B-PER':
57
+ if words[-1] != '<PERSON>':
58
+ words.append('<PERSON>')
59
+ else:
60
+ pass
61
+ # Convert those tokens to a string
62
+ return tokenizer.convert_tokens_to_string(words[1:-1])
63
+
64
+ # def mask_names_hf(text):
65
+ # outputs = pipe(text)
66
+ # tokens = []
67
+ # for token in outputs:
68
+ # if 'PER' in token['entity']:
69
+ # if tokens[-1] != '<PERSON>':
70
+ # tokens.append('<PERSON>')
71
+ # else:
72
+ # tokens.append(token['word'])
73
+
74
+ # t = tokenizer.convert_tokens_to_string(tokens)
75
+ # return t
76
+
77
+ def anonymize(text, min_len=3):
78
+
79
+ # Find and replace other stuff (Presidio NER)
80
+ ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
81
+ results = anonymizer.anonymize(text, analyzer_results=ents)
82
+ t = results.text
83
+
84
+ # t = copy(text)
85
+ # Find and replace names (HF NER)
86
+ t = mask_names_hf(t)
87
+
88
+ pats = re.findall('<.+?>', t)
89
+ for p in pats:
90
+ t = t.replace(p, p.upper().replace(' ', ''))
91
+
92
+
93
+ t = t.replace('<PERSON><PERSON>', '<PERSON>')
94
+ return t
95
+
96
+
97
+ gr.Interface(anonymize, inputs='text', outputs='text').launch()
98
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ presidio_anonymizer
2
+ presidio_analyzer
3
+ spacy