presidio commited on
Commit
7be225d
1 Parent(s): 384d9d6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -292
app.py DELETED
@@ -1,292 +0,0 @@
1
- """Streamlit app for Presidio."""
2
-
3
- from json import JSONEncoder
4
- from typing import List
5
-
6
- import pandas as pd
7
- import spacy
8
- import streamlit as st
9
- from annotated_text import annotated_text
10
- from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
11
- from presidio_analyzer.nlp_engine import NlpEngineProvider
12
- from presidio_anonymizer import AnonymizerEngine
13
- from presidio_anonymizer.entities import OperatorConfig
14
-
15
- from transformers_rec import (
16
- STANFORD_COFIGURATION,
17
- TransformersRecognizer,
18
- BERT_DEID_CONFIGURATION,
19
- )
20
-
21
-
22
- # Helper methods
23
- @st.cache(allow_output_mutation = True)
24
- def analyzer_engine(model_path: str):
25
- """Return AnalyzerEngine.
26
-
27
- :param model_path: Which model to use for NER:
28
- "StanfordAIMI/stanford-deidentifier-base",
29
- "obi/deid_roberta_i2b2",
30
- "en_core_web_lg"
31
- """
32
-
33
- registry = RecognizerRegistry()
34
- registry.load_predefined_recognizers()
35
-
36
- # Set up NLP Engine according to the model of choice
37
- if model_path == "en_core_web_lg":
38
- if not spacy.util.is_package("en_core_web_lg"):
39
- spacy.cli.download("en_core_web_lg")
40
- nlp_configuration = {
41
- "nlp_engine_name": "spacy",
42
- "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
43
- }
44
- else:
45
- if not spacy.util.is_package("en_core_web_sm"):
46
- spacy.cli.download("en_core_web_sm")
47
- # Using a small spaCy model + a HF NER model
48
- transformers_recognizer = TransformersRecognizer(model_path=model_path)
49
-
50
- if model_path == "StanfordAIMI/stanford-deidentifier-base":
51
- transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
52
- elif model_path == "obi/deid_roberta_i2b2":
53
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
54
-
55
- # Use small spaCy model, no need for both spacy and HF models
56
- # The transformers model is used here as a recognizer, not as an NlpEngine
57
- nlp_configuration = {
58
- "nlp_engine_name": "spacy",
59
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
60
- }
61
-
62
- registry.add_recognizer(transformers_recognizer)
63
-
64
- nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
65
-
66
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
67
- return analyzer
68
-
69
-
70
- @st.cache(allow_output_mutation = True)
71
- def anonymizer_engine():
72
- """Return AnonymizerEngine."""
73
- return AnonymizerEngine()
74
-
75
-
76
- @st.cache
77
- def get_supported_entities():
78
- """Return supported entities from the Analyzer Engine."""
79
- return analyzer_engine(st_model).get_supported_entities()
80
-
81
-
82
- def analyze(**kwargs):
83
- """Analyze input using Analyzer engine and input arguments (kwargs)."""
84
- if "entities" not in kwargs or "All" in kwargs["entities"]:
85
- kwargs["entities"] = None
86
- return analyzer_engine(st_model).analyze(**kwargs)
87
-
88
-
89
- def anonymize(text: str, analyze_results: List[RecognizerResult]):
90
- """Anonymize identified input using Presidio Anonymizer.
91
-
92
- :param text: Full text
93
- :param analyze_results: list of results from presidio analyzer engine
94
- """
95
-
96
- if st_operator == "mask":
97
- operator_config = {
98
- "type": "mask",
99
- "masking_char": st_mask_char,
100
- "chars_to_mask": st_number_of_chars,
101
- "from_end": False,
102
- }
103
-
104
- elif st_operator == "encrypt":
105
- operator_config = {"key": st_encrypt_key}
106
- elif st_operator == "highlight":
107
- operator_config = {"lambda": lambda x: x}
108
- else:
109
- operator_config = None
110
-
111
- if st_operator == "highlight":
112
- operator = "custom"
113
- else:
114
- operator = st_operator
115
-
116
- res = anonymizer_engine().anonymize(
117
- text,
118
- analyze_results,
119
- operators={"DEFAULT": OperatorConfig(operator, operator_config)},
120
- )
121
- return res
122
-
123
-
124
- def annotate(text: str, analyze_results: List[RecognizerResult]):
125
- """
126
- Highlights every identified entity on top of the text.
127
- :param text: full text
128
- :param analyze_results: list of analyzer results.
129
- """
130
- tokens = []
131
-
132
- # Use the anonymizer to resolve overlaps
133
- results = anonymize(text, analyze_results)
134
-
135
- # sort by start index
136
- results = sorted(results.items, key=lambda x: x.start)
137
- for i, res in enumerate(results):
138
- if i == 0:
139
- tokens.append(text[: res.start])
140
-
141
- # append entity text and entity type
142
- tokens.append((text[res.start: res.end], res.entity_type))
143
-
144
- # if another entity coming i.e. we're not at the last results element, add text up to next entity
145
- if i != len(results) - 1:
146
- tokens.append(text[res.end: results[i + 1].start])
147
- # if no more entities coming, add all remaining text
148
- else:
149
- tokens.append(text[res.end:])
150
- return tokens
151
-
152
-
153
- st.set_page_config(page_title="Presidio demo", layout="wide")
154
-
155
- # Sidebar
156
- st.sidebar.header(
157
- """
158
- PII De-Identification with Microsoft Presidio
159
- """
160
- )
161
-
162
- st.sidebar.info(
163
- "Presidio is an open source customizable framework for PII detection and de-identification\n"
164
- "[Code](https://aka.ms/presidio) | "
165
- "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
166
- "[Installation](https://microsoft.github.io/presidio/installation/) | "
167
- "[FAQ](https://microsoft.github.io/presidio/faq/)",
168
- icon="ℹ️",
169
- )
170
-
171
- st.sidebar.markdown(
172
- "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"
173
- "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT)"
174
- "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
175
- )
176
-
177
- st_model = st.sidebar.selectbox(
178
- "NER model",
179
- [
180
- "StanfordAIMI/stanford-deidentifier-base",
181
- "obi/deid_roberta_i2b2",
182
- "en_core_web_lg",
183
- ],
184
- index=1,
185
- )
186
- st.sidebar.markdown("> Note: Models might take some time to download. ")
187
-
188
- st_operator = st.sidebar.selectbox(
189
- "De-identification approach",
190
- ["redact", "replace", "mask", "hash", "encrypt", "highlight"],
191
- index=1,
192
- )
193
-
194
- if st_operator == "mask":
195
- st_number_of_chars = st.sidebar.number_input(
196
- "number of chars", value=15, min_value=0, max_value=100
197
- )
198
- st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
199
- elif st_operator == "encrypt":
200
- st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
201
-
202
- st_threshold = st.sidebar.slider(
203
- label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
204
- )
205
-
206
- st_return_decision_process = st.sidebar.checkbox(
207
- "Add analysis explanations to findings", value=False
208
- )
209
-
210
- st_entities = st.sidebar.multiselect(
211
- label="Which entities to look for?",
212
- options=get_supported_entities(),
213
- default=list(get_supported_entities()),
214
- )
215
-
216
- # Main panel
217
- analyzer_load_state = st.info("Starting Presidio analyzer...")
218
- engine = analyzer_engine(model_path=st_model)
219
- analyzer_load_state.empty()
220
-
221
- # Read default text
222
- with open("demo_text.txt") as f:
223
- demo_text = f.readlines()
224
-
225
- # Create two columns for before and after
226
- col1, col2 = st.columns(2)
227
-
228
- # Before:
229
- col1.subheader("Input string:")
230
- st_text = col1.text_area(
231
- label="Enter text",
232
- value="".join(demo_text),
233
- height=400,
234
- )
235
-
236
- st_analyze_results = analyze(
237
- text=st_text,
238
- entities=st_entities,
239
- language="en",
240
- score_threshold=st_threshold,
241
- return_decision_process=st_return_decision_process,
242
- )
243
-
244
- # After
245
- if st_operator != "highlight":
246
- with col2:
247
- st.subheader(f"Output")
248
- st_anonymize_results = anonymize(st_text, st_analyze_results)
249
- st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
250
- else:
251
- st.subheader("Highlighted")
252
- annotated_tokens = annotate(st_text, st_analyze_results)
253
- # annotated_tokens
254
- annotated_text(*annotated_tokens)
255
-
256
-
257
- # json result
258
- class ToDictEncoder(JSONEncoder):
259
- """Encode dict to json."""
260
-
261
- def default(self, o):
262
- """Encode to JSON using to_dict."""
263
- return o.to_dict()
264
-
265
-
266
- # table result
267
- st.subheader(
268
- "Findings" if not st_return_decision_process else "Findings with decision factors"
269
- )
270
- if st_analyze_results:
271
- df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
272
- df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
273
-
274
- df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
275
- {
276
- "entity_type": "Entity type",
277
- "text": "Text",
278
- "start": "Start",
279
- "end": "End",
280
- "score": "Confidence",
281
- },
282
- axis=1,
283
- )
284
- df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
285
- if st_return_decision_process:
286
- analysis_explanation_df = pd.DataFrame.from_records(
287
- [r.analysis_explanation.to_dict() for r in st_analyze_results]
288
- )
289
- df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
290
- st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
291
- else:
292
- st.text("No findings")