Spaces:

presidio
/

presidio_demo

Running

App Files Files Community

presidio commited on Apr 16, 2023

Commit

d6241cc

•

1 Parent(s): 3afd122

Upload 7 files

Browse files

Files changed (2) hide show

presidio_helpers.py +203 -0
presidio_streamlit.py +44 -200

presidio_helpers.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Helper methods for the Presidio Streamlit app
+"""
+from typing import List, Optional
+import spacy
+import streamlit as st
+from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from flair_recognizer import FlairRecognizer
+from openai_fake_data_generator import (
+    set_openai_key,
+    call_completion_model,
+    create_prompt,
+)
+from transformers_rec import (
+    STANFORD_COFIGURATION,
+    TransformersRecognizer,
+    BERT_DEID_CONFIGURATION,
+)
+@st.cache_resource
+def analyzer_engine(model_path: str):
+    """Return AnalyzerEngine.
+    :param model_path: Which model to use for NER:
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    """
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    # Set up NLP Engine according to the model of choice
+    if model_path == "en_core_web_lg":
+        if not spacy.util.is_package("en_core_web_lg"):
+            spacy.cli.download("en_core_web_lg")
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
+        }
+    elif model_path == "flair/ner-english-large":
+        flair_recognizer = FlairRecognizer()
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        }
+        registry.add_recognizer(flair_recognizer)
+        registry.remove_recognizer("SpacyRecognizer")
+    else:
+        if not spacy.util.is_package("en_core_web_sm"):
+            spacy.cli.download("en_core_web_sm")
+        # Using a small spaCy model + a HF NER model
+        transformers_recognizer = TransformersRecognizer(model_path=model_path)
+        registry.remove_recognizer("SpacyRecognizer")
+        if model_path == "StanfordAIMI/stanford-deidentifier-base":
+            transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
+        elif model_path == "obi/deid_roberta_i2b2":
+            transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+        # Use small spaCy model, no need for both spacy and HF models
+        # The transformers model is used here as a recognizer, not as an NlpEngine
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        }
+        registry.add_recognizer(transformers_recognizer)
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
+    return analyzer
+@st.cache_resource
+def anonymizer_engine():
+    """Return AnonymizerEngine."""
+    return AnonymizerEngine()
+@st.cache_data
+def get_supported_entities(st_model: str):
+    """Return supported entities from the Analyzer Engine."""
+    return analyzer_engine(st_model).get_supported_entities()
+@st.cache_data
+def analyze(st_model: str, **kwargs):
+    """Analyze input using Analyzer engine and input arguments (kwargs)."""
+    if "entities" not in kwargs or "All" in kwargs["entities"]:
+        kwargs["entities"] = None
+    return analyzer_engine(st_model).analyze(**kwargs)
+def anonymize(
+    text: str,
+    operator: str,
+    analyze_results: List[RecognizerResult],
+    mask_char: Optional[str] = None,
+    number_of_chars: Optional[str] = None,
+    encrypt_key: Optional[str] = None,
+):
+    """Anonymize identified input using Presidio Anonymizer.
+    :param text: Full text
+    :param operator: Operator name
+    :param mask_char: Mask char (for mask operator)
+    :param number_of_chars: Number of characters to mask (for mask operator)
+    :param encrypt_key: Encryption key (for encrypt operator)
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    if operator == "mask":
+        operator_config = {
+            "type": "mask",
+            "masking_char": mask_char,
+            "chars_to_mask": number_of_chars,
+            "from_end": False,
+        }
+    # Define operator config
+    elif operator == "encrypt":
+        operator_config = {"key": encrypt_key}
+    elif operator == "highlight":
+        operator_config = {"lambda": lambda x: x}
+    else:
+        operator_config = None
+    # Change operator if needed as intermediate step
+    if operator == "highlight":
+        operator = "custom"
+    elif operator == "synthesize":
+        operator = "replace"
+    else:
+        operator = operator
+    res = anonymizer_engine().anonymize(
+        text,
+        analyze_results,
+        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+    )
+    return res
+def annotate(text: str, analyze_results: List[RecognizerResult]):
+    """Highlight the identified PII entities on the original text
+    :param text: Full text
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    tokens = []
+    # Use the anonymizer to resolve overlaps
+    results = anonymize(
+        text=text,
+        operator="highlight",
+        analyze_results=analyze_results,
+    )
+    # sort by start index
+    results = sorted(results.items, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[: res.start])
+        # append entity text and entity type
+        tokens.append((text[res.start : res.end], res.entity_type))
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end : results[i + 1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end :])
+    return tokens
+def create_fake_data(
+    text: str,
+    analyze_results: List[RecognizerResult],
+    openai_key: str,
+    openai_model_name: str,
+):
+    """Creates a synthetic version of the text using OpenAI APIs"""
+    if not openai_key:
+        return "Please provide your OpenAI key"
+    results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
+    set_openai_key(openai_key)
+    prompt = create_prompt(results.text)
+    fake = call_openai_api(prompt, openai_model_name)
+    return fake
+@st.cache_data
+def call_openai_api(prompt: str, openai_model_name: str) -> str:
+    fake_data = call_completion_model(prompt, model=openai_model_name)
+    return fake_data

presidio_streamlit.py CHANGED Viewed

@@ -1,197 +1,20 @@
 """Streamlit app for Presidio."""
 import os
 from json import JSONEncoder
-from typing import List
 import pandas as pd
-import spacy
 import streamlit as st
 from annotated_text import annotated_text
-from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
-from presidio_analyzer.nlp_engine import NlpEngineProvider
-from presidio_anonymizer import AnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig
-from flair_recognizer import FlairRecognizer
-from transformers_rec import (
-    STANFORD_COFIGURATION,
-    TransformersRecognizer,
-    BERT_DEID_CONFIGURATION,
 )
-from openai_fake_data_generator import (
-    set_openai_key,
-    call_completion_model,
-    create_prompt,
-)
-# Helper methods
-@st.cache_resource
-def analyzer_engine(model_path: str):
-    """Return AnalyzerEngine.
-    :param model_path: Which model to use for NER:
-        "StanfordAIMI/stanford-deidentifier-base",
-        "obi/deid_roberta_i2b2",
-        "en_core_web_lg"
-    """
-    registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
-    # Set up NLP Engine according to the model of choice
-    if model_path == "en_core_web_lg":
-        if not spacy.util.is_package("en_core_web_lg"):
-            spacy.cli.download("en_core_web_lg")
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
-        }
-    elif model_path == "flair/ner-english-large":
-        flair_recognizer = FlairRecognizer()
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
-        }
-        registry.add_recognizer(flair_recognizer)
-        registry.remove_recognizer("SpacyRecognizer")
-    else:
-        if not spacy.util.is_package("en_core_web_sm"):
-            spacy.cli.download("en_core_web_sm")
-        # Using a small spaCy model + a HF NER model
-        transformers_recognizer = TransformersRecognizer(model_path=model_path)
-        registry.remove_recognizer("SpacyRecognizer")
-        if model_path == "StanfordAIMI/stanford-deidentifier-base":
-            transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
-        elif model_path == "obi/deid_roberta_i2b2":
-            transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-        # Use small spaCy model, no need for both spacy and HF models
-        # The transformers model is used here as a recognizer, not as an NlpEngine
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
-        }
-        registry.add_recognizer(transformers_recognizer)
-    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
-    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
-    return analyzer
-@st.cache_resource
-def anonymizer_engine():
-    """Return AnonymizerEngine."""
-    return AnonymizerEngine()
-@st.cache_data
-def get_supported_entities():
-    """Return supported entities from the Analyzer Engine."""
-    return analyzer_engine(st_model).get_supported_entities()
-@st.cache_data
-def analyze(**kwargs):
-    """Analyze input using Analyzer engine and input arguments (kwargs)."""
-    if "entities" not in kwargs or "All" in kwargs["entities"]:
-        kwargs["entities"] = None
-    return analyzer_engine(st_model).analyze(**kwargs)
-def anonymize(text: str, analyze_results: List[RecognizerResult]):
-    """Anonymize identified input using Presidio Anonymizer.
-    :param text: Full text
-    :param analyze_results: list of results from presidio analyzer engine
-    """
-    if st_operator == "mask":
-        operator_config = {
-            "type": "mask",
-            "masking_char": st_mask_char,
-            "chars_to_mask": st_number_of_chars,
-            "from_end": False,
-        }
-    # Define operator config
-    elif st_operator == "encrypt":
-        operator_config = {"key": st_encrypt_key}
-    elif st_operator == "highlight":
-        operator_config = {"lambda": lambda x: x}
-    else:
-        operator_config = None
-    # Change operator if needed as intermediate step
-    if st_operator == "highlight":
-        operator = "custom"
-    elif st_operator == "synthesize":
-        operator = "replace"
-    else:
-        operator = st_operator
-    res = anonymizer_engine().anonymize(
-        text,
-        analyze_results,
-        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
-    )
-    return res
-def annotate(text: str, analyze_results: List[RecognizerResult]):
-    """
-    Highlights every identified entity on top of the text.
-    :param text: full text
-    :param analyze_results: list of analyzer results.
-    """
-    tokens = []
-    # Use the anonymizer to resolve overlaps
-    results = anonymize(text, analyze_results)
-    # sort by start index
-    results = sorted(results.items, key=lambda x: x.start)
-    for i, res in enumerate(results):
-        if i == 0:
-            tokens.append(text[: res.start])
-        # append entity text and entity type
-        tokens.append((text[res.start : res.end], res.entity_type))
-        # if another entity coming i.e. we're not at the last results element, add text up to next entity
-        if i != len(results) - 1:
-            tokens.append(text[res.end : results[i + 1].start])
-        # if no more entities coming, add all remaining text
-        else:
-            tokens.append(text[res.end :])
-    return tokens
-def create_fake_data(
-    text: str,
-    analyze_results: List[RecognizerResult],
-    openai_key: str,
-    openai_model_name: str,
-):
-    """Creates a synthetic version of the text using OpenAI APIs"""
-    if not openai_key:
-        return "Please provide your OpenAI key"
-    results = anonymize(text, analyze_results)
-    set_openai_key(openai_key)
-    prompt = create_prompt(results.text)
-    fake = call_openai_api(prompt, openai_model_name)
-    return fake
-@st.cache_data
-def call_openai_api(prompt: str, openai_model_name: str) -> str:
-    fake_data = call_completion_model(prompt, model=openai_model_name)
-    return fake_data
 st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
@@ -211,8 +34,8 @@ st.sidebar.info(
 )
 st.sidebar.markdown(
-    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"
-    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT)"
     "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
 )
@@ -247,14 +70,20 @@ st_operator = st.sidebar.selectbox(
     - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
          """,
 )
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
-        "number of chars", value=15, min_value=0, max_value=100
     )
-    st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
 elif st_operator == "encrypt":
-    st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
 elif st_operator == "synthesize":
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
@@ -264,7 +93,7 @@ elif st_operator == "synthesize":
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
-        value="text-davinci-003",
         help="See more here: https://platform.openai.com/docs/models/",
     )
 st_threshold = st.sidebar.slider(
@@ -276,15 +105,19 @@ st_threshold = st.sidebar.slider(
 )
 st_return_decision_process = st.sidebar.checkbox(
-    "Add analysis explanations to findings", value=False,
-    help="Add the decision process to the output table. More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/"
 )
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
-    options=get_supported_entities(),
-    default=list(get_supported_entities()),
-    help="Limit the list of PII entities detected. This list is dynamic and based on the NER model and registered recognizers. More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/"
 )
 # Main panel
@@ -308,6 +141,7 @@ st_text = col1.text_area(
 )
 st_analyze_results = analyze(
     text=st_text,
     entities=st_entities,
     language="en",
@@ -319,7 +153,14 @@ st_analyze_results = analyze(
 if st_operator not in ("highlight", "synthesize"):
     with col2:
         st.subheader(f"Output")
-        st_anonymize_results = anonymize(st_text, st_analyze_results)
         st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
 elif st_operator == "synthesize":
     with col2:
@@ -333,7 +174,10 @@ elif st_operator == "synthesize":
         st.text_area(label="Synthetic data", value=fake_data, height=400)
 else:
     st.subheader("Highlighted")
-    annotated_tokens = annotate(st_text, st_analyze_results)
     # annotated_tokens
     annotated_text(*annotated_tokens)
@@ -353,7 +197,7 @@ st.subheader(
 )
 if st_analyze_results:
     df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
-    df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
     df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
         {
@@ -365,7 +209,7 @@ if st_analyze_results:
         },
         axis=1,
     )
-    df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
     if st_return_decision_process:
         analysis_explanation_df = pd.DataFrame.from_records(
             [r.analysis_explanation.to_dict() for r in st_analyze_results]

 """Streamlit app for Presidio."""
 import os
 from json import JSONEncoder
 import pandas as pd
 import streamlit as st
 from annotated_text import annotated_text
+from presidio_helpers import (
+    get_supported_entities,
+    analyze,
+    anonymize,
+    annotate,
+    create_fake_data,
+    analyzer_engine,
 )
 st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
 )
 st.sidebar.markdown(
+    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
+    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
     "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
 )
     - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
          """,
 )
+st_mask_char = "*"
+st_number_of_chars = 15
+st_encrypt_key = "WmZq4t7w!z%C&F)J"
+st_openai_key = ""
+st_openai_model = "text-davinci-003"
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
+        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
+    )
+    st_mask_char = st.sidebar.text_input(
+        "Mask character", value=st_mask_char, max_chars=1
     )
 elif st_operator == "encrypt":
+    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
 elif st_operator == "synthesize":
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
+        value=st_openai_model,
         help="See more here: https://platform.openai.com/docs/models/",
     )
 st_threshold = st.sidebar.slider(
 )
 st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations to findings",
+    value=False,
+    help="Add the decision process to the output table. "
+         "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
 )
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
+    options=get_supported_entities(st_model),
+    default=list(get_supported_entities(st_model)),
+    help="Limit the list of PII entities detected. "
+         "This list is dynamic and based on the NER model and registered recognizers. "
+         "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
 )
 # Main panel
 )
 st_analyze_results = analyze(
+    st_model=st_model,
     text=st_text,
     entities=st_entities,
     language="en",
 if st_operator not in ("highlight", "synthesize"):
     with col2:
         st.subheader(f"Output")
+        st_anonymize_results = anonymize(
+            text=st_text,
+            operator=st_operator,
+            mask_char=st_mask_char,
+            number_of_chars=st_number_of_chars,
+            encrypt_key=st_encrypt_key,
+            analyze_results=st_analyze_results,
+        )
         st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
 elif st_operator == "synthesize":
     with col2:
         st.text_area(label="Synthetic data", value=fake_data, height=400)
 else:
     st.subheader("Highlighted")
+    annotated_tokens = annotate(
+        text=st_text,
+        analyze_results=st_analyze_results
+    )
     # annotated_tokens
     annotated_text(*annotated_tokens)
 )
 if st_analyze_results:
     df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+    df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
     df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
         {
         },
         axis=1,
     )
+    df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
     if st_return_decision_process:
         analysis_explanation_df = pd.DataFrame.from_records(
             [r.analysis_explanation.to_dict() for r in st_analyze_results]