Spaces:

presidio
/

presidio_demo

Running

App Files Files Community

presidio commited on Dec 10, 2023

Commit

17f243f

•

1 Parent(s): b7be871

Upload 12 files

Browse files

Files changed (10) hide show

azure_ai_language_wrapper.py +126 -0
flair_recognizer.py +5 -5
flair_test.py +25 -0
index.md +15 -5
openai_fake_data_generator.py +28 -33
presidio_helpers.py +11 -14
presidio_nlp_engine_config.py +118 -40
presidio_streamlit.py +49 -22
requirements.txt +2 -3
test_streamlit.py +43 -0

azure_ai_language_wrapper.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+from typing import List, Optional
+import logging
+import dotenv
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
+from presidio_analyzer.nlp_engine import NlpArtifacts
+logger = logging.getLogger("presidio-streamlit")
+class AzureAIServiceWrapper(EntityRecognizer):
+    from azure.ai.textanalytics._models import PiiEntityCategory
+    TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
+    def __init__(
+        self,
+        supported_entities: Optional[List[str]] = None,
+        supported_language: str = "en",
+        ta_client: Optional[TextAnalyticsClient] = None,
+        ta_key: Optional[str] = None,
+        ta_endpoint: Optional[str] = None,
+    ):
+        """
+        Wrapper for the Azure Text Analytics client
+        :param ta_client: object of type TextAnalyticsClient
+        :param ta_key: Azure cognitive Services for Language key
+        :param ta_endpoint: Azure cognitive Services for Language endpoint
+        """
+        if not supported_entities:
+            supported_entities = self.TA_SUPPORTED_ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Azure AI Language PII",
+        )
+        self.ta_key = ta_key
+        self.ta_endpoint = ta_endpoint
+        if not ta_client:
+            ta_client = self.__authenticate_client(ta_key, ta_endpoint)
+        self.ta_client = ta_client
+    @staticmethod
+    def __authenticate_client(key: str, endpoint: str):
+        ta_credential = AzureKeyCredential(key)
+        text_analytics_client = TextAnalyticsClient(
+            endpoint=endpoint, credential=ta_credential
+        )
+        return text_analytics_client
+    def analyze(
+        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        if not entities:
+            entities = []
+        response = self.ta_client.recognize_pii_entities(
+            [text], language=self.supported_language
+        )
+        results = [doc for doc in response if not doc.is_error]
+        recognizer_results = []
+        for res in results:
+            for entity in res.entities:
+                if entity.category not in self.supported_entities:
+                    continue
+                analysis_explanation = AzureAIServiceWrapper._build_explanation(
+                    original_score=entity.confidence_score,
+                    entity_type=entity.category,
+                )
+                recognizer_results.append(
+                    RecognizerResult(
+                        entity_type=entity.category,
+                        start=entity.offset,
+                        end=entity.offset + len(entity.text),
+                        score=entity.confidence_score,
+                        analysis_explanation=analysis_explanation,
+                    )
+                )
+        return recognizer_results
+    @staticmethod
+    def _build_explanation(
+        original_score: float, entity_type: str
+    ) -> AnalysisExplanation:
+        explanation = AnalysisExplanation(
+            recognizer=AzureAIServiceWrapper.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=f"Identified as {entity_type} by Text Analytics",
+        )
+        return explanation
+    def load(self) -> None:
+        pass
+if __name__ == "__main__":
+    import presidio_helpers
+    dotenv.load_dotenv()
+    text = """
+    Here are a few example sentences we currently support:
+    Hello, my name is David Johnson and I live in Maine.
+    My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+    On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
+    My passport: 191280342 and my phone number: (212) 555-1234.
+    This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+    Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
+    """
+    analyzer = presidio_helpers.analyzer_engine(
+        model_path="Azure Text Analytics PII",
+        ta_key=os.environ["TA_KEY"],
+        ta_endpoint=os.environ["TA_ENDPOINT"],
+    )
+    analyzer.analyze(text=text, language="en")

flair_recognizer.py CHANGED Viewed

@@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
         # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
     ]
-    MODEL_LANGUAGES = {
-        "en": "flair/ner-english-large"
-    }
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
@@ -76,7 +74,7 @@ class FlairRecognizer(EntityRecognizer):
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
-        model_path: Optional[str] = None
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
@@ -93,7 +91,9 @@ class FlairRecognizer(EntityRecognizer):
             self.model = SequenceTagger.load(model_path)
         else:
             print(f"Loading model for language {supported_language}")
-            self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
         super().__init__(
             supported_entities=supported_entities,

         # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
     ]
+    MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
+        model_path: Optional[str] = None,
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
             self.model = SequenceTagger.load(model_path)
         else:
             print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(
+                self.MODEL_LANGUAGES.get(supported_language)
+            )
         super().__init__(
             supported_entities=supported_entities,

flair_test.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Import generic wrappers
+from transformers import AutoModel, AutoTokenizer
+if __name__ == "__main__":
+    from flair.data import Sentence
+    from flair.models import SequenceTagger
+    # load tagger
+    tagger = SequenceTagger.load("flair/ner-english-large")
+    # make example sentence
+    sentence = Sentence("George Washington went to Washington")
+    # predict NER tags
+    tagger.predict(sentence)
+    # print sentence
+    print(sentence)
+    # print predicted NER spans
+    print("The following NER tags are found:")
+    # iterate over entities and print
+    for entity in sentence.get_spans("ner"):
+        print(entity)

index.md CHANGED Viewed

@@ -5,22 +5,32 @@ The app is based on the [streamlit](https://streamlit.io/) package.
 A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
 ## Requirements
-1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
-1. Install dependencies (preferably in a virtual environment)
 ```sh
 pip install -r requirements
 ```
 > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
-2.
 3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
-3. Start the app:
 ```sh
 streamlit run presidio_streamlit.py
 ```
 ## Output
 Output should be similar to this screenshot:
-![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)

 A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
 ## Requirements
+1. Clone the repo and move to the `docs/samples/python/streamlit` folder
+2. Install dependencies (preferably in a virtual environment)
 ```sh
 pip install -r requirements
 ```
 > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
 3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
+4. Start the app:
 ```sh
 streamlit run presidio_streamlit.py
 ```
+5. Consider adding an `.env` file with the following environment variables, for further customizability:
+```sh
+TA_KEY=YOUR_TEXT_ANALYTICS_KEY
+TA_ENDPOINT=YOUR_TEXT_ANALYTICS_ENDPOINT
+OPENAI_TYPE="Azure" #or "openai"
+OPENAI_KEY=YOUR_OPENAI_KEY
+OPENAI_API_VERSION = "2023-05-15"
+AZURE_OPENAI_ENDPOINT=YOUR_AZURE_OPENAI_AZURE_OPENAI_ENDPOINT
+AZURE_OPENAI_DEPLOYMENT=text-davinci-003
+ALLOW_OTHER_MODELS=true #true if the user could download new models
+```
 ## Output
 Output should be similar to this screenshot:
+![image](https://github.com/microsoft/presidio/assets/3776619/7d0eadf1-e750-4747-8b59-8203aa43cac8)

openai_fake_data_generator.py CHANGED Viewed

@@ -2,51 +2,45 @@ from collections import namedtuple
 from typing import Optional
 import openai
 import logging
 logger = logging.getLogger("presidio-streamlit")
 OpenAIParams = namedtuple(
     "open_ai_params",
-    ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
 )
-def set_openai_params(openai_params: OpenAIParams):
-    """Set the OpenAI API key.
-    :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
-    The latter only relate to Azure OpenAI deployments.
-    """
-    openai.api_key = openai_params.openai_key
-    openai.api_version = openai_params.api_version
-    if openai_params.api_base:
-        openai.api_base = openai_params.api_base
-        openai.api_type = openai_params.api_type
 def call_completion_model(
     prompt: str,
-    model: str = "text-davinci-003",
-    max_tokens: int = 512,
-    deployment_id: Optional[str] = None,
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
-    :param model: OpenAI model name
-    :param max_tokens: Model's max_tokens parameter
-    :param deployment_id: Azure OpenAI deployment ID
     """
-    if deployment_id:
-        response = openai.Completion.create(
-            deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
         )
     else:
-        response = openai.Completion.create(
-            model=model, prompt=prompt, max_tokens=max_tokens
-        )
-    return response["choices"][0].text
 def create_prompt(anonymized_text: str) -> str:
@@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str:
     a. Use completely random numbers, so every digit is drawn between 0 and 9.
     b. Use realistic names that come from diverse genders, ethnicities and countries.
-    c. If there are no placeholders, return the text as is and provide an answer.
     d. Keep the formatting as close to the original as possible.
     e. If PII exists in the input, replace it with fake values in the output.
-    input: How do I change the limit on my credit card {{credit_card_number}}?
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
-    input: <PERSON> was the chief science officer at <ORGANIZATION>.
     output: Katherine Buckjov was the chief science officer at NASA.
-    input: Cameroon lives in <LOCATION>.
     output: Vladimir lives in Moscow.
-    input: {anonymized_text}
-    output:
-    """
     return prompt

 from typing import Optional
 import openai
+from openai import OpenAI, AzureOpenAI
 import logging
 logger = logging.getLogger("presidio-streamlit")
 OpenAIParams = namedtuple(
     "open_ai_params",
+    ["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"],
 )
 def call_completion_model(
     prompt: str,
+    openai_params: OpenAIParams,
+    max_tokens: Optional[int] = 256,
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
+    :param openai_params: OpenAI parameters for the completion model
+    :param max_tokens: The maximum number of tokens to generate.
     """
+    if openai_params.api_type.lower() == "azure":
+        client = AzureOpenAI(
+            api_version=openai_params.api_version,
+            api_key=openai_params.openai_key,
+            azure_endpoint=openai_params.api_base,
+            azure_deployment=openai_params.deployment_id,
         )
     else:
+        client = OpenAI(api_key=openai_params.openai_key)
+    response = client.completions.create(
+        model=openai_params.model,
+        prompt=prompt,
+        max_tokens=max_tokens,
+    )
+    return response.choices[0].text.strip()
 def create_prompt(anonymized_text: str) -> str:
     a. Use completely random numbers, so every digit is drawn between 0 and 9.
     b. Use realistic names that come from diverse genders, ethnicities and countries.
+    c. If there are no placeholders, return the text as is.
     d. Keep the formatting as close to the original as possible.
     e. If PII exists in the input, replace it with fake values in the output.
+    f. Remove whitespace before and after the generated text
+    input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
+    input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
     output: Katherine Buckjov was the chief science officer at NASA.
+    input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
     output: Vladimir lives in Moscow.
+    input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
+    output:"""
     return prompt

presidio_helpers.py CHANGED Viewed

@@ -16,16 +16,16 @@ from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 from openai_fake_data_generator import (
-    set_openai_params,
     call_completion_model,
-    create_prompt,
     OpenAIParams,
 )
 from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
     create_nlp_engine_with_flair,
     create_nlp_engine_with_transformers,
-    create_nlp_engine_with_azure_text_analytics,
 )
 logger = logging.getLogger("presidio-streamlit")
@@ -49,14 +49,16 @@ def nlp_engine_and_registry(
     """
     # Set up NLP Engine according to the model of choice
-    if "spaCy" in model_family:
         return create_nlp_engine_with_spacy(model_path)
-    elif "flair" in model_family:
         return create_nlp_engine_with_flair(model_path)
-    elif "HuggingFace" in model_family:
         return create_nlp_engine_with_transformers(model_path)
-    elif "Azure Text Analytics" in model_family:
-        return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
     else:
         raise ValueError(f"Model family {model_family} not supported")
@@ -215,14 +217,9 @@ def create_fake_data(
     if not openai_params.openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
-    set_openai_params(openai_params)
     prompt = create_prompt(results.text)
     print(f"Prompt: {prompt}")
-    fake = call_openai_api(
-        prompt=prompt,
-        openai_model_name=openai_params.model,
-        openai_deployment_name=openai_params.deployment_name,
-    )
     return fake

 from presidio_anonymizer.entities import OperatorConfig
 from openai_fake_data_generator import (
     call_completion_model,
     OpenAIParams,
+    create_prompt,
 )
 from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
     create_nlp_engine_with_flair,
     create_nlp_engine_with_transformers,
+    create_nlp_engine_with_azure_ai_language,
+    create_nlp_engine_with_stanza,
 )
 logger = logging.getLogger("presidio-streamlit")
     """
     # Set up NLP Engine according to the model of choice
+    if "spacy" in model_family.lower():
         return create_nlp_engine_with_spacy(model_path)
+    if "stanza" in model_family.lower():
+        return create_nlp_engine_with_stanza(model_path)
+    elif "flair" in model_family.lower():
         return create_nlp_engine_with_flair(model_path)
+    elif "huggingface" in model_family.lower():
         return create_nlp_engine_with_transformers(model_path)
+    elif "azure ai language" in model_family.lower():
+        return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
     else:
         raise ValueError(f"Model family {model_family} not supported")
     if not openai_params.openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
     prompt = create_prompt(results.text)
     print(f"Prompt: {prompt}")
+    fake = call_completion_model(prompt=prompt, openai_params=openai_params)
     return fake

presidio_nlp_engine_config.py CHANGED Viewed

@@ -1,8 +1,12 @@
-from typing import Tuple
 import logging
 import spacy
 from presidio_analyzer import RecognizerRegistry
-from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 logger = logging.getLogger("presidio-streamlit")
@@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy(
 ) -> Tuple[NlpEngine, RecognizerRegistry]:
     """
     Instantiate an NlpEngine with a spaCy model
-    :param model_path: spaCy model path.
     """
     registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
-    if not spacy.util.is_package(model_path):
-        spacy.cli.download(model_path)
     nlp_configuration = {
-        "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": model_path}],
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     return nlp_engine, registry
@@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers(
     would return NlpArtifacts such as POS and lemmas.
     :param model_path: HuggingFace model path.
     """
-    from transformers_rec import (
-        STANFORD_COFIGURATION,
-        BERT_DEID_CONFIGURATION,
-        TransformersRecognizer,
-    )
-    registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
-    if not spacy.util.is_package("en_core_web_sm"):
-        spacy.cli.download("en_core_web_sm")
-    # Using a small spaCy model + a HF NER model
-    transformers_recognizer = TransformersRecognizer(model_path=model_path)
-    if model_path == "StanfordAIMI/stanford-deidentifier-base":
-        transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
-    elif model_path == "obi/deid_roberta_i2b2":
-        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-    else:
-        print(f"Warning: Model has no configuration, loading default.")
-        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-    # Use small spaCy model, no need for both spacy and HF models
-    # The transformers model is used here as a recognizer, not as an NlpEngine
     nlp_configuration = {
-        "nlp_engine_name": "spacy",
-        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
     }
-    registry.add_recognizer(transformers_recognizer)
-    registry.remove_recognizer("SpacyRecognizer")
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     return nlp_engine, registry
@@ -91,6 +165,8 @@ def create_nlp_engine_with_flair(
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
     if not spacy.util.is_package("en_core_web_sm"):
         spacy.cli.download("en_core_web_sm")
     # Using a small spaCy model + a Flair NER model
@@ -107,7 +183,7 @@ def create_nlp_engine_with_flair(
     return nlp_engine, registry
-def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     """
     Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
     The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
@@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     :param ta_key: Azure Text Analytics key.
     :param ta_endpoint: Azure Text Analytics endpoint.
     """
-    from text_analytics_wrapper import TextAnalyticsWrapper
     if not ta_key or not ta_endpoint:
         raise RuntimeError("Please fill in the Text Analytics endpoint details")
@@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
-    ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
     nlp_configuration = {
         "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
@@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
-    registry.add_recognizer(ta_recognizer)
     registry.remove_recognizer("SpacyRecognizer")
     return nlp_engine, registry

 import logging
+from typing import Tuple
 import spacy
 from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import (
+    NlpEngine,
+    NlpEngineProvider,
+)
 logger = logging.getLogger("presidio-streamlit")
 ) -> Tuple[NlpEngine, RecognizerRegistry]:
     """
     Instantiate an NlpEngine with a spaCy model
+    :param model_path: path to model / model name.
     """
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": model_path}],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "NORP": "NRP",
+                "FAC": "FACILITY",
+                "LOC": "LOCATION",
+                "GPE": "LOCATION",
+                "LOCATION": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+            },
+            "low_confidence_score_multiplier": 0.4,
+            "low_score_entity_names": ["ORG", "ORGANIZATION"],
+        },
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    return nlp_engine, registry
+def create_nlp_engine_with_stanza(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a stanza model
+    :param model_path: path to model / model name.
+    """
     nlp_configuration = {
+        "nlp_engine_name": "stanza",
         "models": [{"lang_code": "en", "model_name": model_path}],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "NORP": "NRP",
+                "FAC": "FACILITY",
+                "LOC": "LOCATION",
+                "GPE": "LOCATION",
+                "LOCATION": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+            }
+        },
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
     return nlp_engine, registry
     would return NlpArtifacts such as POS and lemmas.
     :param model_path: HuggingFace model path.
     """
+    print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
     nlp_configuration = {
+        "nlp_engine_name": "transformers",
+        "models": [
+            {
+                "lang_code": "en",
+                "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
+            }
+        ],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "LOC": "LOCATION",
+                "LOCATION": "LOCATION",
+                "GPE": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "NORP": "NRP",
+                "AGE": "AGE",
+                "ID": "ID",
+                "EMAIL": "EMAIL",
+                "PATIENT": "PERSON",
+                "STAFF": "PERSON",
+                "HOSP": "ORGANIZATION",
+                "PATORG": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+                "PHONE": "PHONE_NUMBER",
+                "HCW": "PERSON",
+                "HOSPITAL": "ORGANIZATION",
+                "FACILITY": "LOCATION",
+            },
+            "low_confidence_score_multiplier": 0.4,
+            "low_score_entity_names": ["ID"],
+            "labels_to_ignore": [
+                "CARDINAL",
+                "EVENT",
+                "LANGUAGE",
+                "LAW",
+                "MONEY",
+                "ORDINAL",
+                "PERCENT",
+                "PRODUCT",
+                "QUANTITY",
+                "WORK_OF_ART",
+            ],
+        },
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
     return nlp_engine, registry
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
+    # there is no official Flair NlpEngine, hence we load it as an additional recognizer
     if not spacy.util.is_package("en_core_web_sm"):
         spacy.cli.download("en_core_web_sm")
     # Using a small spaCy model + a Flair NER model
     return nlp_engine, registry
+def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
     """
     Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
     The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
     :param ta_key: Azure Text Analytics key.
     :param ta_endpoint: Azure Text Analytics endpoint.
     """
+    from azure_ai_language_wrapper import AzureAIServiceWrapper
     if not ta_key or not ta_endpoint:
         raise RuntimeError("Please fill in the Text Analytics endpoint details")
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
+    azure_ai_language_recognizer = AzureAIServiceWrapper(
+        ta_endpoint=ta_endpoint, ta_key=ta_key
+    )
     nlp_configuration = {
         "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry.add_recognizer(azure_ai_language_recognizer)
     registry.remove_recognizer("SpacyRecognizer")
     return nlp_engine, registry

presidio_streamlit.py CHANGED Viewed

@@ -56,7 +56,8 @@ model_list = [
     "flair/ner-english-large",
     "HuggingFace/obi/deid_roberta_i2b2",
     "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
-    "Azure Text Analytics PII",
     "Other",
 ]
 if not allow_other_models:
@@ -75,22 +76,22 @@ st_model_package = st_model.split("/")[0]
 # Remove package prefix (if needed)
 st_model = (
     st_model
-    if st_model_package not in ("spaCy", "HuggingFace")
     else "/".join(st_model.split("/")[1:])
 )
 if st_model == "Other":
     st_model_package = st.sidebar.selectbox(
-        "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
     )
     st_model = st.sidebar.text_input(f"NER model name", value="")
-if st_model == "Azure Text Analytics PII":
     st_ta_key = st.sidebar.text_input(
-        f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
     )
     st_ta_endpoint = st.sidebar.text_input(
-        f"Text Analytics endpoint",
         value=os.getenv("TA_ENDPOINT", default=""),
         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
     )
@@ -124,23 +125,18 @@ open_ai_params = None
 logger.debug(f"st_operator: {st_operator}")
-if st_operator == "mask":
-    st_number_of_chars = st.sidebar.number_input(
-        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
-    )
-    st_mask_char = st.sidebar.text_input(
-        "Mask character", value=st_mask_char, max_chars=1
-    )
-elif st_operator == "encrypt":
-    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
-elif st_operator == "synthesize":
     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         openai_api_type = "azure"
         st_openai_api_base = st.sidebar.text_input(
             "Azure OpenAI base URL",
             value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
         )
-        st_deployment_name = st.sidebar.text_input(
             "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
         )
         st_openai_version = st.sidebar.text_input(
@@ -148,11 +144,13 @@ elif st_operator == "synthesize":
             value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
         )
     else:
-        st_openai_version = openai_api_type = st_openai_api_base = None
-        st_deployment_name = ""
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
-        value=os.getenv("OPENAI_KEY", default=""),
         help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
         type="password",
     )
@@ -161,12 +159,40 @@ elif st_operator == "synthesize":
         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
     open_ai_params = OpenAIParams(
         openai_key=st_openai_key,
         model=st_openai_model,
         api_base=st_openai_api_base,
-        deployment_name=st_deployment_name,
         api_version=st_openai_version,
         api_type=openai_api_type,
     )
@@ -214,7 +240,8 @@ with st.expander("About this demo", expanded=False):
         \n\n[Code](https://aka.ms/presidio) |
         [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
         [Installation](https://microsoft.github.io/presidio/installation/) |
-        [FAQ](https://microsoft.github.io/presidio/faq/) |"""
     )
     st.info(

     "flair/ner-english-large",
     "HuggingFace/obi/deid_roberta_i2b2",
     "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+    "stanza/en",
+    "Azure AI Language",
     "Other",
 ]
 if not allow_other_models:
 # Remove package prefix (if needed)
 st_model = (
     st_model
+    if st_model_package.lower() not in ("spacy", "stanza", "huggingface")
     else "/".join(st_model.split("/")[1:])
 )
 if st_model == "Other":
     st_model_package = st.sidebar.selectbox(
+        "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"]
     )
     st_model = st.sidebar.text_input(f"NER model name", value="")
+if st_model == "Azure AI Language":
     st_ta_key = st.sidebar.text_input(
+        f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password"
     )
     st_ta_endpoint = st.sidebar.text_input(
+        f"Azure AI Language endpoint",
         value=os.getenv("TA_ENDPOINT", default=""),
         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
     )
 logger.debug(f"st_operator: {st_operator}")
+def set_up_openai_synthesis():
+    """Set up the OpenAI API key and model for text synthesis."""
     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         openai_api_type = "azure"
         st_openai_api_base = st.sidebar.text_input(
             "Azure OpenAI base URL",
             value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
         )
+        openai_key = os.getenv("AZURE_OPENAI_KEY", default="")
+        st_deployment_id = st.sidebar.text_input(
             "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
         )
         st_openai_version = st.sidebar.text_input(
             value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
         )
     else:
+        openai_api_type = "openai"
+        st_openai_version = st_openai_api_base = None
+        st_deployment_id = ""
+        openai_key = os.getenv("OPENAI_KEY", default="")
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
+        value=openai_key,
         help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
         type="password",
     )
         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
+    return (
+        openai_api_type,
+        st_openai_api_base,
+        st_deployment_id,
+        st_openai_version,
+        st_openai_key,
+        st_openai_model,
+    )
+if st_operator == "mask":
+    st_number_of_chars = st.sidebar.number_input(
+        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
+    )
+    st_mask_char = st.sidebar.text_input(
+        "Mask character", value=st_mask_char, max_chars=1
+    )
+elif st_operator == "encrypt":
+    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
+elif st_operator == "synthesize":
+    (
+        openai_api_type,
+        st_openai_api_base,
+        st_deployment_id,
+        st_openai_version,
+        st_openai_key,
+        st_openai_model,
+    ) = set_up_openai_synthesis()
     open_ai_params = OpenAIParams(
         openai_key=st_openai_key,
         model=st_openai_model,
         api_base=st_openai_api_base,
+        deployment_id=st_deployment_id,
         api_version=st_openai_version,
         api_type=openai_api_type,
     )
         \n\n[Code](https://aka.ms/presidio) |
         [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
         [Installation](https://microsoft.github.io/presidio/installation/) |
+        [FAQ](https://microsoft.github.io/presidio/faq/) |
+        [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
     )
     st.info(

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-presidio-analyzer
 presidio-anonymizer
 streamlit
 streamlit-tags
@@ -6,8 +7,6 @@ pandas
 python-dotenv
 st-annotated-text
 torch
-transformers
 flair
 openai
-spacy
 azure-ai-textanalytics

+presidio-analyzer[transformers]
+presidio-analyzer[stanza]
 presidio-anonymizer
 streamlit
 streamlit-tags
 python-dotenv
 st-annotated-text
 torch
 flair
 openai
 azure-ai-textanalytics

test_streamlit.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from presidio_helpers import analyzer_engine, analyze, anonymize
+def test_streamlit_logic():
+    st_model = "en"  # st_model = "StanfordAIMI/stanford-deidentifier-base"
+    st_model_package = "stanza"  ##st_model_package = "HuggingFace"
+    st_ta_key = None
+    st_ta_endpoint = None
+    analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+    # Read default text
+    with open("demo_text.txt") as f:
+        demo_text = f.readlines()
+    st_text = "".join(demo_text)
+    # instantiate and cache AnalyzerEngine
+    analyzer_engine(*analyzer_params)
+    # Analyze
+    st_analyze_results = analyze(
+        *analyzer_params,
+        text=st_text,
+        entities="All",
+        language="en",
+        score_threshold=0.35,
+        return_decision_process=True,
+        allow_list=[],
+        deny_list=[],
+    )
+    # Anonymize
+    st_anonymize_results = anonymize(
+        text=st_text,
+        operator="replace",
+        mask_char=None,
+        number_of_chars=None,
+        encrypt_key=None,
+        analyze_results=st_analyze_results,
+    )
+    assert st_anonymize_results.text != ""