KarishmaShirsath commited on
Commit
da8438c
1 Parent(s): ab73abb

Add de-identification options

Browse files
Files changed (3) hide show
  1. Final_file.py +124 -16
  2. app.py +22 -2
  3. requirements.txt +0 -0
Final_file.py CHANGED
@@ -347,7 +347,7 @@ val_dataset = (
347
  .padded_batch(batch_size)
348
  )
349
 
350
- ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
351
 
352
 
353
  # In[15]:
@@ -367,14 +367,14 @@ class CustomNonPaddingTokenLoss(keras.losses.Loss):
367
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)
368
 
369
 
370
- loss = CustomNonPaddingTokenLoss()
371
 
372
 
373
  # In[16]:
374
 
375
 
376
- ner_model.compile(optimizer="adam", loss=loss)
377
- ner_model.fit(train_dataset, epochs=10)
378
 
379
 
380
  def tokenize_and_convert_to_ids(text):
@@ -383,18 +383,18 @@ def tokenize_and_convert_to_ids(text):
383
 
384
 
385
  # Sample inference using the trained model
386
- sample_input = tokenize_and_convert_to_ids(
387
- "eu rejects german call to boycott british lamb"
388
- )
389
- sample_input = tf.reshape(sample_input, shape=[1, -1])
390
- print(sample_input)
391
 
392
- output = ner_model.predict(sample_input)
393
- prediction = np.argmax(output, axis=-1)[0]
394
- prediction = [mapping[i] for i in prediction]
395
 
396
  # eu -> B-ORG, german -> B-MISC, british -> B-MISC
397
- print(prediction)
398
 
399
 
400
  # In[17]:
@@ -426,7 +426,7 @@ def calculate_metrics(_dataset):
426
  evaluate(real_tags, predicted_tags)
427
 
428
 
429
- calculate_metrics(val_dataset)
430
 
431
 
432
  # In[18]:
@@ -450,7 +450,7 @@ def test_model_with_input(_ner_model, mapping):
450
  print("Predicted tags:", predicted_tags)
451
 
452
  # Test the model with user input
453
- test_model_with_input(ner_model, mapping)
454
 
455
 
456
  # In[20]:
@@ -693,7 +693,115 @@ class FlairRecognizer(EntityRecognizer):
693
  anonymized_text = "*" * len(entity_text)
694
  anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
695
 
 
 
 
 
 
 
 
 
696
  # print anonymized sentence
697
  print("Anonymized sentence:")
698
  print(anonymized_sentence)
699
- return anonymized_sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  .padded_batch(batch_size)
348
  )
349
 
350
+ # ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
351
 
352
 
353
  # In[15]:
 
367
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)
368
 
369
 
370
+ # loss = CustomNonPaddingTokenLoss()
371
 
372
 
373
  # In[16]:
374
 
375
 
376
+ # ner_model.compile(optimizer="adam", loss=loss)
377
+ # ner_model.fit(train_dataset, epochs=10)
378
 
379
 
380
  def tokenize_and_convert_to_ids(text):
 
383
 
384
 
385
  # Sample inference using the trained model
386
+ # sample_input = tokenize_and_convert_to_ids(
387
+ # "eu rejects german call to boycott british lamb"
388
+ # )
389
+ # sample_input = tf.reshape(sample_input, shape=[1, -1])
390
+ # print(sample_input)
391
 
392
+ # output = ner_model.predict(sample_input)
393
+ # prediction = np.argmax(output, axis=-1)[0]
394
+ # prediction = [mapping[i] for i in prediction]
395
 
396
  # eu -> B-ORG, german -> B-MISC, british -> B-MISC
397
+ # print(prediction)
398
 
399
 
400
  # In[17]:
 
426
  evaluate(real_tags, predicted_tags)
427
 
428
 
429
+ # calculate_metrics(val_dataset)
430
 
431
 
432
  # In[18]:
 
450
  print("Predicted tags:", predicted_tags)
451
 
452
  # Test the model with user input
453
+ # test_model_with_input(ner_model, mapping)
454
 
455
 
456
  # In[20]:
 
693
  anonymized_text = "*" * len(entity_text)
694
  anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
695
 
696
+ # remove the part that includes named entity annotations
697
+ anonymized_sentence = anonymized_sentence.split("→")[0].strip()
698
+ anonymized_sentence = anonymized_sentence.split(":")[1].strip()
699
+
700
+ a = anonymize(input_text, "", anonymized_sentence)
701
+ print("a sentence:")
702
+ print(a)
703
+
704
  # print anonymized sentence
705
  print("Anonymized sentence:")
706
  print(anonymized_sentence)
707
+ return anonymized_sentence
708
+
709
+
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+ from presidio_anonymizer import AnonymizerEngine
719
+ from presidio_analyzer import AnalyzerEngine
720
+ from presidio_anonymizer.entities import (
721
+ OperatorConfig,
722
+ RecognizerResult,
723
+ EngineResult,
724
+ ConflictResolutionStrategy,
725
+ )
726
+ from typing import List, Dict, Optional, Type
727
+
728
+
729
+ class FlairRecognizer2():
730
+
731
+
732
+ def anonymize(
733
+ text: str,
734
+ operator: str,
735
+ # analyze_results: List[RecognizerResult],
736
+ mask_char: Optional[str] = None,
737
+ number_of_chars: Optional[str] = None,
738
+ encrypt_key: Optional[str] = None,
739
+ ):
740
+ """Anonymize identified input using Presidio Anonymizer.
741
+ :param text: Full text
742
+ :param operator: Operator name
743
+ :param mask_char: Mask char (for mask operator)
744
+ :param number_of_chars: Number of characters to mask (for mask operator)
745
+ :param encrypt_key: Encryption key (for encrypt operator)
746
+ :param analyze_results: list of results from presidio analyzer engine
747
+ """
748
+
749
+ if operator == "mask":
750
+ operator_config = {
751
+ "type": "mask",
752
+ "masking_char": mask_char,
753
+ "chars_to_mask": number_of_chars,
754
+ "from_end": False,
755
+ }
756
+
757
+ # Define operator config
758
+ elif operator == "encrypt":
759
+ operator_config = {"key": encrypt_key}
760
+ elif operator == "highlight":
761
+ operator_config = {"lambda": lambda x: x}
762
+ else:
763
+ operator_config = None
764
+
765
+ # Change operator if needed as intermediate step
766
+ if operator == "highlight":
767
+ operator = "custom"
768
+ elif operator == "synthesize":
769
+ operator = "replace"
770
+ else:
771
+ operator = operator
772
+
773
+ # res = AnonymizerEngine().anonymize(
774
+ # text,
775
+ # analyze_results,
776
+ # operators={"DEFAULT": OperatorConfig("redact", operator_config)},
777
+ # )
778
+
779
+
780
+
781
+ analyzer = AnalyzerEngine()
782
+
783
+ results = analyzer.analyze(text=text, entities=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION'], language='en') # noqa D501
784
+ print("results:")
785
+ print(results)
786
+
787
+ engine = AnonymizerEngine()
788
+
789
+ # Invoke the anonymize function with the text, analyzer results and
790
+ # Operators to define the anonymization type.
791
+ result = engine.anonymize(
792
+ text=text,
793
+ analyzer_results=results,
794
+ operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
795
+ )
796
+ print("res:")
797
+ print(result)
798
+ print(result.text)
799
+ print(type(result.text))
800
+
801
+
802
+ return result.text
803
+
804
+
805
+
806
+
807
+
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  from Final_file import FlairRecognizer
 
3
  import os
4
  import PyPDF2
5
  import docx
@@ -15,9 +16,13 @@ def cached_predict_ner_tags(text):
15
 
16
  # Cache the text analysis function
17
  @st.cache_resource
18
- def cached_analyze_text(text):
19
  return FlairRecognizer.analyze_text(text)
20
 
 
 
 
 
21
  def download_masked_file(masked_text, file_extension):
22
 
23
  # Create a temporary file to store the masked text
@@ -66,6 +71,21 @@ def main():
66
  st.sidebar.header('Upload Options')
67
  upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # # Dropdown menu with four choices
70
  # st.sidebar.header('Masking Options')
71
  # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
@@ -75,7 +95,7 @@ def main():
75
  if st.button('Analyze'):
76
  with st.spinner('Wait for it... the model is loading'):
77
  cached_predict_ner_tags(input_text)
78
- masked_text = cached_analyze_text(input_text)
79
  st.text_area("Masked text:", value=masked_text, height=200)
80
  elif upload_option == 'File Upload':
81
  uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
 
1
  import streamlit as st
2
  from Final_file import FlairRecognizer
3
+ from Final_file import FlairRecognizer2
4
  import os
5
  import PyPDF2
6
  import docx
 
16
 
17
  # Cache the text analysis function
18
  @st.cache_resource
19
+ def cached_analyze_text(text, operator):
20
  return FlairRecognizer.analyze_text(text)
21
 
22
+ @st.cache_resource
23
+ def cached_anonimize_text(text, operator):
24
+ return FlairRecognizer2.anonymize(text, operator)
25
+
26
  def download_masked_file(masked_text, file_extension):
27
 
28
  # Create a temporary file to store the masked text
 
71
  st.sidebar.header('Upload Options')
72
  upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
73
 
74
+ st_operator = st.sidebar.selectbox(
75
+ "De-identification approach",
76
+ ["redact", "replace", "hash"],
77
+ index=1,
78
+ help="""
79
+ Select which manipulation to the text is requested after PII has been identified.\n
80
+ - Redact: Completely remove the PII text\n
81
+ - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
82
+ - Synthesize: Replace with fake values (requires an OpenAI key)\n
83
+ - Highlight: Shows the original text with PII highlighted in colors\n
84
+ - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
85
+ - Hash: Replaces with the hash of the PII string\n
86
+ - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
87
+ """,
88
+ )
89
  # # Dropdown menu with four choices
90
  # st.sidebar.header('Masking Options')
91
  # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
 
95
  if st.button('Analyze'):
96
  with st.spinner('Wait for it... the model is loading'):
97
  cached_predict_ner_tags(input_text)
98
+ masked_text = cached_anonimize_text(input_text, st_operator)
99
  st.text_area("Masked text:", value=masked_text, height=200)
100
  elif upload_option == 'File Upload':
101
  uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ