Spaces:
Running
Running
KarishmaShirsath
commited on
Commit
•
da8438c
1
Parent(s):
ab73abb
Add de-identification options
Browse files- Final_file.py +124 -16
- app.py +22 -2
- requirements.txt +0 -0
Final_file.py
CHANGED
@@ -347,7 +347,7 @@ val_dataset = (
|
|
347 |
.padded_batch(batch_size)
|
348 |
)
|
349 |
|
350 |
-
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
|
351 |
|
352 |
|
353 |
# In[15]:
|
@@ -367,14 +367,14 @@ class CustomNonPaddingTokenLoss(keras.losses.Loss):
|
|
367 |
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
368 |
|
369 |
|
370 |
-
loss = CustomNonPaddingTokenLoss()
|
371 |
|
372 |
|
373 |
# In[16]:
|
374 |
|
375 |
|
376 |
-
ner_model.compile(optimizer="adam", loss=loss)
|
377 |
-
ner_model.fit(train_dataset, epochs=10)
|
378 |
|
379 |
|
380 |
def tokenize_and_convert_to_ids(text):
|
@@ -383,18 +383,18 @@ def tokenize_and_convert_to_ids(text):
|
|
383 |
|
384 |
|
385 |
# Sample inference using the trained model
|
386 |
-
sample_input = tokenize_and_convert_to_ids(
|
387 |
-
|
388 |
-
)
|
389 |
-
sample_input = tf.reshape(sample_input, shape=[1, -1])
|
390 |
-
print(sample_input)
|
391 |
|
392 |
-
output = ner_model.predict(sample_input)
|
393 |
-
prediction = np.argmax(output, axis=-1)[0]
|
394 |
-
prediction = [mapping[i] for i in prediction]
|
395 |
|
396 |
# eu -> B-ORG, german -> B-MISC, british -> B-MISC
|
397 |
-
print(prediction)
|
398 |
|
399 |
|
400 |
# In[17]:
|
@@ -426,7 +426,7 @@ def calculate_metrics(_dataset):
|
|
426 |
evaluate(real_tags, predicted_tags)
|
427 |
|
428 |
|
429 |
-
calculate_metrics(val_dataset)
|
430 |
|
431 |
|
432 |
# In[18]:
|
@@ -450,7 +450,7 @@ def test_model_with_input(_ner_model, mapping):
|
|
450 |
print("Predicted tags:", predicted_tags)
|
451 |
|
452 |
# Test the model with user input
|
453 |
-
test_model_with_input(ner_model, mapping)
|
454 |
|
455 |
|
456 |
# In[20]:
|
@@ -693,7 +693,115 @@ class FlairRecognizer(EntityRecognizer):
|
|
693 |
anonymized_text = "*" * len(entity_text)
|
694 |
anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
|
695 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
# print anonymized sentence
|
697 |
print("Anonymized sentence:")
|
698 |
print(anonymized_sentence)
|
699 |
-
return anonymized_sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
.padded_batch(batch_size)
|
348 |
)
|
349 |
|
350 |
+
# ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
|
351 |
|
352 |
|
353 |
# In[15]:
|
|
|
367 |
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
368 |
|
369 |
|
370 |
+
# loss = CustomNonPaddingTokenLoss()
|
371 |
|
372 |
|
373 |
# In[16]:
|
374 |
|
375 |
|
376 |
+
# ner_model.compile(optimizer="adam", loss=loss)
|
377 |
+
# ner_model.fit(train_dataset, epochs=10)
|
378 |
|
379 |
|
380 |
def tokenize_and_convert_to_ids(text):
|
|
|
383 |
|
384 |
|
385 |
# Sample inference using the trained model
|
386 |
+
# sample_input = tokenize_and_convert_to_ids(
|
387 |
+
# "eu rejects german call to boycott british lamb"
|
388 |
+
# )
|
389 |
+
# sample_input = tf.reshape(sample_input, shape=[1, -1])
|
390 |
+
# print(sample_input)
|
391 |
|
392 |
+
# output = ner_model.predict(sample_input)
|
393 |
+
# prediction = np.argmax(output, axis=-1)[0]
|
394 |
+
# prediction = [mapping[i] for i in prediction]
|
395 |
|
396 |
# eu -> B-ORG, german -> B-MISC, british -> B-MISC
|
397 |
+
# print(prediction)
|
398 |
|
399 |
|
400 |
# In[17]:
|
|
|
426 |
evaluate(real_tags, predicted_tags)
|
427 |
|
428 |
|
429 |
+
# calculate_metrics(val_dataset)
|
430 |
|
431 |
|
432 |
# In[18]:
|
|
|
450 |
print("Predicted tags:", predicted_tags)
|
451 |
|
452 |
# Test the model with user input
|
453 |
+
# test_model_with_input(ner_model, mapping)
|
454 |
|
455 |
|
456 |
# In[20]:
|
|
|
693 |
anonymized_text = "*" * len(entity_text)
|
694 |
anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
|
695 |
|
696 |
+
# remove the part that includes named entity annotations
|
697 |
+
anonymized_sentence = anonymized_sentence.split("→")[0].strip()
|
698 |
+
anonymized_sentence = anonymized_sentence.split(":")[1].strip()
|
699 |
+
|
700 |
+
a = anonymize(input_text, "", anonymized_sentence)
|
701 |
+
print("a sentence:")
|
702 |
+
print(a)
|
703 |
+
|
704 |
# print anonymized sentence
|
705 |
print("Anonymized sentence:")
|
706 |
print(anonymized_sentence)
|
707 |
+
return anonymized_sentence
|
708 |
+
|
709 |
+
|
710 |
+
|
711 |
+
|
712 |
+
|
713 |
+
|
714 |
+
|
715 |
+
|
716 |
+
|
717 |
+
|
718 |
+
from presidio_anonymizer import AnonymizerEngine
|
719 |
+
from presidio_analyzer import AnalyzerEngine
|
720 |
+
from presidio_anonymizer.entities import (
|
721 |
+
OperatorConfig,
|
722 |
+
RecognizerResult,
|
723 |
+
EngineResult,
|
724 |
+
ConflictResolutionStrategy,
|
725 |
+
)
|
726 |
+
from typing import List, Dict, Optional, Type
|
727 |
+
|
728 |
+
|
729 |
+
class FlairRecognizer2():
|
730 |
+
|
731 |
+
|
732 |
+
def anonymize(
|
733 |
+
text: str,
|
734 |
+
operator: str,
|
735 |
+
# analyze_results: List[RecognizerResult],
|
736 |
+
mask_char: Optional[str] = None,
|
737 |
+
number_of_chars: Optional[str] = None,
|
738 |
+
encrypt_key: Optional[str] = None,
|
739 |
+
):
|
740 |
+
"""Anonymize identified input using Presidio Anonymizer.
|
741 |
+
:param text: Full text
|
742 |
+
:param operator: Operator name
|
743 |
+
:param mask_char: Mask char (for mask operator)
|
744 |
+
:param number_of_chars: Number of characters to mask (for mask operator)
|
745 |
+
:param encrypt_key: Encryption key (for encrypt operator)
|
746 |
+
:param analyze_results: list of results from presidio analyzer engine
|
747 |
+
"""
|
748 |
+
|
749 |
+
if operator == "mask":
|
750 |
+
operator_config = {
|
751 |
+
"type": "mask",
|
752 |
+
"masking_char": mask_char,
|
753 |
+
"chars_to_mask": number_of_chars,
|
754 |
+
"from_end": False,
|
755 |
+
}
|
756 |
+
|
757 |
+
# Define operator config
|
758 |
+
elif operator == "encrypt":
|
759 |
+
operator_config = {"key": encrypt_key}
|
760 |
+
elif operator == "highlight":
|
761 |
+
operator_config = {"lambda": lambda x: x}
|
762 |
+
else:
|
763 |
+
operator_config = None
|
764 |
+
|
765 |
+
# Change operator if needed as intermediate step
|
766 |
+
if operator == "highlight":
|
767 |
+
operator = "custom"
|
768 |
+
elif operator == "synthesize":
|
769 |
+
operator = "replace"
|
770 |
+
else:
|
771 |
+
operator = operator
|
772 |
+
|
773 |
+
# res = AnonymizerEngine().anonymize(
|
774 |
+
# text,
|
775 |
+
# analyze_results,
|
776 |
+
# operators={"DEFAULT": OperatorConfig("redact", operator_config)},
|
777 |
+
# )
|
778 |
+
|
779 |
+
|
780 |
+
|
781 |
+
analyzer = AnalyzerEngine()
|
782 |
+
|
783 |
+
results = analyzer.analyze(text=text, entities=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION'], language='en') # noqa D501
|
784 |
+
print("results:")
|
785 |
+
print(results)
|
786 |
+
|
787 |
+
engine = AnonymizerEngine()
|
788 |
+
|
789 |
+
# Invoke the anonymize function with the text, analyzer results and
|
790 |
+
# Operators to define the anonymization type.
|
791 |
+
result = engine.anonymize(
|
792 |
+
text=text,
|
793 |
+
analyzer_results=results,
|
794 |
+
operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
|
795 |
+
)
|
796 |
+
print("res:")
|
797 |
+
print(result)
|
798 |
+
print(result.text)
|
799 |
+
print(type(result.text))
|
800 |
+
|
801 |
+
|
802 |
+
return result.text
|
803 |
+
|
804 |
+
|
805 |
+
|
806 |
+
|
807 |
+
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from Final_file import FlairRecognizer
|
|
|
3 |
import os
|
4 |
import PyPDF2
|
5 |
import docx
|
@@ -15,9 +16,13 @@ def cached_predict_ner_tags(text):
|
|
15 |
|
16 |
# Cache the text analysis function
|
17 |
@st.cache_resource
|
18 |
-
def cached_analyze_text(text):
|
19 |
return FlairRecognizer.analyze_text(text)
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def download_masked_file(masked_text, file_extension):
|
22 |
|
23 |
# Create a temporary file to store the masked text
|
@@ -66,6 +71,21 @@ def main():
|
|
66 |
st.sidebar.header('Upload Options')
|
67 |
upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
# # Dropdown menu with four choices
|
70 |
# st.sidebar.header('Masking Options')
|
71 |
# choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
|
@@ -75,7 +95,7 @@ def main():
|
|
75 |
if st.button('Analyze'):
|
76 |
with st.spinner('Wait for it... the model is loading'):
|
77 |
cached_predict_ner_tags(input_text)
|
78 |
-
masked_text =
|
79 |
st.text_area("Masked text:", value=masked_text, height=200)
|
80 |
elif upload_option == 'File Upload':
|
81 |
uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
|
|
|
1 |
import streamlit as st
|
2 |
from Final_file import FlairRecognizer
|
3 |
+
from Final_file import FlairRecognizer2
|
4 |
import os
|
5 |
import PyPDF2
|
6 |
import docx
|
|
|
16 |
|
17 |
# Cache the text analysis function
|
18 |
@st.cache_resource
|
19 |
+
def cached_analyze_text(text, operator):
|
20 |
return FlairRecognizer.analyze_text(text)
|
21 |
|
22 |
+
@st.cache_resource
|
23 |
+
def cached_anonimize_text(text, operator):
|
24 |
+
return FlairRecognizer2.anonymize(text, operator)
|
25 |
+
|
26 |
def download_masked_file(masked_text, file_extension):
|
27 |
|
28 |
# Create a temporary file to store the masked text
|
|
|
71 |
st.sidebar.header('Upload Options')
|
72 |
upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
|
73 |
|
74 |
+
st_operator = st.sidebar.selectbox(
|
75 |
+
"De-identification approach",
|
76 |
+
["redact", "replace", "hash"],
|
77 |
+
index=1,
|
78 |
+
help="""
|
79 |
+
Select which manipulation to the text is requested after PII has been identified.\n
|
80 |
+
- Redact: Completely remove the PII text\n
|
81 |
+
- Replace: Replace the PII text with a constant, e.g. <PERSON>\n
|
82 |
+
- Synthesize: Replace with fake values (requires an OpenAI key)\n
|
83 |
+
- Highlight: Shows the original text with PII highlighted in colors\n
|
84 |
+
- Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
|
85 |
+
- Hash: Replaces with the hash of the PII string\n
|
86 |
+
- Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
|
87 |
+
""",
|
88 |
+
)
|
89 |
# # Dropdown menu with four choices
|
90 |
# st.sidebar.header('Masking Options')
|
91 |
# choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
|
|
|
95 |
if st.button('Analyze'):
|
96 |
with st.spinner('Wait for it... the model is loading'):
|
97 |
cached_predict_ner_tags(input_text)
|
98 |
+
masked_text = cached_anonimize_text(input_text, st_operator)
|
99 |
st.text_area("Masked text:", value=masked_text, height=200)
|
100 |
elif upload_option == 'File Upload':
|
101 |
uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|