File size: 6,853 Bytes
8850a9d
 
da8438c
8850a9d
 
 
 
 
 
 
1e7dab8
8850a9d
 
 
 
 
 
 
 
da8438c
8850a9d
 
da8438c
 
 
 
1e7dab8
a73656c
 
1e7dab8
8850a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da8438c
 
1e7dab8
da8438c
 
 
 
 
 
 
 
 
 
 
1e7dab8
a73656c
 
 
 
 
 
6313384
a73656c
1e7dab8
8850a9d
 
 
 
 
1e7dab8
a73656c
1e7dab8
8850a9d
 
 
 
 
 
 
 
 
 
1e7dab8
3c7fa1a
1e7dab8
8850a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e7dab8
3c7fa1a
1e7dab8
8850a9d
 
 
 
 
 
 
 
1e7dab8
 
3c7fa1a
8850a9d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import streamlit as st
from Final_file import FlairRecognizer
from Final_file import FlairRecognizer2
import os
import PyPDF2
import docx
# from io import BytesIO
from fpdf import FPDF
import io
from docx import Document
from PiiMaskingService import PiiMaskingService

# Cache the model loading and prediction function
@st.cache_resource
def cached_predict_ner_tags(text):
    return FlairRecognizer.predict_ner_tags(text)

# Cache the text analysis function
@st.cache_resource
def cached_analyze_text(text, operator):
    return FlairRecognizer.analyze_text(text)

@st.cache_resource
def cached_anonimize_text(text, operator):
    return FlairRecognizer2.anonymize(text, operator)

@st.cache_resource
def anonymize(text, operator, model):
    return PiiMaskingService().anonymize(text, operator, model)

def download_masked_file(masked_text, file_extension):
    
    # Create a temporary file to store the masked text
    temp_file_path = f"masked_output.{file_extension}"
    with open(temp_file_path, "w") as temp_file:
        temp_file.write(masked_text)

    # Display a download button
    st.download_button("Download Masked File", temp_file_path, file_name=f"masked_output.{file_extension}")

    # Clean up the temporary file
    os.remove(temp_file_path)

def extract_text_from_pdf(file_contents):
    try:
        # base64_pdf = base64.b64encode(file_contents.read()).decode('utf-8')
        pdf_reader = PyPDF2.PdfReader(file_contents)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
        return text
    except Exception as e:
        return f"Error occurred: {str(e)}"



def create_pdf(text_content):
    pdf = FPDF()
    pdf.add_page()
    pdf.add_font("DejaVuSans", "", "DejaVuSans.ttf",uni=True)  # Add DejaVuSans font
    pdf.set_font("DejaVuSans", size=12)
    pdf.multi_cell(0, 10, txt=text_content)
    return pdf

def create_word_file(text_content):
    doc = Document()
    doc.add_paragraph(text_content)
    # Save the document to a BytesIO object
    doc_io = io.BytesIO()
    doc.save(doc_io)
    doc_io.seek(0)
    return doc_io

def main():
    st.title('PII Masking App')
    st.sidebar.header('Upload Options')
    upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))

    st_operator = st.sidebar.selectbox(
        "De-identification approach",
        ["redact", "replace", "encrypt", "hash", "mask"],
        index=1,
        help="""
        Select which manipulation to the text is requested after PII has been identified.\n
        - Redact: Completely remove the PII text\n
        - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
        - Highlight: Shows the original text with PII highlighted in colors\n
        - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
        - Hash: Replaces with the hash of the PII string\n
        - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
            """,
    )

    st_model = st.sidebar.selectbox(
        "NER model package",
        [
            "flair/ner-english-large",
            "HuggingFace/obi/deid_roberta_i2b2",
        ],
        index=0,
    )
    
    masked_text_public = ''
    if upload_option == 'Text Input':
        input_text = st.text_area("Enter text here:")
        if st.button('Analyze'):
            with st.spinner('Wait for it... the model is loading'):
                # cached_predict_ner_tags(input_text)
                masked_text = anonymize(input_text, st_operator, st_model)
                # masked_text = cached_anonimize_text(input_text, st_operator)
            st.text_area("Masked text:", value=masked_text, height=200)
    elif upload_option == 'File Upload':
        uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
        if uploaded_file is not None:
            file_contents = uploaded_file.read()
            #  Process PDF file
            if uploaded_file.type == 'application/pdf':
                extracted_text = extract_text_from_pdf(uploaded_file)
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        # cached_predict_ner_tags(extracted_text)
                        masked_text = anonymize(extracted_text, st_operator, st_model)
                        # masked_text = cached_analyze_text(extracted_text)
                    st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
                    if extracted_text:
                        pdf = create_pdf(masked_text)
                        # Save PDF to temporary location
                        pdf_file_path = "masked_output.pdf"
                        pdf.output(pdf_file_path)

                        # Download button
                        st.download_button(label="Download", data=open(pdf_file_path, "rb"), file_name="masked_output.pdf", mime="application/pdf")
                    else:
                        st.warning("Please enter some text to download as PDF.")         
                    
            # Process Word document
            elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                doc = docx.Document(io.BytesIO(file_contents))
                text = ''
                for paragraph in doc.paragraphs:
                    text += paragraph.text
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        # cached_predict_ner_tags(text)
                        masked_text = anonymize(text, st_operator, st_model)
                        # masked_text = cached_analyze_text(text)
                    st.text_area("Masked text:", value=masked_text, height=200)
                    #create word file
                    doc_io = create_word_file(masked_text)
                    #download it
                    st.download_button(label="Download", data=doc_io, file_name="masked_text.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
            else:
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        # cached_predict_ner_tags(file_contents.decode())
                        # masked_text = cached_analyze_text(file_contents.decode())
                        masked_text = anonymize(file_contents.decode(), st_operator, st_model)
                    st.text_area("Masked text:", value=masked_text, height=200)
                    st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")


if __name__ == "__main__":
    main()