File size: 5,392 Bytes
8850a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
from Final_file import FlairRecognizer
import os
import PyPDF2
import docx
# from io import BytesIO
from fpdf import FPDF
import io
from docx import Document

# Cache the model loading and prediction function
@st.cache_resource
def cached_predict_ner_tags(text):
    return FlairRecognizer.predict_ner_tags(text)

# Cache the text analysis function
@st.cache_resource
def cached_analyze_text(text):
    return FlairRecognizer.analyze_text(text)

def download_masked_file(masked_text, file_extension):
    
    # Create a temporary file to store the masked text
    temp_file_path = f"masked_output.{file_extension}"
    with open(temp_file_path, "w") as temp_file:
        temp_file.write(masked_text)

    # Display a download button
    st.download_button("Download Masked File", temp_file_path, file_name=f"masked_output.{file_extension}")

    # Clean up the temporary file
    os.remove(temp_file_path)

def extract_text_from_pdf(file_contents):
    try:
        # base64_pdf = base64.b64encode(file_contents.read()).decode('utf-8')
        pdf_reader = PyPDF2.PdfReader(file_contents)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
        return text
    except Exception as e:
        return f"Error occurred: {str(e)}"



def create_pdf(text_content):
    pdf = FPDF()
    pdf.add_page()
    pdf.add_font("DejaVuSans", "", "DejaVuSans.ttf",uni=True)  # Add DejaVuSans font
    pdf.set_font("DejaVuSans", size=12)
    pdf.multi_cell(0, 10, txt=text_content)
    return pdf

def create_word_file(text_content):
    doc = Document()
    doc.add_paragraph(text_content)
    # Save the document to a BytesIO object
    doc_io = io.BytesIO()
    doc.save(doc_io)
    doc_io.seek(0)
    return doc_io

def main():
    st.title('PII Masking App')
    st.sidebar.header('Upload Options')
    upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))

    # # Dropdown menu with four choices
    # st.sidebar.header('Masking Options')
    # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
    masked_text_public = ''
    if upload_option == 'Text Input':
        input_text = st.text_area("Enter text here:")
        if st.button('Analyze'):
            with st.spinner('Wait for it... the model is loading'):
                cached_predict_ner_tags(input_text)
                masked_text = cached_analyze_text(input_text)
            st.text_area("Masked text:", value=masked_text, height=200)
    elif upload_option == 'File Upload':
        uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
        if uploaded_file is not None:
            file_contents = uploaded_file.read()
            #  Process PDF file
            if uploaded_file.type == 'application/pdf':
                extracted_text = extract_text_from_pdf(uploaded_file)
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        cached_predict_ner_tags(extracted_text)
                        masked_text = cached_analyze_text(extracted_text)
                    st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
                    if extracted_text:
                        pdf = create_pdf(masked_text)
                        # Save PDF to temporary location
                        pdf_file_path = "masked_output.pdf"
                        pdf.output(pdf_file_path)

                        # Download button
                        st.download_button(label="Download", data=open(pdf_file_path, "rb"), file_name="masked_output.pdf", mime="application/pdf")
                    else:
                        st.warning("Please enter some text to download as PDF.")         
                    
            # Process Word document
            elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                doc = docx.Document(io.BytesIO(file_contents))
                text = ''
                for paragraph in doc.paragraphs:
                    text += paragraph.text
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        cached_predict_ner_tags(text)
                        masked_text = cached_analyze_text(text)
                    st.text_area("Masked text:", value=masked_text, height=200)
                    #create word file
                    doc_io = create_word_file(masked_text)
                    #download it
                    st.download_button(label="Download", data=doc_io, file_name="masked_text.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
            else:
                if st.button('Analyze'):
                    with st.spinner('Wait for it... the model is loading'):
                        cached_predict_ner_tags(file_contents.decode())
                        masked_text = cached_analyze_text(file_contents.decode())
                    st.text_area("Masked text:", value=masked_text, height=200)
                    st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")


if __name__ == "__main__":
    main()