Spaces:

KarishmaShirsath
/

PIIMasking

Running

App Files Files Community

KarishmaShirsath commited on Apr 7

Commit

8850a9d

•

1 Parent(s): b0ba6f1

Upload 5 files

Browse files

Files changed (5) hide show

DejaVuSans.ttf +0 -0
Final_file.py +699 -0
app.py +128 -0
conlleval.py +235 -0
requirements.txt +0 -0

DejaVuSans.ttf ADDED Viewed

Binary file (757 kB). View file

Final_file.py ADDED Viewed

	@@ -0,0 +1,699 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# get_ipython().system('pip3 install datasets')
+# get_ipython().system('wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py')
+import requests
+url = "https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py"
+response = requests.get(url)
+with open("conlleval.py", "wb") as f:
+    f.write(response.content)
+# In[36]:
+# get_ipython().system('pip install presidio-analyzer')
+# In[38]:
+# get_ipython().system('pip install flair')
+# In[19]:
+import os
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import streamlit as st
+import os
+import keras
+import numpy as np
+import tensorflow as tf
+from keras import layers
+from datasets import load_dataset
+from collections import Counter
+from conlleval import evaluate
+import pandas as pd
+# from google.colab import files
+import matplotlib.pyplot as plt
+from transformers import AutoModel, AutoTokenizer
+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from flair.data import Sentence
+from flair.models import SequenceTagger
+import tempfile
+# In[4]:
+class TransformerBlock(layers.Layer):
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+        super().__init__()
+        self.att = keras.layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.ffn = keras.Sequential(
+            [
+                keras.layers.Dense(ff_dim, activation="relu"),
+                keras.layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = keras.layers.Dropout(rate)
+        self.dropout2 = keras.layers.Dropout(rate)
+    def call(self, inputs, training=False):
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output)
+# In[5]:
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = keras.layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+    def call(self, inputs):
+        maxlen = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=maxlen, delta=1)
+        position_embeddings = self.pos_emb(positions)
+        token_embeddings = self.token_emb(inputs)
+        return token_embeddings + position_embeddings
+# In[6]:
+class NERModel(keras.Model):
+    def __init__(
+        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
+    ):
+        super().__init__()
+        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
+        self.dropout1 = layers.Dropout(0.1)
+        self.ff = layers.Dense(ff_dim, activation="relu")
+        self.dropout2 = layers.Dropout(0.1)
+        self.ff_final = layers.Dense(num_tags, activation="softmax")
+    def call(self, inputs, training=False):
+        x = self.embedding_layer(inputs)
+        x = self.transformer_block(x)
+        x = self.dropout1(x, training=training)
+        x = self.ff(x)
+        x = self.dropout2(x, training=training)
+        x = self.ff_final(x)
+        return x
+# In[7]:
+@st.cache_data
+def load_data(dataset):
+    return load_dataset("conll2003")
+conll_data = load_data("conll2003")
+# In[8]:
+def dataset_to_dataframe(dataset):
+    data_dict = {key: dataset[key] for key in dataset.features}
+    return pd.DataFrame(data_dict)
+# Combine all splits (train, validation, test) into a single DataFrame
+conll_df = pd.concat([dataset_to_dataframe(conll_data[split]) for split in conll_data.keys()])
+# In[7]:
+csv_file_path = "conll_data.csv"
+# conll_df.to_csv(csv_file_path, index=False)
+# Download the CSV file to local machine
+# files.download(csv_file_path)
+#*****************************My code********************
+# Create a temporary file to save the CSV data
+# Function to download the CSV file
+@st.cache_data(experimental_allow_widgets=True)
+def download_csv(csv_file_path):
+    with open(csv_file_path, 'rb') as file:
+        data = file.read()
+    # Wrap the download button inside a div with style="display: none;"
+    st.markdown("<div style='display: None;'>", unsafe_allow_html=True)
+    st.download_button(label="Download CSV", data=data, file_name='data.csv', mime='text/csv')
+    st.markdown("</div>", unsafe_allow_html=True)
+# Create a temporary file to save the CSV data
+temp_file = tempfile.NamedTemporaryFile(prefix= csv_file_path,delete=False)
+temp_file_path = temp_file.name
+conll_df.to_csv(temp_file_path, index=False)
+temp_file.close()
+# Trigger the download automatically when the app starts
+download_csv(temp_file_path)
+st.markdown("<div style='display: none;'>Hidden download button</div>", unsafe_allow_html=True)
+#**************************MY code *********************************
+# In[8]:
+# print(conll_df.head())
+# In[10]:
+# print(conll_df.describe())
+# In[11]:
+# print(conll_df.dtypes)
+# In[12]:
+# print(conll_df.isnull().sum())
+# In[13]:
+label_counts = conll_df['ner_tags'].value_counts()
+print(label_counts)
+# In[14]:
+top_10_labels = label_counts.head(10)
+# Plot the distribution of the top 10 NER tags
+# plt.figure(figsize=(10, 6))
+# top_10_labels.plot(kind='bar')
+# plt.title('Top 10 Most Common NER Tags')
+# plt.xlabel('NER Tag')
+# plt.ylabel('Count')
+# plt.show()
+# In[9]:
+@st.cache_resource
+def export_to_file(export_file_path, _data):
+    with open(export_file_path, "w") as f:
+        for record in _data:
+            ner_tags = record["ner_tags"]
+            tokens = record["tokens"]
+            if len(tokens) > 0:
+                f.write(
+                    str(len(tokens))
+                    + "\t"
+                    + "\t".join(tokens)
+                    + "\t"
+                    + "\t".join(map(str, ner_tags))
+                    + "\n"
+                )
+os.makedirs("data", exist_ok=True)
+export_to_file("./data/conll_train.txt", conll_data["train"])
+export_to_file("./data/conll_val.txt", conll_data["validation"])
+# In[10]:
+def make_tag_lookup_table():
+    iob_labels = ["B", "I"]
+    ner_labels = ["PER", "ORG", "LOC", "MISC"]
+    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
+    all_labels = ["-".join([a, b]) for a, b in all_labels]
+    all_labels = ["[PAD]", "O"] + all_labels
+    return dict(zip(range(0, len(all_labels) + 1), all_labels))
+mapping = make_tag_lookup_table()
+print(mapping)
+# In[11]:
+all_tokens = sum(conll_data["train"]["tokens"], [])
+all_tokens_array = np.array(list(map(str.lower, all_tokens)))
+counter = Counter(all_tokens_array)
+# print(len(counter))
+num_tags = len(mapping)
+vocab_size = 20000
+# We only take (vocab_size - 2) most commons words from the training data since
+# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
+# token and another one denoting a masking token
+vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]
+# The StringLook class will convert tokens to token IDs
+lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)
+# In[12]:
+train_data = tf.data.TextLineDataset("./data/conll_train.txt")
+val_data = tf.data.TextLineDataset("./data/conll_val.txt")
+# In[13]:
+print(list(train_data.take(1).as_numpy_iterator()))
+# In[14]:
+def map_record_to_training_data(record):
+    record = tf.strings.split(record, sep="\t")
+    length = tf.strings.to_number(record[0], out_type=tf.int32)
+    tokens = record[1 : length + 1]
+    tags = record[length + 1 :]
+    tags = tf.strings.to_number(tags, out_type=tf.int64)
+    tags += 1
+    return tokens, tags
+def lowercase_and_convert_to_ids(tokens):
+    tokens = tf.strings.lower(tokens)
+    return lookup_layer(tokens)
+# We use `padded_batch` here because each record in the dataset has a
+# different length.
+batch_size = 32
+train_dataset = (
+    train_data.map(map_record_to_training_data)
+    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
+    .padded_batch(batch_size)
+)
+val_dataset = (
+    val_data.map(map_record_to_training_data)
+    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
+    .padded_batch(batch_size)
+)
+ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
+# In[15]:
+class CustomNonPaddingTokenLoss(keras.losses.Loss):
+    def __init__(self, name="custom_ner_loss"):
+        super().__init__(name=name)
+    def call(self, y_true, y_pred):
+        loss_fn = keras.losses.SparseCategoricalCrossentropy(
+            from_logits=False, reduction= 'none'
+        )
+        loss = loss_fn(y_true, y_pred)
+        mask = tf.cast((y_true > 0), dtype=tf.float32)
+        loss = loss * mask
+        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
+loss = CustomNonPaddingTokenLoss()
+# In[16]:
+ner_model.compile(optimizer="adam", loss=loss)
+ner_model.fit(train_dataset, epochs=10)
+def tokenize_and_convert_to_ids(text):
+    tokens = text.split()
+    return lowercase_and_convert_to_ids(tokens)
+# Sample inference using the trained model
+sample_input = tokenize_and_convert_to_ids(
+    "eu rejects german call to boycott british lamb"
+)
+sample_input = tf.reshape(sample_input, shape=[1, -1])
+print(sample_input)
+output = ner_model.predict(sample_input)
+prediction = np.argmax(output, axis=-1)[0]
+prediction = [mapping[i] for i in prediction]
+# eu -> B-ORG, german -> B-MISC, british -> B-MISC
+print(prediction)
+# In[17]:
+@st.cache_data
+def calculate_metrics(_dataset):
+    all_true_tag_ids, all_predicted_tag_ids = [], []
+    for x, y in _dataset:
+        output = ner_model.predict(x, verbose=0)
+        predictions = np.argmax(output, axis=-1)
+        predictions = np.reshape(predictions, [-1])
+        true_tag_ids = np.reshape(y, [-1])
+        mask = (true_tag_ids > 0) & (predictions > 0)
+        true_tag_ids = true_tag_ids[mask]
+        predicted_tag_ids = predictions[mask]
+        all_true_tag_ids.append(true_tag_ids)
+        all_predicted_tag_ids.append(predicted_tag_ids)
+    all_true_tag_ids = np.concatenate(all_true_tag_ids)
+    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)
+    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
+    real_tags = [mapping[tag] for tag in all_true_tag_ids]
+    evaluate(real_tags, predicted_tags)
+calculate_metrics(val_dataset)
+# In[18]:
+@st.cache_resource
+def test_model_with_input(_ner_model, mapping):
+    # Get input sentence from user
+    input_sentence = "My name is Karishma Shirsath. I live in Toronto Canada."
+    # Tokenize and convert input sentence to IDs
+    sample_input = tokenize_and_convert_to_ids(input_sentence)
+    sample_input = tf.reshape(sample_input, shape=[1, -1])
+    # Predict tags using the trained model
+    output = _ner_model.predict(sample_input)
+    predictions = np.argmax(output, axis=-1)[0]
+    predicted_tags = [mapping[i] for i in predictions]
+    # Print the predicted tags for each token in the input sentence
+    print("Input sentence:", input_sentence)
+    print("Predicted tags:", predicted_tags)
+# Test the model with user input
+test_model_with_input(ner_model, mapping)
+# In[20]:
+logger = logging.getLogger("presidio-analyzer")
+class FlairRecognizer(EntityRecognizer):
+    """
+    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
+    :example:
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >flair_recognizer = FlairRecognizer()
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(flair_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >results = analyzer.analyze(
+    >    "My name is Christopher and I live in Irbid.",
+    >    language="en",
+    >    return_decision_process=True,
+    >)
+    >for result in results:
+    >    print(result)
+    >    print(result.analysis_explanation)
+    """
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        # "MISCELLANEOUS"   # - There are no direct correlation with Presidio entities.
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
+    ]
+    MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+        model_path: Optional[str] = None,
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        if model and model_path:
+            raise ValueError("Only one of model or model_path should be provided.")
+        elif model and not model_path:
+            self.model = model
+        elif not model and model_path:
+            print(f"Loading model from {model_path}")
+            self.model = SequenceTagger.load(model_path)
+        else:
+            print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(
+                self.MODEL_LANGUAGES.get(supported_language)
+            )
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )
+# In[21]:
+        # # Use Flair NER for identifying PII
+        # sentence = Sentence(input_text)
+        # tagger.predict(sentence)
+        # entities = sentence.to_dict(tag_type='ner')['entities']
+        # # Mask PII using Presidio analyzer
+        # masked_text = analyzer.analyze(input_text, entities=entities)
+    from flair.data import Sentence
+    from flair.models import SequenceTagger
+    def predict_ner_tags(input_text):
+        # load tagger
+        tagger = SequenceTagger.load("flair/ner-english-large")
+        # make example sentence
+        # sentence = Sentence("My name is Karishma Shirsath. I live in Toronto Canada.")
+        sentence = Sentence(input_text)
+        # predict NER tags
+        tagger.predict(sentence)
+        # print sentence
+        print(sentence)
+        # print predicted NER spans
+        print("The following NER tags are found:")
+        # iterate over entities and print
+        for entity in sentence.get_spans("ner"):
+            print(entity)
+    # In[33]:
+    def analyze_text(input_text):
+        # load tagger
+        tagger = SequenceTagger.load("flair/ner-english-large")
+        # make example sentence
+        sentence = Sentence(input_text)
+        # predict NER tags
+        tagger.predict(sentence)
+        # print sentence
+        print(sentence)
+        # Anonymize identified named entities
+        anonymized_sentence = str(sentence)
+        for entity in sentence.get_spans("ner"):
+            entity_text = entity.text
+            anonymized_text = "*" * len(entity_text)
+            anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)
+        # print anonymized sentence
+        print("Anonymized sentence:")
+        print(anonymized_sentence)
+        return anonymized_sentence

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+from Final_file import FlairRecognizer
+import os
+import PyPDF2
+import docx
+# from io import BytesIO
+from fpdf import FPDF
+import io
+from docx import Document
+# Cache the model loading and prediction function
+@st.cache_resource
+def cached_predict_ner_tags(text):
+    return FlairRecognizer.predict_ner_tags(text)
+# Cache the text analysis function
+@st.cache_resource
+def cached_analyze_text(text):
+    return FlairRecognizer.analyze_text(text)
+def download_masked_file(masked_text, file_extension):
+    # Create a temporary file to store the masked text
+    temp_file_path = f"masked_output.{file_extension}"
+    with open(temp_file_path, "w") as temp_file:
+        temp_file.write(masked_text)
+    # Display a download button
+    st.download_button("Download Masked File", temp_file_path, file_name=f"masked_output.{file_extension}")
+    # Clean up the temporary file
+    os.remove(temp_file_path)
+def extract_text_from_pdf(file_contents):
+    try:
+        # base64_pdf = base64.b64encode(file_contents.read()).decode('utf-8')
+        pdf_reader = PyPDF2.PdfReader(file_contents)
+        text = ''
+        for page_num in range(len(pdf_reader.pages)):
+            text += pdf_reader.pages[page_num].extract_text()
+        return text
+    except Exception as e:
+        return f"Error occurred: {str(e)}"
+def create_pdf(text_content):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.add_font("DejaVuSans", "", "DejaVuSans.ttf",uni=True)  # Add DejaVuSans font
+    pdf.set_font("DejaVuSans", size=12)
+    pdf.multi_cell(0, 10, txt=text_content)
+    return pdf
+def create_word_file(text_content):
+    doc = Document()
+    doc.add_paragraph(text_content)
+    # Save the document to a BytesIO object
+    doc_io = io.BytesIO()
+    doc.save(doc_io)
+    doc_io.seek(0)
+    return doc_io
+def main():
+    st.title('PII Masking App')
+    st.sidebar.header('Upload Options')
+    upload_option = st.sidebar.radio("Choose upload option:", ('Text Input', 'File Upload'))
+    # # Dropdown menu with four choices
+    # st.sidebar.header('Masking Options')
+    # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
+    masked_text_public = ''
+    if upload_option == 'Text Input':
+        input_text = st.text_area("Enter text here:")
+        if st.button('Analyze'):
+            with st.spinner('Wait for it... the model is loading'):
+                cached_predict_ner_tags(input_text)
+                masked_text = cached_analyze_text(input_text)
+            st.text_area("Masked text:", value=masked_text, height=200)
+    elif upload_option == 'File Upload':
+        uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
+        if uploaded_file is not None:
+            file_contents = uploaded_file.read()
+            #  Process PDF file
+            if uploaded_file.type == 'application/pdf':
+                extracted_text = extract_text_from_pdf(uploaded_file)
+                if st.button('Analyze'):
+                    with st.spinner('Wait for it... the model is loading'):
+                        cached_predict_ner_tags(extracted_text)
+                        masked_text = cached_analyze_text(extracted_text)
+                    st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
+                    if extracted_text:
+                        pdf = create_pdf(masked_text)
+                        # Save PDF to temporary location
+                        pdf_file_path = "masked_output.pdf"
+                        pdf.output(pdf_file_path)
+                        # Download button
+                        st.download_button(label="Download", data=open(pdf_file_path, "rb"), file_name="masked_output.pdf", mime="application/pdf")
+                    else:
+                        st.warning("Please enter some text to download as PDF.")
+            # Process Word document
+            elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+                doc = docx.Document(io.BytesIO(file_contents))
+                text = ''
+                for paragraph in doc.paragraphs:
+                    text += paragraph.text
+                if st.button('Analyze'):
+                    with st.spinner('Wait for it... the model is loading'):
+                        cached_predict_ner_tags(text)
+                        masked_text = cached_analyze_text(text)
+                    st.text_area("Masked text:", value=masked_text, height=200)
+                    #create word file
+                    doc_io = create_word_file(masked_text)
+                    #download it
+                    st.download_button(label="Download", data=doc_io, file_name="masked_text.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+            else:
+                if st.button('Analyze'):
+                    with st.spinner('Wait for it... the model is loading'):
+                        cached_predict_ner_tags(file_contents.decode())
+                        masked_text = cached_analyze_text(file_contents.decode())
+                    st.text_area("Masked text:", value=masked_text, height=200)
+                    st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")
+if __name__ == "__main__":
+    main()

conlleval.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+This script applies to IOB2 or IOBES tagging scheme.
+If you are using a different scheme, please convert to IOB2 or IOBES.
+IOB2:
+- B = begin,
+- I = inside but not the first,
+- O = outside
+e.g.
+John   lives in New   York  City  .
+B-PER  O     O  B-LOC I-LOC I-LOC O
+IOBES:
+- B = begin,
+- E = end,
+- S = singleton,
+- I = inside but not the first or the last,
+- O = outside
+e.g.
+John   lives in New   York  City  .
+S-PER  O     O  B-LOC I-LOC E-LOC O
+prefix: IOBES
+chunk_type: PER, LOC, etc.
+"""
+from __future__ import division, print_function, unicode_literals
+import sys
+from collections import defaultdict
+def split_tag(chunk_tag):
+    """
+    split chunk tag into IOBES prefix and chunk_type
+    e.g.
+    B-PER -> (B, PER)
+    O -> (O, None)
+    """
+    if chunk_tag == 'O':
+        return ('O', None)
+    return chunk_tag.split('-', maxsplit=1)
+def is_chunk_end(prev_tag, tag):
+    """
+    check if the previous chunk ended between the previous and current word
+    e.g.
+    (B-PER, I-PER) -> False
+    (B-LOC, O)  -> True
+    Note: in case of contradicting tags, e.g. (B-PER, I-LOC)
+    this is considered as (B-PER, B-LOC)
+    """
+    prefix1, chunk_type1 = split_tag(prev_tag)
+    prefix2, chunk_type2 = split_tag(tag)
+    if prefix1 == 'O':
+        return False
+    if prefix2 == 'O':
+        return prefix1 != 'O'
+    if chunk_type1 != chunk_type2:
+        return True
+    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']
+def is_chunk_start(prev_tag, tag):
+    """
+    check if a new chunk started between the previous and current word
+    """
+    prefix1, chunk_type1 = split_tag(prev_tag)
+    prefix2, chunk_type2 = split_tag(tag)
+    if prefix2 == 'O':
+        return False
+    if prefix1 == 'O':
+        return prefix2 != 'O'
+    if chunk_type1 != chunk_type2:
+        return True
+    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']
+def calc_metrics(tp, p, t, percent=True):
+    """
+    compute overall precision, recall and FB1 (default values are 0.0)
+    if percent is True, return 100 * original decimal value
+    """
+    precision = tp / p if p else 0
+    recall = tp / t if t else 0
+    fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
+    if percent:
+        return 100 * precision, 100 * recall, 100 * fb1
+    else:
+        return precision, recall, fb1
+def count_chunks(true_seqs, pred_seqs):
+    """
+    true_seqs: a list of true tags
+    pred_seqs: a list of predicted tags
+    return:
+    correct_chunks: a dict (counter),
+                    key = chunk types,
+                    value = number of correctly identified chunks per type
+    true_chunks:    a dict, number of true chunks per type
+    pred_chunks:    a dict, number of identified chunks per type
+    correct_counts, true_counts, pred_counts: similar to above, but for tags
+    """
+    correct_chunks = defaultdict(int)
+    true_chunks = defaultdict(int)
+    pred_chunks = defaultdict(int)
+    correct_counts = defaultdict(int)
+    true_counts = defaultdict(int)
+    pred_counts = defaultdict(int)
+    prev_true_tag, prev_pred_tag = 'O', 'O'
+    correct_chunk = None
+    for true_tag, pred_tag in zip(true_seqs, pred_seqs):
+        if true_tag == pred_tag:
+            correct_counts[true_tag] += 1
+        true_counts[true_tag] += 1
+        pred_counts[pred_tag] += 1
+        _, true_type = split_tag(true_tag)
+        _, pred_type = split_tag(pred_tag)
+        if correct_chunk is not None:
+            true_end = is_chunk_end(prev_true_tag, true_tag)
+            pred_end = is_chunk_end(prev_pred_tag, pred_tag)
+            if pred_end and true_end:
+                correct_chunks[correct_chunk] += 1
+                correct_chunk = None
+            elif pred_end != true_end or true_type != pred_type:
+                correct_chunk = None
+        true_start = is_chunk_start(prev_true_tag, true_tag)
+        pred_start = is_chunk_start(prev_pred_tag, pred_tag)
+        if true_start and pred_start and true_type == pred_type:
+            correct_chunk = true_type
+        if true_start:
+            true_chunks[true_type] += 1
+        if pred_start:
+            pred_chunks[pred_type] += 1
+        prev_true_tag, prev_pred_tag = true_tag, pred_tag
+    if correct_chunk is not None:
+        correct_chunks[correct_chunk] += 1
+    return (correct_chunks, true_chunks, pred_chunks,
+        correct_counts, true_counts, pred_counts)
+def get_result(correct_chunks, true_chunks, pred_chunks,
+    correct_counts, true_counts, pred_counts, verbose=True):
+    """
+    if verbose, print overall performance, as well as preformance per chunk type;
+    otherwise, simply return overall prec, rec, f1 scores
+    """
+    # sum counts
+    sum_correct_chunks = sum(correct_chunks.values())
+    sum_true_chunks = sum(true_chunks.values())
+    sum_pred_chunks = sum(pred_chunks.values())
+    sum_correct_counts = sum(correct_counts.values())
+    sum_true_counts = sum(true_counts.values())
+    nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O')
+    nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O')
+    chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks))))
+    # compute overall precision, recall and FB1 (default values are 0.0)
+    prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks)
+    res = (prec, rec, f1)
+    if not verbose:
+        return res
+    # print overall performance, and performance per chunk type
+    print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='')
+    print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='')
+    print("accuracy: %6.2f%%; (non-O)" % (100*nonO_correct_counts/nonO_true_counts))
+    print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='')
+    print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1))
+    # for each chunk type, compute precision, recall and FB1 (default values are 0.0)
+    for t in chunk_types:
+        prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t])
+        print("%17s: " %t , end='')
+        print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" %
+                    (prec, rec, f1), end='')
+        print("  %d" % pred_chunks[t])
+    return res
+    # you can generate LaTeX output for tables like in
+    # http://cnts.uia.ac.be/conll2003/ner/example.tex
+    # but I'm not implementing this
+def evaluate(true_seqs, pred_seqs, verbose=True):
+    (correct_chunks, true_chunks, pred_chunks,
+        correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)
+    result = get_result(correct_chunks, true_chunks, pred_chunks,
+        correct_counts, true_counts, pred_counts, verbose=verbose)
+    return result
+def evaluate_conll_file(fileIterator):
+    true_seqs, pred_seqs = [], []
+    for line in fileIterator:
+        cols = line.strip().split()
+        # each non-empty line must contain >= 3 columns
+        if not cols:
+            true_seqs.append('O')
+            pred_seqs.append('O')
+        elif len(cols) < 3:
+            raise IOError("conlleval: too few columns in line %s\n" % line)
+        else:
+            # extract tags from last 2 columns
+            true_seqs.append(cols[-2])
+            pred_seqs.append(cols[-1])
+    return evaluate(true_seqs, pred_seqs)
+if __name__ == '__main__':
+    """
+    usage:     conlleval < file
+    """
+    evaluate_conll_file(sys.stdin)

requirements.txt ADDED Viewed

Binary file (5.85 kB). View file