Spaces:

miesnerjacob
/

Multi-task-NLP

Running

App Files Files Community

miesnerjacob commited on Jul 24, 2022

Commit

4b75840

•

1 Parent(s): ef5720a

Add application files

Browse files

Files changed (10) hide show

.DS_Store +0 -0
emotion_detection.py +40 -0
keyword_extraction.py +84 -0
named_entity_recognition.py +34 -0
part_of_speech_tagging.py +14 -0
requirements.txt +15 -0
sentiment_analysis.py +50 -0
streamlit_app.py +249 -0
text_annotation.py +51 -0
text_annotation_utils.py +127 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

emotion_detection.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers_interpret import SequenceClassificationExplainer
+import torch
+import pandas as pd
+class EmotionDetection():
+    def __init__(self, chunksize=512):
+        hub_location = 'cardiffnlp/twitter-roberta-base-emotion'
+        self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
+        self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
+        self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
+    def justify(self, text):
+        """"""
+        word_attributions = self.explainer(text)
+        html = self.explainer.visualize("example.html")
+        return html
+    def classify(self, text):
+        """"""
+        tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
+        outputs = self.model(**tokens)
+        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
+        probs = probs.mean(dim=0).detach().numpy()
+        labels = list(self.model.config.id2label.values())
+        preds = pd.Series(probs, index=labels, name='Predicted Probability')
+        return preds
+    def run(self, text):
+        """"""
+        preds = self.classify(text)
+        html = self.justify(text)
+        return preds, html

keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import spacy
+import pytextrank
+import re
+from operator import itemgetter
+class KeywordExtractor:
+    def __init__(self):
+        self.nlp = spacy.load("en_core_web_sm")
+        self.nlp.add_pipe("textrank")
+    def get_keywords(self, text, max_keywords):
+        doc = self.nlp(text)
+        kws = [i.text for i in doc._.phrases[:max_keywords]]
+        return kws
+    def get_keyword_indicies(self, string_list, text):
+        out = []
+        for s in string_list:
+            indicies = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
+            out.extend(indicies)
+        return out
+    def merge_overlapping_indicies(self, indicies):
+        # Sort the array on the basis of start values of intervals.
+        indicies.sort()
+        stack = []
+        # insert first interval into stack
+        stack.append(indicies[0])
+        for i in indicies[1:]:
+            # Check for overlapping interval,
+            # if interval overlap
+            if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
+                stack[-1][-1] = max(stack[-1][-1], i[-1])
+            else:
+                stack.append(i)
+        return stack
+    def merge_until_finished(self, indicies):
+        len_indicies = 0
+        while True:
+            merged = self.merge_overlapping_indicies(indicies)
+            if len_indicies == len(merged):
+                out_indicies = sorted(merged, key=itemgetter(0))
+                return out_indicies
+            else:
+                len_indicies = len(merged)
+    def get_annotation(self, text, indicies, kws):
+        # Convert indicies to list
+        # kws = kws + [i.lower() for i in kws]
+        arr = list(text)
+        for idx in sorted(indicies, reverse=True):
+            arr.insert(idx[0], "<kw>")
+            arr.insert(idx[1]+1, "XXXxxxXXXxxxXXX <kw>")
+        annotation = ''.join(arr)
+        split = annotation.split('<kw>')
+        final_annotation = [(x.replace('XXXxxxXXXxxxXXX ', ''), "KEY", "#26aaef") if "XXXxxxXXXxxxXXX" in x else x for x in split]
+        kws_check = []
+        for i in final_annotation:
+            if type(i) is tuple:
+                kws_check.append(i[0])
+        return final_annotation
+    def generate(self, text, max_keywords):
+        kws = self.get_keywords(text, max_keywords)
+        indicies = list(self.get_keyword_indicies(kws, text))
+        if indicies:
+            indicies_merged = self.merge_until_finished(indicies)
+            annotation = self.get_annotation(text, indicies_merged, kws)
+        else:
+            annotation = None
+        return annotation, kws

named_entity_recognition.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from transformers import pipeline
+class NamedEntityRecognition():
+    def __init__(self):
+        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+        model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+        self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
+    def get_annotation(self, preds, text):
+        splits = [0]
+        entities = {}
+        for i in preds:
+            splits.append(i['start'])
+            splits.append(i['end'])
+            entities[i['word']] = i['entity_group']
+        # Exclude bad preds
+        exclude = ['', '.', '. ', ' ']
+        for x in exclude:
+            if x in entities.keys():
+                entities.pop(x)
+        parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]
+        final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]
+        return final_annotation
+    def classify(self, text):
+        preds = self.nlp(text)
+        ner_annotation = self.get_annotation(preds, text)
+        return preds, ner_annotation

part_of_speech_tagging.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import nltk
+from nltk.tokenize import word_tokenize
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+class POSTagging():
+    def __init__(self):
+       pass
+    def classify(self, text):
+        text = word_tokenize(text)
+        preds = nltk.pos_tag(text)
+        return preds

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+news-please~=1.5.20
+sklearn~=0.0
+keybert~=0.5.1
+tensorflow
+tensorflow-hub~=0.12.0
+nltk~=3.5
+gradio~=3.0
+typing-extensions==3.10.0.2
+yake~=0.4.8
+streamlit-option-menu~=0.3.2
+streamlit-option-menu~=0.3.2
+st-annotated-text~=3.0.0
+transformers-interpret~=0.7.2
+htbuilder==0.6.0
+pytextrank

sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers_interpret import SequenceClassificationExplainer
+import torch
+import pandas as pd
+class SentimentAnalysis():
+    def __init__(self):
+        # Load Tokenizer & Model
+        hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
+        self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
+        self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
+        # Change model labels in config
+        self.model.config.id2label[0] = "Negative"
+        self.model.config.id2label[1] = "Neutral"
+        self.model.config.id2label[2] = "Positive"
+        self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
+        self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
+        self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
+        # Instantiate explainer
+        self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
+    def justify(self, text):
+        """"""
+        word_attributions = self.explainer(text)
+        html = self.explainer.visualize("example.html")
+        return html
+    def classify(self, text):
+        """"""
+        tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
+        outputs = self.model(**tokens)
+        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
+        probs = probs.mean(dim=0).detach().numpy()
+        preds = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
+        return preds
+    def run(self, text):
+        """"""
+        preds = self.classify(text)
+        html = self.justify(text)
+        return preds, html

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import pandas as pd
+import streamlit as st
+from text_annotation import annotated_text
+from streamlit_option_menu import option_menu
+from sentiment_analysis import SentimentAnalysis
+from keyword_extraction import KeywordExtractor
+from part_of_speech_tagging import POSTagging
+from emotion_detection import EmotionDetection
+from named_entity_recognition import NamedEntityRecognition
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+@st.cache(allow_output_mutation=True)
+def load_sentiment_model():
+    return SentimentAnalysis()
+@st.cache(allow_output_mutation=True)
+def load_keyword_model():
+    return KeywordExtractor()
+@st.cache(allow_output_mutation=True)
+def load_pos_model():
+    return POSTagging()
+@st.cache(allow_output_mutation=True)
+def load_emotion_model():
+    return EmotionDetection()
+@st.cache(allow_output_mutation=True)
+def load_ner_model():
+    return NamedEntityRecognition()
+sentiment_analyzer = load_sentiment_model()
+keyword_extractor = load_keyword_model()
+pos_tagger = load_pos_model()
+emotion_detector = load_emotion_model()
+ner = load_ner_model()
+with st.sidebar:
+    page = option_menu(menu_title='Menu',
+                       menu_icon="robot",
+                       options=["Welcome!",
+                                "Sentiment Analysis",
+                                "Keyword Extraction",
+                                "Part of Speech Tagging",
+                                "Emotion Detection",
+                                "Named Entity Recognition"],
+                       icons=["house-door",
+                              "emoji-heart-eyes",
+                              "key",
+                              "chat-dots",
+                              "emoji-heart-eyes",
+                              "building"],
+                       default_index=0
+                       )
+st.title('Open-source NLP')
+if page == "Welcome!":
+    st.header('Welcome!')
+    st.write(
+        """
+        Supercharge your workflow with this platform built using 100% open-source resources!
+        """
+    )
+    st.markdown("![Alt Text](https://media.giphy.com/media/2fEvoZ9tajMxq/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    st.subheader("Introduction")
+    st.write("""
+        Welcome! This application is a celebration of open-source and the power that programmers have been granted today
+        by those who give back to the community. This tool was constructed using Streamlit, Huggingface Transformers,
+        Transformers-Interpret, NLTK, Spacy, amongst other open-source Python libraries and models.
+        Utilizing this tool you will be able to perform a multitude of Natural Language Processing Tasks on a range of
+        different tasks. All you need to do is paste your input, select your task, and hit the start button!
+        * This application currently supports:
+            * Sentiment Analysis
+            * Keyword Extraction
+            * Part of Speech Tagging
+            * Emotion Detection
+            * Named Entity Recognition
+        More features may be added in the future, depending on community feedback. Please reach out to me at
+        [email protected] or at my Linkedin page listed below if you have ideas or suggestions for improvement.
+        If you would like to contribute yourself, feel free to fork the Github repository listed below and submit a merge request.
+        """
+    )
+    st.subheader("Notes")
+    st.write(
+        """
+        * This dashboard was contsructed by Jacob Miesner, but every resource used is open-source! If you are interested
+        in his other works you can view them here:
+           [Project Github](https://github.com/MiesnerJacob/nlp-dashboard)
+           [Jacob Miesner's Github](https://github.com/MiesnerJacob)
+           [Jacob Miesner's Linkedin](https://www.linkedin.com/in/jacob-miesner-885050125/)
+           [Jacob Miesner's Website](https://www.jacobmiesner.com)
+        * The prediction justification for some of the tasks are printed as the model views them. For this reason the text
+        may contain special tokens like [CLS] or [SEP] or even hashtags splitting words. If you are knowledgeable about
+        language models and how they work these will be familiar, if you do not have prior experience with language models
+        you can ignore these characters.
+        """
+    )
+elif page == "Sentiment Analysis":
+    st.header('Sentiment Analysis')
+    st.markdown("![Alt Text](https://media.giphy.com/media/XIqCQx02E1U9W/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value="")
+    if st.button('Start!'):
+        with st.spinner("Loading..."):
+            preds, html = sentiment_analyzer.run(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Sentiment Predictions")
+            st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
+            st.write("")
+            st.subheader("Sentiment Justification")
+            raw_html = html._repr_html_()
+            st.components.v1.html(raw_html)
+elif page == "Keyword Extraction":
+    st.header('Keyword Extraction')
+    st.markdown("![Alt Text](https://media.giphy.com/media/xT9C25UNTwfZuk85WP/giphy-downsized-large.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value="")
+    max_keywords = st.slider('# of Keywords Max Limit', min_value=1, max_value=10, value=5, step=1)
+    if st.button('Start!'):
+        with st.spinner("Loading..."):
+            annotation, keywords = keyword_extractor.generate(text, max_keywords)
+            st.success('All done!')
+        if annotation:
+            st.subheader("Keyword Annotation")
+            st.write("")
+            annotated_text(*annotation)
+            st.text("")
+        st.subheader("Extracted Keywords")
+        st.write("")
+        df = pd.DataFrame(keywords, columns=['Extracted Keywords'])
+        csv = df.to_csv(index=False).encode('utf-8')
+        st.download_button('Download Keywords to CSV', csv, file_name='news_intelligence_keywords.csv')
+        data_table = st.table(df)
+elif page == "Part of Speech Tagging":
+    st.header('Part of Speech Tagging')
+    st.markdown("![Alt Text](https://media.giphy.com/media/WoWm8YzFQJg5i/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value="")
+    if st.button('Start!'):
+        with st.spinner("Loading..."):
+            preds = pos_tagger.classify(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Part of Speech tags")
+            annotated_text(*preds)
+            st.write("")
+            st.components.v1.iframe('https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html', height=1000)
+elif page == "Emotion Detection":
+    st.header('Emotion Detection')
+    st.markdown("![Alt Text](https://media.giphy.com/media/fU8X6ozSszyEw/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value="")
+    if st.button('Start!'):
+        with st.spinner("Loading..."):
+            preds, html = emotion_detector.run(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("Emotion Predictions")
+            st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
+            raw_html = html._repr_html_()
+            st.write("")
+            st.subheader("Emotion Justification")
+            st.components.v1.html(raw_html, height=500)
+elif page == "Named Entity Recognition":
+    st.header('Named Entity Recognition')
+    st.markdown("![Alt Text](https://media.giphy.com/media/lxO8wdWdu4tig/giphy.gif)")
+    st.write(
+        """
+        """
+    )
+    text = st.text_area("Paste text here", value="")
+    if st.button('Start!'):
+        with st.spinner("Loading..."):
+            preds, ner_annotation = ner.classify(text)
+            st.success('All done!')
+            st.write("")
+            st.subheader("NER Predictions")
+            annotated_text(*ner_annotation)
+            st.write("")
+            st.subheader("NER Prediction Metadata")
+            st.write(preds)

text_annotation.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+from text_annotation_utils import *
+def annotated_text(*args, type=None):
+    """Writes text with annotations into your Streamlit app.
+    Parameters
+    ----------
+    *args : str, tuple or htbuilder.HtmlElement
+        Arguments can be:
+        - strings, to draw the string as-is on the screen.
+        - tuples of the form (main_text, annotation_text, background, color) where
+          background and foreground colors are optional and should be an CSS-valid string such as
+          "#aabbcc" or "rgb(10, 20, 30)"
+        - HtmlElement objects in case you want to customize the annotations further. In particular,
+          you can import the `annotation()` function from this module to easily produce annotations
+          whose CSS you can customize via keyword arguments.
+    Examples
+    --------
+    # >>> annotated_text(
+    # ...     "This ",
+    # ...     ("is", "verb", "#8ef"),
+    # ...     " some ",
+    # ...     ("annotated", "adj", "#faa"),
+    # ...     ("text", "noun", "#afa"),
+    # ...     " for those of ",
+    # ...     ("you", "pronoun", "#fea"),
+    # ...     " who ",
+    # ...     ("like", "verb", "#8ef"),
+    # ...     " this sort of ",
+    # ...     ("thing", "noun", "#afa"),
+    # ... )
+    # >>> annotated_text(
+    # ...     "Hello ",
+    # ...     annotation("world!", "noun", color="#8ef", border="1px dashed red"),
+    # ... )
+    """
+    if type == 'title':
+        st.markdown(
+            '<p class="big-font">' + get_annotated_html(*args)+ '</p>',
+            unsafe_allow_html=True,
+        )
+    if type == 'description':
+        st.markdown(
+            '<p class="medium-font">' + get_annotated_html(*args) + '</p>',
+            unsafe_allow_html=True,
+        )
+    else:
+        st.markdown(
+            get_annotated_html(*args),
+            unsafe_allow_html=True,
+        )

text_annotation_utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import html
+from htbuilder import H, HtmlElement, styles
+from htbuilder.units import unit
+# Only works in 3.7+: from htbuilder import div, span
+div = H.div
+span = H.span
+# Only works in 3.7+: from htbuilder.units import px, rem, em
+px = unit.px
+rem = unit.rem
+em = unit.em
+# Colors from the Streamlit palette.
+# These are red-70, orange-70, ..., violet-70, gray-70.
+PALETTE = [
+    "#ff4b4b",
+    "#ffa421",
+    "#ffe312",
+    "#21c354",
+    "#00d4b1",
+    "#00c0f2",
+    "#1c83e1",
+    "#803df5",
+    "#808495",
+]
+OPACITIES = [
+    "33", "66",
+]
+def annotation(body, label="", background=None, color=None, **style):
+    """Build an HtmlElement span object with the given body and annotation label.
+    The end result will look something like this:
+        [body | label]
+    Parameters
+    ----------
+    body : string
+        The string to put in the "body" part of the annotation.
+    label : string
+        The string to put in the "label" part of the annotation.
+    background : string or None
+        The color to use for the background "chip" containing this annotation.
+        If None, will use a random color based on the label.
+    color : string or None
+        The color to use for the body and label text.
+        If None, will use the document's default text color.
+    style : dict
+        Any CSS you want to apply to the containing "chip". This is useful for things like
+    Examples
+    --------
+    Produce a simple annotation with default colors:
+    # >>> annotation("apple", "fruit")
+    Produce an annotation with custom colors:
+    # >>> annotation("apple", "fruit", background="#FF0", color="black")
+    Produce an annotation with crazy CSS:
+    # >>> annotation("apple", "fruit", background="#FF0", border="1px dashed red")
+    """
+    color_style = {}
+    if color:
+        color_style['color'] = color
+    if not background:
+        label_sum = sum(ord(c) for c in label)
+        background_color = PALETTE[label_sum % len(PALETTE)]
+        background_opacity = OPACITIES[label_sum % len(OPACITIES)]
+        background = background_color + background_opacity
+    return (
+        span(
+            style=styles(
+                background=background,
+                border_radius=rem(0.33),
+                padding=(rem(0.125), rem(0.5)),
+                overflow="hidden",
+                **color_style,
+                **style,
+            ))(
+            html.escape(body),
+            span(
+                style=styles(
+                    padding_left=rem(0.5),
+                    text_transform="uppercase",
+                ))(
+                span(
+                    style=styles(
+                        font_size=em(0.67),
+                        opacity=0.5,
+                    ))(
+                    html.escape(label),
+                ),
+            ),
+        )
+    )
+def get_annotated_html(*args):
+    """Writes text with annotations into an HTML string.
+    Parameters
+    ----------
+    *args : see annotated_text()
+    Returns
+    -------
+    str
+        An HTML string.
+    """
+    out = div()
+    for arg in args:
+        if isinstance(arg, str):
+            out(html.escape(arg))
+        elif isinstance(arg, HtmlElement):
+            out(arg)
+        elif isinstance(arg, tuple):
+            out(annotation(*arg))
+        else:
+            raise Exception("Bad input")
+    return str(out)