Spaces:

aubmindlab
/

Arabic-NLP

Build error

App Files Files Community

wissamantoun commited on Sep 11, 2021

Commit

90afd57

•

1 Parent(s): c64d018

added sarcasm and qa with logging

Browse files

Files changed (9) hide show

app.py +7 -1
backend/home.py +2 -1
backend/qa.py +47 -0
backend/qa_utils.py +163 -0
backend/sarcasm.py +26 -0
backend/services.py +202 -43
backend/utils.py +22 -0
images/is2alni_logo.png +0 -0
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,22 +1,28 @@
 import awesome_streamlit as ast
 import streamlit as st
-from backend.utils import get_current_ram_usage
 import backend.aragpt
 import backend.home
 import backend.processor
 import backend.sa
 st.set_page_config(
     page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
 )
 PAGES = {
     "Home": backend.home,
     "Arabic Text Preprocessor": backend.processor,
     "Arabic Language Generation": backend.aragpt,
     "Arabic Sentiment Analysis": backend.sa,
 }

 import awesome_streamlit as ast
 import streamlit as st
+from backend.utils import get_current_ram_usage, ga
 import backend.aragpt
 import backend.home
 import backend.processor
 import backend.sa
+import backend.qa
+import backend.sarcasm
 st.set_page_config(
     page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
 )
+ga(st.__file__)
 PAGES = {
     "Home": backend.home,
     "Arabic Text Preprocessor": backend.processor,
     "Arabic Language Generation": backend.aragpt,
     "Arabic Sentiment Analysis": backend.sa,
+    "Arabic Sarcasm Detection": backend.sarcasm,
+    "Arabic Question Answering": backend.qa,
 }

backend/home.py CHANGED Viewed

@@ -14,7 +14,8 @@ def write():
     - Arabic Text Preprocessor: Test how text imput is treated by our preprocessor
     - Arabic Language Generation: Generate Arabic text using our AraGPT2 language models
     - Arabic Sentiment Analysis: Test the senitment analysis model that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
-    - Arabic Masked Language Modeling: Test our AraBERT models MLM capabilities
     """
     )
     st.markdown("#")

     - Arabic Text Preprocessor: Test how text imput is treated by our preprocessor
     - Arabic Language Generation: Generate Arabic text using our AraGPT2 language models
     - Arabic Sentiment Analysis: Test the senitment analysis model that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
+    - Arabic Sarcasm Detection: Test MARBERT trained for sarcasm detection
+    - Arabic Question Answering: Test our AraELECTRA QA capabilities
     """
     )
     st.markdown("#")

backend/qa.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+from qa_utils import annotate_answer, get_qa_answers
+_, col1, _ = st.beta_columns(3)
+with col1:
+    st.image("is2alni_logo.png", width=200)
+    st.title("إسألني أي شيء")
+st.markdown(
+    """
+<style>
+p, div, input, label {
+  text-align: right;
+}
+</style>
+    """,
+    unsafe_allow_html=True,
+)
+st.sidebar.header("Info")
+st.sidebar.image("AraELECTRA.png", width=150)
+st.sidebar.write("Powered by [AraELECTRA](https://github.com/aub-mind/arabert)")
+st.sidebar.write("\n")
+n_answers = st.sidebar.slider(
+    "Max. number of answers", min_value=1, max_value=10, value=2, step=1
+)
+question = st.text_input("", value="من هو جو بايدن؟")
+if "؟" not in question:
+    question += "؟"
+run_query = st.button("أجب")
+if run_query:
+    # https://discuss.streamlit.io/t/showing-a-gif-while-st-spinner-runs/5084
+    with st.spinner("... جاري البحث "):
+        results_dict = get_qa_answers(question)
+    if len(results_dict) > 0:
+        st.write("## :الأجابات هي")
+        for result in results_dict["results"][:n_answers]:
+            annotate_answer(result)
+            f"[**المصدر**](<{result['link']}>)"
+    else:
+        st.write("## 😞 ليس لدي جواب")

backend/qa_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import streamlit.components.v1
+from htbuilder import HtmlElement, div, span, styles
+from htbuilder.units import px, rem, em
+def annotation(body, label="", background="#ddd", color="#333", **style):
+    """Build an HtmlElement span object with the given body and annotation label.
+    The end result will look something like this:
+        [body | label]
+    Parameters
+    ----------
+    body : string
+        The string to put in the "body" part of the annotation.
+    label : string
+        The string to put in the "label" part of the annotation.
+    background : string
+        The color to use for the background "chip" containing this annotation.
+    color : string
+        The color to use for the body and label text.
+    **style : dict
+        Any CSS you want to use to customize the containing "chip".
+    Examples
+    --------
+    Produce a simple annotation with default colors:
+    >>> annotation("apple", "fruit")
+    Produce an annotation with custom colors:
+    >>> annotation("apple", "fruit", background="#FF0", color="black")
+    Produce an annotation with crazy CSS:
+    >>> annotation("apple", "fruit", background="#FF0", border="1px dashed red")
+    """
+    if "font_family" not in style:
+        style["font_family"] = "sans-serif"
+    return span(
+        style=styles(
+            background=background,
+            border_radius=rem(0.33),
+            color=color,
+            padding=(rem(0.17), rem(0.67)),
+            display="inline-flex",
+            justify_content="center",
+            align_items="center",
+            **style,
+        )
+    )(
+        body,
+        span(
+            style=styles(
+                color=color,
+                font_size=em(0.67),
+                opacity=0.5,
+                padding_left=rem(0.5),
+                text_transform="uppercase",
+                margin_bottom=px(-2),
+            )
+        )(label),
+    )
+def annotated_text(*args, **kwargs):
+    """Writes test with annotations into your Streamlit app.
+    Parameters
+    ----------
+    *args : str, tuple or htbuilder.HtmlElement
+        Arguments can be:
+        - strings, to draw the string as-is on the screen.
+        - tuples of the form (main_text, annotation_text, background, color) where
+          background and foreground colors are optional and should be an CSS-valid string such as
+          "#aabbcc" or "rgb(10, 20, 30)"
+        - HtmlElement objects in case you want to customize the annotations further. In particular,
+          you can import the `annotation()` function from this module to easily produce annotations
+          whose CSS you can customize via keyword arguments.
+    Examples
+    --------
+    >>> annotated_text(
+    ...     "This ",
+    ...     ("is", "verb", "#8ef"),
+    ...     " some ",
+    ...     ("annotated", "adj", "#faa"),
+    ...     ("text", "noun", "#afa"),
+    ...     " for those of ",
+    ...     ("you", "pronoun", "#fea"),
+    ...     " who ",
+    ...     ("like", "verb", "#8ef"),
+    ...     " this sort of ",
+    ...     ("thing", "noun", "#afa"),
+    ... )
+    >>> annotated_text(
+    ...     "Hello ",
+    ...     annotation("world!", "noun", color="#8ef", border="1px dashed red"),
+    ... )
+    """
+    out = div(
+        style=styles(
+            font_family="sans-serif",
+            line_height="1.45",
+            font_size=px(16),
+            text_align="right",
+        )
+    )
+    for arg in args:
+        if isinstance(arg, str):
+            out(arg)
+        elif isinstance(arg, HtmlElement):
+            out(arg)
+        elif isinstance(arg, tuple):
+            out(annotation(*arg))
+        else:
+            raise Exception("Oh noes!")
+    streamlit.components.v1.html(str(out), **kwargs)
+def shorten_text(text, n, reverse=False):
+    if text.isspace() or text == "":
+        return text
+    if reverse:
+        text = text[::-1]
+    words = iter(text.split())
+    lines, current = [], next(words)
+    for word in words:
+        if len(current) + 1 + len(word) > n:
+            break
+        else:
+            current += " " + word
+    lines.append(current)
+    if reverse:
+        return lines[0][::-1]
+    return lines[0]
+def annotate_answer(result):
+    annotated_text(
+        shorten_text(
+            result["original"][: result["new_start"]],
+            500,
+            reverse=True,
+        ),
+        (result["new_answer"], "جواب", "#8ef"),
+        shorten_text(result["original"][result["new_end"] :], 500) + " ...... إلخ",
+    )

backend/sarcasm.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import streamlit as st
+from .sa import predictor
+def write():
+    st.markdown(
+        """
+        # Arabic Sarcasm Detection
+        This is a simple sarcasm detection app that uses the [MARBERT](https://huggingface.co/UBC-NLP/MARBERT) model trained on [ArSarcasm](https://github.com/iabufarha/ArSarcasm)
+        """
+    )
+    input_text = st.text_input(
+        "Enter your text here:",
+    )
+    if st.button("Predict"):
+        with st.spinner("Predicting..."):
+            prediction, scores = predictor.get_preds_from_sarcasm([input_text])
+            st.write(f"Result: {prediction[0]}")
+            detailed_score = {
+                "Sarcastic": scores[0][0],
+                "Not_Sarcastic": scores[0][1],
+            }
+            st.write("All scores:")
+            st.write(detailed_score)

backend/services.py CHANGED Viewed

@@ -13,6 +13,17 @@ from .preprocess import ArabertPreprocessor
 from .sa_utils import *
 from .utils import download_models, softmax
 logger = logging.getLogger(__name__)
 # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
 class TextGeneration:
@@ -72,6 +83,7 @@ class TextGeneration:
         do_sample: bool,
         num_beams: int,
     ):
         prompt = self.preprocessor.preprocess(prompt)
         return_full_text = False
         return_text = True
@@ -127,6 +139,9 @@ class TextGeneration:
                     return "Something happened 🤷‍♂️!!"
             else:
                 generated_text = generated_text[0]["generated_text"]
         return self.preprocessor.unpreprocess(generated_text)
     def query(self, payload, model_name):
@@ -219,7 +234,7 @@ class SentimentAnalyzer:
         preds_df = pd.DataFrame([])
         for i in range(0, 5):
             preds = []
-            for s in tqdm(more_itertools.chunked(list(prep_texts), 128)):
                 preds.extend(self.pipelines["sar_trial10"][i](s))
             preds_df[f"model_{i}"] = preds
@@ -245,55 +260,63 @@ class SentimentAnalyzer:
         return final_labels, final_scores
     def get_preds_from_a_model(self, texts: List[str], model_name):
-        prep = self.processors[model_name]
-        prep_texts = [prep.preprocess(x) for x in texts]
-        if model_name == "sa_sarcasm":
-            sarcasm_label, _ = self.get_preds_from_sarcasm(texts)
-            sarcastic_map = {"Not_Sarcastic": "غير ساخر", "Sarcastic": "ساخر"}
-            labeled_prep_texts = []
-            for t, l in zip(prep_texts, sarcasm_label):
-                labeled_prep_texts.append(sarcastic_map[l] + " [SEP] " + t)
-        preds_df = pd.DataFrame([])
-        for i in range(0, 5):
-            preds = []
-            for s in more_itertools.chunked(list(prep_texts), 128):
-                preds.extend(self.pipelines[model_name][i](s))
-            preds_df[f"model_{i}"] = preds
-        final_labels = []
-        final_scores = []
-        final_scores_list = []
-        for id, row in preds_df.iterrows():
-            pos_total = 0
-            neg_total = 0
-            neu_total = 0
-            for pred in row[2:]:
-                pos_total += pred[0]["score"]
-                neu_total += pred[1]["score"]
-                neg_total += pred[2]["score"]
-            pos_avg = pos_total / 5
-            neu_avg = neu_total / 5
-            neg_avg = neg_total / 5
-            if model_name == "sa_no_aoa_in_neutral":
-                final_labels.append(
-                    self.pipelines[model_name][0].model.config.id2label[
-                        np.argmax([neu_avg, neg_avg, pos_avg])
-                    ]
                 )
             else:
-                final_labels.append(
-                    self.pipelines[model_name][0].model.config.id2label[
-                        np.argmax([pos_avg, neu_avg, neg_avg])
-                    ]
-                )
-            final_scores.append(np.max([pos_avg, neu_avg, neg_avg]))
-            final_scores_list.append((pos_avg, neu_avg, neg_avg))
         return final_labels, final_scores, final_scores_list
     def predict(self, texts: List[str]):
@@ -355,3 +378,139 @@ class SentimentAnalyzer:
         logger.info(f"Score: {final_ensemble_score}")
         logger.info(f"All Scores: {final_ensemble_all_score}")
         return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score

 from .sa_utils import *
 from .utils import download_models, softmax
+from functools import lru_cache
+from urllib.parse import unquote
+import streamlit as st
+import wikipedia
+from codetiming import Timer
+from fuzzysearch import find_near_matches
+from googleapi import google
+from transformers import AutoTokenizer
 logger = logging.getLogger(__name__)
 # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
 class TextGeneration:
         do_sample: bool,
         num_beams: int,
     ):
+        logger.info(f"Generating with {model_name}")
         prompt = self.preprocessor.preprocess(prompt)
         return_full_text = False
         return_text = True
                     return "Something happened 🤷‍♂️!!"
             else:
                 generated_text = generated_text[0]["generated_text"]
+        logger.info(f"Prompt: {prompt}")
+        logger.info(f"Generated text: {generated_text}")
         return self.preprocessor.unpreprocess(generated_text)
     def query(self, payload, model_name):
         preds_df = pd.DataFrame([])
         for i in range(0, 5):
             preds = []
+            for s in more_itertools.chunked(list(prep_texts), 128):
                 preds.extend(self.pipelines["sar_trial10"][i](s))
             preds_df[f"model_{i}"] = preds
         return final_labels, final_scores
     def get_preds_from_a_model(self, texts: List[str], model_name):
+        try:
+            prep = self.processors[model_name]
+            prep_texts = [prep.preprocess(x) for x in texts]
+            if model_name == "sa_sarcasm":
+                sarcasm_label, _ = self.get_preds_from_sarcasm(texts)
+                sarcastic_map = {"Not_Sarcastic": "غير ساخر", "Sarcastic": "ساخر"}
+                labeled_prep_texts = []
+                for t, l in zip(prep_texts, sarcasm_label):
+                    labeled_prep_texts.append(sarcastic_map[l] + " [SEP] " + t)
+            preds_df = pd.DataFrame([])
+            for i in range(0, 5):
+                preds = []
+                for s in more_itertools.chunked(list(prep_texts), 128):
+                    preds.extend(self.pipelines[model_name][i](s))
+                preds_df[f"model_{i}"] = preds
+            final_labels = []
+            final_scores = []
+            final_scores_list = []
+            for id, row in preds_df.iterrows():
+                pos_total = 0
+                neg_total = 0
+                neu_total = 0
+                for pred in row[2:]:
+                    pos_total += pred[0]["score"]
+                    neu_total += pred[1]["score"]
+                    neg_total += pred[2]["score"]
+                pos_avg = pos_total / 5
+                neu_avg = neu_total / 5
+                neg_avg = neg_total / 5
+                if model_name == "sa_no_aoa_in_neutral":
+                    final_labels.append(
+                        self.pipelines[model_name][0].model.config.id2label[
+                            np.argmax([neu_avg, neg_avg, pos_avg])
+                        ]
+                    )
+                else:
+                    final_labels.append(
+                        self.pipelines[model_name][0].model.config.id2label[
+                            np.argmax([pos_avg, neu_avg, neg_avg])
+                        ]
+                    )
+                final_scores.append(np.max([pos_avg, neu_avg, neg_avg]))
+                final_scores_list.append((pos_avg, neu_avg, neg_avg))
+        except RuntimeError as e:
+            if model_name == "sa_cnnbert":
+                return (
+                    ["Neutral"] * len(texts),
+                    [0.0] * len(texts),
+                    [(0.0, 0.0, 0.0)] * len(texts),
                 )
             else:
+                raise RuntimeError(e)
         return final_labels, final_scores, final_scores_list
     def predict(self, texts: List[str]):
         logger.info(f"Score: {final_ensemble_score}")
         logger.info(f"All Scores: {final_ensemble_all_score}")
         return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score
+wikipedia.set_lang("ar")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+preprocessor = ArabertPreprocessor("wissamantoun/araelectra-base-artydiqa")
+logger.info("Loading QA Pipeline...")
+tokenizer = AutoTokenizer.from_pretrained("wissamantoun/araelectra-base-artydiqa")
+qa_pipe = pipeline("question-answering", model="wissamantoun/araelectra-base-artydiqa")
+logger.info("Finished loading QA Pipeline...")
+@lru_cache(maxsize=100)
+def get_qa_answers(question):
+    logger.info("\n=================================================================")
+    logger.info(f"Question: {question}")
+    if "وسام أنطون" in question or "wissam antoun" in question.lower():
+        return {
+            "title": "Creator",
+            "results": [
+                {
+                    "score": 1.0,
+                    "new_start": 0,
+                    "new_end": 12,
+                    "new_answer": "My Creator 😜",
+                    "original": "My Creator 😜",
+                    "link": "https://github.com/WissamAntoun/",
+                }
+            ],
+        }
+    search_timer = Timer(
+        "search and wiki", text="Search and Wikipedia Time: {:.2f}", logger=logging.info
+    )
+    try:
+        search_timer.start()
+        search_results = google.search(
+            question + " site:ar.wikipedia.org", lang="ar", area="ar"
+        )
+        if len(search_results) == 0:
+            return {}
+        page_name = search_results[0].link.split("wiki/")[-1]
+        wiki_page = wikipedia.page(unquote(page_name))
+        wiki_page_content = wiki_page.content
+        search_timer.stop()
+    except:
+        return {}
+    sections = []
+    for section in re.split("== .+ ==[^=]", wiki_page_content):
+        if not section.isspace():
+            prep_section = tokenizer.tokenize(preprocessor.preprocess(section))
+            if len(prep_section) > 500:
+                subsections = []
+                for subsection in re.split("=== .+ ===", section):
+                    if subsection.isspace():
+                        continue
+                    prep_subsection = tokenizer.tokenize(
+                        preprocessor.preprocess(subsection)
+                    )
+                    subsections.append(subsection)
+                    # logger.info(f"Subsection found with length: {len(prep_subsection)}")
+                sections.extend(subsections)
+            else:
+                # logger.info(f"Regular Section with length: {len(prep_section)}")
+                sections.append(section)
+    full_len_sections = []
+    temp_section = ""
+    for section in sections:
+        if (
+            len(tokenizer.tokenize(preprocessor.preprocess(temp_section)))
+            + len(tokenizer.tokenize(preprocessor.preprocess(section)))
+            > 384
+        ):
+            if temp_section == "":
+                temp_section = section
+                continue
+            full_len_sections.append(temp_section)
+            # logger.info(
+            #     f"full section length: {len(tokenizer.tokenize(preprocessor.preprocess(temp_section)))}"
+            # )
+            temp_section = ""
+        else:
+            temp_section += " " + section + " "
+    if temp_section != "":
+        full_len_sections.append(temp_section)
+    reader_time = Timer("electra", text="Reader Time: {:.2f}", logger=logging.info)
+    reader_time.start()
+    results = qa_pipe(
+        question=[preprocessor.preprocess(question)] * len(full_len_sections),
+        context=[preprocessor.preprocess(x) for x in full_len_sections],
+    )
+    if not isinstance(results, list):
+        results = [results]
+    logger.info(f"Wiki Title: {unquote(page_name)}")
+    logger.info(f"Total Sections: {len(sections)}")
+    logger.info(f"Total Full Sections: {len(full_len_sections)}")
+    for result, section in zip(results, full_len_sections):
+        result["original"] = section
+        answer_match = find_near_matches(
+            " " + preprocessor.unpreprocess(result["answer"]) + " ",
+            result["original"],
+            max_l_dist=min(5, len(preprocessor.unpreprocess(result["answer"])) // 2),
+            max_deletions=0,
+        )
+        try:
+            result["new_start"] = answer_match[0].start
+            result["new_end"] = answer_match[0].end
+            result["new_answer"] = answer_match[0].matched
+            result["link"] = (
+                search_results[0].link + "#:~:text=" + result["new_answer"].strip()
+            )
+        except:
+            result["new_start"] = result["start"]
+            result["new_end"] = result["end"]
+            result["new_answer"] = result["answer"]
+            result["original"] = preprocessor.preprocess(result["original"])
+            result["link"] = search_results[0].link
+        logger.info(f"Answers: {preprocessor.preprocess(result['new_answer'])}")
+    sorted_results = sorted(results, reverse=True, key=lambda x: x["score"])
+    return_dict = {}
+    return_dict["title"] = unquote(page_name)
+    return_dict["results"] = sorted_results
+    reader_time.stop()
+    logger.info(f"Total time spent: {reader_time.last + search_timer.last}")
+    return return_dict

backend/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 import psutil
 import os
@@ -40,3 +41,24 @@ def download_models(models):
 def softmax(x):
     return np.exp(x) / sum(np.exp(x))

+import re
 import numpy as np
 import psutil
 import os
 def softmax(x):
     return np.exp(x) / sum(np.exp(x))
+def ga(file):
+    code = """
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+        <script async src="https://www.googletagmanager.com/gtag/js?id=G-NH9HWCW08F"></script>
+        <script>
+        window.dataLayer = window.dataLayer || [];
+        function gtag(){dataLayer.push(arguments);}
+        gtag('js', new Date());
+        gtag('config', 'G-NH9HWCW08F');
+    </script>
+    """
+    a = os.path.dirname(file) + "/static/index.html"
+    with open(a, "r") as f:
+        data = f.read()
+        if len(re.findall("G-", data)) == 0:
+            with open(a, "w") as ff:
+                newdata = re.sub("<head>", "<head>" + code, data)
+                ff.write(newdata)

images/is2alni_logo.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -10,4 +10,8 @@ transformers==4.10.0
 psutil==5.8.0
 fuzzysearch==0.7.3
 more-itertools==8.9.0
-cookiecutter

 psutil==5.8.0
 fuzzysearch==0.7.3
 more-itertools==8.9.0
+cookiecutter
+git+https://github.com/dantru7/Google-Search-API
+codetiming==1.3.0
+htbuilder
+wikipedia==1.4.0