Spaces:

themeetjani
/

esg-tweet_classification

Sleeping

App Files Files Community

themeetjani commited on Dec 6, 2023

Commit

c63c7bb

•

1 Parent(s): dc83878

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -0

app.py CHANGED Viewed

@@ -24,8 +24,70 @@ lemmatizer = WordNetLemmatizer()
 stopwords = nltk.corpus.stopwords.words('english')
 import gradio as gr
 pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
 def classify(text):
     output = pipe(text,top_k = 2)
     return {"class": output}
 inputs = gr.inputs.Textbox(label="pdf link")

 stopwords = nltk.corpus.stopwords.words('english')
 import gradio as gr
+def pre_processing_str_esg(df_col):
+    df_col = df_col.lower()
+    #defining the function to remove punctuation
+    def remove_punctuation(text):
+        punctuationfree="".join([i for i in text if i not in string.punctuation])
+        return punctuationfree
+    #storing the puntuation free text
+    df_col= remove_punctuation(df_col)
+    df_col = re.sub(r"http\S+", " ", df_col)
+    def remove_stopwords(text):
+        return " ".join([word for word in str(text).split() if word not in stopwords])
+    #applying the function
+    df_col = remove_stopwords(df_col)
+    df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
+    df_col = df_col.replace("¶", "")
+    df_col = df_col.replace("§", "")
+    df_col = df_col.replace('“', ' ')
+    df_col = df_col.replace('”', ' ')
+    df_col = df_col.replace('-', ' ')
+    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
+    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
+    df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
+    df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
+#     df_col = re.sub('W*dw*','',df_col)
+    df_col = re.sub('[0-9]+', ' ', df_col)
+    df_col = re.sub('  ', ' ', df_col)
+    def remove_emoji(string):
+        emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"  # emoticons
+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                               u"\U00002702-\U000027B0"
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+        return emoji_pattern.sub(r'', string)
+    df_col = remove_emoji(df_col)
+    return df_col
+def pre_processing_str(df_col):
+#    df_col = df_col.lower()
+    if len(df_col.split()) >= 70:
+        return pre_processing_str_esg(df_col)
+    else:
+        df_col = df_col.replace('#', '')
+        df_col = df_col.replace('!', '')
+        df_col = re.sub(r"http\S+", " ", df_col)
+        df_col = re.sub('[0-9]+', ' ', df_col)
+        df_col = re.sub('  ', ' ', df_col)
+        def remove_emojis(text):
+            return emoji.replace_emoji(text)
+        df_col = remove_emojis(df_col)
+        df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
+        df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
+        df_col = df_col.strip()
+        return df_col
 pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
 def classify(text):
+    text = pre_processing_str(text)
     output = pipe(text,top_k = 2)
     return {"class": output}
 inputs = gr.inputs.Textbox(label="pdf link")