File size: 3,415 Bytes
4d29f91 c63c7bb 4d29f91 c63c7bb 274543c 72f4194 4d29f91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from transformers import pipeline
import numpy as np
import transformers
import json
import pandas as pd
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
stemmer = PorterStemmer()
# uncomment this when run first time
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
import gradio as gr
def pre_processing_str_esg(df_col):
df_col = df_col.lower()
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#storing the puntuation free text
df_col= remove_punctuation(df_col)
df_col = re.sub(r"http\S+", " ", df_col)
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stopwords])
#applying the function
df_col = remove_stopwords(df_col)
df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
df_col = df_col.replace("¶", "")
df_col = df_col.replace("§", "")
df_col = df_col.replace('“', ' ')
df_col = df_col.replace('”', ' ')
df_col = df_col.replace('-', ' ')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
# df_col = re.sub('W*dw*','',df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
df_col = remove_emoji(df_col)
return df_col
def pre_processing_str(df_col):
# df_col = df_col.lower()
if len(df_col.split()) >= 70:
return pre_processing_str_esg(df_col)
else:
df_col = df_col.replace('#', '')
df_col = df_col.replace('!', '')
df_col = re.sub(r"http\S+", " ", df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emojis(text):
return emoji.replace_emoji(text)
df_col = remove_emojis(df_col)
df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
df_col = df_col.strip()
return df_col
pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
def classify(text):
text = pre_processing_str(text)
output = pipe(text,top_k = 2)
return {"class": output}
# inputs = gr.inputs.Textbox(label="pdf link")
# outputs = gr.outputs.Textbox(label="OCR Text")
demo = gr.Interface(fn=classify,inputs="text", outputs="text")
demo.launch() |