themeetjani
commited on
Commit
•
c63c7bb
1
Parent(s):
dc83878
Update app.py
Browse files
app.py
CHANGED
@@ -24,8 +24,70 @@ lemmatizer = WordNetLemmatizer()
|
|
24 |
stopwords = nltk.corpus.stopwords.words('english')
|
25 |
|
26 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
|
28 |
def classify(text):
|
|
|
29 |
output = pipe(text,top_k = 2)
|
30 |
return {"class": output}
|
31 |
inputs = gr.inputs.Textbox(label="pdf link")
|
|
|
24 |
stopwords = nltk.corpus.stopwords.words('english')
|
25 |
|
26 |
import gradio as gr
|
27 |
+
def pre_processing_str_esg(df_col):
|
28 |
+
df_col = df_col.lower()
|
29 |
+
#defining the function to remove punctuation
|
30 |
+
def remove_punctuation(text):
|
31 |
+
punctuationfree="".join([i for i in text if i not in string.punctuation])
|
32 |
+
return punctuationfree
|
33 |
+
#storing the puntuation free text
|
34 |
+
df_col= remove_punctuation(df_col)
|
35 |
+
df_col = re.sub(r"http\S+", " ", df_col)
|
36 |
+
|
37 |
+
def remove_stopwords(text):
|
38 |
+
return " ".join([word for word in str(text).split() if word not in stopwords])
|
39 |
+
#applying the function
|
40 |
+
df_col = remove_stopwords(df_col)
|
41 |
+
df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
|
42 |
+
df_col = df_col.replace("¶", "")
|
43 |
+
df_col = df_col.replace("§", "")
|
44 |
+
df_col = df_col.replace('“', ' ')
|
45 |
+
df_col = df_col.replace('”', ' ')
|
46 |
+
df_col = df_col.replace('-', ' ')
|
47 |
+
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
|
48 |
+
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
|
49 |
+
df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
|
50 |
+
df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
|
51 |
+
|
52 |
+
# df_col = re.sub('W*dw*','',df_col)
|
53 |
+
df_col = re.sub('[0-9]+', ' ', df_col)
|
54 |
+
df_col = re.sub(' ', ' ', df_col)
|
55 |
+
|
56 |
+
def remove_emoji(string):
|
57 |
+
emoji_pattern = re.compile("["
|
58 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
59 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
60 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
61 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
62 |
+
u"\U00002702-\U000027B0"
|
63 |
+
u"\U000024C2-\U0001F251"
|
64 |
+
"]+", flags=re.UNICODE)
|
65 |
+
return emoji_pattern.sub(r'', string)
|
66 |
+
df_col = remove_emoji(df_col)
|
67 |
+
|
68 |
+
return df_col
|
69 |
+
|
70 |
+
def pre_processing_str(df_col):
|
71 |
+
# df_col = df_col.lower()
|
72 |
+
if len(df_col.split()) >= 70:
|
73 |
+
return pre_processing_str_esg(df_col)
|
74 |
+
else:
|
75 |
+
df_col = df_col.replace('#', '')
|
76 |
+
df_col = df_col.replace('!', '')
|
77 |
+
df_col = re.sub(r"http\S+", " ", df_col)
|
78 |
+
|
79 |
+
df_col = re.sub('[0-9]+', ' ', df_col)
|
80 |
+
df_col = re.sub(' ', ' ', df_col)
|
81 |
+
def remove_emojis(text):
|
82 |
+
return emoji.replace_emoji(text)
|
83 |
+
df_col = remove_emojis(df_col)
|
84 |
+
df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
|
85 |
+
df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
|
86 |
+
df_col = df_col.strip()
|
87 |
+
return df_col
|
88 |
pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
|
89 |
def classify(text):
|
90 |
+
text = pre_processing_str(text)
|
91 |
output = pipe(text,top_k = 2)
|
92 |
return {"class": output}
|
93 |
inputs = gr.inputs.Textbox(label="pdf link")
|