themeetjani commited on
Commit
c63c7bb
1 Parent(s): dc83878

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py CHANGED
@@ -24,8 +24,70 @@ lemmatizer = WordNetLemmatizer()
24
  stopwords = nltk.corpus.stopwords.words('english')
25
 
26
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
28
  def classify(text):
 
29
  output = pipe(text,top_k = 2)
30
  return {"class": output}
31
  inputs = gr.inputs.Textbox(label="pdf link")
 
24
  stopwords = nltk.corpus.stopwords.words('english')
25
 
26
  import gradio as gr
27
+ def pre_processing_str_esg(df_col):
28
+ df_col = df_col.lower()
29
+ #defining the function to remove punctuation
30
+ def remove_punctuation(text):
31
+ punctuationfree="".join([i for i in text if i not in string.punctuation])
32
+ return punctuationfree
33
+ #storing the puntuation free text
34
+ df_col= remove_punctuation(df_col)
35
+ df_col = re.sub(r"http\S+", " ", df_col)
36
+
37
+ def remove_stopwords(text):
38
+ return " ".join([word for word in str(text).split() if word not in stopwords])
39
+ #applying the function
40
+ df_col = remove_stopwords(df_col)
41
+ df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
42
+ df_col = df_col.replace("¶", "")
43
+ df_col = df_col.replace("§", "")
44
+ df_col = df_col.replace('“', ' ')
45
+ df_col = df_col.replace('”', ' ')
46
+ df_col = df_col.replace('-', ' ')
47
+ REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
48
+ BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
49
+ df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
50
+ df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
51
+
52
+ # df_col = re.sub('W*dw*','',df_col)
53
+ df_col = re.sub('[0-9]+', ' ', df_col)
54
+ df_col = re.sub(' ', ' ', df_col)
55
+
56
+ def remove_emoji(string):
57
+ emoji_pattern = re.compile("["
58
+ u"\U0001F600-\U0001F64F" # emoticons
59
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
60
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
61
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
62
+ u"\U00002702-\U000027B0"
63
+ u"\U000024C2-\U0001F251"
64
+ "]+", flags=re.UNICODE)
65
+ return emoji_pattern.sub(r'', string)
66
+ df_col = remove_emoji(df_col)
67
+
68
+ return df_col
69
+
70
+ def pre_processing_str(df_col):
71
+ # df_col = df_col.lower()
72
+ if len(df_col.split()) >= 70:
73
+ return pre_processing_str_esg(df_col)
74
+ else:
75
+ df_col = df_col.replace('#', '')
76
+ df_col = df_col.replace('!', '')
77
+ df_col = re.sub(r"http\S+", " ", df_col)
78
+
79
+ df_col = re.sub('[0-9]+', ' ', df_col)
80
+ df_col = re.sub(' ', ' ', df_col)
81
+ def remove_emojis(text):
82
+ return emoji.replace_emoji(text)
83
+ df_col = remove_emojis(df_col)
84
+ df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
85
+ df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
86
+ df_col = df_col.strip()
87
+ return df_col
88
  pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
89
  def classify(text):
90
+ text = pre_processing_str(text)
91
  output = pipe(text,top_k = 2)
92
  return {"class": output}
93
  inputs = gr.inputs.Textbox(label="pdf link")