Jainesh212 commited on
Commit
3042494
1 Parent(s): de71ace

Create inference-app.py

Browse files
Files changed (1) hide show
  1. inference-app.py +242 -0
inference-app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from transformers import AutoTokenizer,AutoModel
9
+ import random
10
+ from bs4 import BeautifulSoup
11
+ import re
12
+
13
+
14
+ from transformers import AutoModelForSequenceClassification
15
+ import pytorch_lightning as pl
16
+
17
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+
19
+ train_path = "train.csv"
20
+ test_path = "test.csv"
21
+ test_labels_paths = "test_labels.csv"
22
+ test_df = pd.read_csv(test_path)
23
+ test_labels_df = pd.read_csv(test_labels_paths)
24
+ test_df = pd.concat([test_df.iloc[:, 1], test_labels_df.iloc[:, 1:]], axis = 1)
25
+ test_df.to_csv("test-dataset.csv")
26
+ test_dataset_path = "test-dataset.csv"
27
+
28
+ #Lets make a new column labeled "healthy"
29
+
30
+ def healthy_filter(df):
31
+ if (df["toxic"]==0) and (df["severe_toxic"]==0) and (df["obscene"]==0) and (df["threat"]==0) and (df["insult"]==0) and (df["identity_hate"]==0):
32
+ return 1
33
+ else:
34
+ return 0
35
+
36
+ attributes = ['toxic', 'severe_toxic', 'obscene', 'threat',
37
+ 'insult', 'identity_hate', 'healthy']
38
+
39
+ class Comments_Dataset(Dataset):
40
+ def __init__(self, data_path, tokenizer, attributes, max_token_len = 128, sample=5000):
41
+ self.data_path = data_path
42
+ self.tokenizer = tokenizer
43
+ self.attributes = attributes
44
+ self.max_token_len = max_token_len
45
+ self.sample = sample
46
+ self._prepare_data()
47
+
48
+ def _prepare_data(self):
49
+ data = pd.read_csv(self.data_path)
50
+ data["healthy"] = data.apply(healthy_filter,axis=1)
51
+ data["unhealthy"] = np.where(data['healthy']==1, 0, 1)
52
+ if self.sample is not None:
53
+ unhealthy = data.loc[data["healthy"] == 0]
54
+ healthy = data.loc[data["healthy"] ==1]
55
+ self.data = pd.concat([unhealthy, healthy.sample(self.sample, random_state=42)])
56
+ else:
57
+ self.data = data
58
+
59
+ def __len__(self):
60
+ return len(self.data)
61
+
62
+ def __getitem__(self,index):
63
+ item = self.data.iloc[index]
64
+ comment = str(item.comment_text)
65
+ attributes = torch.FloatTensor(item[self.attributes])
66
+ tokens = self.tokenizer.encode_plus(comment,
67
+ add_special_tokens=True,
68
+ return_tensors='pt',
69
+ truncation=True,
70
+ padding='max_length',
71
+ max_length=self.max_token_len,
72
+ return_attention_mask = True)
73
+ return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}
74
+
75
+
76
+ class Comments_Data_Module(pl.LightningDataModule):
77
+
78
+ def __init__(self, train_path, val_path, attributes, batch_size: int = 16, max_token_length: int = 128, model_name='roberta-base'):
79
+ super().__init__()
80
+ self.train_path = train_path
81
+ self.val_path = val_path
82
+ self.attributes = attributes
83
+ self.batch_size = batch_size
84
+ self.max_token_length = max_token_length
85
+ self.model_name = model_name
86
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
87
+
88
+ def setup(self, stage = None):
89
+ if stage in (None, "fit"):
90
+ self.train_dataset = Comments_Dataset(self.train_path, attributes=self.attributes, tokenizer=self.tokenizer)
91
+ self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
92
+ if stage == 'predict':
93
+ self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
94
+
95
+ def train_dataloader(self):
96
+ return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, shuffle=True)
97
+
98
+ def val_dataloader(self):
99
+ return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)
100
+
101
+ def predict_dataloader(self):
102
+ return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)
103
+
104
+ comments_data_module = Comments_Data_Module(train_path, test_dataset_path, attributes=attributes)
105
+ comments_data_module.setup()
106
+ comments_data_module.train_dataloader()
107
+
108
+ class Comment_Classifier(pl.LightningModule):
109
+ #the config dict has the hugginface parameters in it
110
+ def __init__(self, config: dict):
111
+ super().__init__()
112
+ self.config = config
113
+ self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
114
+ self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
115
+ self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
116
+ torch.nn.init.xavier_uniform_(self.classifier.weight)
117
+ self.loss_func = nn.CrossEntropyLoss()
118
+ self.dropout = nn.Dropout()
119
+
120
+ def forward(self, input_ids, attention_mask, labels=None):
121
+ # roberta layer
122
+ output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
123
+ pooled_output = torch.mean(output.last_hidden_state, 1)
124
+ # final logits / classification layers
125
+ pooled_output = self.dropout(pooled_output)
126
+ pooled_output = self.hidden(pooled_output)
127
+ pooled_output = F.relu(pooled_output)
128
+ pooled_output = self.dropout(pooled_output)
129
+ logits = self.classifier(pooled_output)
130
+ # calculate loss
131
+ loss = 0
132
+ if labels is not None:
133
+ loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
134
+ return loss, logits
135
+
136
+ def training_step(self, batch, batch_index):
137
+ loss, outputs = self(**batch)
138
+ self.log("train loss ", loss, prog_bar = True, logger=True)
139
+ return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}
140
+
141
+ def validation_step(self, batch, batch_index):
142
+ loss, outputs = self(**batch)
143
+ self.log("validation loss ", loss, prog_bar = True, logger=True)
144
+ return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}
145
+
146
+ def predict_step(self, batch, batch_index):
147
+ loss, outputs = self(**batch)
148
+ return outputs
149
+
150
+ def configure_optimizers(self):
151
+ optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
152
+ total_steps = self.config['train_size']/self.config['batch_size']
153
+ warmup_steps = math.floor(total_steps * self.config['warmup'])
154
+ scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
155
+ return [optimizer],[scheduler]
156
+
157
+
158
+ config = {
159
+ 'model_name': 'distilroberta-base',
160
+ 'n_labels': len(attributes),
161
+ 'batch_size': 128,
162
+ 'lr': 1.5e-6,
163
+ 'warmup': 0.2,
164
+ 'train_size': len(comments_data_module.train_dataloader()),
165
+ 'weight_decay': 0.001,
166
+ 'n_epochs': 100
167
+ }
168
+
169
+ model_name = 'distilroberta-base'
170
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
171
+
172
+ model = Comment_Classifier(config=config)
173
+ model.load_state_dict(torch.load("model_state_dict.pt"))
174
+ model.eval()
175
+
176
+
177
+
178
+ def prepare_tokenized_review(raw_review):
179
+ # Remove HTML tags with BS
180
+ review_text = BeautifulSoup(raw_review).get_text()
181
+ # Removing non-letters using a regular expression
182
+ review_text = re.sub("[^a-zA-Z!?]"," ", review_text)
183
+ # Convert words to lower case and split them
184
+ words = review_text.lower().split()
185
+
186
+ return " ".join(words)
187
+
188
+ def get_encodings(text):
189
+ MAX_LEN=256
190
+ encodings = tokenizer.encode_plus(
191
+ text,
192
+ None,
193
+ add_special_tokens=True,
194
+ max_length=MAX_LEN,
195
+ padding='max_length',
196
+ truncation=True,
197
+ return_attention_mask=True,
198
+ return_tensors='pt')
199
+ return encodings
200
+
201
+ def run_inference(encoding):
202
+ with torch.no_grad():
203
+ input_ids = encoding['input_ids'].to(device, dtype=torch.long)
204
+ attention_mask = encoding['attention_mask'].to(device, dtype=torch.long)
205
+ output = model(input_ids, attention_mask)
206
+ final_output = torch.softmax(output[1][0],dim=0).cpu()
207
+ print(final_output.numpy().tolist())
208
+ return final_output.numpy().tolist()
209
+
210
+
211
+
212
+ test_tweets = test_df["comment_text"].values
213
+ #streamlit section
214
+ models = ["distilroberta-base"]
215
+ model_pointers = ["default: distilroberta-base"]
216
+
217
+ # current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
218
+ # current_random_tweet = prepare_tokenized_review(current_random_tweet)
219
+ st.write("1. Hit the button to view and see the analyis of a random tweet")
220
+
221
+ with st.form(key="init_form"):
222
+ current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
223
+ current_random_tweet = prepare_tokenized_review(current_random_tweet)
224
+
225
+
226
+
227
+ choice = st.selectbox("Choose Model", model_pointers)
228
+
229
+ user_picked_model = models[model_pointers.index(choice)]
230
+ with st.spinner("Analyzing..."):
231
+ text_encoding = get_encodings(current_random_tweet)
232
+ result = run_inference(text_encoding)
233
+ df = pd.DataFrame({"Tweet":current_random_tweet}, index=[0])
234
+ df["Highest Toxicity Class"] = attributes[result.index(max(result))]
235
+ df["Sentiment Score"] = max(result)
236
+ st.table(df)
237
+
238
+ next_tweet = st.form_submit_button("Next Tweet")
239
+
240
+ if next_tweet:
241
+ with st.spinner("Analyzing..."):
242
+ st.write("")