6023oji commited on
Commit
e5f7925
1 Parent(s): 74bb020

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +324 -0
app.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Automatically generated by Colaboratory.
3
+
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1_cVBwxsa7LcHzjzCcS4l1ds0wxNPQrjm
6
+ """
7
+
8
+ from google.colab import drive
9
+ drive.mount('/content/drive')
10
+
11
+ import pandas as pd
12
+ import numpy as np
13
+
14
+ import warnings
15
+ warnings.filterwarnings('ignore') # to avoid warnings
16
+
17
+ import random
18
+ import pandas as pd
19
+ from tqdm import tqdm
20
+ import seaborn as sns
21
+ import matplotlib.pyplot as plt
22
+
23
+ """
24
+ Sklearn Libraries
25
+ """
26
+ from sklearn.metrics import f1_score
27
+ from sklearn.model_selection import train_test_split
28
+
29
+ """
30
+ Transformer Libraries
31
+ """
32
+ !pip install transformers
33
+ from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
34
+
35
+ """
36
+ Pytorch Libraries
37
+ """
38
+ import torch
39
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
40
+
41
+ esg_data = pd.read_csv("/content/drive/MyDrive/kpmg_personal/concat.csv",
42
+ encoding='utf-8')
43
+
44
+ esg_data
45
+
46
+ plt.figure(figsize = (15,8))
47
+
48
+ sns.set(style='darkgrid')
49
+
50
+ # Increase information on the figure
51
+ sns.set(font_scale=1.3)
52
+ sns.countplot(x='category', data = esg_data)
53
+ plt.title('ESG Category Distribution')
54
+ plt.xlabel('E,S,G,N')
55
+ plt.ylabel('Number of Contents')
56
+
57
+ def show_random_contents(total_number, df):
58
+
59
+ # Get the random number of reviews
60
+ n_contents = df.sample(total_number)
61
+
62
+ # Print each one of the reviews
63
+ for val in list(n_contents.index):
64
+ print("Contents #°{}".format(val))
65
+ print(" - Category: {}".format(df.iloc[val]["category"]))
66
+ print(" - Contents: {}".format(df.iloc[val]["contents"]))
67
+ print("")
68
+
69
+ # Show 5 random headlines
70
+ show_random_contents(5, esg_data)
71
+
72
+ def encode_categories_values(df):
73
+
74
+ possible_categories = df.category.unique()
75
+ category_dict = {}
76
+
77
+ for index, possible_category in enumerate(possible_categories):
78
+ category_dict[possible_category] = index
79
+
80
+ # Encode all the sentiment values
81
+ df['label'] = df.category.replace(category_dict)
82
+
83
+ return df, category_dict
84
+
85
+ # Perform the encoding task on the data set
86
+ esg_data, category_dict = encode_categories_values(esg_data)
87
+
88
+ X_train,X_val, y_train, y_val = train_test_split(esg_data.index.values,
89
+ esg_data.label.values,
90
+ test_size = 0.15,
91
+ random_state = 2022,
92
+ stratify = esg_data.label.values)
93
+
94
+ esg_data.loc[X_train, 'data_type'] = 'train'
95
+ esg_data.loc[X_val, 'data_type'] = 'val'
96
+
97
+ # Vizualiez the number of sentiment occurence on each type of data
98
+ esg_data.groupby(['category', 'label', 'data_type']).count()
99
+
100
+ # Get the FinBERT Tokenizer
101
+ finbert_tokenizer = BertTokenizer.from_pretrained('snunlp/KR-FinBert-SC',
102
+ do_lower_case=True)
103
+
104
+ def get_contents_len(df):
105
+
106
+ contents_sequence_lengths = []
107
+
108
+ print("Encoding in progress...")
109
+ for content in tqdm(df.contents):
110
+ encoded_content = finbert_tokenizer.encode(content,
111
+ add_special_tokens = True)
112
+
113
+ # record the length of the encoded review
114
+ contents_sequence_lengths.append(len(encoded_content))
115
+ print("End of Task.")
116
+
117
+ return contents_sequence_lengths
118
+
119
+ def show_contents_distribution(sequence_lengths, figsize = (15,8)):
120
+
121
+ # Get the percentage of reviews with length > 512
122
+ len_512_plus = [rev_len for rev_len in sequence_lengths if rev_len > 512]
123
+ percent = (len(len_512_plus)/len(sequence_lengths))*100
124
+
125
+ print("Maximum Sequence Length is {}".format(max(sequence_lengths)))
126
+
127
+ # Configure the plot size
128
+ plt.figure(figsize = figsize)
129
+
130
+ sns.set(style='darkgrid')
131
+
132
+ # Increase information on the figure
133
+ sns.set(font_scale=1.3)
134
+
135
+ # Plot the result
136
+ sns.distplot(sequence_lengths, kde = False, rug = False)
137
+ plt.title('Contents Lengths Distribution')
138
+ plt.xlabel('Contents Length')
139
+ plt.ylabel('Number of Contents')
140
+
141
+ show_contents_distribution(get_contents_len(esg_data))
142
+
143
+ # Encode the Training and Validation Data
144
+ encoded_data_train = finbert_tokenizer.batch_encode_plus(
145
+ esg_data[esg_data.data_type=='train'].contents.values,
146
+ return_tensors='pt',
147
+ add_special_tokens=True,
148
+ return_attention_mask=True,
149
+ pad_to_max_length=True,
150
+ max_length=200 # the maximum lenght observed in the headlines
151
+ )
152
+
153
+ encoded_data_val = finbert_tokenizer.batch_encode_plus(
154
+ esg_data[esg_data.data_type=='val'].contents.values,
155
+ return_tensors='pt',
156
+ add_special_tokens=True,
157
+ return_attention_mask=True,
158
+ pad_to_max_length=True,
159
+ max_length=200 # the maximum length observed in the headlines
160
+ )
161
+
162
+
163
+ input_ids_train = encoded_data_train['input_ids']
164
+ attention_masks_train = encoded_data_train['attention_mask']
165
+ labels_train = torch.tensor(esg_data[esg_data.data_type=='train'].label.values)
166
+
167
+ input_ids_val = encoded_data_val['input_ids']
168
+ attention_masks_val = encoded_data_val['attention_mask']
169
+ sentiments_val = torch.tensor(esg_data[esg_data.data_type=='val'].label.values)
170
+
171
+
172
+ dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
173
+ dataset_val = TensorDataset(input_ids_val, attention_masks_val, sentiments_val)
174
+
175
+ model = AutoModelForSequenceClassification.from_pretrained("snunlp/KR-FinBert-SC",
176
+ num_labels=len(category_dict),
177
+ output_attentions=False,
178
+ output_hidden_states=False,
179
+ ignore_mismatched_sizes=True)
180
+
181
+ batch_size = 5
182
+
183
+ dataloader_train = DataLoader(dataset_train,
184
+ sampler=RandomSampler(dataset_train),
185
+ batch_size=batch_size)
186
+
187
+ dataloader_validation = DataLoader(dataset_val,
188
+ sampler=SequentialSampler(dataset_val),
189
+ batch_size=batch_size)
190
+
191
+ optimizer = AdamW(model.parameters(),
192
+ lr=1e-5,
193
+ eps=1e-8)
194
+
195
+ epochs = 5
196
+
197
+ scheduler = get_linear_schedule_with_warmup(optimizer,
198
+ num_warmup_steps=0,
199
+ num_training_steps=len(dataloader_train)*epochs)
200
+
201
+ def f1_score_func(preds, labels):
202
+ preds_flat = np.argmax(preds, axis=1).flatten()
203
+ labels_flat = labels.flatten()
204
+ return f1_score(labels_flat, preds_flat, average='weighted')
205
+
206
+ def accuracy_per_class(preds, labels):
207
+ label_dict_inverse = {v: k for k, v in category_dict.items()}
208
+
209
+ preds_flat = np.argmax(preds, axis=1).flatten()
210
+ labels_flat = labels.flatten()
211
+
212
+ for label in np.unique(labels_flat):
213
+ y_preds = preds_flat[labels_flat==label]
214
+ y_true = labels_flat[labels_flat==label]
215
+ print(f'Class: {label_dict_inverse[label]}')
216
+ print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
217
+
218
+ seed_val = 2022
219
+ random.seed(seed_val)
220
+ np.random.seed(seed_val)
221
+ torch.manual_seed(seed_val)
222
+ torch.cuda.manual_seed_all(seed_val)
223
+
224
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
225
+ model.to(device)
226
+
227
+
228
+ def evaluate(dataloader_val):
229
+
230
+ model.eval()
231
+
232
+ loss_val_total = 0
233
+ predictions, true_vals = [], []
234
+
235
+ for batch in dataloader_val:
236
+
237
+ batch = tuple(b.to(device) for b in batch)
238
+
239
+ inputs = {'input_ids': batch[0],
240
+ 'attention_mask': batch[1],
241
+ 'labels': batch[2],
242
+ }
243
+
244
+ with torch.no_grad():
245
+ outputs = model(**inputs)
246
+
247
+ loss = outputs[0]
248
+ logits = outputs[1]
249
+ loss_val_total += loss.item()
250
+
251
+ logits = logits.detach().cpu().numpy()
252
+ label_ids = inputs['labels'].cpu().numpy()
253
+ predictions.append(logits)
254
+ true_vals.append(label_ids)
255
+
256
+ loss_val_avg = loss_val_total/len(dataloader_val)
257
+
258
+ predictions = np.concatenate(predictions, axis=0)
259
+ true_vals = np.concatenate(true_vals, axis=0)
260
+
261
+ return loss_val_avg, predictions, true_vals
262
+
263
+
264
+ for epoch in tqdm(range(1, epochs+1)):
265
+
266
+ model.train()
267
+
268
+ loss_train_total = 0
269
+
270
+ progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
271
+ for batch in progress_bar:
272
+
273
+ model.zero_grad()
274
+
275
+ batch = tuple(b.to(device) for b in batch)
276
+
277
+ inputs = {'input_ids': batch[0],
278
+ 'attention_mask': batch[1],
279
+ 'labels': batch[2],
280
+ }
281
+
282
+ outputs = model(**inputs)
283
+
284
+ loss = outputs[0]
285
+ loss_train_total += loss.item()
286
+ loss.backward()
287
+
288
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
289
+
290
+ optimizer.step()
291
+ scheduler.step()
292
+
293
+ progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
294
+
295
+ torch.save(model.state_dict(), f'finetuned_finBERT_epoch_{epoch}.model')
296
+
297
+ tqdm.write(f'\nEpoch {epoch}')
298
+
299
+ loss_train_avg = loss_train_total/len(dataloader_train)
300
+ tqdm.write(f'Training loss: {loss_train_avg}')
301
+
302
+ val_loss, predictions, true_vals = evaluate(dataloader_validation)
303
+ val_f1 = f1_score_func(predictions, true_vals)
304
+ tqdm.write(f'Validation loss: {val_loss}')
305
+ tqdm.write(f'F1 Score (Weighted): {val_f1}')
306
+
307
+ model = AutoModelForSequenceClassification.from_pretrained("snunlp/KR-FinBert-SC",
308
+ num_labels=len(category_dict),
309
+ output_attentions=False,
310
+ output_hidden_states=False,
311
+ ignore_mismatched_sizes=True)
312
+
313
+ model.to(device)
314
+
315
+ model.load_state_dict(torch.load('finetuned_finBERT_epoch_4.model',
316
+ map_location=torch.device('cpu')))
317
+
318
+ _, predictions, true_vals = evaluate(dataloader_validation)
319
+
320
+ accuracy_per_class(predictions, true_vals)
321
+
322
+ # max_length = 200
323
+
324
+