## Airline Sentiment Prediction using BERT

### Approach
First I analysed the data and I found that there was a huge imbalance in the dataset, to resolve this I used Textattack for augumentation of data.
Before the augumenting the dataset I used the following techniques to clean the data & reduce the noise:
- Removed the @usernames
- Removed the URLs
- Removed hashtags
- Replacement of emojis with their meaning

After cleaning the data I used EasyDataAugment of Textattack to augment the data, augmenting the data helped me to increase the accuracy of the model by more than 3%. I also tried using Clare(It replaces the words with their synonyms) but that was very resource intensive & it was taking very long to get output.

### Model
Since, this was a binary classification task I used BERT for training the model. I used the pretrained BERT model from Huggingface transformers library. I used the BERT model with the following parameters:
- BERT-base-uncased
- Max length of the input sequence: 128
- Learning rate: 3e-5
- Batch size: 32

### Results
The dataset was split into 80:20 ratio for training & validation.
I got the following results after training the model:
Training loss: 0.0137
Validation loss: 0.1209
Training accuracy: 0.9955
Validation accuracy: 0.9794


========================================================================================================================================

Install the required libraries

In [None]:
%pip install transformers
%pip install emoji
%pip install numpy pandas
%pip install scikit-learn
%pip install textattack

Importing the libraries

In [None]:
import numpy as np
import pandas as pd
from pprint import pprint

Reading the data

In [None]:
df = pd.read_csv("airline_sentiment_analysis.csv")
df.head(20)

Assigning 1 to positive sentiment and 0 to negative sentiment

In [None]:
for label in df['airline_sentiment']:
 if label == 'positive':
 df['airline_sentiment'].replace(label, 1, inplace=True)
 elif label == 'negative':
 df['airline_sentiment'].replace(label, 0, inplace=True)
df.head(20)

Remove the @usernames, URLs, hashtags & Replace the emojis with their meaning

In [None]:

import emoji
for i,r in df.iterrows():
 
 df.loc[i,"text"] = emoji.demojize(df.loc[i,"text"])
 df.loc[i,"text"] = df.loc[i,"text"].replace(":"," ")
 df.loc[i,"text"] = ' '.join(df.loc[i,"text"].split())

df['text'] = df['text'].str.replace("@[A-Za-z0-9]+", "",regex=True)
df['text'] = df['text'].str.replace("#", "",regex=True)
df['text'] = df['text'].str.replace("https?://[A-Za-z0-9./]+", "",regex=True)
df['text'] = df['text'].str.replace("[^a-zA-Z.!?']", " ",regex=True)


df.head(20)

Augumenting Positive Sentiment using EasyDataAugment

In [None]:
positive_feedback = (df.loc[df["airline_sentiment"] == 1])["text"]
positive_feedback = positive_feedback.tolist()
# pprint(positive_feedback)

from textattack.augmentation import EasyDataAugmenter
esy_aug = EasyDataAugmenter()
aug_list = []
for sen in positive_feedback:
 aug_list.append(esy_aug.augment(sen))
serial_list = []
for l in aug_list:
 for sen in l:
 serial_list.append(sen)
df = df.drop(df.columns[[0]],axis=1)

df2 = pd.DataFrame(list(zip([1]*len(serial_list),serial_list)),columns=["airline_sentiment","text"])

df = pd.concat([df,df2])

df.to_csv("modified.csv") #save the modified dataset
df.head()

Split dataset into train & validation in 80:20 ratio

In [None]:
# split the data into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)


Initalise the BERT model & tokenizer

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Utility function to convert the data into the format required by BERT

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
 train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
 text_a = x[DATA_COLUMN], 
 text_b = None,
 label = x[LABEL_COLUMN]), axis = 1)

 validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
 text_a = x[DATA_COLUMN], 
 text_b = None,
 label = x[LABEL_COLUMN]), axis = 1)
 
 return train_InputExamples, validation_InputExamples

 
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
 features = [] # -> will hold InputFeatures to be converted later

 for e in examples:
 # Documentation is really strong for this method, so please take a look at it
 input_dict = tokenizer.encode_plus(
 e.text_a,
 add_special_tokens=True,
 max_length=max_length, # truncates if len(s) > max_length
 return_token_type_ids=True,
 return_attention_mask=True,
 pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
 truncation=True
 )

 input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
 input_dict["token_type_ids"], input_dict['attention_mask'])

 features.append(
 InputFeatures(
 input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
 )
 )

 def gen():
 for f in features:
 yield (
 {
 "input_ids": f.input_ids,
 "attention_mask": f.attention_mask,
 "token_type_ids": f.token_type_ids,
 },
 f.label,
 )

 return tf.data.Dataset.from_generator(
 gen,
 ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
 (
 {
 "input_ids": tf.TensorShape([None]),
 "attention_mask": tf.TensorShape([None]),
 "token_type_ids": tf.TensorShape([None]),
 },
 tf.TensorShape([]),
 ),
 )


BERT model for training

In [None]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'airline_sentiment'


train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
 metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Saving the trained weights

In [None]:
model.save_weights("weights.h5")

Inference: Predicting the sentiment of the tweet

In [None]:
pred_data = ["@abc The flight was great", "@abc ☚ī¸","🎊 it was bad experience"]
pred_data = pd.DataFrame(pred_data)


for i,r in pred_data.iterrows():
 pred_data.loc[i,0] = emoji.demojize(r[0])
 pred_data.loc[i,0] = r[0].replace(":"," ")
 pred_data.loc[i,0] = ' '.join(r[0].split())


pred_data[0] = pred_data[0].str.replace("@[A-Za-z0-9]+", "",regex=True)
pred_data[0] = pred_data[0].str.replace("#", "",regex=True)
pred_data[0] = pred_data[0].str.replace("https?://[A-Za-z0-9./]+", "",regex=True)
pred_data[0] = pred_data[0].str.replace("[^a-zA-Z.!?']", " ",regex=True)

pred_data.head()


In [None]:
pred_data = pred_data[0].values.tolist()
print(pred_data)
tf_batch = tokenizer(pred_data, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_data)):
 print(pred_data[i], ": \n", labels[label[i]])