Sentiment_Analysis_App / roberta_sentiment_analysis.py
HOLYBOY's picture
app_added
dac5ca8
# -*- coding: utf-8 -*-
"""Roberta sentiment Analysis
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e
"""
# Install required libraries
!pip install datasets transformers huggingface_hub -q
# Import key libraries and packages
import numpy as np
import os
import pandas as pd
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from google.colab import files
from google.colab import drive
# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"
drive.mount('/content/drive')
# Load the datasets
train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0)
test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("")
train_df.head()
test_df.head()
train_df.isnull().sum()
test_df.isnull().sum()
"""Fine-tuning the roberta model"""
train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label'])
print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}")
# Save splitted subsets
train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False)
eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False)
dataset = load_dataset('csv',
data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv',
'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1")
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)
# Define helper functions
## Function to transform labels
def transform_labels(label):
label = label['label']
num = 0
if label == -1: #'Negative'
num = 0
elif label == 0: #'Neutral'
num = 1
elif label == 1: #'Positive'
num = 2
return {'labels': num}
## Function to tokenize data
def tokenize_data(example):
return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)
# Tokenize the tweets
dataset = dataset.map(tokenize_data, batched=True)
# Transform labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)
# Define training arguments
training_args = TrainingArguments(
"covid_tweets_sentiment_analysis_model",
num_train_epochs=4,
load_best_model_at_end=True,
evaluation_strategy="epoch",
save_strategy="epoch"
)
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)
# Define evaluation metrics
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=24)
eval_dataset = dataset["eval"].shuffle(seed=24)
#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()
# Reinstantiate the trainer for evaluation
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Launch the final evaluation
trainer.evaluate()
# Login to HF hub
notebook_login()
# Push model and tokenizer to HF Hub
model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")
tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")