Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
"""Roberta sentiment Analysis | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e | |
""" | |
# Install required libraries | |
!pip install datasets transformers huggingface_hub -q | |
# Import key libraries and packages | |
import numpy as np | |
import os | |
import pandas as pd | |
from datasets import load_dataset, load_metric | |
from huggingface_hub import notebook_login | |
from sklearn.model_selection import train_test_split | |
from transformers import AutoTokenizer, TrainingArguments, Trainer | |
from google.colab import files | |
from google.colab import drive | |
# Disable Weights & Biases | |
os.environ["WANDB_DISABLED"] = "true" | |
drive.mount('/content/drive') | |
# Load the datasets | |
train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0) | |
test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("") | |
train_df.head() | |
test_df.head() | |
train_df.isnull().sum() | |
test_df.isnull().sum() | |
"""Fine-tuning the roberta model""" | |
train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label']) | |
print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}") | |
# Save splitted subsets | |
train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False) | |
eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False) | |
dataset = load_dataset('csv', | |
data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv', | |
'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1") | |
# Instantiate the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) | |
# Define helper functions | |
## Function to transform labels | |
def transform_labels(label): | |
label = label['label'] | |
num = 0 | |
if label == -1: #'Negative' | |
num = 0 | |
elif label == 0: #'Neutral' | |
num = 1 | |
elif label == 1: #'Positive' | |
num = 2 | |
return {'labels': num} | |
## Function to tokenize data | |
def tokenize_data(example): | |
return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256) | |
# Tokenize the tweets | |
dataset = dataset.map(tokenize_data, batched=True) | |
# Transform labels and limit the columns | |
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement'] | |
dataset = dataset.map(transform_labels, remove_columns=remove_columns) | |
# Define training arguments | |
training_args = TrainingArguments( | |
"covid_tweets_sentiment_analysis_model", | |
num_train_epochs=4, | |
load_best_model_at_end=True, | |
evaluation_strategy="epoch", | |
save_strategy="epoch" | |
) | |
# Load the pretrained model | |
from transformers import AutoModelForSequenceClassification | |
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) | |
# Define evaluation metrics | |
metric = load_metric("accuracy") | |
def compute_metrics(eval_pred): | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
return metric.compute(predictions=predictions, references=labels) | |
# Instantiate the training and evaluation sets | |
train_dataset = dataset["train"].shuffle(seed=24) | |
eval_dataset = dataset["eval"].shuffle(seed=24) | |
#converting training data to PyTorch tensors to speed up training and adding padding: | |
from transformers import DataCollatorWithPadding | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# Instantiate the trainer | |
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics) | |
trainer.train() | |
# Reinstantiate the trainer for evaluation | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=compute_metrics, | |
) | |
# Launch the final evaluation | |
trainer.evaluate() | |
# Login to HF hub | |
notebook_login() | |
# Push model and tokenizer to HF Hub | |
model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") | |
tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") |