Spaces:
Build error
Build error
File size: 4,286 Bytes
dac5ca8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# -*- coding: utf-8 -*-
"""Roberta sentiment Analysis
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e
"""
# Install required libraries
!pip install datasets transformers huggingface_hub -q
# Import key libraries and packages
import numpy as np
import os
import pandas as pd
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from google.colab import files
from google.colab import drive
# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"
drive.mount('/content/drive')
# Load the datasets
train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0)
test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("")
train_df.head()
test_df.head()
train_df.isnull().sum()
test_df.isnull().sum()
"""Fine-tuning the roberta model"""
train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label'])
print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}")
# Save splitted subsets
train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False)
eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False)
dataset = load_dataset('csv',
data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv',
'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1")
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)
# Define helper functions
## Function to transform labels
def transform_labels(label):
label = label['label']
num = 0
if label == -1: #'Negative'
num = 0
elif label == 0: #'Neutral'
num = 1
elif label == 1: #'Positive'
num = 2
return {'labels': num}
## Function to tokenize data
def tokenize_data(example):
return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)
# Tokenize the tweets
dataset = dataset.map(tokenize_data, batched=True)
# Transform labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)
# Define training arguments
training_args = TrainingArguments(
"covid_tweets_sentiment_analysis_model",
num_train_epochs=4,
load_best_model_at_end=True,
evaluation_strategy="epoch",
save_strategy="epoch"
)
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)
# Define evaluation metrics
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=24)
eval_dataset = dataset["eval"].shuffle(seed=24)
#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()
# Reinstantiate the trainer for evaluation
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Launch the final evaluation
trainer.evaluate()
# Login to HF hub
notebook_login()
# Push model and tokenizer to HF Hub
model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")
tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") |