File size: 4,286 Bytes
dac5ca8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
"""Roberta sentiment Analysis

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e
"""

# Install required libraries
!pip install datasets transformers huggingface_hub -q

# Import key libraries and packages
import numpy as np
import os
import pandas as pd

from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from google.colab import files
from google.colab import drive

# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

drive.mount('/content/drive')

# Load the datasets
train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0)
test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("")

train_df.head()

test_df.head()

train_df.isnull().sum()

test_df.isnull().sum()

"""Fine-tuning the roberta model"""

train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label'])

print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}")

# Save splitted subsets
train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False)
eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False)

dataset = load_dataset('csv',
                        data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv',
                        'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1")

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

# Define helper functions
## Function to transform labels
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

## Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)

# Tokenize the tweets
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

# Define training arguments
training_args = TrainingArguments(
    "covid_tweets_sentiment_analysis_model", 
    num_train_epochs=4,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
    )

# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

# Define evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=24) 
eval_dataset = dataset["eval"].shuffle(seed=24)

#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()

# Reinstantiate the trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the final evaluation 
trainer.evaluate()

# Login to HF hub
notebook_login()

# Push model and tokenizer to HF Hub
model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")
tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")