File size: 6,606 Bytes

435eb54

#!/usr/bin/env python
# coding: utf-8

# # Creating a Zero-Shot classifier based on BETO
# 
# This notebook/script fine-tunes a BETO (spanish bert, 'dccuchile/bert-base-spanish-wwm-cased') model on the spanish XNLI dataset.
# The fine-tuned model can then be fed to a Huggingface ZeroShot pipeline to obtain a ZeroShot classifier.

# In[ ]:


from datasets import load_dataset, Dataset, load_metric, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from pathlib import Path
# from ray import tune
# from ray.tune.suggest.hyperopt import HyperOptSearch
# from ray.tune.schedulers import ASHAScheduler


# # Prepare the datasets

# In[ ]:


xnli_es = load_dataset("xnli", "es")


# In[ ]:


xnli_es


# >joeddav
# >Aug '20
# >
# >@rsk97 In addition, just make sure the model used is trained on an NLI task and that the **last output label corresponds to entailment** while the **first output label corresponds to contradiction**.
# 
# => We change the original `label` and use the `labels` column, which is required by a `AutoModelForSequenceClassification`

# In[ ]:


# see markdown above
def switch_label_id(row):
    if row["label"] == 0:
        return {"labels": 2}
    elif row["label"] == 2:
        return {"labels": 0}
    else:
        return {"labels": 1}

for split in xnli_es:
    xnli_es[split] = xnli_es[split].map(switch_label_id)


# ## Tokenize data

# In[ ]:


tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")


# In a first attempt i padded all data to the maximum length of the dataset (379). However, the traninig takes substanially longer with all the paddings, it's better to pass in the tokenizer to the `Trainer` and let the `Trainer` do the padding on a batch level.

# In[ ]:


# Figured out max length of the dataset manually
# max_length = 379
def tokenize(row):
    return tokenizer(row["premise"], row["hypothesis"], truncation=True, max_length=512)  #, padding="max_length", max_length=max_length)


# In[ ]:


data = {}
for split in xnli_es:
    data[split] = xnli_es[split].map(
        tokenize, 
        remove_columns=["hypothesis", "premise", "label"], 
        batched=True, 
        batch_size=128
    )


# In[ ]:


train_path = str(Path("./train_ds").absolute())
valid_path = str(Path("./valid_ds").absolute())

data["train"].save_to_disk(train_path)
data["validation"].save_to_disk(valid_path)


# In[ ]:


# We can use `datasets.Dataset`s directly

# class XnliDataset(torch.utils.data.Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val) for key, val in self.data[idx].items()}
#         return item

#     def __len__(self):
#         return len(self.data)


# In[ ]:


def trainable(config):
    metric = load_metric("xnli", "es")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = predictions.argmax(axis=-1)
        return metric.compute(predictions=predictions, references=labels)
    
    model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=3)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        do_train=True,
        do_eval=True,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        num_train_epochs=config["epochs"],              # total number of training epochs
        per_device_train_batch_size=config["batch_size"],  # batch size per device during training
        per_device_eval_batch_size=config["batch_size_eval"],   # batch size for evaluation
        warmup_steps=config["warmup_steps"],  # 500
        weight_decay=config["weight_decay"],  # 0.001               # strength of weight decay
        learning_rate=config["learning_rate"],  # 5e-05
        logging_dir='./logs',            # directory for storing logs
        logging_steps=250,
        #save_steps=500,  # ignored when using load_best_model_at_end
        save_total_limit=10,
        no_cuda=False,
        disable_tqdm=True,
    )
    
#     train_dataset = XnliDataset(load_from_disk(config["train_path"]))
#     valid_dataset = XnliDataset(load_from_disk(config["valid_path"]))
    train_dataset = load_from_disk(config["train_path"])
    valid_dataset = load_from_disk(config["valid_path"])

    
    trainer = Trainer(
        model,
        tokenizer=tokenizer,
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
        compute_metrics=compute_metrics,
    )
    
    trainer.train()


# In[ ]:


trainable(
    {
        "train_path": train_path,
        "valid_path": valid_path,
        "batch_size": 16,
        "batch_size_eval": 64,
        "warmup_steps": 500,
        "weight_decay": 0.001,
        "learning_rate": 5e-5,
        "epochs": 3,
    }
)


# # HPO

# In[ ]:


# config = {
#     "train_path": train_path,
#     "valid_path": valid_path,
#     "warmup_steps": tune.randint(0, 500),
#     "weight_decay": tune.loguniform(0.00001, 0.1),
#     "learning_rate": tune.loguniform(5e-6, 5e-4),
#     "epochs": tune.choice([2, 3, 4])
# }


# # In[ ]:


# analysis = tune.run(
#     trainable,
#     config=config,
#     metric="eval_acc",
#     mode="max",
#     #search_alg=HyperOptSearch(),
#     #scheduler=ASHAScheduler(),
#     num_samples=1,
# )


# # In[ ]:


# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=3)

# trainer = Trainer(
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=valid_dataset,          # evaluation dataset
#     model_init=model_init,
#     compute_metrics=compute_metrics,
# )


# # In[ ]:


# best_trial = trainer.hyperparameter_search(
#     direction="maximize",
#     backend="ray",
#     n_trials=2,
#     # Choose among many libraries:
#     # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html
#     search_alg=HyperOptSearch(mode="max", metric="accuracy"),
#     # Choose among schedulers:
#     # https://docs.ray.io/en/latest/tune/api_docs/schedulers.html
#     scheduler=ASHAScheduler(mode="max", metric="accuracy"),
#     local_dir="tune_runs",
# )