Training this model

#33
by ottogutierrez - opened

How would you train this model?
I know you can use AutoTrain but in what format should I include the training data?

Thanks!

This is a fantastic question I am also trying to figure out!

I have included my training script below, which successfully runs on my rtx-3090, but the problem I am running into is a large number of false positives when using multi_label inference on the trained models, which seems to be driven by very high entailment logits, despite performing very well on evaluation data (>95% accuracy).

I suspect it is either an issue with the preprocessing (I have tried one-hot encoding for entailment, neutral and contradiction which does not run) or possibly some processing of the logits is necessary? Any help from @facebook-llama would be greatly appreciated.

import numpy as np
import evaluate
import torch
from datasets import load_from_disk
from transformers import BartForSequenceClassification, BartTokenizerFast, Trainer, TrainingArguments, EvalPrediction
from convert_classified_feedback_to_zsc_training_data import dataset_output as balanced_dataset_file

# Load the balanced dataset
balanced_dataset = load_from_disk(balanced_dataset_file)

# Assuming balanced_dataset is your preprocessed and balanced dataset
train_test_split = balanced_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Initialize the tokenizer
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')


# Function to encode the dataset
def encode_examples(examples):
    # Tokenize the premise and the hypothesis
    encoding = tokenizer(examples['premise'], examples['hypothesis'], truncation='only_first', padding='max_length', )

    # Convert labels to numeric values: entailment=0, neutral=1, contradiction=2
    text_to_tensor = {
        'entailment': torch.tensor([2]),
        'neutral': torch.tensor([1]),
        'contradiction': torch.tensor([0]),
    }

    encoding['labels'] = [text_to_tensor[label] for label in examples['label']]

    encoding["input_sentence"] = tokenizer.batch_decode(encoding.input_ids)

    return encoding


# print the first record from each dataset
print(train_dataset[0])
print(test_dataset[0])

# Encode the full dataset
train_dataset = train_dataset.map(encode_examples, batched=True, remove_columns=['premise', 'hypothesis', 'label'])
test_dataset = test_dataset.map(encode_examples, batched=True, remove_columns=['premise', 'hypothesis', 'label'])

train_dataset.set_format('torch')
test_dataset.set_format('torch')

# Load the model
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli', num_labels=3)

learning_rate = 5e-5

batch_size = 8
eval_batch_size = 8

gradient_accumulation_steps = int(16 / batch_size)

# Define training arguments
training_args = TrainingArguments(
    learning_rate=learning_rate,  # The initial learning rate for Adam
    output_dir=f'./results/lr_{learning_rate}',  # Output directory
    num_train_epochs=5,  # Total number of training epochs
    per_device_train_batch_size=batch_size,  # Batch size per device during training
    per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation
    gradient_accumulation_steps=gradient_accumulation_steps,
    eval_accumulation_steps=eval_batch_size,
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    logging_dir=f'./logs/lr_{learning_rate}',  # Directory for storing logs
    logging_steps=10,  # log results every x steps
    evaluation_strategy="steps",
    eval_steps=100,  # evaluate every x steps
    save_strategy="steps",
    save_steps=100,  # save model every x steps
)


# Define the compute_metrics function for evaluation
def compute_metrics(p: EvalPrediction):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    return result


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()
This comment has been hidden

@lcahill have you solved the issue? are you able to do it

@lcahill have you solved the issue? are you able to do it

Hi @akashAD , yes, luckily I was! The key is to have your dataset minic the MNLI Dataset format. https://huggingface.co/datasets/nyu-mll/multi_nli. I'd recommend starting by re-creating the bart-large-mnli model from bart-large.

This has some good resouces on it:
https://medium.com/@lidores98/finetuning-huggingface-facebook-bart-model-2c758472e340

Sign up or log in to comment