Generation after finetuning does not ends at EOS token

#123
by zokica - opened

I formatted content like this and generation does stop when it should.

from datasets import load_dataset

train_dataset = load_dataset('json', data_files='train.json', split='train')
eval_dataset = load_dataset('json', data_files='test.json', split='train')

def formatting_func(example):
    #new_text_format = f'<s>[INST] {instruction} [/INST] ```python\n{content}```</s>'
    text = f"Answer: {example['instruction']}\n <-beg->{example['output']}<-end-><|endoftext|>"
    return text


max_length = 128 

def generate_and_tokenize_prompt2(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2)

For example it outputs:
Answer:How many apples in a jar\n <-beg->Too many<-end-><-beg->Too many<-end-><-beg->Too many<-end->

and it should be just:
Answer:How many apples in a jar\n <-beg->Too many<-end->

Sign up or log in to comment