Generation after finetuning does not ends at EOS token
#123
by
zokica
- opened
I formatted content like this and generation does stop when it should.
from datasets import load_dataset
train_dataset = load_dataset('json', data_files='train.json', split='train')
eval_dataset = load_dataset('json', data_files='test.json', split='train')
def formatting_func(example):
#new_text_format = f'<s>[INST] {instruction} [/INST] ```python\n{content}```</s>'
text = f"Answer: {example['instruction']}\n <-beg->{example['output']}<-end-><|endoftext|>"
return text
max_length = 128
def generate_and_tokenize_prompt2(prompt):
result = tokenizer(
formatting_func(prompt),
truncation=True,
max_length=max_length,
padding="max_length",
)
result["labels"] = result["input_ids"].copy()
return result
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2)
For example it outputs:
Answer:How many apples in a jar\n <-beg->Too many<-end-><-beg->Too many<-end-><-beg->Too many<-end->
and it should be just:
Answer:How many apples in a jar\n <-beg->Too many<-end->