getting messy response (response includes #)
I implemented Instruct model locally.
Here is a sample respose I got:
I get multiple # in the response
Code:
def __call__(
self, instruction: str, **generate_kwargs: Dict[str, Any]
) -> Tuple[str, str, float]:
s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)
input_ids = self.tokenizer(s, return_tensors="pt").input_ids
input_ids = input_ids.to(self.model.device)
with torch.no_grad():
output_ids = self.model.generate(input_ids, **generate_kwargs)
# Slice the output_ids tensor to get only new tokens
new_tokens = output_ids[0, len(input_ids[0]) :]
output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
return output_text
self.generate_text = InstructionTextGenerationPipeline(model=self.model, tokenizer=self.tokenizer)
response = self.generate_text(prompt, **data)
What generate kwargs are you using? Is this in bfloat16?
data ={"max_new_tokens": 100,
"temperature": 0.1,
"top_p": 1,
"use_cache": True,
"top_k": 0
}
Could this be a dtype thing? Are you using a GPU that supports bf16, and if not, is this fp32 or fp16?
I am also getting this. It generates a good first response, but then a ton of '#' characters
My only idea so far is to add this to the stopping_ids?
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
config = AutoConfig.from_pretrained(
'mosaicml/mpt-7b-chat',
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
'mosaicml/mpt-7b-chat',
config=config,
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
model.to(device='cuda:0')
model_inputs = tokenizer(text, return_tensors="pt").to("cuda")
output_ids = model.generate(
**model_inputs,
max_new_tokens=512,
)
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
You need to use stopping criteria as mentioned below. This would get rid of noisy responses.
stop_token_ids = generate.tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
Define a custom stopping criteria
class StopOnTokens(StoppingCriteria):
def call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_id in stop_token_ids:
if input_ids[0][-1] == stop_id:
return True
return False
I had the same problem, but with these generate kwargs I no longer get the spurious tokens after the end-of-sequence token is generated:
generate_params = {
"max_new_tokens": 512,
"temperature": 1.0,
"top_p": 1.0,
"top_k": 50,
"use_cache": True,
"do_sample": True,
"eos_token_id": 0,
"pad_token_id": 0
}
The important arg is the eos_token_id
, if you don't pass this, the token generation continues past the EOS token and we get garbage tokens.
For reference, this is what the full script looks like (using mpt-7b-chat, but it's the same for the instruct model, except for the input format):
import torch
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
model_name = 'mosaicml/mpt-7b-chat'
config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
model.to(device='cuda:0')
model.eval() # Evaluation mode is default, but calling it anyway
system_prompt = '''<|im_start|> system
You are am AI assistant
<|im_end|>\n
'''
user_message = '''
what is the meaning of life?
'''
fmt_user_message = f'<|im_start|>user {user_message}<|im_end|>\n'
input_ids = tokenizer(system_prompt + fmt_user_message, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
generate_params = {
"max_new_tokens": 512,
"temperature": 1.0,
"top_p": 1.0,
"top_k": 50,
"use_cache": True,
"do_sample": True,
"eos_token_id": 0,
"pad_token_id": 0
}
generated_ids = model.generate(input_ids, **generate_params)
output = tokenizer.decode(generated_ids.cpu().tolist()[0], skip_special_tokens=True)
for line in output.split('\n'):
print(line)
Using a stopping criteria and finding good generate kwargs are good ideas. Closing as complete