error to run in float16

#11
by Baicai003 - opened

with the https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16 model
I test:

from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import torch

print("start load")
model = AutoModelForCausalLM.from_pretrained("./gpt-j-6B", low_cpu_mem_usage=True, torch_dtype=torch.float16)
print("end 1")
tokenizer = AutoTokenizer.from_pretrained("./gpt-j-6B", low_cpu_mem_usage=True, torch_dtype=torch.float16)
print("end 2")

# from parallelformers import parallelize

# parallelize(model, num_gpus=1, fp16=True, verbose='detail')

start = time.time()
inputs = tokenizer("My Name is Mukesh ", return_tensors="pt")
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=200,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)

start = time.time()
inputs = tokenizer("Q:what is AI\nA:AI is a cat.\nQ:why?\nA:", return_tensors="pt")
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=200,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)

start = time.time()
inputs = tokenizer("Q:世界上最大的地方是哪\nA:是太阳.\nQ:太阳是世界上最大的地方吗?\nA:", return_tensors="pt")
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=200,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)

but got an error with:

start load
end 1
end 2
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Traceback (most recent call last):
  File "test.py", line 17, in <module>
    outputs = model.generate(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
    return func(*args, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/generation/utils.py", line 1608, in generate
    return self.beam_search(
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/generation/utils.py", line 2799, in beam_search
    outputs = self(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gptj/modeling_gptj.py", line 821, in forward
    transformer_outputs = self.transformer(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gptj/modeling_gptj.py", line 676, in forward
    outputs = block(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gptj/modeling_gptj.py", line 309, in forward
    hidden_states = self.ln_1(hidden_states)
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/normalization.py", line 189, in forward
    return F.layer_norm(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/functional.py", line 2503, in layer_norm
    return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'

this code works fine for me:

from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import torch

print("start load")
model = AutoModelForCausalLM.from_pretrained("./gpt-j-6B", low_cpu_mem_usage=True, torch_dtype=torch.float16)
model = model.to(torch.device("cuda:0"))
print("end 1")
tokenizer = AutoTokenizer.from_pretrained("./gpt-j-6B", low_cpu_mem_usage=True, torch_dtype=torch.float16)
# tokenizer = tokenizer.to(torch.device("cuda:0"))
print("end 2")

# from parallelformers import parallelize

# parallelize(model, num_gpus=1, fp16=True, verbose='detail')

start = time.time()
# 使用float16
inputs = tokenizer("My Name is Mukesh ", return_tensors="pt")
inputs = inputs.to(torch.device("cuda:0"))

outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=100,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)

start = time.time()
inputs = tokenizer("Q:what is AI\nA:AI is a cat.\nQ:why?\nA:", return_tensors="pt")
inputs = inputs.to(torch.device("cuda:0"))
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=100,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)

start = time.time()
inputs = tokenizer("Q:请用python实现CNN\nA:", return_tensors="pt")
inputs = inputs.to(torch.device("cuda:0"))
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=4,
max_length=100,
)
print(f"Output: {tokenizer.batch_decode(outputs)[0]}")
end = time.time()
print(end - start)
Baicai003 changed discussion status to closed

Sign up or log in to comment