The model is extremelly slow in 4bit, is my code for loading ok?

#7
by zokica - opened

I load it on RTX 3090 in 4 bits, and I get 1 tokens per second on a GPU which is slower than a CPU. For example llama 7b is around 30 tokens/second on teh same GPU. And the speed should be around 10 tokens/second.

I show only load code

if 1==1:

import time
timea = time.time()
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-30b')
base_model = "mosaicml/mpt-30b-chat"
config = transformers.AutoConfig.from_pretrained(base_model, trust_remote_code=True)
config.max_seq_len = 16384 # (input + output) tokens can now be up to 16384

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


#print("device_map",device_map) 
model = transformers.AutoModelForCausalLM.from_pretrained(
        base_model,
        config=config,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )  

model.eval()

print("load time",-timea + time.time())

Sign up or log in to comment