support marlin kernel
#11
by
xun
- opened
Repacking weights to be compatible with Marlin kernel...: 5%|ββββ | 15/296 [00:00<00:02, 120.10it/s]
Traceback (most recent call last):
File "/data/vllm/to_marlin.py", line 5, in <module>
marlin_model = AutoGPTQForCausalLM.from_quantized(
File "/data/test/AutoGPTQ/auto_gptq/modeling/auto.py", line 142, in from_quantized
return quant_func(
File "/data/test/AutoGPTQ/auto_gptq/modeling/_base.py", line 1100, in from_quantized
model, model_save_name = prepare_model_for_marlin_load(
File "/data/test/AutoGPTQ/auto_gptq/utils/marlin_utils.py", line 63, in prepare_model_for_marlin_load
model = convert_to_marlin(model, quant_linear_class, quantize_config, repack=True)
File "/data/anaconda3/envs/qwen-q/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/data/test/AutoGPTQ/auto_gptq/utils/marlin_utils.py", line 156, in convert_to_marlin
new_module = MarlinQuantLinear(
File "/data/test/AutoGPTQ/auto_gptq/nn_modules/qlinear/qlinear_marlin.py", line 98, in __init__
raise ValueError(f"`infeatures:{infeatures}` must be divisible by 128 and `outfeatures:{outfeatures}` by 256.")
ValueError: `infeatures:2048` must be divisible by 128 and `outfeatures:5504` by 256.
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
GPTQ_MODEL = "/data/test/Qwen-1_8B-Chat-Int4"
marlin_model = AutoGPTQForCausalLM.from_quantized(
GPTQ_MODEL,
use_marlin=True,
device_map='auto',
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
"/data/test/Qwen-1_8B-Chat-Int4",
trust_remote_code=True
)
save_dir = "/data/test/Qwen-1_8B-Chat-Int4-marlin"
marlin_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
~