support marlin kernel

#11
by xun - opened
Repacking weights to be compatible with Marlin kernel...:   5%|β–ˆβ–ˆβ–ˆβ–‰                                                                         | 15/296 [00:00<00:02, 120.10it/s]
Traceback (most recent call last):
  File "/data/vllm/to_marlin.py", line 5, in <module>
    marlin_model = AutoGPTQForCausalLM.from_quantized(
  File "/data/test/AutoGPTQ/auto_gptq/modeling/auto.py", line 142, in from_quantized
    return quant_func(
  File "/data/test/AutoGPTQ/auto_gptq/modeling/_base.py", line 1100, in from_quantized
    model, model_save_name = prepare_model_for_marlin_load(
  File "/data/test/AutoGPTQ/auto_gptq/utils/marlin_utils.py", line 63, in prepare_model_for_marlin_load
    model = convert_to_marlin(model, quant_linear_class, quantize_config, repack=True)
  File "/data/anaconda3/envs/qwen-q/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/data/test/AutoGPTQ/auto_gptq/utils/marlin_utils.py", line 156, in convert_to_marlin
    new_module = MarlinQuantLinear(
  File "/data/test/AutoGPTQ/auto_gptq/nn_modules/qlinear/qlinear_marlin.py", line 98, in __init__
    raise ValueError(f"`infeatures:{infeatures}` must be divisible by 128 and `outfeatures:{outfeatures}` by 256.")
ValueError: `infeatures:2048` must be divisible by 128 and `outfeatures:5504` by 256.
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

GPTQ_MODEL = "/data/test/Qwen-1_8B-Chat-Int4"
marlin_model = AutoGPTQForCausalLM.from_quantized(
      GPTQ_MODEL,
      use_marlin=True,
      device_map='auto',
      trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
        "/data/test/Qwen-1_8B-Chat-Int4",
        trust_remote_code=True
    )
save_dir = "/data/test/Qwen-1_8B-Chat-Int4-marlin"
marlin_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
~

Sign up or log in to comment