In [1]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch

INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:datasets:PyTorch version 2.2.1+cu118 available.


In [4]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'llava-hf/llava-v1.6-34b-hf'
quant_path = './llava-v1.6-34b-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

In [5]:
model = AutoAWQForCausalLM.from_pretrained(
    model_path, torch_dtype = torch.bfloat16,
)
_ = model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /llava-hf/llava-v1.6-34b-hf/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/llava-hf/llava-v1.6-34b-hf/revision/main HTTP/1.1" 200 2489


Fetching 25 files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /llava-hf/llava-v1.6-34b-hf/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


In [6]:
!nvidia-smi

Tue May 28 04:52:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   32C    P0              65W / 300W |  66718MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [7]:
def load_wikitext():
    data = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
    return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 30]

In [8]:
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext())

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/wikitext HTTP/1.1" 200 4846
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): s3.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/wikitext/wikitext.py HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/wikitext HTTP/1.1" 200 4846
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/wikitext/resolve/b08601e04326c79dfdd32d625aee71d232d685c3/README.md HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/wikitext/resolve/b08601e04326c79dfdd32d625aee71d232d685c3/.huggingface.yaml HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): datasets-server

In [9]:
model.save_quantized(quant_path)

[2024-05-28 06:02:42,856] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [10]:
tokenizer.save_pretrained(quant_path)

('./llava-v1.6-34b-awq/tokenizer_config.json',
 './llava-v1.6-34b-awq/special_tokens_map.json',
 './llava-v1.6-34b-awq/tokenizer.model',
 './llava-v1.6-34b-awq/added_tokens.json',
 './llava-v1.6-34b-awq/tokenizer.json')

In [17]:
tokenizer.push_to_hub('mesolitica/llava-v1.6-34b-awq')

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/repos/create HTTP/1.1" 409 108
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /mesolitica/llava-v1.6-34b-awq/resolve/main/README.md HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/validate-yaml HTTP/1.1" 200 27
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/mesolitica/llava-v1.6-34b-awq/preupload/main HTTP/1.1" 200 442
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /mesolitica/llava-v1.6-34b-awq.git/info/lfs/objects/batch HTTP/1.1" 200 908


tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com:443 "PUT /repos/59/d4/59d45338b6a6ddb440f61ec405842ef87dffed6ec946242daa5c9bfe59de941a/386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240528%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240528T071302Z&X-Amz-Expires=900&X-Amz-Signature=c521b865377a68a968cad9bef88b199a7ce8f5967af74d7a4d2c2197b35da6c5&X-Amz-SignedHeaders=host&x-amz-storage-class=INTELLIGENT_TIERING&x-id=PutObject HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /mesolitica/llava-v1.6-34b-awq.git/info/lfs/objects/verify HTTP/1.1" 200 2
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/mesolitica/llava-v1.6-34b-awq/commit/main HTTP/1.1" 200 202

CommitInfo(commit_url='https://huggingface.co/mesolitica/llava-v1.6-34b-awq/commit/03d9749ace4afe673620749b66ac77093bac742d', commit_message='Upload tokenizer', commit_description='', oid='03d9749ace4afe673620749b66ac77093bac742d', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
from transformers import AutoConfig, AwqConfig

quantization_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    backend='autoawq',
    version=quant_config['version'].lower(),
)

config = AutoConfig.from_pretrained(model_path)
config.quantization_config = quantization_config

config.push_to_hub('mesolitica/llava-v1.6-34b-awq')

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /llava-hf/llava-v1.6-34b-hf/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/repos/create HTTP/1.1" 409 108
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /mesolitica/llava-v1.6-34b-awq/resolve/main/README.md HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140629439752000 on /home/ubuntu/.cache/huggingface/hub/.locks/models--mesolitica--llava-v1.6-34b-awq/bc5f30d6632ac0efdc7be2e9095e9e9579af2e33.lock
DEBUG:filelock:Lock 140629439752000 acquired on /home/ubuntu/.cache/huggingface/hub/.locks/models--mesolitica--llava-v1.6-34b-awq/bc5f30d6632ac0efdc7be2e9095e9e9579af2e33.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /mesolitica/llava-v1.6-34b-awq/resolve/main/README.md HTTP/1.1" 200 5174


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140629439752000 on /home/ubuntu/.cache/huggingface/hub/.locks/models--mesolitica--llava-v1.6-34b-awq/bc5f30d6632ac0efdc7be2e9095e9e9579af2e33.lock
DEBUG:filelock:Lock 140629439752000 released on /home/ubuntu/.cache/huggingface/hub/.locks/models--mesolitica--llava-v1.6-34b-awq/bc5f30d6632ac0efdc7be2e9095e9e9579af2e33.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/validate-yaml HTTP/1.1" 200 27
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/mesolitica/llava-v1.6-34b-awq/preupload/main HTTP/1.1" 200 143
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/mesolitica/llava-v1.6-34b-awq/commit/main HTTP/1.1" 200 202


CommitInfo(commit_url='https://huggingface.co/mesolitica/llava-v1.6-34b-awq/commit/7f9ea6a51b95b743229de158f5bef5c5a33335db', commit_message='Upload config', commit_description='', oid='7f9ea6a51b95b743229de158f5bef5c5a33335db', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
from huggingface_hub import HfApi

api = HfApi()

In [24]:
api.upload_folder(
    folder_path='llava-v1.6-34b-awq',
    repo_id='mesolitica/llava-v1.6-34b-awq',
)