Update README.md
Browse files
README.md
CHANGED
@@ -22,4 +22,33 @@ model = HQQModelForCausalLM.from_quantized(model_id)
|
|
22 |
#Optional
|
23 |
from hqq.core.quantize import *
|
24 |
HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)
|
25 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
#Optional
|
23 |
from hqq.core.quantize import *
|
24 |
HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)
|
25 |
+
```
|
26 |
+
|
27 |
+
### Quantization
|
28 |
+
You can reproduce the model using the following quant configs:
|
29 |
+
|
30 |
+
``` Python
|
31 |
+
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
|
32 |
+
model_id = "mistralai/Mixtral-8x7B-v0.1"
|
33 |
+
model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
|
34 |
+
|
35 |
+
#Quantize params
|
36 |
+
from hqq.core.quantize import *
|
37 |
+
attn_prams = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=True)
|
38 |
+
attn_prams['scale_quant_params']['group_size'] = 256
|
39 |
+
experts_params = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=True)
|
40 |
+
|
41 |
+
quant_config = {}
|
42 |
+
#Attention
|
43 |
+
quant_config['self_attn.q_proj'] = attn_prams
|
44 |
+
quant_config['self_attn.k_proj'] = attn_prams
|
45 |
+
quant_config['self_attn.v_proj'] = attn_prams
|
46 |
+
quant_config['self_attn.o_proj'] = attn_prams
|
47 |
+
#Experts
|
48 |
+
quant_config['block_sparse_moe.experts.w1'] = experts_params
|
49 |
+
quant_config['block_sparse_moe.experts.w2'] = experts_params
|
50 |
+
quant_config['block_sparse_moe.experts.w3'] = experts_params
|
51 |
+
|
52 |
+
#Quantize
|
53 |
+
model.quantize_model(quant_config=quant_config)
|
54 |
+
```
|