mobicham commited on
Commit
b4d5653
1 Parent(s): 88e72ff

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +30 -1
README.md CHANGED
@@ -22,4 +22,33 @@ model = HQQModelForCausalLM.from_quantized(model_id)
22
  #Optional
23
  from hqq.core.quantize import *
24
  HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)
25
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  #Optional
23
  from hqq.core.quantize import *
24
  HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)
25
+ ```
26
+
27
+ ### Quantization
28
+ You can reproduce the model using the following quant configs:
29
+
30
+ ``` Python
31
+ from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
32
+ model_id = "mistralai/Mixtral-8x7B-v0.1"
33
+ model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
34
+
35
+ #Quantize params
36
+ from hqq.core.quantize import *
37
+ attn_prams = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=True)
38
+ attn_prams['scale_quant_params']['group_size'] = 256
39
+ experts_params = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=True)
40
+
41
+ quant_config = {}
42
+ #Attention
43
+ quant_config['self_attn.q_proj'] = attn_prams
44
+ quant_config['self_attn.k_proj'] = attn_prams
45
+ quant_config['self_attn.v_proj'] = attn_prams
46
+ quant_config['self_attn.o_proj'] = attn_prams
47
+ #Experts
48
+ quant_config['block_sparse_moe.experts.w1'] = experts_params
49
+ quant_config['block_sparse_moe.experts.w2'] = experts_params
50
+ quant_config['block_sparse_moe.experts.w3'] = experts_params
51
+
52
+ #Quantize
53
+ model.quantize_model(quant_config=quant_config)
54
+ ```