update model generated by v0.2
Browse filesSigned-off-by: wenhuach <[email protected]>
- README.md +21 -36
- config.json +7 -6
- model.safetensors +2 -2
- quantize_config.json +3 -3
- tokenizer.json +2 -2
README.md
CHANGED
@@ -1,10 +1,3 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
datasets:
|
4 |
-
- NeelNanda/pile-10k
|
5 |
-
language:
|
6 |
-
- en
|
7 |
-
---
|
8 |
|
9 |
|
10 |
|
@@ -21,11 +14,8 @@ This model is an int4 model with group_size 128 of [google/gemma-2b](https://hug
|
|
21 |
|
22 |
### INT4 Inference with AutoGPTQ's kernel
|
23 |
|
24 |
-
Install the latest [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) from source first
|
25 |
-
|
26 |
```python
|
27 |
-
##pip install auto-gptq
|
28 |
-
##pip install triton==2.2.0
|
29 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
30 |
quantized_model_dir = "Intel/gemma-2b-int4-inc"
|
31 |
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
|
@@ -37,41 +27,40 @@ tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
|
|
37 |
text = "There is a girl who likes adventure,"
|
38 |
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
39 |
print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=True)[0]))
|
|
|
40 |
```
|
41 |
|
42 |
|
43 |
|
44 |
### Evaluate the model
|
45 |
|
46 |
-
|
47 |
|
48 |
-
pip install auto-gptq
|
49 |
-
pip install triton==2.2.0
|
50 |
|
51 |
Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
|
52 |
|
53 |
```bash
|
54 |
-
lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,
|
55 |
```
|
56 |
|
57 |
-
| Metric | FP16 | int4 |
|
58 |
-
| -------------- | ------ | ------ |
|
59 |
-
| Avg. | 0.5383 | 0.5338 |
|
60 |
-
| mmlu | 0.3337 | 0.3276 |
|
61 |
-
| lambada_openai | 0.6398 | 0.6319 |
|
62 |
-
| hellaswag | 0.5271 | 0.5161 |
|
63 |
-
| winogrande | 0.6472 | 0.6472 |
|
64 |
-
| piqa | 0.7699 | 0.7622 |
|
65 |
-
| truthfulqa_mc1 | 0.2203 | 0.2191 |
|
66 |
-
| openbookqa | 0.3020 | 0.2980 |
|
67 |
-
| boolq | 0.6939 | 0.6939 |
|
68 |
-
| rte | 0.6426 | 0.6498 |
|
69 |
-
| arc_easy | 0.7424 | 0.7348 |
|
70 |
-
| arc_challenge | 0.4019 | 0.3908 |
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
### Reproduce the model
|
75 |
|
76 |
Here is the sample command to reproduce the model
|
77 |
|
@@ -85,6 +74,8 @@ python3 main.py \
|
|
85 |
--group_size 128 \
|
86 |
--bits 4 \
|
87 |
--iters 400 \
|
|
|
|
|
88 |
--deployment_device 'gpu' \
|
89 |
--output_dir "./tmp_autoround"
|
90 |
|
@@ -111,9 +102,3 @@ Here are a couple of useful links to learn more about Intel's AI software:
|
|
111 |
|
112 |
The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
|
113 |
|
114 |
-
|
115 |
-
## Cite
|
116 |
-
|
117 |
-
@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
|
118 |
-
|
119 |
-
[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
|
3 |
|
|
|
14 |
|
15 |
### INT4 Inference with AutoGPTQ's kernel
|
16 |
|
|
|
|
|
17 |
```python
|
18 |
+
##pip install auto-gptq
|
|
|
19 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
20 |
quantized_model_dir = "Intel/gemma-2b-int4-inc"
|
21 |
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
|
|
|
27 |
text = "There is a girl who likes adventure,"
|
28 |
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
29 |
print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=True)[0]))
|
30 |
+
##<bos>There is a girl who likes adventure, and she is a girl who likes to travel. She is a girl who likes to explore the world and see new things. She is a girl who likes to meet new people and learn about their cultures. She is a girl who likes to take risks
|
31 |
```
|
32 |
|
33 |
|
34 |
|
35 |
### Evaluate the model
|
36 |
|
37 |
+
pip3 install lm-eval==0.4.2
|
38 |
|
39 |
+
pip install auto-gptq
|
|
|
40 |
|
41 |
Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
|
42 |
|
43 |
```bash
|
44 |
+
lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
|
45 |
```
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
+
| Metric | BF16 | FP16 | AutoRound v0.1 | AutoRound v0.2 |
|
50 |
+
| -------------- | ------ | ------ | -------------- | -------------- |
|
51 |
+
| Avg. | 0.5263 | 0.5277 | 0.5235 | 0.5248 |
|
52 |
+
| mmlu | 0.3287 | 0.3287 | 0.3297 | 0.3309 |
|
53 |
+
| lambada_openai | 0.6344 | 0.6375 | 0.6307 | 0.6379 |
|
54 |
+
| hellaswag | 0.5273 | 0.5281 | 0.5159 | 0.5184 |
|
55 |
+
| winogrande | 0.6504 | 0.6488 | 0.6543 | 0.6575 |
|
56 |
+
| piqa | 0.7671 | 0.7720 | 0.7612 | 0.7606 |
|
57 |
+
| truthfulqa_mc1 | 0.2203 | 0.2203 | 0.2203 | 0.2191 |
|
58 |
+
| openbookqa | 0.2980 | 0.3020 | 0.3000 | 0.3060 |
|
59 |
+
| boolq | 0.6927 | 0.6936 | 0.6939 | 0.6966 |
|
60 |
+
| arc_easy | 0.7420 | 0.7403 | 0.7353 | 0.7357 |
|
61 |
+
| arc_challenge | 0.4019 | 0.4061 | 0.3933 | 0.3857 |
|
62 |
+
|
63 |
|
|
|
64 |
|
65 |
Here is the sample command to reproduce the model
|
66 |
|
|
|
74 |
--group_size 128 \
|
75 |
--bits 4 \
|
76 |
--iters 400 \
|
77 |
+
--use_quant_input \
|
78 |
+
--model_dtype "float16"
|
79 |
--deployment_device 'gpu' \
|
80 |
--output_dir "./tmp_autoround"
|
81 |
|
|
|
102 |
|
103 |
The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -9,6 +9,7 @@
|
|
9 |
"eos_token_id": 1,
|
10 |
"head_dim": 256,
|
11 |
"hidden_act": "gelu",
|
|
|
12 |
"hidden_size": 2048,
|
13 |
"initializer_range": 0.02,
|
14 |
"intermediate_size": 16384,
|
@@ -19,11 +20,12 @@
|
|
19 |
"num_key_value_heads": 1,
|
20 |
"pad_token_id": 0,
|
21 |
"quantization_config": {
|
22 |
-
"autoround_version": "0.
|
23 |
"bits": 4,
|
24 |
"damp_percent": 0.01,
|
25 |
"desc_act": false,
|
26 |
"enable_minmax_tuning": true,
|
|
|
27 |
"group_size": 128,
|
28 |
"is_marlin_format": false,
|
29 |
"iters": 400,
|
@@ -32,17 +34,16 @@
|
|
32 |
"model_file_base_name": "model",
|
33 |
"model_name_or_path": null,
|
34 |
"quant_method": "gptq",
|
35 |
-
"scale_dtype": "
|
36 |
"static_groups": false,
|
37 |
"sym": false,
|
38 |
-
"true_sequential": false
|
39 |
-
"use_quant_input": true
|
40 |
},
|
41 |
"rms_norm_eps": 1e-06,
|
42 |
"rope_scaling": null,
|
43 |
"rope_theta": 10000.0,
|
44 |
-
"torch_dtype": "
|
45 |
-
"transformers_version": "4.
|
46 |
"use_cache": true,
|
47 |
"vocab_size": 256000
|
48 |
}
|
|
|
9 |
"eos_token_id": 1,
|
10 |
"head_dim": 256,
|
11 |
"hidden_act": "gelu",
|
12 |
+
"hidden_activation": null,
|
13 |
"hidden_size": 2048,
|
14 |
"initializer_range": 0.02,
|
15 |
"intermediate_size": 16384,
|
|
|
20 |
"num_key_value_heads": 1,
|
21 |
"pad_token_id": 0,
|
22 |
"quantization_config": {
|
23 |
+
"autoround_version": "0.2.0.dev",
|
24 |
"bits": 4,
|
25 |
"damp_percent": 0.01,
|
26 |
"desc_act": false,
|
27 |
"enable_minmax_tuning": true,
|
28 |
+
"enable_quanted_input": true,
|
29 |
"group_size": 128,
|
30 |
"is_marlin_format": false,
|
31 |
"iters": 400,
|
|
|
34 |
"model_file_base_name": "model",
|
35 |
"model_name_or_path": null,
|
36 |
"quant_method": "gptq",
|
37 |
+
"scale_dtype": "float16",
|
38 |
"static_groups": false,
|
39 |
"sym": false,
|
40 |
+
"true_sequential": false
|
|
|
41 |
},
|
42 |
"rms_norm_eps": 1e-06,
|
43 |
"rope_scaling": null,
|
44 |
"rope_theta": 10000.0,
|
45 |
+
"torch_dtype": "float16",
|
46 |
+
"transformers_version": "4.40.2",
|
47 |
"use_cache": true,
|
48 |
"vocab_size": 256000
|
49 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcbf563b9667464d9217348e712763af0ae6acd26c5b53dc15483c18d51e910d
|
3 |
+
size 3130472744
|
quantize_config.json
CHANGED
@@ -10,11 +10,11 @@
|
|
10 |
"model_file_base_name": "model",
|
11 |
"is_marlin_format": false,
|
12 |
"quant_method": "intel/auto-round",
|
13 |
-
"autoround_version": "0.
|
14 |
"iters": 400,
|
15 |
"lr": 0.0025,
|
16 |
"minmax_lr": 0.0025,
|
17 |
"enable_minmax_tuning": true,
|
18 |
-
"
|
19 |
-
"scale_dtype": "
|
20 |
}
|
|
|
10 |
"model_file_base_name": "model",
|
11 |
"is_marlin_format": false,
|
12 |
"quant_method": "intel/auto-round",
|
13 |
+
"autoround_version": "0.2.0.dev",
|
14 |
"iters": 400,
|
15 |
"lr": 0.0025,
|
16 |
"minmax_lr": 0.0025,
|
17 |
"enable_minmax_tuning": true,
|
18 |
+
"enable_quanted_input": true,
|
19 |
+
"scale_dtype": "float16"
|
20 |
}
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4db21bfaffa1fd75fd741df2d95dc51e539d5cc38b07934bae0d7d129db90662
|
3 |
+
size 17477581
|