bpawar commited on
Commit
1b438fb
β€’
1 Parent(s): f8345df

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.onnx_data filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
3rd_party_licenses.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1: Name: TensorRT Model Optimizer - Windows
2
+ Version: 0.19
3
+ LicenseText: MIT License
4
+
5
+ Copyright (c) 2023 MIT HAN Lab
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+
25
+
26
+ 2: Name: onnxruntime-genai-directml
27
+ Version: 0.4.0
28
+ LicenseText: MIT License
29
+
30
+ Copyright (c) Microsoft Corporation.
31
+
32
+ Permission is hereby granted, free of charge, to any person obtaining a copy
33
+ of this software and associated documentation files (the "Software"), to deal
34
+ in the Software without restriction, including without limitation the rights
35
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36
+ copies of the Software, and to permit persons to whom the Software is
37
+ furnished to do so, subject to the following conditions:
38
+
39
+ The above copyright notice and this permission notice shall be included in all
40
+ copies or substantial portions of the Software.
41
+
42
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48
+ SOFTWARE
License.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ GOVERNING TERMS: Use of this model is governed by the NVIDIA Open Model License Agreement (found at https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf). ADDITIONAL INFORMATION: Apache License, Version 2.0 (found at https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).
Readme.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ To run inference with this model, please follow below steps -
2
+
3
+ 1. Install Nvidia Graphics driver R565 or higher.
4
+ 2. Install python 3.10.11.
5
+ 3. Setup GenAI DirectML via
6
+ a. Install numpy - pip install numpy==2.1.0
7
+ b. Install GenAI - pip install onnxruntime-genai-directml==0.5.0
8
+ 4. Download inference script - curl https://github.com/microsoft/onnxruntime-genai/blob/rel-0.5.0/examples/python/phi3-qa.py -o phi3-qa.py
9
+ 5. Run inference - python phi3-qa.py -m <model-downloaded-path>
10
+ a. Enter prompt - "What is GenAI?"
genai_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 1,
4
+ "context_length": 1024000,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": [
9
+ {
10
+ "dml": {}
11
+ }
12
+ ]
13
+ },
14
+ "filename": "model.onnx",
15
+ "head_size": 128,
16
+ "hidden_size": 5120,
17
+ "inputs": {
18
+ "input_ids": "input_ids",
19
+ "attention_mask": "attention_mask",
20
+ "position_ids": "position_ids",
21
+ "past_key_names": "past_key_values.%d.key",
22
+ "past_value_names": "past_key_values.%d.value"
23
+ },
24
+ "outputs": {
25
+ "logits": "logits",
26
+ "present_key_names": "present.%d.key",
27
+ "present_value_names": "present.%d.value"
28
+ },
29
+ "num_attention_heads": 32,
30
+ "num_hidden_layers": 40,
31
+ "num_key_value_heads": 8
32
+ },
33
+ "eos_token_id": 2,
34
+ "pad_token_id": 2,
35
+ "type": "mistral",
36
+ "vocab_size": 131072
37
+ },
38
+ "search": {
39
+ "diversity_penalty": 0.0,
40
+ "do_sample": false,
41
+ "early_stopping": true,
42
+ "length_penalty": 1.0,
43
+ "max_length": 1024000,
44
+ "min_length": 0,
45
+ "no_repeat_ngram_size": 0,
46
+ "num_beams": 1,
47
+ "num_return_sequences": 1,
48
+ "past_present_share_buffer": true,
49
+ "repetition_penalty": 1.0,
50
+ "temperature": 1.0,
51
+ "top_k": 1,
52
+ "top_p": 1.0
53
+ }
54
+ }
model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:219f817360446debb5e2b557f2c2e40dc6a5cfc2c9a132ede25b9278b1344372
3
+ size 545151
model.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef8e2c8550824c282e318d9b074d4c93d054282c37d6493f03015ee825a2b36
3
+ size 8573839360
quantization_log_and_pip_list.txt ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ (venv2) D:\modelopt-windows-scripts\ONNX_PTQ>python D:\opset21_patrice.py --onnx_path="D:\GenAI\models\FP16_Mistral-Nemo-Instruct-2407_ONNX\model.onnx" --output_path="D:\GenAI\models\FP16_Mistral-Nemo-Instruct-2407_ONNX\opset_21\model.onnx"
3
+ Printing opset info of given input model...
4
+
5
+ Domain:
6
+ Version: 14
7
+
8
+ Domain: com.microsoft
9
+ Version: 1
10
+
11
+ Printing opset info of output model...
12
+
13
+ Domain:
14
+ Version: 21
15
+
16
+ Domain: com.microsoft
17
+ Version: 1
18
+
19
+
20
+ (venv2) D:\modelopt-windows-scripts\ONNX_PTQ>python quantize_script.py --model_name=mistralai/Mistral-Nemo-Instruct-2407 --onnx_path=D:\GenAI\models\FP16_Mistral-Nemo-Instruct-2407_ONNX\opset_21\model.onnx --output_path="D:\GenAI\models\FP16_Mistral-Nemo-Instruct-2407_ONNX\opset_21\default_quant_cuda_ep_calib\model.onnx" --calibration_eps=cuda
21
+
22
+ --Quantize-Script-- algo=awq_lite, dataset=cnn, calib_size=32, batch_size=1, block_size=128, add-position-ids=True, past-kv=True, rcalib=False, device=cpu, use_zero_point=False
23
+
24
+
25
+
26
+ --Quantize-Script-- awqlite_alpha_step=0.1, awqlite_fuse_nodes=False, awqlite_run_per_subgraph=False, awqclip_alpha_step=0.05, awqclip_alpha_min=0.5, awqclip_bsz_col=1024, calibration_eps=['cuda']
27
+
28
+ D:\venv2\Lib\site-packages\transformers\models\auto\configuration_auto.py:1002: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
29
+ warnings.warn(
30
+ D:\venv2\Lib\site-packages\transformers\models\auto\tokenization_auto.py:809: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
31
+ warnings.warn(
32
+
33
+ --Quantize-Script-- number_of_batched_samples=32, batch-input-ids-list-len=32, batched_attention_mask=32
34
+
35
+
36
+ --Quantize-Script-- number of batched inputs = 32
37
+
38
+ INFO:root:
39
+ Quantizing the model....
40
+
41
+ INFO:root:Quantization Mode: int4
42
+ INFO:root:Finding quantizable weights and augmenting graph output with input activations
43
+ INFO:root:Augmenting took 0.031656503677368164 seconds
44
+ INFO:root:Saving the model took 60.20284128189087 seconds
45
+ 2024-11-05 22:37:34.5783341 [W:onnxruntime:, transformer_memcpy.cc:74 onnxruntime::MemcpyTransformer::ApplyImpl] 11 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.
46
+ 2024-11-05 22:37:34.5949880 [W:onnxruntime:, session_state.cc:1168 onnxruntime::VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
47
+ 2024-11-05 22:37:34.6026375 [W:onnxruntime:, session_state.cc:1170 onnxruntime::VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.
48
+ Getting activation names maps...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 280/280 [00:00<?, ?it/s]
49
+ Running AWQ scale search per node...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 280/280 [17:50<00:00, 3.82s/it]
50
+ INFO:root:AWQ scale search took 1070.4731740951538 seconds
51
+ Quantizing the weights...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 280/280 [00:15<00:00, 17.78it/s]
52
+ INFO:root:Quantizing actual weights took 15.744078636169434 seconds
53
+ INFO:root:Inserting DQ nodes and input_pre_quant_scale node using quantized weights and scales ...
54
+ INFO:root:Inserting nodes took 0.17318105697631836 seconds
55
+ INFO:root:Exporting the quantized graph ...
56
+ Loading extension modelopt_round_and_pack_ext...
57
+
58
+ INFO:root:Exporting took 59.45134162902832 seconds
59
+ INFO:root:
60
+ Quantization process took 1223.9775414466858 seconds
61
+ INFO:root:Saving to D:\GenAI\models\FP16_Mistral-Nemo-Instruct-2407_ONNX\opset_21\default_quant_cuda_ep_calib\model.onnx took 9.476586818695068 seconds
62
+
63
+ Done
64
+
65
+
66
+ (venv2) D:\modelopt-windows-scripts\ONNX_PTQ>pip list
67
+ Package Version
68
+ -------------------- -------------------------
69
+ aiohappyeyeballs 2.4.3
70
+ aiohttp 3.10.10
71
+ aiosignal 1.3.1
72
+ annotated-types 0.7.0
73
+ attrs 24.2.0
74
+ certifi 2024.8.30
75
+ charset-normalizer 3.4.0
76
+ cloudpickle 3.1.0
77
+ colorama 0.4.6
78
+ coloredlogs 15.0.1
79
+ cppimport 22.8.2
80
+ cupy-cuda12x 13.3.0
81
+ datasets 3.1.0
82
+ dill 0.3.8
83
+ fastrlock 0.8.2
84
+ filelock 3.16.1
85
+ flatbuffers 24.3.25
86
+ frozenlist 1.5.0
87
+ fsspec 2024.9.0
88
+ huggingface-hub 0.26.2
89
+ humanfriendly 10.0
90
+ idna 3.10
91
+ Jinja2 3.1.4
92
+ Mako 1.3.6
93
+ markdown-it-py 3.0.0
94
+ MarkupSafe 3.0.2
95
+ mdurl 0.1.2
96
+ mpmath 1.3.0
97
+ multidict 6.1.0
98
+ multiprocess 0.70.16
99
+ networkx 3.4.2
100
+ ninja 1.11.1.1
101
+ numpy 1.26.4
102
+ nvidia-modelopt 0.20.1.dev20+g299b7f8a098
103
+ onnx 1.16.0
104
+ onnx-graphsurgeon 0.5.2
105
+ onnxconverter-common 1.14.0
106
+ onnxmltools 1.12.0
107
+ onnxruntime-gpu 1.20.0
108
+ packaging 24.1
109
+ pandas 2.2.3
110
+ pip 24.0
111
+ propcache 0.2.0
112
+ protobuf 3.20.2
113
+ pyarrow 18.0.0
114
+ pybind11 2.13.6
115
+ pydantic 2.9.2
116
+ pydantic_core 2.23.4
117
+ Pygments 2.18.0
118
+ pyreadline3 3.5.4
119
+ python-dateutil 2.9.0.post0
120
+ pytz 2024.2
121
+ PyYAML 6.0.2
122
+ regex 2024.9.11
123
+ requests 2.32.3
124
+ rich 13.9.4
125
+ safetensors 0.4.5
126
+ scipy 1.14.1
127
+ setuptools 65.5.0
128
+ six 1.16.0
129
+ sympy 1.13.3
130
+ tokenizers 0.20.2
131
+ torch 2.4.0
132
+ tqdm 4.66.6
133
+ transformers 4.46.1
134
+ typing_extensions 4.12.2
135
+ tzdata 2024.2
136
+ urllib3 2.2.3
137
+ xxhash 3.5.0
138
+ yarl 1.17.1
139
+
140
+ [notice] A new release of pip is available: 24.0 -> 24.3.1
141
+ [notice] To update, run: python.exe -m pip install --upgrade pip
142
+
143
+ (venv2) D:\modelopt-windows-scripts\ONNX_PTQ>
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0240ce510f08e6c2041724e9043e33be9d251d1e4a4d94eb68cd47b954b61d2
3
+ size 17078292
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff