amd
/

Llama-3.1-8B-Instruct-FP8-KV

haoyang-amd commited on Sep 19

Commit

b97b5e3

•

1 Parent(s): 17fe796

Update README.md (#13)

Files changed (1) hide show

README.md CHANGED Viewed

@@ -23,8 +23,9 @@ python3 quantize_quark.py \
         --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
-        --num_calib_data 128  \
-        --model_export quark_safetensors
 # If model size is too large for single GPU, please use multi GPU instead.
 python3 quantize_quark.py \
@@ -32,8 +33,9 @@ python3 quantize_quark.py \
         --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
-        --num_calib_data 128  \
         --model_export quark_safetensors \
         --multi_gpu
 ```
 ## Deployment

         --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
+        --num_calib_data 128 \
+        --model_export quark_safetensors \
+        --no_weight_matrix_merge
 # If model size is too large for single GPU, please use multi GPU instead.
 python3 quantize_quark.py \
         --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
+        --num_calib_data 128 \
         --model_export quark_safetensors \
+       --no_weight_matrix_merge \
         --multi_gpu
 ```
 ## Deployment