yuekai
/

distill_whisper_large_v3_trtllm_triton

Model card Files Files and versions Community

yuekai commited on 29 days ago

Commit

f605912

•

1 Parent(s): a3eaad0

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

build_whisper_fp16.sh +34 -0
build_whisper_int8.sh +37 -0
tllm_checkpoint/decoder/config.json +38 -0
tllm_checkpoint/decoder/rank0.safetensors +3 -0
tllm_checkpoint/encoder/config.json +15 -0
tllm_checkpoint/encoder/rank0.safetensors +3 -0
tllm_checkpoint_int8/decoder/config.json +38 -0
tllm_checkpoint_int8/decoder/rank0.safetensors +3 -0
tllm_checkpoint_int8/encoder/config.json +15 -0
tllm_checkpoint_int8/encoder/rank0.safetensors +3 -0

build_whisper_fp16.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+export CUDA_VISIBLE_DEVICES=1
+INFERENCE_PRECISION=float16
+MAX_BEAM_WIDTH=4
+MAX_BATCH_SIZE=8
+checkpoint_dir=tllm_checkpoint
+output_dir=distill_whisper_large_v3
+# trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
+#                 --output_dir ${output_dir}/encoder \
+#                 --paged_kv_cache disable \
+#                 --moe_plugin disable \
+#                 --enable_xqa disable \
+#                 --max_batch_size ${MAX_BATCH_SIZE} \
+#                 --gemm_plugin disable \
+#                 --bert_attention_plugin ${INFERENCE_PRECISION} \
+#                 --remove_input_padding disable
+# trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
+#                 --output_dir ${output_dir}/decoder \
+#                 --paged_kv_cache disable \
+#                 --moe_plugin disable \
+#                 --enable_xqa disable \
+#                 --max_beam_width ${MAX_BEAM_WIDTH} \
+#                 --max_batch_size ${MAX_BATCH_SIZE} \
+#                 --max_seq_len 100 \
+#                 --max_input_len 14 \
+#                 --max_encoder_input_len 1500 \
+#                 --gemm_plugin ${INFERENCE_PRECISION} \
+#                 --bert_attention_plugin ${INFERENCE_PRECISION} \
+#                 --gpt_attention_plugin ${INFERENCE_PRECISION} \
+#                 --remove_input_padding disable
+python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}

build_whisper_int8.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+export CUDA_VISIBLE_DEVICES=1
+INFERENCE_PRECISION=float16
+WEIGHT_ONLY_PRECISION=int8
+MAX_BEAM_WIDTH=4
+MAX_BATCH_SIZE=8
+checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION}
+output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION}
+# python3 convert_checkpoint.py --use_weight_only \
+#                 --weight_only_precision $WEIGHT_ONLY_PRECISION \
+#                 --output_dir $checkpoint_dir
+trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
+                --output_dir ${output_dir}/encoder \
+                --paged_kv_cache disable \
+                --moe_plugin disable \
+                --enable_xqa disable \
+                --max_batch_size ${MAX_BATCH_SIZE} \
+                --gemm_plugin ${INFERENCE_PRECISION} \
+                --bert_attention_plugin ${INFERENCE_PRECISION} \
+                --remove_input_padding disable
+trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
+                --output_dir ${output_dir}/decoder \
+                --paged_kv_cache disable \
+                --moe_plugin disable \
+                --enable_xqa disable \
+                --max_beam_width ${MAX_BEAM_WIDTH} \
+                --max_batch_size ${MAX_BATCH_SIZE} \
+                --max_seq_len 100 \
+                --max_input_len 14 \
+                --max_encoder_input_len 1500 \
+                --gemm_plugin ${INFERENCE_PRECISION} \
+                --bert_attention_plugin ${INFERENCE_PRECISION} \
+                --gpt_attention_plugin ${INFERENCE_PRECISION} \
+                --remove_input_padding disable
+python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}

tllm_checkpoint/decoder/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architecture": "DecoderModel",
+    "dtype": "float16",
+    "logits_dtype": "float16",
+    "num_hidden_layers": 2,
+    "num_attention_heads": 20,
+    "hidden_size": 1280,
+    "norm_epsilon": 1e-05,
+    "vocab_size": 51866,
+    "hidden_act": "gelu",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "max_position_embeddings": 448,
+    "use_prompt_tuning": false,
+    "head_size": 64,
+    "has_position_embedding": true,
+    "layernorm_type": 0,
+    "has_attention_qkvo_bias": true,
+    "has_mlp_bias": true,
+    "has_model_final_layernorm": true,
+    "has_embedding_layernorm": false,
+    "has_embedding_scale": false,
+    "ffn_hidden_size": 5120,
+    "q_scaling": 1.0,
+    "layernorm_position": 0,
+    "relative_attention": false,
+    "max_distance": 0,
+    "num_buckets": 0,
+    "model_type": "whisper",
+    "rescale_before_lm_head": false,
+    "encoder_hidden_size": 1280,
+    "encoder_num_heads": 20,
+    "encoder_head_size": null,
+    "skip_cross_qkv": false,
+    "quantization": {
+        "quant_algo": null
+    }
+}

tllm_checkpoint/decoder/rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a2e1b9d985e4764effd3367fb7994c5b5cc4ac1dedd63f3c829d05f30113118
+size 371665280

tllm_checkpoint/encoder/config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "architecture": "WhisperEncoder",
+    "dtype": "float16",
+    "num_hidden_layers": 32,
+    "num_attention_heads": 20,
+    "hidden_size": 1280,
+    "n_mels": 128,
+    "n_audio_ctx": 1500,
+    "vocab_size": 51866,
+    "hidden_act": "gelu",
+    "num_languages": 100,
+    "quantization": {
+        "quant_algo": null
+    }
+}

tllm_checkpoint/encoder/rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9f10fdcb40284710f7488bba8c1a021c0d3099487ee886c95c537aacbc22b9
+size 1288720864

tllm_checkpoint_int8/decoder/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architecture": "DecoderModel",
+    "dtype": "float16",
+    "logits_dtype": "float16",
+    "num_hidden_layers": 2,
+    "num_attention_heads": 20,
+    "hidden_size": 1280,
+    "norm_epsilon": 1e-05,
+    "vocab_size": 51866,
+    "hidden_act": "gelu",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "max_position_embeddings": 448,
+    "use_prompt_tuning": false,
+    "head_size": 64,
+    "has_position_embedding": true,
+    "layernorm_type": 0,
+    "has_attention_qkvo_bias": true,
+    "has_mlp_bias": true,
+    "has_model_final_layernorm": true,
+    "has_embedding_layernorm": false,
+    "has_embedding_scale": false,
+    "ffn_hidden_size": 5120,
+    "q_scaling": 1.0,
+    "layernorm_position": 0,
+    "relative_attention": false,
+    "max_distance": 0,
+    "num_buckets": 0,
+    "model_type": "whisper",
+    "rescale_before_lm_head": false,
+    "encoder_hidden_size": 1280,
+    "encoder_num_heads": 20,
+    "encoder_head_size": null,
+    "skip_cross_qkv": false,
+    "quantization": {
+        "quant_algo": "W8A16"
+    }
+}

tllm_checkpoint_int8/decoder/rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85c059ed4fa6c74025538b2c38b42bce19755dfad6390ee1930dfc5c111fed19
+size 319304296

tllm_checkpoint_int8/encoder/config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "architecture": "WhisperEncoder",
+    "dtype": "float16",
+    "num_hidden_layers": 32,
+    "num_attention_heads": 20,
+    "hidden_size": 1280,
+    "n_mels": 128,
+    "n_audio_ctx": 1500,
+    "vocab_size": 51866,
+    "hidden_act": "gelu",
+    "num_languages": 100,
+    "quantization": {
+        "quant_algo": "W8A16"
+    }
+}

tllm_checkpoint_int8/encoder/rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5637200b3af4798f5d775e9c150efbda0084902937aa3a2cf200f9a4dfecb35
+size 660326480