yuekai commited on
Commit
f605912
1 Parent(s): a3eaad0

Upload folder using huggingface_hub

Browse files
build_whisper_fp16.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export CUDA_VISIBLE_DEVICES=1
2
+ INFERENCE_PRECISION=float16
3
+ MAX_BEAM_WIDTH=4
4
+ MAX_BATCH_SIZE=8
5
+ checkpoint_dir=tllm_checkpoint
6
+ output_dir=distill_whisper_large_v3
7
+
8
+
9
+ # trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
10
+ # --output_dir ${output_dir}/encoder \
11
+ # --paged_kv_cache disable \
12
+ # --moe_plugin disable \
13
+ # --enable_xqa disable \
14
+ # --max_batch_size ${MAX_BATCH_SIZE} \
15
+ # --gemm_plugin disable \
16
+ # --bert_attention_plugin ${INFERENCE_PRECISION} \
17
+ # --remove_input_padding disable
18
+
19
+ # trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
20
+ # --output_dir ${output_dir}/decoder \
21
+ # --paged_kv_cache disable \
22
+ # --moe_plugin disable \
23
+ # --enable_xqa disable \
24
+ # --max_beam_width ${MAX_BEAM_WIDTH} \
25
+ # --max_batch_size ${MAX_BATCH_SIZE} \
26
+ # --max_seq_len 100 \
27
+ # --max_input_len 14 \
28
+ # --max_encoder_input_len 1500 \
29
+ # --gemm_plugin ${INFERENCE_PRECISION} \
30
+ # --bert_attention_plugin ${INFERENCE_PRECISION} \
31
+ # --gpt_attention_plugin ${INFERENCE_PRECISION} \
32
+ # --remove_input_padding disable
33
+
34
+ python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}
build_whisper_int8.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export CUDA_VISIBLE_DEVICES=1
2
+ INFERENCE_PRECISION=float16
3
+ WEIGHT_ONLY_PRECISION=int8
4
+ MAX_BEAM_WIDTH=4
5
+ MAX_BATCH_SIZE=8
6
+ checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION}
7
+ output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION}
8
+
9
+ # python3 convert_checkpoint.py --use_weight_only \
10
+ # --weight_only_precision $WEIGHT_ONLY_PRECISION \
11
+ # --output_dir $checkpoint_dir
12
+
13
+ trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
14
+ --output_dir ${output_dir}/encoder \
15
+ --paged_kv_cache disable \
16
+ --moe_plugin disable \
17
+ --enable_xqa disable \
18
+ --max_batch_size ${MAX_BATCH_SIZE} \
19
+ --gemm_plugin ${INFERENCE_PRECISION} \
20
+ --bert_attention_plugin ${INFERENCE_PRECISION} \
21
+ --remove_input_padding disable
22
+
23
+ trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
24
+ --output_dir ${output_dir}/decoder \
25
+ --paged_kv_cache disable \
26
+ --moe_plugin disable \
27
+ --enable_xqa disable \
28
+ --max_beam_width ${MAX_BEAM_WIDTH} \
29
+ --max_batch_size ${MAX_BATCH_SIZE} \
30
+ --max_seq_len 100 \
31
+ --max_input_len 14 \
32
+ --max_encoder_input_len 1500 \
33
+ --gemm_plugin ${INFERENCE_PRECISION} \
34
+ --bert_attention_plugin ${INFERENCE_PRECISION} \
35
+ --gpt_attention_plugin ${INFERENCE_PRECISION} \
36
+ --remove_input_padding disable
37
+ python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}
tllm_checkpoint/decoder/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "DecoderModel",
3
+ "dtype": "float16",
4
+ "logits_dtype": "float16",
5
+ "num_hidden_layers": 2,
6
+ "num_attention_heads": 20,
7
+ "hidden_size": 1280,
8
+ "norm_epsilon": 1e-05,
9
+ "vocab_size": 51866,
10
+ "hidden_act": "gelu",
11
+ "use_parallel_embedding": false,
12
+ "embedding_sharding_dim": 0,
13
+ "max_position_embeddings": 448,
14
+ "use_prompt_tuning": false,
15
+ "head_size": 64,
16
+ "has_position_embedding": true,
17
+ "layernorm_type": 0,
18
+ "has_attention_qkvo_bias": true,
19
+ "has_mlp_bias": true,
20
+ "has_model_final_layernorm": true,
21
+ "has_embedding_layernorm": false,
22
+ "has_embedding_scale": false,
23
+ "ffn_hidden_size": 5120,
24
+ "q_scaling": 1.0,
25
+ "layernorm_position": 0,
26
+ "relative_attention": false,
27
+ "max_distance": 0,
28
+ "num_buckets": 0,
29
+ "model_type": "whisper",
30
+ "rescale_before_lm_head": false,
31
+ "encoder_hidden_size": 1280,
32
+ "encoder_num_heads": 20,
33
+ "encoder_head_size": null,
34
+ "skip_cross_qkv": false,
35
+ "quantization": {
36
+ "quant_algo": null
37
+ }
38
+ }
tllm_checkpoint/decoder/rank0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2e1b9d985e4764effd3367fb7994c5b5cc4ac1dedd63f3c829d05f30113118
3
+ size 371665280
tllm_checkpoint/encoder/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "WhisperEncoder",
3
+ "dtype": "float16",
4
+ "num_hidden_layers": 32,
5
+ "num_attention_heads": 20,
6
+ "hidden_size": 1280,
7
+ "n_mels": 128,
8
+ "n_audio_ctx": 1500,
9
+ "vocab_size": 51866,
10
+ "hidden_act": "gelu",
11
+ "num_languages": 100,
12
+ "quantization": {
13
+ "quant_algo": null
14
+ }
15
+ }
tllm_checkpoint/encoder/rank0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc9f10fdcb40284710f7488bba8c1a021c0d3099487ee886c95c537aacbc22b9
3
+ size 1288720864
tllm_checkpoint_int8/decoder/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "DecoderModel",
3
+ "dtype": "float16",
4
+ "logits_dtype": "float16",
5
+ "num_hidden_layers": 2,
6
+ "num_attention_heads": 20,
7
+ "hidden_size": 1280,
8
+ "norm_epsilon": 1e-05,
9
+ "vocab_size": 51866,
10
+ "hidden_act": "gelu",
11
+ "use_parallel_embedding": false,
12
+ "embedding_sharding_dim": 0,
13
+ "max_position_embeddings": 448,
14
+ "use_prompt_tuning": false,
15
+ "head_size": 64,
16
+ "has_position_embedding": true,
17
+ "layernorm_type": 0,
18
+ "has_attention_qkvo_bias": true,
19
+ "has_mlp_bias": true,
20
+ "has_model_final_layernorm": true,
21
+ "has_embedding_layernorm": false,
22
+ "has_embedding_scale": false,
23
+ "ffn_hidden_size": 5120,
24
+ "q_scaling": 1.0,
25
+ "layernorm_position": 0,
26
+ "relative_attention": false,
27
+ "max_distance": 0,
28
+ "num_buckets": 0,
29
+ "model_type": "whisper",
30
+ "rescale_before_lm_head": false,
31
+ "encoder_hidden_size": 1280,
32
+ "encoder_num_heads": 20,
33
+ "encoder_head_size": null,
34
+ "skip_cross_qkv": false,
35
+ "quantization": {
36
+ "quant_algo": "W8A16"
37
+ }
38
+ }
tllm_checkpoint_int8/decoder/rank0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c059ed4fa6c74025538b2c38b42bce19755dfad6390ee1930dfc5c111fed19
3
+ size 319304296
tllm_checkpoint_int8/encoder/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "WhisperEncoder",
3
+ "dtype": "float16",
4
+ "num_hidden_layers": 32,
5
+ "num_attention_heads": 20,
6
+ "hidden_size": 1280,
7
+ "n_mels": 128,
8
+ "n_audio_ctx": 1500,
9
+ "vocab_size": 51866,
10
+ "hidden_act": "gelu",
11
+ "num_languages": 100,
12
+ "quantization": {
13
+ "quant_algo": "W8A16"
14
+ }
15
+ }
tllm_checkpoint_int8/encoder/rank0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5637200b3af4798f5d775e9c150efbda0084902937aa3a2cf200f9a4dfecb35
3
+ size 660326480