|
export CUDA_VISIBLE_DEVICES=1 |
|
INFERENCE_PRECISION=float16 |
|
WEIGHT_ONLY_PRECISION=int8 |
|
MAX_BEAM_WIDTH=4 |
|
MAX_BATCH_SIZE=8 |
|
checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION} |
|
output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION} |
|
|
|
|
|
|
|
|
|
|
|
trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ |
|
--output_dir ${output_dir}/encoder \ |
|
--paged_kv_cache disable \ |
|
--moe_plugin disable \ |
|
--enable_xqa disable \ |
|
--max_batch_size ${MAX_BATCH_SIZE} \ |
|
--gemm_plugin ${INFERENCE_PRECISION} \ |
|
--bert_attention_plugin ${INFERENCE_PRECISION} \ |
|
--remove_input_padding disable |
|
|
|
trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ |
|
--output_dir ${output_dir}/decoder \ |
|
--paged_kv_cache disable \ |
|
--moe_plugin disable \ |
|
--enable_xqa disable \ |
|
--max_beam_width ${MAX_BEAM_WIDTH} \ |
|
--max_batch_size ${MAX_BATCH_SIZE} \ |
|
--max_seq_len 100 \ |
|
--max_input_len 14 \ |
|
--max_encoder_input_len 1500 \ |
|
--gemm_plugin ${INFERENCE_PRECISION} \ |
|
--bert_attention_plugin ${INFERENCE_PRECISION} \ |
|
--gpt_attention_plugin ${INFERENCE_PRECISION} \ |
|
--remove_input_padding disable |
|
python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir} |