export CUDA_VISIBLE_DEVICES=1 | |
INFERENCE_PRECISION=float16 | |
MAX_BEAM_WIDTH=4 | |
MAX_BATCH_SIZE=8 | |
checkpoint_dir=tllm_checkpoint | |
output_dir=distill_whisper_large_v3 | |
# trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ | |
# --output_dir ${output_dir}/encoder \ | |
# --paged_kv_cache disable \ | |
# --moe_plugin disable \ | |
# --enable_xqa disable \ | |
# --max_batch_size ${MAX_BATCH_SIZE} \ | |
# --gemm_plugin disable \ | |
# --bert_attention_plugin ${INFERENCE_PRECISION} \ | |
# --remove_input_padding disable | |
# trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ | |
# --output_dir ${output_dir}/decoder \ | |
# --paged_kv_cache disable \ | |
# --moe_plugin disable \ | |
# --enable_xqa disable \ | |
# --max_beam_width ${MAX_BEAM_WIDTH} \ | |
# --max_batch_size ${MAX_BATCH_SIZE} \ | |
# --max_seq_len 100 \ | |
# --max_input_len 14 \ | |
# --max_encoder_input_len 1500 \ | |
# --gemm_plugin ${INFERENCE_PRECISION} \ | |
# --bert_attention_plugin ${INFERENCE_PRECISION} \ | |
# --gpt_attention_plugin ${INFERENCE_PRECISION} \ | |
# --remove_input_padding disable | |
python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir} | |