export CUDA_VISIBLE_DEVICES=1 INFERENCE_PRECISION=float16 WEIGHT_ONLY_PRECISION=int8 MAX_BEAM_WIDTH=4 MAX_BATCH_SIZE=8 checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION} output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION} # python3 convert_checkpoint.py --use_weight_only \ # --weight_only_precision $WEIGHT_ONLY_PRECISION \ # --output_dir $checkpoint_dir trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ --output_dir ${output_dir}/encoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --max_batch_size ${MAX_BATCH_SIZE} \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --remove_input_padding disable trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ --output_dir ${output_dir}/decoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size ${MAX_BATCH_SIZE} \ --max_seq_len 100 \ --max_input_len 14 \ --max_encoder_input_len 1500 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ --remove_input_padding disable python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}