export CUDA_VISIBLE_DEVICES=1
INFERENCE_PRECISION=float16
MAX_BEAM_WIDTH=4
MAX_BATCH_SIZE=8
checkpoint_dir=tllm_checkpoint
output_dir=distill_whisper_large_v3


# trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
#                 --output_dir ${output_dir}/encoder \
#                 --paged_kv_cache disable \
#                 --moe_plugin disable \
#                 --enable_xqa disable \
#                 --max_batch_size ${MAX_BATCH_SIZE} \
#                 --gemm_plugin disable \
#                 --bert_attention_plugin ${INFERENCE_PRECISION} \
#                 --remove_input_padding disable

# trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
#                 --output_dir ${output_dir}/decoder \
#                 --paged_kv_cache disable \
#                 --moe_plugin disable \
#                 --enable_xqa disable \
#                 --max_beam_width ${MAX_BEAM_WIDTH} \
#                 --max_batch_size ${MAX_BATCH_SIZE} \
#                 --max_seq_len 100 \
#                 --max_input_len 14 \
#                 --max_encoder_input_len 1500 \
#                 --gemm_plugin ${INFERENCE_PRECISION} \
#                 --bert_attention_plugin ${INFERENCE_PRECISION} \
#                 --gpt_attention_plugin ${INFERENCE_PRECISION} \
#                 --remove_input_padding disable

python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}