INFERENCE_PRECISION=float16 MAX_BEAM_WIDTH=4 MAX_BATCH_SIZE=64 checkpoint_dir=multi_zh_distil_tllm_checkpoint_pos_emb_true output_dir=distil_whisper_multi_zh_remove_padding # python3 convert_checkpoint.py \ # --output_dir $checkpoint_dir \ # --model_name distil-large-v2 # checkpoint_dir=multi_zh_distil_tllm_int8_checkpoint_pos_emb_true # output_dir=distil_whisper_multi_zh_int8_remove_padding # python3 convert_checkpoint.py --use_weight_only \ # --weight_only_precision int8 \ # --output_dir $checkpoint_dir --model_name distil-large-v2 trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ --output_dir ${output_dir}/encoder \ --moe_plugin disable \ --enable_xqa disable \ --max_batch_size ${MAX_BATCH_SIZE} \ --gemm_plugin disable \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --max_input_len 3000 --max_seq_len=3000 trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ --output_dir ${output_dir}/decoder \ --moe_plugin disable \ --enable_xqa disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size ${MAX_BATCH_SIZE} \ --max_seq_len 114 \ --max_input_len 14 \ --max_encoder_input_len 3000 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} # batch_size=32 # padding_strategy=zero # dataset=wenet-e2e/wenetspeech # dataset_name=TEST_MEETING # python3 run.py --engine_dir $output_dir \ # --enable_warmup \ # --dataset $dataset \ # --dataset_name $dataset_name \ # --dataset_split test \ # --compute_cer \ # --text_prefix "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ # --name aishell_${dataset_name}_${output_dir}_padding_${padding_strategy}_batch_${batch_size}_cppsession \ # --batch_size $batch_size --padding_strategy $padding_strategy