yuekai
/

distill_whisper_large_v3_trtllm_triton

Upload folder using huggingface_hub

f605912 verified 3 months ago

1.68 kB

	export CUDA_VISIBLE_DEVICES=1
	INFERENCE_PRECISION=float16
	WEIGHT_ONLY_PRECISION=int8
	MAX_BEAM_WIDTH=4
	MAX_BATCH_SIZE=8
	checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION}
	output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION}

	# python3 convert_checkpoint.py --use_weight_only \
	# --weight_only_precision $WEIGHT_ONLY_PRECISION \
	# --output_dir $checkpoint_dir

	trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
	--output_dir ${output_dir}/encoder \
	--paged_kv_cache disable \
	--moe_plugin disable \
	--enable_xqa disable \
	--max_batch_size ${MAX_BATCH_SIZE} \
	--gemm_plugin ${INFERENCE_PRECISION} \
	--bert_attention_plugin ${INFERENCE_PRECISION} \
	--remove_input_padding disable

	trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
	--output_dir ${output_dir}/decoder \
	--paged_kv_cache disable \
	--moe_plugin disable \
	--enable_xqa disable \
	--max_beam_width ${MAX_BEAM_WIDTH} \
	--max_batch_size ${MAX_BATCH_SIZE} \
	--max_seq_len 100 \
	--max_input_len 14 \
	--max_encoder_input_len 1500 \
	--gemm_plugin ${INFERENCE_PRECISION} \
	--bert_attention_plugin ${INFERENCE_PRECISION} \
	--gpt_attention_plugin ${INFERENCE_PRECISION} \
	--remove_input_padding disable
	python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}