#! /usr/bin/bash # # This script runs the speech recognition training using DeepSpeed # # CHANGE THESE AS PER YOUR REQUIREMENTS # LANG as it is referred in the dataset LANG=ml # 2 letter ISO code for the language LANG_ISO_3=mal # 3 letter ISO code for the language LANGUAGE=Malayalam # Full language name as per Whisper convention # For Mozilla Commonvoice datasets, uncomment the following DATASET="mozilla-foundation/common_voice_16_0" TEXT_COLUMN="sentence" # For Google Fleurs datasets, uncomment the following # DATASET="google/fleurs" # TEXT_COLUMN="transcription" # Custom datasets #DATASET="parambharat/kannada_asr_corpus" #TEXT_COLUMN=${TEXT_COLUMN:-"sentence"} # Function to get fine tuning learning rate get_fine_tuning_lr() { local model_size=$1 local lr case $model_size in "tiny") lr="3.75e-5" ;; "base") lr="2.5e-5" ;; "small") lr="1.25e-5" ;; "medium") lr="6.25e-6" ;; "large") lr="4.375e-6" ;; "large-v2") lr="5e-6" ;; *) echo "Invalid model size" exit 1 ;; esac echo $lr } SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}") SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")) # Port to use export MASTER_PORT="${MASTER_PORT:-29500}" echo "Using master_port for deepspeech: ${MASTER_PORT}" export "MASTER_ADDR"="localhost" export "RANK"="0" export "LOCAL_RANK"="0" export "WORLD_SIZE"="1" # Base model variant MODEL=w2v2 # Model names and other stuff BASE_MODEL="facebook/mms-1b-all" JUST_LANG=${LANG%%_*} MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}" OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}" echo "OUTDIR: ${OUTDIR}" # Training parameters you can tweak. Feel free to directly change any of the parameters below. MAX_EPOCHS=4 TRAIN_BATCH_SIZE=4 EVAL_BATCH_SIZE=4 LEARNING_RATE="1e-3" EVAL_STEPS="200" SAVE_STEPS="200" # Create dir mkdir -p ${OUTDIR} # --overwrite_output_dir \ # If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory. # --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \ echo "================ TRAINING: START ================" python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \ --dataset_name="${DATASET}" \ --model_name_or_path="${BASE_MODEL}" \ --dataset_config_name="${LANG}" \ --target_language="${LANG_ISO_3}" \ --output_dir="${OUTDIR}" \ --num_train_epochs="${MAX_EPOCHS}" \ --per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \ --learning_rate="${LEARNING_RATE}" \ --warmup_steps="100" \ --evaluation_strategy="steps" \ --text_column_name="${TEXT_COLUMN}" \ --length_column_name="input_length" \ --save_steps="${SAVE_STEPS}" \ --eval_steps="${EVAL_STEPS}" \ --save_total_limit="3" \ --optim="adamw_bnb_8bit" \ --hub_model_id "simpragma/${MY_MODEL}" \ --gradient_checkpointing \ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ --fp16 \ --group_by_length \ --do_train \ --do_eval \ --push_to_hub \ | tee ${OUTDIR}/${MY_MODEL}.log # Copy the script to the output directory so that we can recreate the model cp ${SCRIPT_PATH} ${OUTDIR} echo "================ TRAINING: DONE ================" exit 0