breeze-listen-w2v2-ml / train-ctc-model.sh
hanasim's picture
Training in progress, step 200
abb8a1f verified
#! /usr/bin/bash
#
# This script runs the speech recognition training using DeepSpeed
#
# CHANGE THESE AS PER YOUR REQUIREMENTS
# LANG as it is referred in the dataset
LANG=ml # 2 letter ISO code for the language
LANG_ISO_3=mal # 3 letter ISO code for the language
LANGUAGE=Malayalam # Full language name as per Whisper convention
# For Mozilla Commonvoice datasets, uncomment the following
DATASET="mozilla-foundation/common_voice_16_0"
TEXT_COLUMN="sentence"
# For Google Fleurs datasets, uncomment the following
# DATASET="google/fleurs"
# TEXT_COLUMN="transcription"
# Custom datasets
#DATASET="parambharat/kannada_asr_corpus"
#TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}
# Function to get fine tuning learning rate
get_fine_tuning_lr() {
local model_size=$1
local lr
case $model_size in
"tiny")
lr="3.75e-5"
;;
"base")
lr="2.5e-5"
;;
"small")
lr="1.25e-5"
;;
"medium")
lr="6.25e-6"
;;
"large")
lr="4.375e-6"
;;
"large-v2")
lr="5e-6"
;;
*)
echo "Invalid model size"
exit 1
;;
esac
echo $lr
}
SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))
# Port to use
export MASTER_PORT="${MASTER_PORT:-29500}"
echo "Using master_port for deepspeech: ${MASTER_PORT}"
export "MASTER_ADDR"="localhost"
export "RANK"="0"
export "LOCAL_RANK"="0"
export "WORLD_SIZE"="1"
# Base model variant
MODEL=w2v2
# Model names and other stuff
BASE_MODEL="facebook/mms-1b-all"
JUST_LANG=${LANG%%_*}
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}"
OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
echo "OUTDIR: ${OUTDIR}"
# Training parameters you can tweak. Feel free to directly change any of the parameters below.
MAX_EPOCHS=4
TRAIN_BATCH_SIZE=4
EVAL_BATCH_SIZE=4
LEARNING_RATE="1e-3"
EVAL_STEPS="200"
SAVE_STEPS="200"
# Create dir
mkdir -p ${OUTDIR}
# --overwrite_output_dir \
# If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
# --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \
echo "================ TRAINING: START ================"
python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
--dataset_name="${DATASET}" \
--model_name_or_path="${BASE_MODEL}" \
--dataset_config_name="${LANG}" \
--target_language="${LANG_ISO_3}" \
--output_dir="${OUTDIR}" \
--num_train_epochs="${MAX_EPOCHS}" \
--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
--learning_rate="${LEARNING_RATE}" \
--warmup_steps="100" \
--evaluation_strategy="steps" \
--text_column_name="${TEXT_COLUMN}" \
--length_column_name="input_length" \
--save_steps="${SAVE_STEPS}" \
--eval_steps="${EVAL_STEPS}" \
--save_total_limit="3" \
--optim="adamw_bnb_8bit" \
--hub_model_id "simpragma/${MY_MODEL}" \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
--group_by_length \
--do_train \
--do_eval \
--push_to_hub \
| tee ${OUTDIR}/${MY_MODEL}.log
# Copy the script to the output directory so that we can recreate the model
cp ${SCRIPT_PATH} ${OUTDIR}
echo "================ TRAINING: DONE ================"
exit 0