simpragma
/

breeze-listen-w2v2-ml

Automatic Speech Recognition

mozilla-foundation/common_voice_16_0

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Community

breeze-listen-w2v2-ml / train-ctc-model.sh

hanasim's picture

Training in progress, step 200

abb8a1f verified 10 months ago

history blame contribute delete

3.37 kB

	#! /usr/bin/bash

	#
	# This script runs the speech recognition training using DeepSpeed
	#

	# CHANGE THESE AS PER YOUR REQUIREMENTS

	# LANG as it is referred in the dataset
	LANG=ml # 2 letter ISO code for the language
	LANG_ISO_3=mal # 3 letter ISO code for the language
	LANGUAGE=Malayalam # Full language name as per Whisper convention

	# For Mozilla Commonvoice datasets, uncomment the following
	DATASET="mozilla-foundation/common_voice_16_0"
	TEXT_COLUMN="sentence"

	# For Google Fleurs datasets, uncomment the following
	# DATASET="google/fleurs"
	# TEXT_COLUMN="transcription"

	# Custom datasets
	#DATASET="parambharat/kannada_asr_corpus"
	#TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}

	# Function to get fine tuning learning rate
	get_fine_tuning_lr() {
	local model_size=$1
	local lr

	case $model_size in
	"tiny")
	lr="3.75e-5"
	;;
	"base")
	lr="2.5e-5"
	;;
	"small")
	lr="1.25e-5"
	;;
	"medium")
	lr="6.25e-6"
	;;
	"large")
	lr="4.375e-6"
	;;
	"large-v2")
	lr="5e-6"
	;;
	*)
	echo "Invalid model size"
	exit 1
	;;
	esac

	echo $lr
	}

	SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
	SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))

	# Port to use
	export MASTER_PORT="${MASTER_PORT:-29500}"
	echo "Using master_port for deepspeech: ${MASTER_PORT}"

	export "MASTER_ADDR"="localhost"
	export "RANK"="0"
	export "LOCAL_RANK"="0"
	export "WORLD_SIZE"="1"

	# Base model variant
	MODEL=w2v2

	# Model names and other stuff
	BASE_MODEL="facebook/mms-1b-all"

	JUST_LANG=${LANG%%_*}
	MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}"

	OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
	echo "OUTDIR: ${OUTDIR}"

	# Training parameters you can tweak. Feel free to directly change any of the parameters below.

	MAX_EPOCHS=4
	TRAIN_BATCH_SIZE=4
	EVAL_BATCH_SIZE=4
	LEARNING_RATE="1e-3"

	EVAL_STEPS="200"
	SAVE_STEPS="200"

	# Create dir
	mkdir -p ${OUTDIR}

	# --overwrite_output_dir \

	# If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
	# --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \

	echo "================ TRAINING: START ================"

	python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
	--dataset_name="${DATASET}" \
	--model_name_or_path="${BASE_MODEL}" \
	--dataset_config_name="${LANG}" \
	--target_language="${LANG_ISO_3}" \
	--output_dir="${OUTDIR}" \
	--num_train_epochs="${MAX_EPOCHS}" \
	--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
	--learning_rate="${LEARNING_RATE}" \
	--warmup_steps="100" \
	--evaluation_strategy="steps" \
	--text_column_name="${TEXT_COLUMN}" \
	--length_column_name="input_length" \
	--save_steps="${SAVE_STEPS}" \
	--eval_steps="${EVAL_STEPS}" \
	--save_total_limit="3" \
	--optim="adamw_bnb_8bit" \
	--hub_model_id "simpragma/${MY_MODEL}" \
	--gradient_checkpointing \
	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
	--fp16 \
	--group_by_length \
	--do_train \
	--do_eval \
	--push_to_hub \
	\| tee ${OUTDIR}/${MY_MODEL}.log

	# Copy the script to the output directory so that we can recreate the model
	cp ${SCRIPT_PATH} ${OUTDIR}

	echo "================ TRAINING: DONE ================"

	exit 0