File size: 3,368 Bytes
abb8a1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/bash

#
# This script runs the speech recognition training using DeepSpeed
#

# CHANGE THESE AS PER YOUR REQUIREMENTS

# LANG as it is referred in the dataset
LANG=ml			# 2 letter ISO code for the language
LANG_ISO_3=mal		# 3 letter ISO code for the language
LANGUAGE=Malayalam	# Full language name as per Whisper convention

# For Mozilla Commonvoice datasets, uncomment the following
DATASET="mozilla-foundation/common_voice_16_0"
TEXT_COLUMN="sentence"

# For Google Fleurs datasets, uncomment the following
# DATASET="google/fleurs"
# TEXT_COLUMN="transcription"

# Custom datasets
#DATASET="parambharat/kannada_asr_corpus"
#TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}

# Function to get fine tuning learning rate
get_fine_tuning_lr() {
    local model_size=$1
    local lr

    case $model_size in
        "tiny")
            lr="3.75e-5"
            ;;
        "base")
            lr="2.5e-5"
            ;;
        "small")
            lr="1.25e-5"
            ;;
        "medium")
            lr="6.25e-6"
            ;;
        "large")
            lr="4.375e-6"
            ;;
        "large-v2")
            lr="5e-6"
            ;;
        *)
            echo "Invalid model size"
            exit 1
            ;;
    esac

    echo $lr
}

SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))

# Port to use
export MASTER_PORT="${MASTER_PORT:-29500}"
echo "Using master_port for deepspeech: ${MASTER_PORT}"

export "MASTER_ADDR"="localhost"
export "RANK"="0"
export "LOCAL_RANK"="0"
export "WORLD_SIZE"="1"

# Base model variant
MODEL=w2v2

# Model names and other stuff
BASE_MODEL="facebook/mms-1b-all"

JUST_LANG=${LANG%%_*}
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}"

OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
echo "OUTDIR: ${OUTDIR}"

# Training parameters you can tweak. Feel free to directly change any of the parameters below.

MAX_EPOCHS=4
TRAIN_BATCH_SIZE=4
EVAL_BATCH_SIZE=4
LEARNING_RATE="1e-3"

EVAL_STEPS="200"
SAVE_STEPS="200"

# Create dir
mkdir -p ${OUTDIR}

#	--overwrite_output_dir \

# If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
# --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \

echo "================ TRAINING: START ================"

python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
	--dataset_name="${DATASET}" \
	--model_name_or_path="${BASE_MODEL}" \
	--dataset_config_name="${LANG}" \
	--target_language="${LANG_ISO_3}"	\
	--output_dir="${OUTDIR}" \
	--num_train_epochs="${MAX_EPOCHS}" \
	--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
	--learning_rate="${LEARNING_RATE}" \
	--warmup_steps="100" \
	--evaluation_strategy="steps" \
	--text_column_name="${TEXT_COLUMN}" \
	--length_column_name="input_length" \
	--save_steps="${SAVE_STEPS}" \
	--eval_steps="${EVAL_STEPS}" \
	--save_total_limit="3" \
	--optim="adamw_bnb_8bit"	\
	--hub_model_id "simpragma/${MY_MODEL}" \
	--gradient_checkpointing \
	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
	--fp16 \
	--group_by_length \
	--do_train 	\
	--do_eval \
	--push_to_hub	\
	| tee ${OUTDIR}/${MY_MODEL}.log

# Copy the script to the output directory so that we can recreate the model
cp ${SCRIPT_PATH} ${OUTDIR}

echo "================ TRAINING: DONE ================"

exit 0