Create train.sh
Browse files
train.sh
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/bash
|
2 |
+
#SBATCH --job-name TED
|
3 |
+
#SBATCH --account OPEN-28-57
|
4 |
+
#SBATCH --partition qgpu
|
5 |
+
#SBATCH --nodes=6
|
6 |
+
#SBATCH --ntasks=6
|
7 |
+
#SBATCH --ntasks-per-node=1
|
8 |
+
#SBATCH --gpus-per-node 8
|
9 |
+
#SBATCH --cpus-per-task=128
|
10 |
+
#SBATCH --time 2-00:00:00
|
11 |
+
#SBATCH --output=/mnt/proj1/open-28-58/lakoc/huggingface_asr/outputs/ebranchformer_english_medium_normalized_regularized.out
|
12 |
+
|
13 |
+
EXPERIMENT="ebranchformer_english_medium_normalized_regularized"
|
14 |
+
PROJECT="regularizations_english_corpus"
|
15 |
+
WORK_DIR="/mnt/proj1/open-28-58/lakoc/huggingface_asr"
|
16 |
+
RECIPE_DIR="${WORK_DIR}/recipes/ebranchformer_english"
|
17 |
+
EXPERIMENT_PATH="${WORK_DIR}/experiments/${EXPERIMENT}"
|
18 |
+
HF_HOME="/scratch/project/open-28-57/lakoc/huggingface_cache"
|
19 |
+
|
20 |
+
args=(
|
21 |
+
# General training arguments
|
22 |
+
--output_dir=$EXPERIMENT_PATH
|
23 |
+
--per_device_train_batch_size="64"
|
24 |
+
--per_device_eval_batch_size="8"
|
25 |
+
--dataloader_num_workers="24"
|
26 |
+
--num_train_epochs="100"
|
27 |
+
--group_by_length="True"
|
28 |
+
--bf16
|
29 |
+
--do_train
|
30 |
+
--do_evaluate
|
31 |
+
--joint_decoding_during_training
|
32 |
+
--load_best_model_at_end
|
33 |
+
--metric_for_best_model="eval_wer"
|
34 |
+
|
35 |
+
# Optimizer related arguments
|
36 |
+
--optim="adamw_torch"
|
37 |
+
--learning_rate="1e-3"
|
38 |
+
--warmup_steps="40000"
|
39 |
+
--early_stopping_patience="10"
|
40 |
+
--weight_decay="1e-6"
|
41 |
+
--max_grad_norm="0.5"
|
42 |
+
--lsm_factor="0.1"
|
43 |
+
--mask_unks
|
44 |
+
--gradient_accumulation_steps="1"
|
45 |
+
|
46 |
+
# Logging, saving and evaluation related arguments
|
47 |
+
--report_to="wandb"
|
48 |
+
--logging_steps="10"
|
49 |
+
--save_strategy="epoch"
|
50 |
+
--evaluation_strategy="epoch"
|
51 |
+
--wandb_predictions_to_save=500
|
52 |
+
--greater_is_better="False"
|
53 |
+
--save_total_limit="5"
|
54 |
+
|
55 |
+
# Data related arguments
|
56 |
+
--max_duration_in_seconds="20.0"
|
57 |
+
--min_duration_in_seconds="0.2"
|
58 |
+
--length_column_name="input_len"
|
59 |
+
--remove_unused_columns="False"
|
60 |
+
--preprocessing_num_workers="32"
|
61 |
+
--dataset_name="/scratch/project/open-28-57/lakoc/processed_dataset_full"
|
62 |
+
--writer_batch_size="500"
|
63 |
+
--test_splits wsj_test fisher_swbd_dev voxpopuli_test tedlium3_test librispeech_test.clean librispeech_test.other commonvoice_en_test fleurs_test
|
64 |
+
--validation_slice="20%"
|
65 |
+
--validation_slice_seed=42
|
66 |
+
|
67 |
+
# Preprocessing related arguments
|
68 |
+
--data_preprocessing_config="${RECIPE_DIR}/data_preprocessing.json"
|
69 |
+
|
70 |
+
# Model related arguments
|
71 |
+
--from_encoder_decoder_config
|
72 |
+
--tokenizer_name="Lakoc/english_corpus_uni5000_normalized"
|
73 |
+
--feature_extractor_name="Lakoc/log_80mel_extractor_16k"
|
74 |
+
--base_encoder_model="Lakoc/ebranchformer_16l_512h"
|
75 |
+
--base_decoder_model="Lakoc/gpt2_512h_8l_add_head6_04"
|
76 |
+
--ctc_weight="0.3"
|
77 |
+
--decoder_pos_emb_fixed
|
78 |
+
--expect_2d_input
|
79 |
+
|
80 |
+
# Generation related arguments
|
81 |
+
--num_beams="1"
|
82 |
+
--max_length="512"
|
83 |
+
--predict_with_generate
|
84 |
+
--decoding_ctc_weight="0"
|
85 |
+
)
|
86 |
+
|
87 |
+
export PARENT=`/bin/hostname -s`
|
88 |
+
export MPORT=13000
|
89 |
+
export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT`
|
90 |
+
export HOSTLIST="$PARENT $CHILDREN"
|
91 |
+
export WORLD_SIZE=$SLURM_NTASKS
|
92 |
+
|
93 |
+
conda deactivate
|
94 |
+
source activate loco_asr
|
95 |
+
|
96 |
+
mkdir -p $EXPERIMENT_PATH
|
97 |
+
|
98 |
+
srun --cpus-per-task $SLURM_CPUS_ON_NODE --gpus-per-task $SLURM_GPUS_ON_NODE \
|
99 |
+
/mnt/proj1/open-28-58/lakoc/huggingface_asr/recipes/multinode_training/start_single_node_job.sh \
|
100 |
+
"${EXPERIMENT}" $PROJECT $WORK_DIR $RECIPE_DIR $HF_HOME "${args[@]}"
|