Lakoc commited on
Commit
5b20446
1 Parent(s): e78bc0e

Create train.sh

Browse files
Files changed (1) hide show
  1. train.sh +100 -0
train.sh ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+ #SBATCH --job-name TED
3
+ #SBATCH --account OPEN-28-57
4
+ #SBATCH --partition qgpu
5
+ #SBATCH --nodes=6
6
+ #SBATCH --ntasks=6
7
+ #SBATCH --ntasks-per-node=1
8
+ #SBATCH --gpus-per-node 8
9
+ #SBATCH --cpus-per-task=128
10
+ #SBATCH --time 2-00:00:00
11
+ #SBATCH --output=/mnt/proj1/open-28-58/lakoc/huggingface_asr/outputs/ebranchformer_english_medium_normalized_regularized.out
12
+
13
+ EXPERIMENT="ebranchformer_english_medium_normalized_regularized"
14
+ PROJECT="regularizations_english_corpus"
15
+ WORK_DIR="/mnt/proj1/open-28-58/lakoc/huggingface_asr"
16
+ RECIPE_DIR="${WORK_DIR}/recipes/ebranchformer_english"
17
+ EXPERIMENT_PATH="${WORK_DIR}/experiments/${EXPERIMENT}"
18
+ HF_HOME="/scratch/project/open-28-57/lakoc/huggingface_cache"
19
+
20
+ args=(
21
+ # General training arguments
22
+ --output_dir=$EXPERIMENT_PATH
23
+ --per_device_train_batch_size="64"
24
+ --per_device_eval_batch_size="8"
25
+ --dataloader_num_workers="24"
26
+ --num_train_epochs="100"
27
+ --group_by_length="True"
28
+ --bf16
29
+ --do_train
30
+ --do_evaluate
31
+ --joint_decoding_during_training
32
+ --load_best_model_at_end
33
+ --metric_for_best_model="eval_wer"
34
+
35
+ # Optimizer related arguments
36
+ --optim="adamw_torch"
37
+ --learning_rate="1e-3"
38
+ --warmup_steps="40000"
39
+ --early_stopping_patience="10"
40
+ --weight_decay="1e-6"
41
+ --max_grad_norm="0.5"
42
+ --lsm_factor="0.1"
43
+ --mask_unks
44
+ --gradient_accumulation_steps="1"
45
+
46
+ # Logging, saving and evaluation related arguments
47
+ --report_to="wandb"
48
+ --logging_steps="10"
49
+ --save_strategy="epoch"
50
+ --evaluation_strategy="epoch"
51
+ --wandb_predictions_to_save=500
52
+ --greater_is_better="False"
53
+ --save_total_limit="5"
54
+
55
+ # Data related arguments
56
+ --max_duration_in_seconds="20.0"
57
+ --min_duration_in_seconds="0.2"
58
+ --length_column_name="input_len"
59
+ --remove_unused_columns="False"
60
+ --preprocessing_num_workers="32"
61
+ --dataset_name="/scratch/project/open-28-57/lakoc/processed_dataset_full"
62
+ --writer_batch_size="500"
63
+ --test_splits wsj_test fisher_swbd_dev voxpopuli_test tedlium3_test librispeech_test.clean librispeech_test.other commonvoice_en_test fleurs_test
64
+ --validation_slice="20%"
65
+ --validation_slice_seed=42
66
+
67
+ # Preprocessing related arguments
68
+ --data_preprocessing_config="${RECIPE_DIR}/data_preprocessing.json"
69
+
70
+ # Model related arguments
71
+ --from_encoder_decoder_config
72
+ --tokenizer_name="Lakoc/english_corpus_uni5000_normalized"
73
+ --feature_extractor_name="Lakoc/log_80mel_extractor_16k"
74
+ --base_encoder_model="Lakoc/ebranchformer_16l_512h"
75
+ --base_decoder_model="Lakoc/gpt2_512h_8l_add_head6_04"
76
+ --ctc_weight="0.3"
77
+ --decoder_pos_emb_fixed
78
+ --expect_2d_input
79
+
80
+ # Generation related arguments
81
+ --num_beams="1"
82
+ --max_length="512"
83
+ --predict_with_generate
84
+ --decoding_ctc_weight="0"
85
+ )
86
+
87
+ export PARENT=`/bin/hostname -s`
88
+ export MPORT=13000
89
+ export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT`
90
+ export HOSTLIST="$PARENT $CHILDREN"
91
+ export WORLD_SIZE=$SLURM_NTASKS
92
+
93
+ conda deactivate
94
+ source activate loco_asr
95
+
96
+ mkdir -p $EXPERIMENT_PATH
97
+
98
+ srun --cpus-per-task $SLURM_CPUS_ON_NODE --gpus-per-task $SLURM_GPUS_ON_NODE \
99
+ /mnt/proj1/open-28-58/lakoc/huggingface_asr/recipes/multinode_training/start_single_node_job.sh \
100
+ "${EXPERIMENT}" $PROJECT $WORK_DIR $RECIPE_DIR $HF_HOME "${args[@]}"