File size: 3,195 Bytes
5b20446 673be4f 5b20446 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
#!/usr/bin/bash
#SBATCH --job-name DeCRED
#SBATCH --account OPEN-28-57
#SBATCH --partition qgpu
#SBATCH --nodes=6
#SBATCH --ntasks=6
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node 8
#SBATCH --cpus-per-task=128
#SBATCH --time 2-00:00:00
#SBATCH --output=/mnt/proj1/open-28-58/lakoc/huggingface_asr/outputs/ebranchformer_english_medium_normalized_regularized.out
EXPERIMENT="ebranchformer_english_medium_normalized_regularized"
PROJECT="regularizations_english_corpus"
WORK_DIR="/mnt/proj1/open-28-58/lakoc/huggingface_asr"
RECIPE_DIR="${WORK_DIR}/recipes/ebranchformer_english"
EXPERIMENT_PATH="${WORK_DIR}/experiments/${EXPERIMENT}"
HF_HOME="/scratch/project/open-28-57/lakoc/huggingface_cache"
args=(
# General training arguments
--output_dir=$EXPERIMENT_PATH
--per_device_train_batch_size="64"
--per_device_eval_batch_size="8"
--dataloader_num_workers="24"
--num_train_epochs="100"
--group_by_length="True"
--bf16
--do_train
--do_evaluate
--joint_decoding_during_training
--load_best_model_at_end
--metric_for_best_model="eval_wer"
# Optimizer related arguments
--optim="adamw_torch"
--learning_rate="1e-3"
--warmup_steps="40000"
--early_stopping_patience="10"
--weight_decay="1e-6"
--max_grad_norm="0.5"
--lsm_factor="0.1"
--mask_unks
--gradient_accumulation_steps="1"
# Logging, saving and evaluation related arguments
--report_to="wandb"
--logging_steps="10"
--save_strategy="epoch"
--evaluation_strategy="epoch"
--wandb_predictions_to_save=500
--greater_is_better="False"
--save_total_limit="5"
# Data related arguments
--max_duration_in_seconds="20.0"
--min_duration_in_seconds="0.2"
--length_column_name="input_len"
--remove_unused_columns="False"
--preprocessing_num_workers="32"
--dataset_name="/scratch/project/open-28-57/lakoc/processed_dataset_full"
--writer_batch_size="500"
--test_splits wsj_test fisher_swbd_dev voxpopuli_test tedlium3_test librispeech_test.clean librispeech_test.other commonvoice_en_test fleurs_test
--validation_slice="20%"
--validation_slice_seed=42
# Preprocessing related arguments
--data_preprocessing_config="${RECIPE_DIR}/data_preprocessing.json"
# Model related arguments
--from_encoder_decoder_config
--tokenizer_name="Lakoc/english_corpus_uni5000_normalized"
--feature_extractor_name="Lakoc/log_80mel_extractor_16k"
--base_encoder_model="Lakoc/ebranchformer_16l_512h"
--base_decoder_model="Lakoc/gpt2_512h_8l_add_head6_04"
--ctc_weight="0.3"
--decoder_pos_emb_fixed
--expect_2d_input
# Generation related arguments
--num_beams="1"
--max_length="512"
--predict_with_generate
--decoding_ctc_weight="0"
)
export PARENT=`/bin/hostname -s`
export MPORT=13000
export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT`
export HOSTLIST="$PARENT $CHILDREN"
export WORLD_SIZE=$SLURM_NTASKS
conda deactivate
source activate loco_asr
mkdir -p $EXPERIMENT_PATH
srun --cpus-per-task $SLURM_CPUS_ON_NODE --gpus-per-task $SLURM_GPUS_ON_NODE \
/mnt/proj1/open-28-58/lakoc/huggingface_asr/recipes/multinode_training/start_single_node_job.sh \
"${EXPERIMENT}" $PROJECT $WORK_DIR $RECIPE_DIR $HF_HOME "${args[@]}"
|