File size: 3,632 Bytes
cd12b7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#!/bin/bash
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=256G
#SBATCH -p pilot
#SBATCH -t 48:00:00
#SBATCH --gpus-per-node=mi250:8
#SBATCH --exclusive=user
#SBATCH --hint=nomultithread
#SBATCH --account=project_462000119
#SBATCH -o logs/%j.out
#SBATCH -e logs/%j.err
# if run without sbatch, invoke here
#if [ -z $SLURM_JOB_ID ]; then
# mkdir -p logs
# sbatch "$0"
# exit
#fi
VARIANT=7b1ru2
set -euo pipefail
# symlink logs/latest.out and logs/latest.err
ln -f -s $SLURM_JOB_ID.out logs/latest.out
ln -f -s $SLURM_JOB_ID.err logs/latest.err
KILL_SWITCH_PATH=kill-switch-$VARIANT
CHECKPOINT_PATH=checkpoints_$VARIANT
TENSORBOARD_PATH=tensorboard_$VARIANT
# Data
TOKENIZER_NAME_OR_PATH=bigscience/tokenizer
TRAIN_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3ru_train.txt
VALID_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_validation_ru.txt
PP_SIZE=1
TP_SIZE=1
MICRO_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=16
WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
# Model parameters
NLAYERS=30
NHIDDEN=4096
NHEADS=32
SEQ_LEN=2048
TRAIN_SAMPLES=6_348_800
SAVE_INTERVAL=500
ZERO_STAGE=1
mkdir -p ds_configs
config_json="ds_configs/$SLURM_JOB_ID.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
CMD=" \
Megatron-DeepSpeed/finetune_t0.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
--init-method-std 0.0048 \
--embed-layernorm \
--fp16 \
--seed 42 \
--position-embedding-type alibi \
--abort-on-unmet-fused-kernel-constraints \
--clip-grad 1.0 \
--kill-switch-path $KILL_SWITCH_PATH \
--checkpoint-activations \
--pad-vocab-size-to 250880 \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 2e-5 \
--lr-decay-style constant \
--lr-warmup-samples 0 \
--clip-grad 1.0 \
--weight-decay 1e-4 \
--no-load-optim \
--reset-progress \
--norm-target-loss \
--log-interval 10 \
--save-interval $SAVE_INTERVAL \
--eval-interval 500 \
--eval-iters 1 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $TRAIN_DATA_PATH \
--valid-weighted-split-paths-path $VALID_DATA_PATH \
--dataloader-type single \
--data-impl mmap \
--deepspeed \
--deepspeed_config $config_json \
--zero-stage $ZERO_STAGE \
"
echo $CMD
echo "START $SLURM_JOBID: $(date)"
srun --label launch.sh $CMD
echo "END $SLURM_JOBID: $(date)"
|