From e193c662a1b37cc7588c217b3540cb9167a6c351 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 25 Aug 2021 18:25:47 +0200 Subject: [PATCH 1/3] Add config for rotary 350M debug --- train/tr4-1B3-rotary/tr3b-350M.slurm | 180 +++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 train/tr4-1B3-rotary/tr3b-350M.slurm diff --git a/train/tr4-1B3-rotary/tr3b-350M.slurm b/train/tr4-1B3-rotary/tr3b-350M.slurm new file mode 100644 index 00000000..f368a6d7 --- /dev/null +++ b/train/tr4-1B3-rotary/tr3b-350M.slurm @@ -0,0 +1,180 @@ +#!/bin/bash +#SBATCH --job-name=350M.slurm +#SBATCH --qos=qos_gpu-t3 +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.out # error file name (same to watch just one file) +#SBATCH --account=six@gpu + +set -x -e + + +ROUND=2 +TESTING=0 + +OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4-350M-rotary +MEGATRON_DEEPSPEED_REPO=$OUTPUT_PATH/code/Megatron-DeepSpeed + +if [[ ${TESTING} == 1 ]]; then + # testing on 10k + DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_100k_text_document +else + # production on full 304M records + DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_en_train_text_document + +fi + +source $six_ALL_CCFRWORK/start-prod +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +cd $MEGATRON_DEEPSPEED_REPO + +MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` +MASTER_PORT=6000 + +# adjust depending on the number of the nodes + +# XXX: edit me +GPUS_PER_NODE=4 +NNODES=4 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=4 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=256 +TRAIN_ITER=146_484_375 + +NLAYERS=24 +NHIDDEN=1024 +NHEADS=16 +FFN_HIDDEN_SIZE=4096 +SEQ_LEN=2048 + +if [[ ${ROUND} == 1 ]]; then EXIT_INTERVAL=100 SAVE_INTERVAL=10 +elif [[ ${ROUND} == 2 ]]; then SAVE_INTERVAL=1500 +else echo "invalid ROUND: $ROUND" +fi + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 3e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 126_953_125 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path t5-small \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --position-embedding-type rotary \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $OUTPUT_PATH/tensorboard \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + " + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + +export CMD=" \ + `pwd`/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $OUTPUT_PATH/checkpoints \ + --load $OUTPUT_PATH/checkpoints \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# to debug - add echo (it exits and prints what it would have launched) +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee OUTPUT_PATH/logs/tr3c-350M-modeling-baseline.$SLURM_JOBID.out From a836be96817217ab52d5f47a24086a5ce26a80e7 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 14:27:55 +0200 Subject: [PATCH 2/3] Create 350M config for rotary on oscar --- .../tr4-1B3-modeling-rotary.slurm | 1 - ....slurm => tr4b-350M-modeling-rotary.slurm} | 39 +++++++++---------- 2 files changed, 18 insertions(+), 22 deletions(-) rename train/tr4-1B3-rotary/{tr3b-350M.slurm => tr4b-350M-modeling-rotary.slurm} (82%) diff --git a/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm b/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm index 96b51f0c..7d03ff4b 100644 --- a/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm +++ b/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm @@ -7,7 +7,6 @@ #SBATCH --gres=gpu:4 # number of gpus #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=logs/%x-%j.out # output file name -#SBATCH --error=logs/%x-%j.out # error file name (same to watch just one file) #SBATCH --account=six@gpu #SBATCH --array=1-10%1 diff --git a/train/tr4-1B3-rotary/tr3b-350M.slurm b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm similarity index 82% rename from train/tr4-1B3-rotary/tr3b-350M.slurm rename to train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm index f368a6d7..e4daac0b 100644 --- a/train/tr4-1B3-rotary/tr3b-350M.slurm +++ b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm @@ -1,14 +1,13 @@ #!/bin/bash -#SBATCH --job-name=350M.slurm -#SBATCH --qos=qos_gpu-t3 +#!/bin/bash +#SBATCH --job-name=350M-v2.slurm #SBATCH --nodes=4 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=40 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:4 # number of gpus #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.out # error file name (same to watch just one file) +#SBATCH --output=logs/%x-%j.out # output file name #SBATCH --account=six@gpu set -x -e @@ -17,17 +16,12 @@ set -x -e ROUND=2 TESTING=0 -OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4-350M-rotary +OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4b-350M-rotary MEGATRON_DEEPSPEED_REPO=$OUTPUT_PATH/code/Megatron-DeepSpeed -if [[ ${TESTING} == 1 ]]; then - # testing on 10k - DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_100k_text_document -else - # production on full 304M records - DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_en_train_text_document - -fi +VOCAB_FILE=$OUTPUT_PATH/data/gpt2-vocab.json +MERGE_FILE=$OUTPUT_PATH/data/gpt2-merges.txt +DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document source $six_ALL_CCFRWORK/start-prod export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models @@ -50,9 +44,9 @@ PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here TP_SIZE=4 # always fixed to the size of a single node DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=256 -TRAIN_ITER=146_484_375 +MICRO_BATCH_SIZE=8 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=73_242_187 NLAYERS=24 NHIDDEN=1024 @@ -73,7 +67,7 @@ OPTIMIZER_ARGS=" \ --lr 3e-4 \ --min-lr 1e-5 \ --lr-decay-style cosine \ - --lr-decay-samples 126_953_125 \ + --lr-decay-samples 73_242_187 \ --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ @@ -89,15 +83,17 @@ GPT_ARGS=" \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ --seq-length $SEQ_LEN \ + --position-embedding-type rotary\ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ --train-samples $TRAIN_ITER \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path t5-small \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ - --position-embedding-type rotary \ + --checkpoint-activations \ $OPTIMIZER_ARGS \ $EXIT_OPTS \ " @@ -145,6 +141,7 @@ DEEPSPEED_ARGS=" \ --deepspeed \ --deepspeed_config ${config_json} \ --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ " export LAUNCHER="python -u -m torch.distributed.launch \ @@ -177,4 +174,4 @@ export CMD=" \ echo $CMD # to debug - add echo (it exits and prints what it would have launched) -srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee OUTPUT_PATH/logs/tr3c-350M-modeling-baseline.$SLURM_JOBID.out +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr3h-350M-v2.$SLURM_JOBID.out From f97caf30caa6a4f7ca6747e6406e12276d3a25bf Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 14:38:04 +0200 Subject: [PATCH 3/3] Fix formatting --- train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm index e4daac0b..1ccd63ec 100644 --- a/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm +++ b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm @@ -1,6 +1,6 @@ #!/bin/bash #!/bin/bash -#SBATCH --job-name=350M-v2.slurm +#SBATCH --job-name=350M-rotary.slurm #SBATCH --nodes=4 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=40 # number of cores per tasks @@ -174,4 +174,4 @@ export CMD=" \ echo $CMD # to debug - add echo (it exits and prints what it would have launched) -srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr3h-350M-v2.$SLURM_JOBID.out +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr4b-350M.$SLURM_JOBID.out