From e193c662a1b37cc7588c217b3540cb9167a6c351 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 25 Aug 2021 18:25:47 +0200
Subject: [PATCH 1/3] Add config for rotary 350M debug

---
 train/tr4-1B3-rotary/tr3b-350M.slurm | 180 +++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 train/tr4-1B3-rotary/tr3b-350M.slurm

diff --git a/train/tr4-1B3-rotary/tr3b-350M.slurm b/train/tr4-1B3-rotary/tr3b-350M.slurm
new file mode 100644
index 00000000..f368a6d7
--- /dev/null
+++ b/train/tr4-1B3-rotary/tr3b-350M.slurm
@@ -0,0 +1,180 @@
+#!/bin/bash
+#SBATCH --job-name=350M.slurm
+#SBATCH --qos=qos_gpu-t3
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:4                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --error=%x-%j.out            # error file name (same to watch just one file)
+#SBATCH --account=six@gpu
+
+set -x -e
+
+
+ROUND=2
+TESTING=0
+
+OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4-350M-rotary
+MEGATRON_DEEPSPEED_REPO=$OUTPUT_PATH/code/Megatron-DeepSpeed
+
+if [[ ${TESTING} == 1 ]]; then
+    # testing on 10k
+    DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_100k_text_document
+else
+    # production on full 304M records
+    DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_en_train_text_document
+
+fi
+
+source $six_ALL_CCFRWORK/start-prod
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+cd $MEGATRON_DEEPSPEED_REPO
+
+MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
+MASTER_PORT=6000
+
+# adjust depending on the number of the nodes
+
+# XXX: edit me
+GPUS_PER_NODE=4
+NNODES=4
+PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=4 # always fixed to the size of a single node
+DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer
+
+MICRO_BATCH_SIZE=4
+GLOBAL_BATCH_SIZE=256
+TRAIN_ITER=146_484_375
+
+NLAYERS=24
+NHIDDEN=1024
+NHEADS=16
+FFN_HIDDEN_SIZE=4096
+SEQ_LEN=2048
+
+if   [[ ${ROUND} == 1 ]]; then  EXIT_INTERVAL=100    SAVE_INTERVAL=10
+elif [[ ${ROUND} == 2 ]]; then  SAVE_INTERVAL=1500
+else echo "invalid ROUND: $ROUND"
+fi
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr 3e-4 \
+    --min-lr 1e-5 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 126_953_125 \
+    --lr-warmup-samples 183_105 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+EXIT_OPTS=" \
+    --exit-duration-in-mins 1190 \
+    "
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --ffn-hidden-size $FFN_HIDDEN_SIZE \
+    --seq-length $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-samples $TRAIN_ITER \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path t5-small \
+    --loss-scale 12 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --position-embedding-type rotary \
+    $OPTIMIZER_ARGS \
+    $EXIT_OPTS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 200 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+    --tensorboard-dir $OUTPUT_PATH/tensorboard \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+ZERO_STAGE=1
+
+config_json="./ds_config.$SLURM_JOBID.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+export CMD=" \
+    `pwd`/pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $OUTPUT_ARGS \
+    --save $OUTPUT_PATH/checkpoints \
+    --load $OUTPUT_PATH/checkpoints \
+    --data-path $DATA_PATH \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+     $DEEPSPEED_ARGS \
+    "
+
+
+# # clear old checkpoint as it'd mismatch while we sort things out
+#     rm -rf $SAVE_CHECKPOINT_PATH
+
+
+echo $CMD
+
+# to debug - add echo (it exits and prints what it would have launched)
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee OUTPUT_PATH/logs/tr3c-350M-modeling-baseline.$SLURM_JOBID.out

From a836be96817217ab52d5f47a24086a5ce26a80e7 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 14:27:55 +0200
Subject: [PATCH 2/3] Create 350M config for rotary on oscar

---
 .../tr4-1B3-modeling-rotary.slurm             |  1 -
 ....slurm => tr4b-350M-modeling-rotary.slurm} | 39 +++++++++----------
 2 files changed, 18 insertions(+), 22 deletions(-)
 rename train/tr4-1B3-rotary/{tr3b-350M.slurm => tr4b-350M-modeling-rotary.slurm} (82%)

diff --git a/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm b/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm
index 96b51f0c..7d03ff4b 100644
--- a/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm
+++ b/train/tr4-1B3-rotary/tr4-1B3-modeling-rotary.slurm
@@ -7,7 +7,6 @@
 #SBATCH --gres=gpu:4                 # number of gpus
 #SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=logs/%x-%j.out           # output file name
-#SBATCH --error=logs/%x-%j.out            # error file name (same to watch just one file)
 #SBATCH --account=six@gpu
 #SBATCH --array=1-10%1
 
diff --git a/train/tr4-1B3-rotary/tr3b-350M.slurm b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
similarity index 82%
rename from train/tr4-1B3-rotary/tr3b-350M.slurm
rename to train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
index f368a6d7..e4daac0b 100644
--- a/train/tr4-1B3-rotary/tr3b-350M.slurm
+++ b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
@@ -1,14 +1,13 @@
 #!/bin/bash
-#SBATCH --job-name=350M.slurm
-#SBATCH --qos=qos_gpu-t3
+#!/bin/bash
+#SBATCH --job-name=350M-v2.slurm
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=40         # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --gres=gpu:4                 # number of gpus
 #SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=%x-%j.out           # output file name
-#SBATCH --error=%x-%j.out            # error file name (same to watch just one file)
+#SBATCH --output=logs/%x-%j.out           # output file name
 #SBATCH --account=six@gpu
 
 set -x -e
@@ -17,17 +16,12 @@ set -x -e
 ROUND=2
 TESTING=0
 
-OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4-350M-rotary
+OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr4b-350M-rotary
 MEGATRON_DEEPSPEED_REPO=$OUTPUT_PATH/code/Megatron-DeepSpeed
 
-if [[ ${TESTING} == 1 ]]; then
-    # testing on 10k
-    DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_100k_text_document
-else
-    # production on full 304M records
-    DATA_PATH=$six_ALL_CCFRSCRATCH/datasets-custom/c4_preprocessing/c4_en_train_text_document
-
-fi
+VOCAB_FILE=$OUTPUT_PATH/data/gpt2-vocab.json
+MERGE_FILE=$OUTPUT_PATH/data/gpt2-merges.txt
+DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
 
 source $six_ALL_CCFRWORK/start-prod
 export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
@@ -50,9 +44,9 @@ PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
 TP_SIZE=4 # always fixed to the size of a single node
 DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer
 
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=256
-TRAIN_ITER=146_484_375
+MICRO_BATCH_SIZE=8
+GLOBAL_BATCH_SIZE=512
+TRAIN_ITER=73_242_187
 
 NLAYERS=24
 NHIDDEN=1024
@@ -73,7 +67,7 @@ OPTIMIZER_ARGS=" \
     --lr 3e-4 \
     --min-lr 1e-5 \
     --lr-decay-style cosine \
-    --lr-decay-samples 126_953_125 \
+    --lr-decay-samples 73_242_187 \
     --lr-warmup-samples 183_105 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
@@ -89,15 +83,17 @@ GPT_ARGS=" \
     --num-attention-heads $NHEADS \
     --ffn-hidden-size $FFN_HIDDEN_SIZE \
     --seq-length $SEQ_LEN \
+    --position-embedding-type rotary\
     --micro-batch-size $MICRO_BATCH_SIZE \
     --global-batch-size $GLOBAL_BATCH_SIZE \
+    --rampup-batch-size 32 32 2_000_000 \
     --train-samples $TRAIN_ITER \
-    --tokenizer-type PretrainedFromHF \
-    --tokenizer-name-or-path t5-small \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
     --loss-scale 12 \
     --clip-grad 1.0 \
     --fp16 \
-    --position-embedding-type rotary \
+    --checkpoint-activations \
     $OPTIMIZER_ARGS \
     $EXIT_OPTS \
     "
@@ -145,6 +141,7 @@ DEEPSPEED_ARGS=" \
     --deepspeed \
     --deepspeed_config ${config_json} \
     --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
     "
 
 export LAUNCHER="python -u -m torch.distributed.launch \
@@ -177,4 +174,4 @@ export CMD=" \
 echo $CMD
 
 # to debug - add echo (it exits and prints what it would have launched)
-srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee OUTPUT_PATH/logs/tr3c-350M-modeling-baseline.$SLURM_JOBID.out
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr3h-350M-v2.$SLURM_JOBID.out

From f97caf30caa6a4f7ca6747e6406e12276d3a25bf Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 14:38:04 +0200
Subject: [PATCH 3/3] Fix formatting

---
 train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
index e4daac0b..1ccd63ec 100644
--- a/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
+++ b/train/tr4-1B3-rotary/tr4b-350M-modeling-rotary.slurm
@@ -1,6 +1,6 @@
 #!/bin/bash
 #!/bin/bash
-#SBATCH --job-name=350M-v2.slurm
+#SBATCH --job-name=350M-rotary.slurm
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=40         # number of cores per tasks
@@ -174,4 +174,4 @@ export CMD=" \
 echo $CMD
 
 # to debug - add echo (it exits and prints what it would have launched)
-srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr3h-350M-v2.$SLURM_JOBID.out
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee $OUTPUT_PATH/logs/tr4b-350M.$SLURM_JOBID.out