From 7f1f04ebb6e5382e849d1ea79562e4742cd0f3de Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Fri, 12 Nov 2021 22:55:34 +0000
Subject: [PATCH] Enabled Megatron-LM-v1.1.5-ZeRO3-8.3B_param model on 8 GPUs
 and MP_SIZE=1

---
 .../ds_pretrain_gpt2-zero3_8.3B_params.sh     | 167 ++++++++++++++++++
 .../ds_zero_stage_3_config_8.3B_params.json   |  24 +++
 2 files changed, 191 insertions(+)
 create mode 100644 Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh
 create mode 100755 Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json

diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh
new file mode 100644
index 000000000..eb2776232
--- /dev/null
+++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh
@@ -0,0 +1,167 @@
+#! /bin/bash
+
+# Change for multinode config
+MP_SIZE=1
+
+DEBUG=1
+if [[ ${DEBUG} == 1 ]];  then
+       MP_SIZE=1
+       NUM_WORKERS=1
+       NUM_GPUS_PER_WORKER=8
+       HIDDEN_SIZE=3072
+       NUM_ATTN_HEADS=24
+       NUM_LAYERS=72
+       BATCHSIZE=4
+else
+       NUM_WORKERS=${DLTS_NUM_WORKER}
+       NUM_GPUS_PER_WORKER=${DLTS_NUM_GPU_PER_WORKER}
+       HIDDEN_SIZE=8192
+       NUM_ATTN_HEADS=32
+       NUM_LAYERS=50
+       BATCHSIZE=4
+
+       #HIDDEN_SIZE=4096
+       #NUM_LAYERS=24 # 50
+       #BATCHSIZE=16
+fi
+
+
+#BASE_DATA_PATH=/data/Megatron-LM/data
+#DATA_PATH=${BASE_DATA_PATH}/indexed_datasets/megatron
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+DATA_PATH=/data/DeepSpeed_data/Megatron_wikipedia/my-gpt2_text_document
+VOCAB_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-vocab.json
+MERGE_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+if [[ -z $1 ]]; then
+       config_json="$script_dir/ds_zero_stage_3_config_8.3B_params.json"
+
+       # offloads to NVMe
+       #config_json="$script_dir/ds_zero_stage_infinity_config.json"
+else
+       config_json=$script_dir/`basename $1`
+fi
+
+#ZeRO Configs
+stage=3
+reduce_scatter=true
+contigious_gradients=true
+rbs=50000000
+agbs=5000000000
+
+#Activation Checkpointing and Contigious Memory
+chkp_layers=1
+PA=true
+PA_CPU=true
+CC=true
+SYNCHRONIZE=true
+PROFILE=false
+
+# TiledLinear splits, 0 is disable
+TILED_LINEAR="false"
+TILE_DIM=1
+
+
+# Megatron Model Parallelism
+LOGDIR="tboard-zero2/stage${stage}-lazyscatter-${NUM_LAYERS}l_${HIDDEN_SIZE}h_${NUM_WORKERS}n_${NUM_GPUS_PER_WORKER}g_${MP_SIZE}mp_${BATCHSIZE}b"
+
+#--load $CHECKPOINT_PATH \
+gpt_options=" \
+        --model-parallel-size ${MP_SIZE} \
+        --num-layers $NUM_LAYERS \
+        --hidden-size $HIDDEN_SIZE \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length 1024 \
+        --max-position-embeddings 1024 \
+        --batch-size $BATCHSIZE \
+        --train-iters 320000 \
+        --lr-decay-iters 320000 \
+        --save $CHECKPOINT_PATH \
+        --data-path $DATA_PATH \
+        --vocab-file $VOCAB_PATH \
+        --merge-file $MERGE_PATH \
+        --data-impl mmap \
+        --split 949,50,1 \
+        --distributed-backend nccl \
+        --lr 1.5e-4 \
+        --lr-decay-style cosine \
+        --min-lr 1.0e-5 \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --warmup 0.01 \
+        --checkpoint-activations \
+        --log-interval 1 \
+        --save-interval 10000 \
+        --eval-interval 2000 \
+        --eval-iters 10 \
+        --fp16 \
+        --scattered-embeddings \
+        --split-transformers \
+"
+        #--tensorboard-dir ${LOGDIR}
+
+ deepspeed_options=" \
+                --deepspeed \
+                --deepspeed_config ${config_json} \
+                --zero-stage ${stage} \
+                --zero-reduce-bucket-size ${rbs} \
+                --zero-allgather-bucket-size ${agbs}
+            "
+
+if [ "${contigious_gradients}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-contigious-gradients"
+fi
+
+if [ "${reduce_scatter}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-reduce-scatter"
+fi
+
+chkp_opt=" \
+--deepspeed-activation-checkpointing \
+--checkpoint-num-layers ${chkp_layers}"
+
+if [ "${PA}" = "true" ]; then
+chkp_opt="${chkp_opt} --partition-activations"
+fi
+
+if [ "${PA_CPU}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --checkpoint-in-cpu"
+fi
+
+if [ "${SYNCHRONIZE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --synchronize-each-layer"
+fi
+
+if [ "${CC}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --contigious-checkpointing"
+fi
+
+if [ "${PROFILE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --profile-backward"
+fi
+
+if [ "${TILED_LINEAR}" = "true" ]; then
+tile_opt="${tile_opt} \
+        --memory-centric-tiled-linear \
+        --tile-factor=${TILE_DIM}"
+fi
+
+
+full_options="${gpt_options} ${deepspeed_options} ${chkp_opt} ${tile_opt}"
+
+run_cmd="deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}  pretrain_gpt2.py ${@:2} ${full_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
+
diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json
new file mode 100755
index 000000000..d9a90d84e
--- /dev/null
+++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json
@@ -0,0 +1,24 @@
+{
+  "train_batch_size": 64,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 1e7,
+    "stage3_param_persitence_threshold": 1e5,
+    "reduce_bucket_size": 1e7,
+    "contiguous_gradients": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false
+}