Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#! /bin/bash

# Change for multinode config
MP_SIZE=8

DEBUG=1
if [[ ${DEBUG} == 1 ]]; then
MP_SIZE=8
NUM_WORKERS=1
NUM_GPUS_PER_WORKER=16
HIDDEN_SIZE=3072
NUM_ATTN_HEADS=24
NUM_LAYERS=72
BATCHSIZE=16
else
NUM_WORKERS=${DLTS_NUM_WORKER}
NUM_GPUS_PER_WORKER=${DLTS_NUM_GPU_PER_WORKER}
HIDDEN_SIZE=8192
NUM_ATTN_HEADS=32
NUM_LAYERS=50
BATCHSIZE=4

#HIDDEN_SIZE=4096
#NUM_LAYERS=24 # 50
#BATCHSIZE=16
fi


#BASE_DATA_PATH=/data/Megatron-LM/data
#DATA_PATH=${BASE_DATA_PATH}/indexed_datasets/megatron
#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
#MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DATA_PATH=/data/DeepSpeed_data/Megatron_wikipedia/my-gpt2_text_document
VOCAB_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-vocab.json
MERGE_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-merges.txt
CHECKPOINT_PATH=checkpoints/gpt2_345m_ds

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
if [[ -z $1 ]]; then
config_json="$script_dir/ds_zero_stage_2_config_8.3B_params_16GPUs.json"

# offloads to NVMe
#config_json="$script_dir/ds_zero_stage_infinity_config.json"
else
config_json=$script_dir/`basename $1`
fi

#ZeRO Configs
stage=2
reduce_scatter=true
contigious_gradients=true
rbs=50000000
agbs=5000000000

#Activation Checkpointing and Contigious Memory
chkp_layers=1
PA=true
PA_CPU=true
CC=true
SYNCHRONIZE=true
PROFILE=false

# TiledLinear splits, 0 is disable
TILED_LINEAR="false"
TILE_DIM=1


# Megatron Model Parallelism
LOGDIR="tboard-zero2/stage${stage}-lazyscatter-${NUM_LAYERS}l_${HIDDEN_SIZE}h_${NUM_WORKERS}n_${NUM_GPUS_PER_WORKER}g_${MP_SIZE}mp_${BATCHSIZE}b"

#--load $CHECKPOINT_PATH \
gpt_options=" \
--model-parallel-size ${MP_SIZE} \
--num-layers $NUM_LAYERS \
--hidden-size $HIDDEN_SIZE \
--num-attention-heads ${NUM_ATTN_HEADS} \
--seq-length 1024 \
--max-position-embeddings 1024 \
--batch-size $BATCHSIZE \
--train-iters 320000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_PATH \
--merge-file $MERGE_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 1.5e-4 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup 0.01 \
--checkpoint-activations \
--log-interval 1 \
--save-interval 10000 \
--eval-interval 2000 \
--eval-iters 10 \
--fp16 \
--scattered-embeddings \
--split-transformers \
"
#--tensorboard-dir ${LOGDIR}

deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${stage} \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--zero-reduce-bucket-size ${rbs} \
--zero-allgather-bucket-size ${agbs}
"

if [ "${contigious_gradients}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-contigious-gradients"
fi

if [ "${reduce_scatter}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-reduce-scatter"
fi

chkp_opt=" \
--deepspeed-activation-checkpointing \
--checkpoint-num-layers ${chkp_layers}"

if [ "${PA}" = "true" ]; then
chkp_opt="${chkp_opt} --partition-activations"
fi

if [ "${PA_CPU}" = "true" ]; then
chkp_opt="${chkp_opt} \
--checkpoint-in-cpu"
fi

if [ "${SYNCHRONIZE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--synchronize-each-layer"
fi

if [ "${CC}" = "true" ]; then
chkp_opt="${chkp_opt} \
--contigious-checkpointing"
fi

if [ "${PROFILE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--profile-backward"
fi

if [ "${TILED_LINEAR}" = "true" ]; then
tile_opt="${tile_opt} \
--memory-centric-tiled-linear \
--tile-factor=${TILE_DIM}"
fi


full_options="${gpt_options} ${deepspeed_options} ${chkp_opt} ${tile_opt}"

run_cmd="deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${@:2} ${full_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"train_batch_size": 16,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"reduce_scatter": true,
"allgather_bucket_size": 50000000,
"reduce_bucket_size": 50000000,
"overlap_comm": true
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0,
"betas": [0.9, 0.95]
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,

"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"wall_clock_breakdown": true,
"zero_allow_untested_optimizer": false
}