diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params_16GPUs.sh b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params_16GPUs.sh new file mode 100644 index 000000000..ac44160f0 --- /dev/null +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params_16GPUs.sh @@ -0,0 +1,166 @@ +#! /bin/bash + +# Change for multinode config +MP_SIZE=8 + +DEBUG=1 +if [[ ${DEBUG} == 1 ]]; then + MP_SIZE=8 + NUM_WORKERS=1 + NUM_GPUS_PER_WORKER=16 + HIDDEN_SIZE=3072 + NUM_ATTN_HEADS=24 + NUM_LAYERS=72 + BATCHSIZE=16 +else + NUM_WORKERS=${DLTS_NUM_WORKER} + NUM_GPUS_PER_WORKER=${DLTS_NUM_GPU_PER_WORKER} + HIDDEN_SIZE=8192 + NUM_ATTN_HEADS=32 + NUM_LAYERS=50 + BATCHSIZE=4 + + #HIDDEN_SIZE=4096 + #NUM_LAYERS=24 # 50 + #BATCHSIZE=16 +fi + + +#BASE_DATA_PATH=/data/Megatron-LM/data +#DATA_PATH=${BASE_DATA_PATH}/indexed_datasets/megatron +#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json +#MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt +DATA_PATH=/data/DeepSpeed_data/Megatron_wikipedia/my-gpt2_text_document +VOCAB_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-vocab.json +MERGE_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-merges.txt +CHECKPOINT_PATH=checkpoints/gpt2_345m_ds + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +if [[ -z $1 ]]; then + config_json="$script_dir/ds_zero_stage_2_config_8.3B_params_16GPUs.json" + + # offloads to NVMe + #config_json="$script_dir/ds_zero_stage_infinity_config.json" +else + config_json=$script_dir/`basename $1` +fi + +#ZeRO Configs +stage=2 +reduce_scatter=true +contigious_gradients=true +rbs=50000000 +agbs=5000000000 + +#Activation Checkpointing and Contigious Memory +chkp_layers=1 +PA=true +PA_CPU=true +CC=true +SYNCHRONIZE=true +PROFILE=false + +# TiledLinear splits, 0 is disable +TILED_LINEAR="false" +TILE_DIM=1 + + +# Megatron Model Parallelism +LOGDIR="tboard-zero2/stage${stage}-lazyscatter-${NUM_LAYERS}l_${HIDDEN_SIZE}h_${NUM_WORKERS}n_${NUM_GPUS_PER_WORKER}g_${MP_SIZE}mp_${BATCHSIZE}b" + +#--load $CHECKPOINT_PATH \ +gpt_options=" \ + --model-parallel-size ${MP_SIZE} \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --num-attention-heads ${NUM_ATTN_HEADS} \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --batch-size $BATCHSIZE \ + --train-iters 320000 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_PATH \ + --merge-file $MERGE_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 1.5e-4 \ + --lr-decay-style cosine \ + --min-lr 1.0e-5 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --warmup 0.01 \ + --checkpoint-activations \ + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 2000 \ + --eval-iters 10 \ + --fp16 \ + --scattered-embeddings \ + --split-transformers \ +" + #--tensorboard-dir ${LOGDIR} + + deepspeed_options=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${stage} \ + --zero-reduce-bucket-size ${rbs} \ + --zero-allgather-bucket-size ${agbs} + " + +if [ "${contigious_gradients}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --zero-contigious-gradients" +fi + +if [ "${reduce_scatter}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --zero-reduce-scatter" +fi + +chkp_opt=" \ +--deepspeed-activation-checkpointing \ +--checkpoint-num-layers ${chkp_layers}" + +if [ "${PA}" = "true" ]; then +chkp_opt="${chkp_opt} --partition-activations" +fi + +if [ "${PA_CPU}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --checkpoint-in-cpu" +fi + +if [ "${SYNCHRONIZE}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --synchronize-each-layer" +fi + +if [ "${CC}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --contigious-checkpointing" +fi + +if [ "${PROFILE}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --profile-backward" +fi + +if [ "${TILED_LINEAR}" = "true" ]; then +tile_opt="${tile_opt} \ + --memory-centric-tiled-linear \ + --tile-factor=${TILE_DIM}" +fi + + +full_options="${gpt_options} ${deepspeed_options} ${chkp_opt} ${tile_opt}" + +run_cmd="deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${@:2} ${full_options}" +echo ${run_cmd} +eval ${run_cmd} + +set +x diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params_16GPUs.json b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params_16GPUs.json new file mode 100755 index 000000000..a0eb30bf2 --- /dev/null +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params_16GPUs.json @@ -0,0 +1,32 @@ +{ + "train_batch_size": 16, + "gradient_accumulation_steps": 1, + "steps_per_print": 1, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 50000000, + "reduce_bucket_size": 50000000, + "overlap_comm": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015, + "max_grad_norm": 1.0, + "betas": [0.9, 0.95] + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true, + + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "wall_clock_breakdown": true, + "zero_allow_untested_optimizer": false +}