From 7f1f04ebb6e5382e849d1ea79562e4742cd0f3de Mon Sep 17 00:00:00 2001 From: rraminen Date: Fri, 12 Nov 2021 22:55:34 +0000 Subject: [PATCH] Enabled Megatron-LM-v1.1.5-ZeRO3-8.3B_param model on 8 GPUs and MP_SIZE=1 --- .../ds_pretrain_gpt2-zero3_8.3B_params.sh | 167 ++++++++++++++++++ .../ds_zero_stage_3_config_8.3B_params.json | 24 +++ 2 files changed, 191 insertions(+) create mode 100644 Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh create mode 100755 Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh new file mode 100644 index 000000000..eb2776232 --- /dev/null +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh @@ -0,0 +1,167 @@ +#! /bin/bash + +# Change for multinode config +MP_SIZE=1 + +DEBUG=1 +if [[ ${DEBUG} == 1 ]]; then + MP_SIZE=1 + NUM_WORKERS=1 + NUM_GPUS_PER_WORKER=8 + HIDDEN_SIZE=3072 + NUM_ATTN_HEADS=24 + NUM_LAYERS=72 + BATCHSIZE=4 +else + NUM_WORKERS=${DLTS_NUM_WORKER} + NUM_GPUS_PER_WORKER=${DLTS_NUM_GPU_PER_WORKER} + HIDDEN_SIZE=8192 + NUM_ATTN_HEADS=32 + NUM_LAYERS=50 + BATCHSIZE=4 + + #HIDDEN_SIZE=4096 + #NUM_LAYERS=24 # 50 + #BATCHSIZE=16 +fi + + +#BASE_DATA_PATH=/data/Megatron-LM/data +#DATA_PATH=${BASE_DATA_PATH}/indexed_datasets/megatron +#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json +#MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt +DATA_PATH=/data/DeepSpeed_data/Megatron_wikipedia/my-gpt2_text_document +VOCAB_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-vocab.json +MERGE_PATH=/data/DeepSpeed_data/Megatron_wikipedia/gpt2-merges.txt +CHECKPOINT_PATH=checkpoints/gpt2_345m_ds + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +if [[ -z $1 ]]; then + config_json="$script_dir/ds_zero_stage_3_config_8.3B_params.json" + + # offloads to NVMe + #config_json="$script_dir/ds_zero_stage_infinity_config.json" +else + config_json=$script_dir/`basename $1` +fi + +#ZeRO Configs +stage=3 +reduce_scatter=true +contigious_gradients=true +rbs=50000000 +agbs=5000000000 + +#Activation Checkpointing and Contigious Memory +chkp_layers=1 +PA=true +PA_CPU=true +CC=true +SYNCHRONIZE=true +PROFILE=false + +# TiledLinear splits, 0 is disable +TILED_LINEAR="false" +TILE_DIM=1 + + +# Megatron Model Parallelism +LOGDIR="tboard-zero2/stage${stage}-lazyscatter-${NUM_LAYERS}l_${HIDDEN_SIZE}h_${NUM_WORKERS}n_${NUM_GPUS_PER_WORKER}g_${MP_SIZE}mp_${BATCHSIZE}b" + +#--load $CHECKPOINT_PATH \ +gpt_options=" \ + --model-parallel-size ${MP_SIZE} \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --num-attention-heads ${NUM_ATTN_HEADS} \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --batch-size $BATCHSIZE \ + --train-iters 320000 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_PATH \ + --merge-file $MERGE_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 1.5e-4 \ + --lr-decay-style cosine \ + --min-lr 1.0e-5 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --warmup 0.01 \ + --checkpoint-activations \ + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 2000 \ + --eval-iters 10 \ + --fp16 \ + --scattered-embeddings \ + --split-transformers \ +" + #--tensorboard-dir ${LOGDIR} + + deepspeed_options=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${stage} \ + --zero-reduce-bucket-size ${rbs} \ + --zero-allgather-bucket-size ${agbs} + " + +if [ "${contigious_gradients}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --zero-contigious-gradients" +fi + +if [ "${reduce_scatter}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --zero-reduce-scatter" +fi + +chkp_opt=" \ +--deepspeed-activation-checkpointing \ +--checkpoint-num-layers ${chkp_layers}" + +if [ "${PA}" = "true" ]; then +chkp_opt="${chkp_opt} --partition-activations" +fi + +if [ "${PA_CPU}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --checkpoint-in-cpu" +fi + +if [ "${SYNCHRONIZE}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --synchronize-each-layer" +fi + +if [ "${CC}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --contigious-checkpointing" +fi + +if [ "${PROFILE}" = "true" ]; then +chkp_opt="${chkp_opt} \ + --profile-backward" +fi + +if [ "${TILED_LINEAR}" = "true" ]; then +tile_opt="${tile_opt} \ + --memory-centric-tiled-linear \ + --tile-factor=${TILE_DIM}" +fi + + +full_options="${gpt_options} ${deepspeed_options} ${chkp_opt} ${tile_opt}" + +run_cmd="deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${@:2} ${full_options}" +echo ${run_cmd} +eval ${run_cmd} + +set +x + diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json new file mode 100755 index 000000000..d9a90d84e --- /dev/null +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json @@ -0,0 +1,24 @@ +{ + "train_batch_size": 64, + "gradient_accumulation_steps": 1, + "steps_per_print": 1, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_prefetch_bucket_size": 1e7, + "stage3_param_persitence_threshold": 1e5, + "reduce_bucket_size": 1e7, + "contiguous_gradients": true + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "wall_clock_breakdown": true, + "zero_allow_untested_optimizer": false +}