From 302a3dd7e4b256f80e35c9e2a185734cae857e35 Mon Sep 17 00:00:00 2001 From: rraminen Date: Tue, 7 Dec 2021 17:58:16 +0000 Subject: [PATCH] Set train_batch_size=8 --- .../ds_pretrain_gpt2-zero2_8.3B_params.sh | 2 +- .../ds_pretrain_gpt2-zero3_8.3B_params.sh | 4 +-- .../ds_zero_stage_2_config_8.3B_params.json | 32 +++++++++++++++++++ .../ds_zero_stage_3_config_8.3B_params.json | 2 +- 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100755 Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params.json diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params.sh b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params.sh index ebcbca843..93116d7ed 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params.sh +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero2_8.3B_params.sh @@ -38,7 +38,7 @@ CHECKPOINT_PATH=checkpoints/gpt2_345m_ds script_path=$(realpath $0) script_dir=$(dirname $script_path) if [[ -z $1 ]]; then - config_json="$script_dir/ds_zero_stage_2_config.json" + config_json="$script_dir/ds_zero_stage_2_config_8.3B_params.json" # offloads to NVMe #config_json="$script_dir/ds_zero_stage_infinity_config.json" diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh index eb2776232..600e563c5 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_pretrain_gpt2-zero3_8.3B_params.sh @@ -5,13 +5,13 @@ MP_SIZE=1 DEBUG=1 if [[ ${DEBUG} == 1 ]]; then - MP_SIZE=1 + MP_SIZE=8 NUM_WORKERS=1 NUM_GPUS_PER_WORKER=8 HIDDEN_SIZE=3072 NUM_ATTN_HEADS=24 NUM_LAYERS=72 - BATCHSIZE=4 + BATCHSIZE=8 else NUM_WORKERS=${DLTS_NUM_WORKER} NUM_GPUS_PER_WORKER=${DLTS_NUM_GPU_PER_WORKER} diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params.json b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params.json new file mode 100755 index 000000000..78c633c23 --- /dev/null +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config_8.3B_params.json @@ -0,0 +1,32 @@ +{ + "train_batch_size": 8, + "gradient_accumulation_steps": 1, + "steps_per_print": 1, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 50000000, + "reduce_bucket_size": 50000000, + "overlap_comm": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015, + "max_grad_norm": 1.0, + "betas": [0.9, 0.95] + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true, + + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "wall_clock_breakdown": true, + "zero_allow_untested_optimizer": false +} diff --git a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json index d9a90d84e..7825a54ad 100755 --- a/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json +++ b/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_3_config_8.3B_params.json @@ -1,5 +1,5 @@ { - "train_batch_size": 64, + "train_batch_size": 8, "gradient_accumulation_steps": 1, "steps_per_print": 1, "zero_optimization": {