From 354c16eed4d26024b3cd47b13827aa537f190374 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 15:51:35 +0800 Subject: [PATCH 01/10] [ci] add CI for gpt, palm and opt --- examples/language/gpt/gemini/run_gemini.sh | 5 +- examples/language/gpt/gemini/test_ci.sh | 36 +++++++++++ .../language/gpt/gemini/train_gpt_demo.py | 10 ++- examples/language/gpt/test_ci.sh | 17 +---- examples/language/opt/test_ci.sh | 4 ++ examples/language/palm/run.sh | 2 +- examples/language/palm/test_ci.sh | 9 +++ examples/language/palm/train.py | 64 +++++++++++++------ 8 files changed, 106 insertions(+), 41 deletions(-) create mode 100644 examples/language/gpt/gemini/test_ci.sh create mode 100644 examples/language/opt/test_ci.sh create mode 100644 examples/language/palm/test_ci.sh diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index 0c2ea660f1e0..8bc6583cb502 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -1,6 +1,6 @@ set -x # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPLAN=${DISTPLAN:-"colossalai"} +export DISTPLAN=${DISTPLAN:-"zero1"} # The following options only valid when DISTPLAN="colossalai" export GPUNUM=${GPUNUM:-1} @@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"} export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} - +export TRAIN_STEP=${TRAIN_STEP:-10} # export PYTHONPATH=$PWD:$PYTHONPATH mkdir -p gemini_logs @@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \ --placement=${PLACEMENT} \ --shardinit=${USE_SHARD_INIT} \ --distplan=${DISTPLAN} \ +--train_step=${TRAIN_STEP} \ 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh new file mode 100644 index 000000000000..0528a2110d1e --- /dev/null +++ b/examples/language/gpt/gemini/test_ci.sh @@ -0,0 +1,36 @@ +$(cd `dirname $0`;pwd) + +export TRAIN_STEP=4 + +for MODEL_TYPE in "gpt2_medium"; do + for DISTPLAN in "colossalai"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1 2; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + for PLACEMENT in "cpu" "auto"; do + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + bash ./run_gemini.sh + done + done + done + done + done + + for DISTPLAN in "zero1" "zero2"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\ + bash ./run_gemini.sh + done + done + done + done + done +done diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index 7bec980f95bd..88ecb4de2406 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -65,6 +65,12 @@ def parse_args(): default="gpt2_medium", help="model model scale", ) + parser.add_argument( + "--train_step", + type=int, + default=10, + help="training iterations for test", + ) args = parser.parse_args() return args @@ -236,7 +242,7 @@ def main(): SEQ_LEN = 1024 VOCAB_SIZE = 50257 - NUM_STEPS = 10 + NUM_STEPS = args.train_step WARMUP_STEPS = 1 assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " @@ -290,14 +296,12 @@ def main(): from torch.distributed.optim import ZeroRedundancyOptimizer optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01) elif args.distplan.startswith("zero"): - pg = ProcessGroup() model = model.half() partition_flag = (args.distplan == "zero2") optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = LowLevelZeroOptimizer( optimizer, - pg=pg, reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, partition_grad=partition_flag, diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index ad0cfa325d37..cb0bab213f34 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,16 +1 @@ -pip install -r requirements.txt - -# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPAN="colossalai" - -# The following options only valid when DISTPAN="colossalai" -export TPDEGREE=2 -export GPUNUM=4 -export PLACEMENT='cpu' -export USE_SHARD_INIT=False -export BATCH_SIZE=8 -export MODEL_TYPE="gpt2_medium" - - -mkdir -p logs -torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log +bash ./gemini/test_ci.shg diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh new file mode 100644 index 000000000000..317f602cda3c --- /dev/null +++ b/examples/language/opt/test_ci.sh @@ -0,0 +1,4 @@ +for GPUNUM in 2 1 +do +env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh +done diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh index 4aa868953f7b..7a533509e009 100644 --- a/examples/language/palm/run.sh +++ b/examples/language/palm/run.sh @@ -8,4 +8,4 @@ export PLACEMENT='cpu' export USE_SHARD_INIT=False export BATCH_SIZE=4 -env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log \ No newline at end of file +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh new file mode 100644 index 000000000000..e1324a952502 --- /dev/null +++ b/examples/language/palm/test_ci.sh @@ -0,0 +1,9 @@ +$(cd `dirname $0`;pwd) + +for BATCH_SIZE in 2 +do +for GPUNUM in 1 4 +do +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --batch_size=${BATCH_SIZE} 2>&1 | tee run.log +done +done diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py index 6725c07dfac7..fa340e2a3e77 100644 --- a/examples/language/palm/train.py +++ b/examples/language/palm/train.py @@ -1,11 +1,12 @@ import gzip import random -from time import time from functools import partial +from time import time + import numpy as np import torch -import torch.optim as optim import torch.nn as nn +import torch.optim as optim import tqdm from packaging import version from palm_pytorch import PaLM @@ -23,7 +24,7 @@ # constants -NUM_BATCHES = int(100) +NUM_BATCHES = int(10) WARMUP_BATCHES = 1 GRADIENT_ACCUMULATE_EVERY = 1 LEARNING_RATE = 2e-4 @@ -66,9 +67,16 @@ def parse_args(): default=8, help="batch size per DP group of training.", ) + parser.add_argument( + "--dummy_data", + type=bool, + default=False, + help="use dummy dataset.", + ) args = parser.parse_args() return args + # helpers def cycle(loader): while True: @@ -79,12 +87,15 @@ def cycle(loader): def decode_token(token): return str(chr(max(32, token))) + def get_tflops(model_numel, batch_size, seq_len, step_time): return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) + def decode_tokens(tokens): return "".join(list(map(decode_token, tokens))) + def get_model_size(model: nn.Module): total_numel = 0 for module in model.modules(): @@ -92,6 +103,7 @@ def get_model_size(model: nn.Module): total_numel += p.numel() return total_numel + # Gemini + ZeRO DDP def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): cai_version = colossalai.__version__ @@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: raise NotImplemented(f"CAI version {cai_version} is not supported") return model + ## Parameter Sharding Strategies for Tensor Parallelism def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup): spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D)) @@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup): def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): split_param_single_dim_tp1d(-1, param, pg) + # Tensor Parallel def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): """tensor_parallelize @@ -159,15 +173,28 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): args = parse_args() if args.distplan not in ["colossalai", "pytorch"]: - raise TypeError(f"{args.distplan} is error") + raise TypeError(f"{args.distplan} is error") disable_existing_loggers() colossalai.launch_from_torch(config={}) logger = get_dist_logger() -with gzip.open("./data/enwik8.gz") as file: - X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) - trX, vaX = np.split(X, [int(90e6)]) - data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + +def generate_dataset(from_file: bool = True): + if from_file: + with gzip.open("./data/enwik8.gz") as file: + X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) + trX, vaX = np.split(X, [int(90e6)]) + data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + # print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}") + # print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}") + return data_train, data_val + else: + return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,)) + + +data_train, data_val = generate_dataset(args.dummy_data) + +print("generate dataset ready!") class TextSamplerDataset(Dataset): @@ -216,7 +243,7 @@ def __len__(self): model.cuda() optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) - # model is shared after TP +# model is shared after TP numel = get_model_size(model) get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN) @@ -251,7 +278,7 @@ def __len__(self): ) if i >= WARMUP_BATCHES: tflops_list.append(step_tflops) - + else: for __ in range(GRADIENT_ACCUMULATE_EVERY): loss = model(next(train_loader)) @@ -261,18 +288,17 @@ def __len__(self): torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optim.step() optim.zero_grad() - + tflops_list.sort() median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}") - - # TODO - # if i % VALIDATE_EVERY == 0: - # model.eval() - # with torch.no_grad(): - # loss = model(next(val_loader)) - # print(f"validation loss: {loss.item()}") +# TODO +# if i % VALIDATE_EVERY == 0: +# model.eval() +# with torch.no_grad(): +# loss = model(next(val_loader)) +# print(f"validation loss: {loss.item()}") # if i % GENERATE_EVERY == 0: # model.eval() @@ -282,4 +308,4 @@ def __len__(self): # sample = model.generate(inp[None, ...], GENERATE_LENGTH) # output_str = decode_tokens(sample[0]) - # print(output_str) \ No newline at end of file + # print(output_str) From bf2a6dfc5f9d6f967a25140ec1825ae40ecad10d Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 15:54:01 +0800 Subject: [PATCH 02/10] polish code --- examples/language/palm/test_ci.sh | 2 +- examples/language/palm/train.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh index e1324a952502..d16136311c58 100644 --- a/examples/language/palm/test_ci.sh +++ b/examples/language/palm/test_ci.sh @@ -4,6 +4,6 @@ for BATCH_SIZE in 2 do for GPUNUM in 1 4 do -env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --batch_size=${BATCH_SIZE} 2>&1 | tee run.log +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data --batch_size=${BATCH_SIZE} 2>&1 | tee run.log done done diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py index fa340e2a3e77..a334ea9511fb 100644 --- a/examples/language/palm/train.py +++ b/examples/language/palm/train.py @@ -179,8 +179,8 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): logger = get_dist_logger() -def generate_dataset(from_file: bool = True): - if from_file: +def generate_dataset(dummy_data: bool = False): + if not dummy_data: with gzip.open("./data/enwik8.gz") as file: X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) trX, vaX = np.split(X, [int(90e6)]) From 2732f34581276c37eeb0215f6ae2a60be2e20ea9 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 15:55:58 +0800 Subject: [PATCH 03/10] polish code --- examples/language/gpt/gemini/run_gemini.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index 8bc6583cb502..6f0710d54f01 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -1,6 +1,6 @@ set -x # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPLAN=${DISTPLAN:-"zero1"} +export DISTPLAN=${DISTPLAN:-"colossalai"} # The following options only valid when DISTPLAN="colossalai" export GPUNUM=${GPUNUM:-1} From c46178b771de617b9034cef61d6bfe7cf9aa4ca2 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 16:00:19 +0800 Subject: [PATCH 04/10] polish code --- examples/language/gpt/test_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index cb0bab213f34..375662eae401 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1 +1 @@ -bash ./gemini/test_ci.shg +bash ./gemini/test_ci.sh From 80021a7ae12e58c198bc62dbe71a721d09f94172 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 16:02:22 +0800 Subject: [PATCH 05/10] polish code --- examples/language/palm/test_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh index d16136311c58..f21095578077 100644 --- a/examples/language/palm/test_ci.sh +++ b/examples/language/palm/test_ci.sh @@ -4,6 +4,6 @@ for BATCH_SIZE in 2 do for GPUNUM in 1 4 do -env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data --batch_size=${BATCH_SIZE} 2>&1 | tee run.log +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log done done From 93798eab66edc1b9d94abef95c3184a435820e7a Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 16:07:30 +0800 Subject: [PATCH 06/10] polish code --- examples/language/gpt/gemini/test_ci.sh | 3 --- examples/language/gpt/test_ci.sh | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh index 0528a2110d1e..02f520036de5 100644 --- a/examples/language/gpt/gemini/test_ci.sh +++ b/examples/language/gpt/gemini/test_ci.sh @@ -1,5 +1,3 @@ -$(cd `dirname $0`;pwd) - export TRAIN_STEP=4 for MODEL_TYPE in "gpt2_medium"; do @@ -32,5 +30,4 @@ for MODEL_TYPE in "gpt2_medium"; do done done done - done done diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index 375662eae401..1e27e4f16a41 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1 +1,3 @@ -bash ./gemini/test_ci.sh +set -x +$(cd `dirname $0`;pwd) +bash gemini/test_ci.sh From fd7ed1f3d6138ed3bc223b0045a8004a8956d8bc Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 13 Jan 2023 16:15:26 +0800 Subject: [PATCH 07/10] polish code --- examples/language/gpt/gemini/test_ci.sh | 1 + examples/language/gpt/test_ci.sh | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh index 02f520036de5..98d79482622a 100644 --- a/examples/language/gpt/gemini/test_ci.sh +++ b/examples/language/gpt/gemini/test_ci.sh @@ -1,3 +1,4 @@ +$(cd `dirname $0`;pwd) export TRAIN_STEP=4 for MODEL_TYPE in "gpt2_medium"; do diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index 1e27e4f16a41..d67c17229e71 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,3 +1,2 @@ set -x -$(cd `dirname $0`;pwd) -bash gemini/test_ci.sh +cd gemini && bash test_ci.sh From 316431688476e88feb905fda3fde862a6973c4a3 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Sat, 14 Jan 2023 11:14:08 +0800 Subject: [PATCH 08/10] polish code precommit --- examples/language/gpt/gemini/test_ci.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh index 98d79482622a..6079d5ed615b 100644 --- a/examples/language/gpt/gemini/test_ci.sh +++ b/examples/language/gpt/gemini/test_ci.sh @@ -1,3 +1,4 @@ +set -x $(cd `dirname $0`;pwd) export TRAIN_STEP=4 From 9ffa03b62ec90b1afd98a9b9f9a742f681c5ea73 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Mon, 16 Jan 2023 10:00:01 +0800 Subject: [PATCH 09/10] polish precommit --- examples/language/gpt/gemini/train_gpt_demo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index bd5ae4e3cecc..713de6f9fb45 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -243,7 +243,6 @@ def main(): SEQ_LEN = 1024 VOCAB_SIZE = 50257 - NUM_STEPS = args.train_step WARMUP_STEPS = 1 From 93ba83bd76c2ccefc1fcf2304df22bbe25dcaf64 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Mon, 16 Jan 2023 10:14:32 +0800 Subject: [PATCH 10/10] polish precommit --- examples/language/gpt/test_ci.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index 0e33a321e833..d67c17229e71 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,4 +1,2 @@ - set -x cd gemini && bash test_ci.sh -