From 8d2e33d42630ca265844de3ade25931be56fe3f6 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 11:30:36 +0800 Subject: [PATCH 1/6] [example] add google doc for benchmark results of GPT --- examples/language/gpt/README.md | 53 +-------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index 07905b0cb3d0..b3d2bb53eba8 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -62,58 +62,7 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e. ColossalAI version 0.1.13. -How dose Batch Size affect the efficency. - -| model | #GPU | policy | TP | batch per DP | Tflops | -| ---------- | --------- |--------- |--------- |--------- |--------- | -| gpt2_10b | 2 | cpu | 1 | 32 | 122.046 | -| gpt2_10b | 2 | cpu | 1 | 16 | 82.649 | -| gpt2_10b | 2 | cpu | 1 | 8 | 61.354 | - - -How dose the Placement Policy affect the efficency. - -| model | #GPU | policy | TP | batch per DP | Tflops | -| ---------- | --------- |--------- |--------- |--------- |--------- | -| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | -| gpt2_10b | 4 | cuda | 1 | 8 | OOM | -| gpt2_10b | 4 | cpu | 1 | 8 | 61.354 | -| gpt2_10b | 4 | const | 1 | 8 | 82.137 | - -How dose the Tensor Parallel Degree affect the efficency. - -| model | #GPU | policy | TP | batch per DP | Tflops | -| ---------- | --------- |--------- |--------- |--------- |--------- | -| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | -| gpt2_10b | 4 | auto | 2 | 8 | 56.687 | -| gpt2_10b | 4 | auto | 4 | 8 | 29.019 | -| gpt2_10b | 4 | auto | 4 | 64 | 50.411 | -| gpt2_20b | 1 | cpu | 1 | 8 | 43.102 | -| gpt2_20b | 4 | cpu | 4 | 8 | 28.491 | - - -Touch the bar of model scale and batch size. - -1. `cpu` is the most stable policy for large model and large batch size. One 8 GPU with TP=2, largest batch size of `auto`, `const` - `cpu` is 64, 32 and 16, respectively. - -2. Tensor parallel is necessary for 20B model to reduce model data memory requirement on each GPU. - -| model | #GPU | policy | TP | batch per DP | Tflops | -| ---------- | --------- |--------- |--------- |--------- |--------- | -| gpt2_20b | 4 | cpu | 1 | 64 | CUDA OOM | -| gpt2_20b | 4 | auto | 1/2 | 64 | CUDA OOM | -| gpt2_20b | 4 | cpu | 2 | 8 | 43.102 | -| gpt2_20b | 4 | cpu | 2 | 64 | 121.394 | -| gpt2_20b | 8 | auto | 2 | 16 | 99.871 | -| gpt2_20b | 8 | cpu | 2 | 64 | 125.170 | -| gpt2_20b | 8 | const | 2 | 32 | 105.415 | - - -| model | #GPU | policy | TP | batch per DP | Tflops | -| ---------- | --------- |--------- |--------- |--------- |--------- | -| gpt2_20b | 8 | cpu | 2 | 8 | 46.895 | - +[benchmark results on google doc](https://docs.google.com/spreadsheets/d/15A2j3RwyHh-UobAPv_hJgT4W_d7CnlPm5Fp4yEzH5K4/edit#gid=0) ### Experimental Features From b3199df9f56d84698cdbc925d1ad53a295560aa4 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 11:36:14 +0800 Subject: [PATCH 2/6] add tencet doc --- examples/language/gpt/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index b3d2bb53eba8..8fdf6be3b6d9 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -64,6 +64,8 @@ ColossalAI version 0.1.13. [benchmark results on google doc](https://docs.google.com/spreadsheets/d/15A2j3RwyHh-UobAPv_hJgT4W_d7CnlPm5Fp4yEzH5K4/edit#gid=0) +[benchmark results on Tencent doc (for china)](https://docs.qq.com/sheet/DUVpqeVdxS3RKRldk?tab=BB08J2) + ### Experimental Features #### [Pipeline Parallel](./experiments/pipeline_parallel/) From 41ad66da65a04efda232453d85fd1d08ebc1412f Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 14:53:23 +0800 Subject: [PATCH 3/6] [example] gpt, shard init on all processes --- colossalai/tensor/colo_tensor.py | 8 +++---- examples/language/gpt/gemini/run_gemini.sh | 2 +- .../language/gpt/gemini/train_gpt_demo.py | 22 ++++++++++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py index 93ab982ccb8f..136afebac193 100644 --- a/colossalai/tensor/colo_tensor.py +++ b/colossalai/tensor/colo_tensor.py @@ -117,7 +117,7 @@ def get_process_group(self) -> 'ProcessGroup': def set_process_group(self, pg: ProcessGroup): """set_process_group change the pg of the ColoTensor. Note that the valid use cases is limited. - Only existing pg is DP and dist spec is REPLICaTE is valid. + Only existing pg is DP and dist spec is Replica is valid. Args: pg (ProcessGroup): target pg @@ -127,10 +127,10 @@ def set_process_group(self, pg: ProcessGroup): # if the new pg is the same as the old pg, just returns if self.process_group == pg: return - assert self.process_group.tp_world_size() == 1, \ - "Can not set_process_group on a ColoTensor whose process_group has tp world group" + assert self.process_group.tp_world_size() == 1 or self.process_group.dp_world_size() == 1, \ + "Can not set_process_group on a ColoTensor whose process_group is both tp > 1 and world group > 1" assert self.dist_spec.placement.value == 'r', \ - "Can not set_process_group on a ColoTensor whose dist spec is not REPLICATE" + "Can not set_process_group on a ColoTensor whose dist spec is not Replica" self.process_group = pg diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index ad577c350d39..b0770e28e931 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -6,7 +6,7 @@ export DISTPAN=${DISTPAN:-"colossalai"} export GPUNUM=${GPUNUM:-1} export TPDEGREE=${TPDEGREE:-1} export PLACEMENT=${PLACEMENT:-"cpu"} -export USE_SHARD_INIT=${USE_SHARD_INIT:-False} +export USE_SHARD_INIT=${USE_SHARD_INIT:-True} export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index 14200bff7b7e..29f8c8ef1215 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -148,10 +148,16 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): """ for mn, module in model.named_modules(): for pn, param in module.named_parameters(recurse=False): - # NOTE() a param maybe shared by tow modules + # NOTE() a param maybe shared by two modules if hasattr(param, 'visited'): continue + + # if shard init, then convert param to replica and use the dp-only ProcessGroup + param: ColoParameter = param param.set_dist_spec(ReplicaSpec()) + param.set_process_group(pg) + + # shard it w.r.t tp pattern if 'mlp.c_fc' in mn: if 'weight' in pn or 'bias' in pn: split_param_col_tp1d(param, pg) # colmn slice @@ -170,7 +176,6 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): split_param_col_tp1d(param, pg) # colmn slice else: param.set_dist_spec(ReplicaSpec()) - param.visited = True @@ -248,27 +253,28 @@ def main(): torch.manual_seed(123) if args.distplan == "colossalai": # all param must use the same process group. - default_pg = ProcessGroup(tp_degree=args.tp_degree) - default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None + world_size = torch.distributed.get_world_size() + shard_pg = ProcessGroup(tp_degree=world_size) + default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None # build GPT model if version.parse(CAI_VERSION) > version.parse("0.1.10"): with ColoInitContext(device=get_current_device(), dtype=torch.half, default_dist_spec=default_dist_spec, - default_pg=default_pg): + default_pg=shard_pg): model = model_builder(args.model_type)(checkpoint=True) else: with ColoInitContext(device=get_current_device()): model = model_builder(args.model_type)(checkpoint=True) - pg = default_pg + tp_pg = ProcessGroup(tp_degree=args.tp_degree) # Tensor Parallelism (TP) - tensor_parallelize(model, pg) + tensor_parallelize(model, tp_pg) # build a Gemini model and a highly optimized cpu optimizer # Gemini + ZeRO DP, Note it must be used after TP - model, optimizer = build_gemini(model, pg, args.placement) + model, optimizer = build_gemini(model, tp_pg, args.placement) logger.info(get_mem_info(prefix='After init optim, '), ranks=[0]) else: From 9d945aff401b467cdf6d3f39ea1a6340f2f9af1c Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 14:58:18 +0800 Subject: [PATCH 4/6] polish comments --- colossalai/tensor/colo_tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py index 136afebac193..3712d6a0acea 100644 --- a/colossalai/tensor/colo_tensor.py +++ b/colossalai/tensor/colo_tensor.py @@ -117,7 +117,7 @@ def get_process_group(self) -> 'ProcessGroup': def set_process_group(self, pg: ProcessGroup): """set_process_group change the pg of the ColoTensor. Note that the valid use cases is limited. - Only existing pg is DP and dist spec is Replica is valid. + It works for the target pg is DP and TP only and current dist spec of the Tensor is Replica. Args: pg (ProcessGroup): target pg From 29c25d9ca9c1977e85ea14982593f754d3e6a8d5 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 15:27:30 +0800 Subject: [PATCH 5/6] polish code --- examples/language/gpt/gemini/run_gemini.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index b0770e28e931..ad577c350d39 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -6,7 +6,7 @@ export DISTPAN=${DISTPAN:-"colossalai"} export GPUNUM=${GPUNUM:-1} export TPDEGREE=${TPDEGREE:-1} export PLACEMENT=${PLACEMENT:-"cpu"} -export USE_SHARD_INIT=${USE_SHARD_INIT:-True} +export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} From c811b516edd731d4c07ad8856e5a74ebce84c67e Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Fri, 6 Jan 2023 16:29:04 +0800 Subject: [PATCH 6/6] [builder] update readme --- README.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 6ffbc85ba9eb..1b0ca7e973e0 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,10 @@ Colossal-AI: A Unified Deep Learning System for Big Model Era -

Paper | - Documentation | - Examples | - Forum | +

Paper | + Documentation | + Examples | + Forum | Blog

[![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml) @@ -17,7 +17,7 @@ [![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w) [![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png) - + | [English](README.md) | [中文](README-zh-Hans.md) | @@ -35,7 +35,7 @@
  • Why Colossal-AI
  • Features
  • - Parallel Training Demo + Parallel Training Demo
  • - Single GPU Training Demo + Single GPU Training Demo
  • - Inference (Energon-AI) Demo + Inference (Energon-AI) Demo
  • - Colossal-AI for Real World Applications + Colossal-AI for Real World Applications
    • AIGC: Acceleration of Stable Diffusion
    • Biomedicine: Acceleration of AlphaFold Protein Structure
    • @@ -106,7 +106,7 @@ distributed training and inference in a few lines. - [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054) - [Auto-Parallelism](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt) -- Heterogeneous Memory Management +- Heterogeneous Memory Management - [PatrickStar](https://arxiv.org/abs/2108.05818) - Friendly Usage @@ -115,7 +115,7 @@ distributed training and inference in a few lines. - Inference - [Energon-AI](https://github.com/hpcaitech/EnergonAI) -- Colossal-AI in the Real World +- Colossal-AI in the Real World - Biomedicine: [FastFold](https://github.com/hpcaitech/FastFold) accelerates training and inference of AlphaFold protein structure

      (back to top)

      @@ -149,7 +149,7 @@ distributed training and inference in a few lines. - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model released by Meta, which stimulates AI programmers to perform various downstream tasks and application deployments because public pretrained model weights. -- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt) +- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt) Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details. @@ -277,10 +277,11 @@ pip install -r requirements/requirements.txt pip install . ``` -If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer): +By default, we do not compile CUDA/C++ kernels. ColossalAI will build them during runtime. +If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer): ```shell -NO_CUDA_EXT=1 pip install . +CUDA_EXT=1 pip install . ```

      (back to top)