From bd5428b5b7930c0e44ea9bc72b8b4885c7638a3e Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Fri, 26 May 2023 11:22:15 +0800 Subject: [PATCH 01/10] [doc] fix title of mixed precision --- .../zh-Hans/features/gradient_accumulation_with_booster.md | 2 +- .../zh-Hans/features/mixed_precision_training_with_booster.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md index ab86f34f2dec..a8422060f0ea 100644 --- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md +++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md @@ -1,4 +1,4 @@ -# 梯度累积 (最新版本) +# 梯度累积 (新版本) 作者: [Mingyan Jiang](https://github.com/jiangmingyan) diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md index 6954556a8e9a..187aef1a6c4a 100644 --- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md +++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md @@ -1,4 +1,4 @@ -# 自动混合精度训练 (最新版本) +# 自动混合精度训练 (新版本) 作者: [Mingyan Jiang](https://github.com/jiangmingyan) From 7708b8d7adbe46c39999d5fa470679f89f059423 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Fri, 26 May 2023 14:55:51 +0800 Subject: [PATCH 02/10] [doc]update document of zero with chunk --- docs/source/en/features/zero_with_chunk.md | 54 ++++++------------- .../zh-Hans/features/zero_with_chunk.md | 54 +++++++------------ 2 files changed, 35 insertions(+), 73 deletions(-) diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index d7a99f2fbbfd..a3acb6fedf4e 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -3,7 +3,7 @@ Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY) **Prerequisite:** -- [Define Your Configuration](../basics/define_your_config.md) +- [定义配置文件](../basics/define_your_config.md) **Example Code** @@ -97,6 +97,7 @@ For simplicity, we just use randomly generated data here. First we only need to import `GPT2LMHeadModel` from `Huggingface transformers` to define our model, which does not require users to define or modify the model, so that users can use it more conveniently. +Define a GPT model: ```python class GPTLMModel(nn.Module): @@ -182,34 +183,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): split_param_single_dim_tp1d(-1, param, pg) ``` -Define a model which uses Gemini + ZeRO DDP: - -```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - cai_version = colossalai.__version__ - if version.parse(cai_version) > version.parse("0.1.10"): - from colossalai.nn.parallel import GeminiDDP - model = GeminiDDP(model, - device=get_current_device(), - placement_policy=placement_policy, - pin_memory=True, - search_range_mb=32) - elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): - from colossalai.gemini import ChunkManager, GeminiManager - chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) - gemini_manager = GeminiManager(placement_policy, chunk_manager) - chunk_manager = ChunkManager(chunk_size, - pg, - enable_distributed_storage=True, - init_device=GeminiManager.get_default_device(placement_policy)) - model = ZeroDDP(model, gemini_manager) - else: - raise NotImplemented(f"CAI version {cai_version} is not supported") - return model -``` - -As we pre-train GPT in this example, we just use a simple language model loss. - Write a function to get random inputs: ```python @@ -219,9 +192,15 @@ def get_data(batch_size, seq_len, vocab_size): return input_ids, attention_mask ``` -Finally, we can define our training loop: +Finally, we define a model which uses Gemini + ZeRO DDP and our training loop, As we pre-train GPT in this example, we just use a simple language model loss.: ```python +from torch.optim import Adam + +from colossalai.booster import Booster +from colossalai.zero import ColoInitContext +from colossalai.booster.plugin import GeminiPlugin + def main(): args = parse_args() BATCH_SIZE = 8 @@ -232,22 +211,23 @@ def main(): # build criterion criterion = GPTLMLoss() + optimizer = Adam(model.parameters(), lr=0.001) torch.manual_seed(123) default_pg = ProcessGroup(tp_degree=args.tp_degree) - default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None + default_dist_spec = ShardSpec([-1], [args.tp_degree]) # build GPT model with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg): model = gpt2_medium(checkpoint=True) pg = default_pg # Tensor Parallelism (TP) tensor_parallelize(model, pg) + # Gemini + ZeRO DP, Note it must be used after TP - model = gemini_zero_dpp(model, pg, args.placement) - # build optimizer - optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5) - numel = sum([p.numel() for p in model.parameters()]) - get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN) + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + booster = Booster(plugin=plugin) + model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) + torch.cuda.synchronize() model.train() for n in range(NUM_STEPS): @@ -256,7 +236,7 @@ def main(): optimizer.zero_grad() outputs = model(input_ids, attn_mask) loss = criterion(outputs, input_ids) - optimizer.backward(loss) + booster.backward(loss, optimizer) optimizer.step() torch.cuda.synchronize() diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index ba57ba4e8e61..0eef26bbf121 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -97,6 +97,8 @@ optimizer.step() 首先我们只需要引入`Huggingface transformers` 的 `GPT2LMHeadModel`来定义我们的模型,不需要用户进行模型的定义与修改,方便用户使用。 +定义模型: + ```python class GPTLMModel(nn.Module): @@ -182,34 +184,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): split_param_single_dim_tp1d(-1, param, pg) ``` -定义一个使用 Gemini + ZeRO DDP 的模型: - -```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - cai_version = colossalai.__version__ - if version.parse(cai_version) > version.parse("0.1.10"): - from colossalai.nn.parallel import GeminiDDP - model = GeminiDDP(model, - device=get_current_device(), - placement_policy=placement_policy, - pin_memory=True, - search_range_mb=32) - elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): - from colossalai.gemini import ChunkManager, GeminiManager - chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) - gemini_manager = GeminiManager(placement_policy, chunk_manager) - chunk_manager = ChunkManager(chunk_size, - pg, - enable_distributed_storage=True, - init_device=GeminiManager.get_default_device(placement_policy)) - model = ZeroDDP(model, gemini_manager) - else: - raise NotImplemented(f"CAI version {cai_version} is not supported") - return model -``` - -由于我们在这个例子中对GPT进行预训练,因此只使用了一个简单的语言模型损失函数。 - 写一个获得随机输入的函数: ```python @@ -219,9 +193,16 @@ def get_data(batch_size, seq_len, vocab_size): return input_ids, attention_mask ``` -最后,我们可以定义我们的训练循环: + +最后,使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练,因此只使用了一个简单的语言模型损失函数: ```python +from torch.optim import Adam + +from colossalai.booster import Booster +from colossalai.zero import ColoInitContext +from colossalai.booster.plugin import GeminiPlugin + def main(): args = parse_args() BATCH_SIZE = 8 @@ -232,22 +213,23 @@ def main(): # build criterion criterion = GPTLMLoss() + optimizer = Adam(model.parameters(), lr=0.001) torch.manual_seed(123) default_pg = ProcessGroup(tp_degree=args.tp_degree) - default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None + default_dist_spec = ShardSpec([-1], [args.tp_degree]) # build GPT model with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg): model = gpt2_medium(checkpoint=True) pg = default_pg # Tensor Parallelism (TP) tensor_parallelize(model, pg) + # Gemini + ZeRO DP, Note it must be used after TP - model = gemini_zero_dpp(model, pg, args.placement) - # build optimizer - optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5) - numel = sum([p.numel() for p in model.parameters()]) - get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN) + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + booster = Booster(plugin=plugin) + model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) + torch.cuda.synchronize() model.train() for n in range(NUM_STEPS): @@ -256,7 +238,7 @@ def main(): optimizer.zero_grad() outputs = model(input_ids, attn_mask) loss = criterion(outputs, input_ids) - optimizer.backward(loss) + booster.backward(loss, optimizer) optimizer.step() torch.cuda.synchronize() From 8547c76af68335b758fcff66bcc5f1daad287bfe Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Fri, 26 May 2023 14:59:48 +0800 Subject: [PATCH 03/10] [doc] update document of zero with chunk, fix --- docs/source/en/features/zero_with_chunk.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index a3acb6fedf4e..0ce64d25968b 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -192,7 +192,7 @@ def get_data(batch_size, seq_len, vocab_size): return input_ids, attention_mask ``` -Finally, we define a model which uses Gemini + ZeRO DDP and our training loop, As we pre-train GPT in this example, we just use a simple language model loss.: +Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss.: ```python from torch.optim import Adam From 3084e5f94a93b10a76c6b7cd61242d6d566b4cdc Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Fri, 26 May 2023 15:02:08 +0800 Subject: [PATCH 04/10] [doc] update document of zero with chunk, fix --- docs/source/en/features/zero_with_chunk.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index 0ce64d25968b..1fbdc0290121 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -192,7 +192,7 @@ def get_data(batch_size, seq_len, vocab_size): return input_ids, attention_mask ``` -Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss.: +Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss: ```python from torch.optim import Adam @@ -224,7 +224,7 @@ def main(): tensor_parallelize(model, pg) # Gemini + ZeRO DP, Note it must be used after TP - plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) From 3b3101515c7cbecc770e9eb38a46d50aabc2dd19 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Fri, 26 May 2023 15:04:37 +0800 Subject: [PATCH 05/10] [doc] update document of zero with chunk, fix --- docs/source/zh-Hans/features/zero_with_chunk.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index 0eef26bbf121..84c7781b938d 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -97,7 +97,7 @@ optimizer.step() 首先我们只需要引入`Huggingface transformers` 的 `GPT2LMHeadModel`来定义我们的模型,不需要用户进行模型的定义与修改,方便用户使用。 -定义模型: +定义GPT模型: ```python class GPTLMModel(nn.Module): @@ -226,7 +226,7 @@ def main(): tensor_parallelize(model, pg) # Gemini + ZeRO DP, Note it must be used after TP - plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) From b559a64d644a6c50f59db3706bcf383cc6101638 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Mon, 29 May 2023 10:41:01 +0800 Subject: [PATCH 06/10] [doc] update document of zero with chunk, add doc test --- docs/source/en/features/zero_with_chunk.md | 2 ++ docs/source/zh-Hans/features/zero_with_chunk.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index 1fbdc0290121..bec075418aac 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -243,3 +243,5 @@ def main(): ``` > ⚠️ Note: If you want to use the Gemini module, please do not use the [Gradient Accumulation](../features/gradient_accumulation.md) we mentioned before。 The complete example can be found on [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). + + diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index 84c7781b938d..02569256a84b 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -245,3 +245,5 @@ def main(): ``` > ⚠️ 注意:如果你使用Gemini模块的话,请不要使用我们之前提到过的[梯度累加](../features/gradient_accumulation.md)。 完整的例子代码可以在 [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。 + + From 15d053dfefbad25c4b407191f9575dc58f827757 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Mon, 29 May 2023 10:55:20 +0800 Subject: [PATCH 07/10] [doc] update document of zero with chunk, add doc test --- docs/source/en/features/zero_with_chunk.md | 2 +- docs/source/zh-Hans/features/zero_with_chunk.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index bec075418aac..f5692d92dca1 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -224,7 +224,7 @@ def main(): tensor_parallelize(model, pg) # Gemini + ZeRO DP, Note it must be used after TP - plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index 02569256a84b..eb8516cd084e 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -226,7 +226,7 @@ def main(): tensor_parallelize(model, pg) # Gemini + ZeRO DP, Note it must be used after TP - plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) + plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) From e9d48fedb6f750751987bfca05003a4c6888ca5b Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Mon, 29 May 2023 12:57:59 +0800 Subject: [PATCH 08/10] [doc] update document of zero with chunk, fix installation --- docs/source/zh-Hans/get_started/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md index a32627db6f00..a6c88672b907 100755 --- a/docs/source/zh-Hans/get_started/installation.md +++ b/docs/source/zh-Hans/get_started/installation.md @@ -47,7 +47,7 @@ CUDA_EXT=1 pip install . pip install . ``` -如果您在使用CUDA 10.2,您仍然可以从源码安装ColossalA。但是您需要手动下载cub库并将其复制到相应的目录。 +如果您在使用CUDA 10.2,您仍然可以从源码安装ColossalAI。但是您需要手动下载cub库并将其复制到相应的目录。 ```bash # clone the repository From 4272483cc35cd4c6d813f4da95366f88771abbe6 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Tue, 30 May 2023 16:38:55 +0800 Subject: [PATCH 09/10] [doc] update document of zero with chunk, fix zero with chunk doc --- applications/Chat/examples/train_sft.sh | 4 ++-- docs/source/en/features/zero_with_chunk.md | 2 +- docs/source/zh-Hans/features/zero_with_chunk.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index c880f85825a7..19df8560c349 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -1,5 +1,5 @@ -torchrun --standalone --nproc_per_node=4 train_sft.py \ - --pretrain "/path/to/LLaMa-7B/" \ +python -m torch.distributed.run --standalone --nproc_per_node=4 train_sft.py \ + --pretrain "path/to/LLaMa-7B/" \ --model 'llama' \ --strategy colossalai_zero2 \ --log_interval 10 \ diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index f5692d92dca1..d6f6f611a64c 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -3,7 +3,7 @@ Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY) **Prerequisite:** -- [定义配置文件](../basics/define_your_config.md) +- [Train with booster](../basics/booster_api.md) **Example Code** diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index eb8516cd084e..9030464ddf9a 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -4,7 +4,7 @@ **前置教程:** -- [定义配置文件](../basics/define_your_config.md) +- [booster使用](../basics/booster_api.md) **示例代码** From cb518a6bd501d37ec2504d3c1f8e767d39e09fe9 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Tue, 30 May 2023 16:41:19 +0800 Subject: [PATCH 10/10] [doc] update document of zero with chunk, fix zero with chunk doc --- applications/Chat/examples/train_sft.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index 19df8560c349..c880f85825a7 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -1,5 +1,5 @@ -python -m torch.distributed.run --standalone --nproc_per_node=4 train_sft.py \ - --pretrain "path/to/LLaMa-7B/" \ +torchrun --standalone --nproc_per_node=4 train_sft.py \ + --pretrain "/path/to/LLaMa-7B/" \ --model 'llama' \ --strategy colossalai_zero2 \ --log_interval 10 \