From eb15c7c6f0fd3e34ccbe676b057daecb9b2f8231 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:16:37 +0800 Subject: [PATCH 1/7] [doc] update meet_gemini.md --- .../en/advanced_tutorials/meet_gemini.md | 25 +++++++++------- .../zh-Hans/advanced_tutorials/meet_gemini.md | 30 +++++++++---------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index 8afb6705b6ae..1473b2f11e41 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -9,16 +9,21 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr ## Usage -At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini. Set attribute of zero model_config, i.e., tensor_placement_policy='auto'. - -``` -zero = dict( - model_config=dict( - tensor_placement_policy='auto', - shard_strategy=BucketTensorShardStrategy() - ), - optimizer_config=dict( - ...) +At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject feathures with `booster`. More instructions please refer to [usage of booster]((../basics/booster_api.md). + +```python +from torchvision.models import resnet18 +from colossalai.booster import Booster +from colossalai.zero import ColoInitContext +from colossalai.booster.plugin import GeminiPlugin +plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) +booster = Booster(plugin=plugin) +ctx = ColoInitContext() +with ctx: + model = resnet18() +optimizer = HybridAdam(model.parameters(), lr=1e-3) +criterion = lambda x: x.mean() +model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) ) ``` diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index 2bf0a9c98c3f..780712dd73fd 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -8,21 +8,21 @@ ## 用法 -目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单,在训练策略的配置文件里设置zero的model_config属性tensor_placement_policy='auto' - -``` -zero = dict( - model_config=dict( - reduce_scatter_bucket_size_mb=25, - fp32_reduce_scatter=False, - gradient_predivide_factor=1.0, - tensor_placement_policy="auto", - shard_strategy=TensorShardStrategy(), - ... - ), - optimizer_config=dict( - ... - ) +目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单:使用booster将`GeminiPlugin`注入训练组件中。更多细节请参考[booster使用](../basics/booster_api.md)。 + +```python +from torchvision.models import resnet18 +from colossalai.booster import Booster +from colossalai.zero import ColoInitContext +from colossalai.booster.plugin import GeminiPlugin +plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) +booster = Booster(plugin=plugin) +ctx = ColoInitContext() +with ctx: + model = resnet18() +optimizer = HybridAdam(model.parameters(), lr=1e-3) +criterion = lambda x: x.mean() +model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) ) ``` From 3e1c260da7555ad0eee60a6834a9b23690dde717 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:20:47 +0800 Subject: [PATCH 2/7] [doc] update meet_gemini.md --- docs/source/en/advanced_tutorials/meet_gemini.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index 1473b2f11e41..2a2233a8faad 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -9,7 +9,7 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr ## Usage -At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject feathures with `booster`. More instructions please refer to [usage of booster]((../basics/booster_api.md). +At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions please refer to [usage of booster]((../basics/booster_api.md). ```python from torchvision.models import resnet18 From 5027893dfda041d6f358e84f7c5931c00da46b3b Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:23:00 +0800 Subject: [PATCH 3/7] [doc] fix parentheses --- docs/source/en/advanced_tutorials/meet_gemini.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index 2a2233a8faad..cab76dc9f6f7 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -9,7 +9,7 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr ## Usage -At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions please refer to [usage of booster]((../basics/booster_api.md). +At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions please refer to [usage of booster](../basics/booster_api.md). ```python from torchvision.models import resnet18 From cb2185f9639e4c792a3f5debf36744325d8ed6c8 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:23:56 +0800 Subject: [PATCH 4/7] [doc] fix parentheses --- docs/source/zh-Hans/advanced_tutorials/meet_gemini.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index 780712dd73fd..d9dccea9b31c 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -8,7 +8,7 @@ ## 用法 -目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单:使用booster将`GeminiPlugin`注入训练组件中。更多细节请参考[booster使用](../basics/booster_api.md)。 +目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单:使用booster将`GeminiPlugin`中的特性注入到训练组件中。更多细节请参考[booster使用](../basics/booster_api.md)。 ```python from torchvision.models import resnet18 From f8cd531e47e0c6bef693fc49a7c1d2cdb18c941a Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:29:35 +0800 Subject: [PATCH 5/7] [doc] fix doc test --- docs/source/en/advanced_tutorials/meet_gemini.md | 2 ++ docs/source/zh-Hans/advanced_tutorials/meet_gemini.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index cab76dc9f6f7..64b1ee730fda 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -91,3 +91,5 @@ The important duty of MSC is to adjust the tensor layout position. For example, In the warmup stage, since we haven't finished a complete iteration yet, we don't know actual memory occupation. At this time, we limit the upper bound of memory usage of the model data. For example, only 30% of the GPU memory can be used. This ensures that we can successfully complete the warmup state. In the non-warmup stage, we need to use the memory information of non-model data collected in the warm-up stage to reserve the peak memory required by the computing device for the next Period, which requires us to move some model tensors. In order to avoid frequent replacement of the same tensor in and out of the CPU-GPU, causing a phenomenon similar to [cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science)). Using the iterative characteristics of DNN training, we design the OPT cache swap out strategy. Specifically, in the warmup stage, we record the sampling time required by each tensor computing device. If we need to expel some HOLD tensors, we will choose the latest tensor needed on this device as the victim. + + diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index d9dccea9b31c..b6f13e756c40 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -50,6 +50,8 @@ model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) ColossalAI设计了Gemini,就像双子星一样,它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内,从而让模型训练突破GPU的内存墙。内存管理器由两部分组成,分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。 + + 我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段,开始时的一个或若干迭代步属于预热阶段,其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息,而在non-warmup阶段STM入去MSC收集的信息来移动tensor,以达到最小化CPU-GPU数据移动volume的目的。 From 1c4b08f4d189d25b92a5ec18568da412e625b53c Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:30:12 +0800 Subject: [PATCH 6/7] [doc] fix doc test --- docs/source/zh-Hans/advanced_tutorials/meet_gemini.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index b6f13e756c40..12e2fd7dc776 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -50,8 +50,6 @@ model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) ColossalAI设计了Gemini,就像双子星一样,它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内,从而让模型训练突破GPU的内存墙。内存管理器由两部分组成,分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。 - - 我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段,开始时的一个或若干迭代步属于预热阶段,其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息,而在non-warmup阶段STM入去MSC收集的信息来移动tensor,以达到最小化CPU-GPU数据移动volume的目的。 @@ -96,3 +94,5 @@ MSC的重要职责是在调整tensor layout位置,比如在上图S2时刻, 在non-warmup阶段,我们需要利用预热阶段采集的非模型数据内存信息,预留出下一个Period在计算设备上需要的峰值内存,这需要我们移动出一些模型张量。 为了避免频繁在CPU-GPU换入换出相同的tensor,引起类似[cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science))的现象。我们利用DNN训练迭代特性,设计了OPT cache换出策略。具体来说,在warmup阶段,我们记录每个tensor被计算设备需要的采样时刻。如果我们需要驱逐一些HOLD tensor,那么我们选择在本设备上最晚被需要的tensor作为受害者。 + + From fcc62a9b2fdd19de246c212b7acf910709763650 Mon Sep 17 00:00:00 2001 From: jiangmingyan <1829166702@qq.com> Date: Thu, 25 May 2023 14:45:03 +0800 Subject: [PATCH 7/7] [doc] fix doc --- docs/source/en/advanced_tutorials/meet_gemini.md | 2 +- docs/source/zh-Hans/advanced_tutorials/meet_gemini.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index 64b1ee730fda..c1c23a355efa 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -9,7 +9,7 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr ## Usage -At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions please refer to [usage of booster](../basics/booster_api.md). +At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions of `booster` please refer to [**usage of booster**](../basics/booster_api.md). ```python from torchvision.models import resnet18 diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index 12e2fd7dc776..341cdd1b442e 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -8,7 +8,7 @@ ## 用法 -目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单:使用booster将`GeminiPlugin`中的特性注入到训练组件中。更多细节请参考[booster使用](../basics/booster_api.md)。 +目前Gemini支持和ZeRO并行方式兼容,它的使用方法很简单:使用booster将`GeminiPlugin`中的特性注入到训练组件中。更多`booster`介绍请参考[booster使用](../basics/booster_api.md)。 ```python from torchvision.models import resnet18