From 9baa42c7c10b6f1521778cd180da521391eff4b4 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 19 Apr 2021 11:00:54 -0700 Subject: [PATCH] assert no Z2/Z3 with pipeline and fix some docs links --- deepspeed/runtime/pipe/engine.py | 2 ++ deepspeed/runtime/pipe/module.py | 3 +++ docs/_tutorials/pipeline.md | 6 ------ docs/code-docs/source/optimizers.rst | 23 +++++++++++++---------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 573dccce78a5..1a401a27e36f 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -52,6 +52,8 @@ def __init__(self, *super_args, **super_kwargs): super().__init__(*super_args, **super_kwargs) assert isinstance(self.module, PipelineModule), "model must base PipelineModule" + assert self.zero_optimization_stage() < 2, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism" + # We schedule the all-reduces, so disable it in super().backward() self.enable_backward_allreduce = False assert not self.elasticity_enabled(), "Elasticity is not currently supported" \ diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index 6d24ed469f3a..dcd4be0ea342 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -112,6 +112,9 @@ def forward(self, inputs): x = layer(x) return x + .. note:: + Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3. + Args: layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module. num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided. diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md index 529da7880f94..1751846830ef 100644 --- a/docs/_tutorials/pipeline.md +++ b/docs/_tutorials/pipeline.md @@ -276,15 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim DeepSpeed provides a `LayerSpec` class that delays the construction of modules until the model layers have been partitioned across workers. -<<<<<<< HEAD Then each worker will allocate only the layers it's assigned to. So, comparing to the example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to allocate a total of 1x model size on its CPU memory and not 16x. -======= -Then each worker will allocate only the layers it's assigned to. So, continuing the -example from the previous paragraph, a machine with 16 GPUs will need to allocate a -total of 1x model size on its CPU, compared to 16x in the LayerSpec example. ->>>>>>> [squash] Staging zero infinity v1 (#168) Here is an example of the abbreviated AlexNet model, but expressed only with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)` diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst index 53024d161b3e..04416486d954 100755 --- a/docs/code-docs/source/optimizers.rst +++ b/docs/code-docs/source/optimizers.rst @@ -1,24 +1,27 @@ Optimizers -=================== +========== DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU. Adam (CPU) ----------------------------- +---------- + .. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam + FusedAdam (GPU) ----------------------------- +--------------- + .. autoclass:: deepspeed.ops.adam.FusedAdam + FusedLamb (GPU) ----------------------------- +--------------- + .. autoclass:: deepspeed.ops.lamb.FusedLamb + OneBitAdam (GPU) ----------------------------- -<<<<<<< HEAD -.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam -======= -.. autoclass:: deepspeed.runtime.fp16.OneBitAdam ->>>>>>> [squash] Staging zero infinity v1 (#168) +---------------- + +.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam