From 9baa42c7c10b6f1521778cd180da521391eff4b4 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 19 Apr 2021 11:00:54 -0700
Subject: [PATCH] assert no Z2/Z3 with pipeline and fix some docs links

---
 deepspeed/runtime/pipe/engine.py     |  2 ++
 deepspeed/runtime/pipe/module.py     |  3 +++
 docs/_tutorials/pipeline.md          |  6 ------
 docs/code-docs/source/optimizers.rst | 23 +++++++++++++----------
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 573dccce78a5..1a401a27e36f 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -52,6 +52,8 @@ def __init__(self, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
         assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
 
+        assert self.zero_optimization_stage() < 2, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
+
         # We schedule the all-reduces, so disable it in super().backward()
         self.enable_backward_allreduce = False
         assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 6d24ed469f3a..dcd4be0ea342 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -112,6 +112,9 @@ def forward(self, inputs):
                     x = layer(x)
                 return x
 
+        .. note::
+            Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3.
+
         Args:
             layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
             num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 529da7880f94..1751846830ef 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,15 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
-<<<<<<< HEAD
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
 example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
-=======
-Then each worker will allocate only the layers it's assigned to. So, continuing the
-example from the previous paragraph, a machine with 16 GPUs will need to allocate a
-total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
->>>>>>> [squash] Staging zero infinity v1 (#168)
 
 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 53024d161b3e..04416486d954 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,24 +1,27 @@
 Optimizers
-===================
+==========
 
 DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
 
 Adam (CPU)
-----------------------------
+----------
+
 .. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
 
+
 FusedAdam (GPU)
-----------------------------
+---------------
+
 .. autoclass:: deepspeed.ops.adam.FusedAdam
 
+
 FusedLamb (GPU)
-----------------------------
+---------------
+
 .. autoclass:: deepspeed.ops.lamb.FusedLamb
 
+
 OneBitAdam (GPU)
-----------------------------
-<<<<<<< HEAD
-.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
-=======
-.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
->>>>>>> [squash] Staging zero infinity v1 (#168)
+----------------
+
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam