From d190f1cad62ce54f3d2b075892604a08d35c3078 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Fri, 12 Mar 2021 00:56:58 +0100
Subject: [PATCH 01/78] test sparse self_attn fix

---
 .../ops/sparse_attention/sparse_self_attention.py   | 13 +++++++++++--
 deepspeed/runtime/pipe/topology.py                  |  6 ++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py
index 6e7d8905e0a8..b37f1e01c7b1 100644
--- a/deepspeed/ops/sparse_attention/sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@@ -24,7 +24,8 @@ def __init__(
         sparsity_config=SparsityConfig(num_heads=4),
         key_padding_mask_mode='add',
         attn_mask_mode='mul',
-        max_seq_length=2048):
+        max_seq_length=2048,
+        mpu=None):
         """Initialize the sparse self attention layer.
         Arguments:
             sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class.
@@ -45,13 +46,21 @@ def __init__(
         # mask modes
         self.key_padding_mask_mode = key_padding_mask_mode
         self.attn_mask_mode = attn_mask_mode
+        self.mpu = mpu
 
     ops = dict()
 
     def get_layout(self, L):
         # if layout is never synchronized across GPUs, broadcast the layout from global rank 0
         if self._need_layout_synchronization and dist.is_initialized():
-            dist.broadcast(self.master_layout, src=0)
+            if self.mpu is not None:
+                data_parallel_group = self.mpu.get_data_parallel_group()
+                src_rank = self.mpu.get_data_parallel_src_rank()
+            else:
+                src_rank = 0
+                data_parallel_group = None
+            dist.broadcast(self.master_layout, src=src_rank,
+                           group=data_parallel_group)
             self._need_layout_synchronization = False
 
         if (L % self.sparsity_config.block != 0):
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index b1e11ee0e492..a4a3541c8767 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -426,6 +426,12 @@ def get_data_parallel_world_size(self):
         """ The number of pipelines. """
         return self.data_parallel_size
 
+    def get_data_parallel_src_rank(self):
+        """Calculate the global rank corresponding to a local rank zero
+        in the data parallel group."""
+        global_rank = dist.get_rank()
+        return (global_rank // self.data_parallel_size) * self.data_parallel_size
+
     def get_data_parallel_group(self):
         """ The group of ranks within the same stage of all pipelines. """
         return self.dp_proc_group

From 18a26f3f6051aa0622971bd71fa00f35ec006c2f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 11 Mar 2021 17:55:54 -0800
Subject: [PATCH 02/78] [WarmupDecayLR] fix log(0) & 1/log(1) bugs (#772)

* fix log(0) & 1/log(1) bugs

* simplify

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
Co-authored-by: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Co-authored-by: Cheng Li <pistasable@gmail.com>
---
 deepspeed/runtime/lr_schedules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index 515233851a1d..7846da12fdbd 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -706,8 +706,8 @@ def __init__(self,
         self.min_lrs = self._format_param(self.optimizer, warmup_min_lr, "min_lr")
         self.max_lrs = self._format_param(self.optimizer, warmup_max_lr, "max_lr")
         self.delta_lrs = [big - small for big, small in zip(self.max_lrs, self.min_lrs)]
-        self.warmup_num_steps = warmup_num_steps
-        self.inverse_log_warm_up = 1.0 / math.log(warmup_num_steps)
+        self.warmup_num_steps = max(2, warmup_num_steps)
+        self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
         self.last_batch_iteration = last_batch_iteration
 
     def get_lr(self):

From 35fd7ccd862adcb93febd546cb5b9fa7cb883d8f Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 12 Mar 2021 08:52:14 -0800
Subject: [PATCH 03/78] bump to v0.3.12

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 208059121dde..0b9c0199636e 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.11
+0.3.12

From 458ff028d929b6abbf7d0955ad4b796dbf9a5156 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 12 Mar 2021 14:53:56 -0800
Subject: [PATCH 04/78] Bug fix: Remove client optimizer param_group list item
 that does not have 'params' (#827)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/engine.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8a9d5abf173f..f5737c07ea04 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -585,6 +585,12 @@ def _configure_distributed_model(self, model):
     def _configure_optimizer(self, client_optimizer, model_parameters):
 
         if client_optimizer is not None:
+            client_optimizer.param_groups[:] = [
+                pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
+            ]
+            logger.info(
+                "Removing param_group that has no 'params'in the client Optimizer")
+
             basic_optimizer = client_optimizer
             if self.global_rank == 0:
                 logger.info('Using client Optimizer as basic optimizer')

From 73d762c8d96525872b8090ffa9b1f5a5e86e4295 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Sun, 14 Mar 2021 10:58:06 -0700
Subject: [PATCH 05/78] [doc] pipeline doc typos/improvements (#659)

Admin merging for pure-doc PR that does not trigger build.
---
 docs/_tutorials/pipeline.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index e7730ebe2661..46546066ab1a 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -132,7 +132,7 @@ net = PipelineModule(layers=net.to_layers(), num_stages=2)
 ```
 
 **Note:**
-the `lamda` in the middle of `layers` above is not a `torch.nn.Module`
+the `lambda` in the middle of `layers` above is not a `torch.nn.Module`
 type. Any object that implements `__call__()` can be a layer in a
 `PipelineModule`: this allows for convenient data transformations in the
 pipeline.
@@ -165,7 +165,7 @@ These modifications can be accomplished with a short subclass:
 class TransformerBlockPipe(TransformerBlock)
     def forward(self, inputs):
         hidden, mask = inputs
-        outputs = super().forward(hidden, mask)
+        output = super().forward(hidden, mask)
         return (output, mask)
 stack = [ TransformerBlockPipe() for _ in range(num_layers) ]
 ```
@@ -269,17 +269,18 @@ by DeepSpeed:
 * `partition_method="uniform"` balances the number of layers per stage.
 
 ### Memory-Efficient Model Construction
-Building a `Sequential` and providing it `PipelineModule` is a convenient way
-of specifying a pipeline parallel model. However, this approach encounters
-scalability issues for massive models. Starting from a `Sequential` allocates
-the model in CPU memory redundantly by every worker. A machine with 16 GPUs
-must have as much local CPU memory as 16 times the model size.
+Building a `Sequential` container and providing it to a `PipelineModule` is a convenient way
+of specifying a pipeline parallel model. However, this approach encounters scalability issues
+for massive models because each worker replicates the whole model in CPU memory.
+For example, a machine with 16 GPUs must have as much local CPU memory as 16 times the model size.
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
-modules until the model layers have been partitioned across workers. Then,
-the modules are built on the GPU that owns the layer.
+modules until the model layers have been partitioned across workers.
+Then each worker will allocate only the layers it's assigned to. So, continuing the
+example from the previous paragraph, a machine with 16 GPUs will need to allocate a
+total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
 
-Here's an example of the abbreviated AlexNet model, but expressed only
+Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
 simply becomes `LayerSpec(nn.ReLU, inplace=True)`.
 ```python

From 4601885972be96373066662084ce1bf9c49448b8 Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Mon, 15 Mar 2021 13:07:20 -0700
Subject: [PATCH 06/78] Samyamr/inference hook fix (#851)

* Fix mis-aligned-grad

When a parameter is not divisible by world size, the partitioned gradients are mis-aligned due to incorrect padding handling. This PR should fix for that.

* Formatting fix

* Adding static_scale test back for Z3, and also changing hidden size to be not divisile by world_size

* also removing alignment from flat fp16 buffers

* Testing for hidden dim alignment

* inference hook fix

* Update stage3.py

* formatting

* [bug-fix] move params to gpu if offload params is turned off

Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com>
Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 .../runtime/zero/partition_parameters.py      |  8 +++++--
 deepspeed/runtime/zero/stage3.py              | 21 +++++++++++++------
 tests/unit/test_fp16.py                       | 13 ++++++------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 05825fc90688..e6cb9199899a 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -807,8 +807,12 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         if start < param.ds_numel:
             elements = min(param.ds_numel - start, partition_size)
 
-            dest_tensor = partition_buffer.view(-1).narrow(0, 0, elements)
+            dest_tensor_full_buffer = partition_buffer.view(-1).narrow(
+                0,
+                0,
+                partition_size)
 
+            dest_tensor = dest_tensor_full_buffer.narrow(0, 0, elements)
             src_tensor = param.grad.view(-1).narrow(0, start, elements)
 
             # just copy the grad partition to the buffer
@@ -841,7 +845,7 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
             #                                             elements))
 
         #print("after partition gradients")
-        param.grad.data = dest_tensor.data
+        param.grad.data = dest_tensor_full_buffer.data
         see_memory_usage("After partitioning gradients", force=False)
 
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index f840de15c57d..99b4916aef3c 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -961,10 +961,9 @@ def _create_fp16_partitions_with_defragmentation(self):
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(
-                            self.fp16_partitioned_groups[i],
-                            dist.get_world_size(group=self.dp_process_group)).cuda(
-                                torch.cuda.current_device()))
+                        flatten_dense_tensors_aligned(self.fp16_partitioned_groups[i],
+                                                      1).cuda(
+                                                          torch.cuda.current_device()))
                     see_memory_usage(
                         f"After flattening and moving param group {i} to GPU",
                         force=False)
@@ -976,10 +975,12 @@ def _create_fp16_partitions_with_defragmentation(self):
                                   flat_offset,
                                   total_elements)
                     self.fp16_partitioned_groups_flat.append(fp16_partitioned_group_flat)
-                    self._move_to_flat_buffer(self.fp16_partitioned_groups[i],
-                                              self.fp16_partitioned_groups_flat[i])
                     flat_offset += total_elements
 
+                # move param to flat buffer for both param offload on/off
+                self._move_to_flat_buffer(self.fp16_partitioned_groups[i],
+                                          self.fp16_partitioned_groups_flat[i])
+
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
     def _create_fp32_partitions(self):
@@ -1036,6 +1037,14 @@ def setup_zero_stage3_hooks(self):
         self.hierarchy = 0
         self._register_hooks_recursively(self.module)
 
+        #reset step if in inference mode
+        def _end_of_forward_hook(module, *args):
+
+            if not torch._C.is_grad_enabled():
+                self.param_coordinator.reset_step()
+
+        self.module.register_forward_hook(_end_of_forward_hook)
+
     def persistent_parameters(self):
         persistent_params = []
         total_persistent_parameters = 0
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index 5012614f97b0..dbd40c322be9 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -347,9 +347,6 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
     if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
         pytest.skip("cpu-adam is not compatible")
 
-    if zero_stage == 3:
-        pytest.skip("skip for now")
-
     config_dict = {
         "train_batch_size": 4,
         "steps_per_print": 1,
@@ -371,8 +368,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
     args = args_from_dict(tmpdir, config_dict)
 
     @distributed_test(world_size=2)
-    def _test_zero_static_scale(args, zero_stage):
-        hidden_dim = 10
+    def _test_zero_static_scale(args, zero_stage, hidden_dim):
+        #making hidden size not divisible by DP for covering this scenario
+        hidden_dim = hidden_dim
         model = SimpleModel(hidden_dim)
 
         model, optim, _, _ = deepspeed.initialize(args=args,
@@ -393,7 +391,10 @@ def _test_zero_static_scale(args, zero_stage):
             model.backward(loss)
             model.step()
 
-    _test_zero_static_scale(args=args, zero_stage=zero_stage)
+    #test when hidden_dim is not aligned with world size
+    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9)
+    #test when hidden_dim is aligned with world size
+    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
 
 
 def test_zero_static_scale_deprecated_format(tmpdir):

From a75d971bc2f1300c10331ed3b5f6026ecabe1821 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 15 Mar 2021 16:18:27 -0700
Subject: [PATCH 07/78] ZeRO Stage 2: Clear reduced gradients (#856)

* Ensure gradients of other partitions are cleared after reduction

* Remove redundant code

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/zero/stage2.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index bdd1de4cbdda..e0ca4f025957 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -37,7 +37,7 @@ def split_half_float_double(tensors):
     ]
     buckets = []
     for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t is not None and t.type() == dtype]
+        bucket = [t for t in tensors if t.type() == dtype]
         if bucket:
             buckets.append(bucket)
     return buckets
@@ -477,6 +477,8 @@ def independent_gradient_partition_epilogue(self):
 
         if self.overlap_comm:
             torch.cuda.synchronize()
+            # It is safe to clear previously reduced grads of other partitions
+            self._clear_previous_reduced_grads()
 
         if self.cpu_offload is False:
             for i, _ in enumerate(self.fp16_groups):
@@ -638,6 +640,9 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
             param.grad.data = new_grad_tensor.data.view_as(param.grad)
 
         self.elements_in_ipg_bucket += param.numel()
+
+        assert param.grad is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
+
         self.grads_in_ipg_bucket.append(param.grad)
         self.params_in_ipg_bucket.append((i, param, param_id))
 
@@ -965,7 +970,7 @@ def reduce_ipg_grads(self):
 
                 if not self.is_param_in_current_partition[param_id]:
                     if self.overlap_comm and self.contiguous_gradients is False:
-                        # Clear the previous grads during the next reduction
+                        # Clear grads of other partitions during the next reduction
                         # to avoid clearing them before the reduction is complete.
                         if self.previous_reduced_grads is None:
                             self.previous_reduced_grads = []
@@ -1078,16 +1083,18 @@ def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=N
 
         return tensor
 
+    def _clear_previous_reduced_grads(self):
+        if self.previous_reduced_grads is not None:
+            for param in self.previous_reduced_grads:
+                param.grad = None
+            self.previous_reduced_grads = None
+
     #if rank is specified do a reduction instead of an allreduce
     def allreduce_and_copy(self, small_bucket, rank=None, log=None):
         if self.overlap_comm:
             torch.cuda.synchronize()
-            if self.previous_reduced_grads is not None:
-                # previous_reduced_grads has the previous reduced grads,
-                # now it is safe to clear.
-                for param in self.previous_reduced_grads:
-                    param.grad = None
-                self.previous_reduced_grads = None
+            # It is safe to clear the previously reduced grads of other partitions
+            self._clear_previous_reduced_grads()
             stream = self.reduction_stream
         else:
             stream = torch.cuda.current_stream()

From 24335d49cec0be439ba4f311c6e7e802e834b84d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 16 Mar 2021 09:23:45 -0700
Subject: [PATCH 08/78] [runner/launch] propagate the error (#854)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/launcher/runner.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 6ce482060358..ac873f4ca3f4 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -359,6 +359,12 @@ def main(args=None):
     result = subprocess.Popen(cmd, env=env)
     result.wait()
 
+    # In case of failure must propagate the error-condition back to the caller (usually shell). The
+    # actual error and traceback should have been printed in the subprocess, so in order to avoid
+    # unnecessary noise we just quietly exit here with the same code as the subprocess
+    if result.returncode > 0:
+        sys.exit(result.returncode)
+
 
 if __name__ == "__main__":
     main()

From 547d1c5f8f3f5a3a01bb13b286e7686c5364f9b8 Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@gmail.com>
Date: Tue, 16 Mar 2021 11:27:07 -0500
Subject: [PATCH 09/78] docs: minor spelling tweaks (#858)

---
 deepspeed/profiling/flops_profiler/profiler.py |  2 +-
 docs/_pages/features.md                        |  6 +++---
 docs/_tutorials/flops-profiler.md              | 18 +++++++++---------
 docs/_tutorials/pipeline.md                    |  2 +-
 docs/_tutorials/sparse-attention.md            |  2 +-
 docs/_tutorials/zero.md                        |  4 ++--
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index 7e225fc20f2b..be7d772782f2 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -265,7 +265,7 @@ def del_extra_repr(module):
                 "Each module profile is listed after its name in the following order: \nnumber of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency)."
             )
             print(
-                "Note: \n1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.\n2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.\n"
+                "Note: \n1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.\n2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.\n"
             )
             print(self.model)
 
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 08f2bf221672..ba955fd574db 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -37,7 +37,7 @@ and communication- efficient training. DeepSpeed supports a hybrid
 combination of data, model, and pipeline parallelism and has scaled to over
 [one trillion parameters using 3D parallelism]({{ site.press_release_v3 }}).
 Pipeline parallelism can also improve communication efficiency and has
-accelerated training by up to 7x on low-banwdith clusters.
+accelerated training by up to 7x on low-bandwidth clusters.
 
 
 ## Model Parallelism
@@ -256,9 +256,9 @@ This can be enabled by setting the following in the `deepspeed_config` file.
 
 ```
 
-###  Timing Activiation Checkpoint Functions
+###  Timing Activation Checkpoint Functions
 
-When activiation checkpoingint is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
+When activation checkpointing is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
 
 ```json
 {
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 3ccd8a45929f..39d0015dd4fe 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -37,11 +37,11 @@ Top 3 modules in params at depth 2 are {'Conv2d': '50.69 k', 'Linear': '11.01 k'
 Top 3 modules in latency at depth 2 are {'Conv2d': '11.37 ms', 'Linear': '5.27 ms', 'AvgPool2d': '5.02 ms'}
 
 ------------------------------ Detailed Profile ------------------------------
-Each module profile is listed after its name in the follwing order:
+Each module profile is listed after its name in the following order:
 number of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency).
 Note:
 1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.
-2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.
+2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
 
 LeNet5(
   61.71 k, 100.00% Params, 439.56 MMACs, 100.00% MACs, 25.7 ms, 100.00% latency, 34.2 GFLOPS,
@@ -92,7 +92,7 @@ The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a stan
 
 ### Usage With the DeepSpeed Runtime
 
-When using DeepSpeed for model training, the flops profiler can be configured in the `deepspeed_config` file. No explict API calls are needed to use the profiler. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
+When using DeepSpeed for model training, the flops profiler can be configured in the `deepspeed_config` file. No explicit API calls are needed to use the profiler. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
 
 
 #### Example: Megatron-LM
@@ -131,11 +131,11 @@ Top 3 modules in params at depth 8 are {'ColumnParallelLinear': '7.35 M', 'RowPa
 Top 3 modules in latency at depth 8 are {'ColumnParallelLinear': '659.23 us', 'RowParallelLinear': '587.94 us', 'FusedScaleMaskSoftmax': '370.98 us'}
 
 ------------------------------ Detailed Profile ------------------------------
-Each module profile is listed after its name in the follwing order:
+Each module profile is listed after its name in the following order:
 number of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency).
 Note:
 1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.
-2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.
+2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
 
 DistributedDataParallel(
   38.89 M, 100.00% Params, 314.61 GMACs, 100.00% MACs, 33.81 ms, 100.00% latency, 18.61 TFLOPS,
@@ -235,11 +235,11 @@ Top 3 modules in params at depth 2 are {'Linear': '58.63 M', 'Conv2d': '2.47 M',
 Top 3 modules in latency at depth 2 are {'Conv2d': '13.96 ms', 'Linear': '6.23 ms', 'ReLU': '730.75 us'}
 
 ------------------------------ Detailed Profile ------------------------------
-Each module profile is listed after its name in the follwing order:
+Each module profile is listed after its name in the following order:
 number of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency).
 Note:
 1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.
-2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.
+2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
 
 AlexNet(
   61.1 M, 100.00% Params, 183.18 GMACs, 100.00% MACs, 22.13 ms, 100.00% latency, 16.56 TFLOPS,
@@ -335,11 +335,11 @@ Top 3 modules in params at depth 7 are {'Linear': '28.35 M', 'LayerNorm': '18.43
 Top 3 modules in latency at depth 7 are {'Linear': '153.7 ms', 'LayerNorm': '4.74 ms', 'Dropout': '597.95 us'}
 
 ------------------------------ Detailed Profile ------------------------------
-Each module profile is listed after its name in the follwing order:
+Each module profile is listed after its name in the following order:
 number of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency).
 Note:
 1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.
-2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.
+2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
 
 BertForSequenceClassification(
   109.48 M, 100.00% Params, 43.5 GMACs, 100.00% MACs, 393.7 ms, 100.00% latency, 220.97 GFLOPS,
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 46546066ab1a..70790c82b301 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -230,7 +230,7 @@ pipeline. Each worker should load micro-batches of size
 a total of `engine.gradient_accumulation_steps()` times per `train_batch()`.
 
 **Watch out!**
-The pipeline engine *pulls* data from an iteratior instead of iterating over
+The pipeline engine *pulls* data from an iterator instead of iterating over
 it. It's critical that the data stream does not empty in the middle of a
 training batch. Each invocation of `train_batch()` will pull
 a total of `engine.gradient_accumulation_steps()` micro-batches of data from
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
index 915fd524e1fd..184d3e621e2d 100644
--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -154,7 +154,7 @@ This module, is the parent class for all sparsity structures and contains the sh
   * `block`: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such square blocks; `Block X Block`.
   * `different_layout_per_head`: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
 
-* **Fixed** (FixedSparistyConfig):
+* **Fixed** (FixedSparsityConfig):
 This structure is based on [Generative Modeling with Sparse Transformers](https://arxiv.org/abs/1904.10509) from OpenAI, in which local and global attention is fixed by the given parameters:
   * `num_local_blocks`: an integer determining the number of blocks in local attention window. As it is illustrated in the below figure (adapted from original paper), tokens in a local window, attend to all tokens local to them. In the case of autoregressive model, as in the figure, tokens attend to tokens appearing before them in the local window. And in the case of Masked model such as BERT, attention is bidirectional.
   * `num_global_blocks`: an integer determining how many consecutive blocks in a local window is used as the representative of the window for global attention; illustrated in the figure below as well.
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index e594427f460f..ad6e222707e0 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -3,7 +3,7 @@ title: "Zero Redundancy Optimizer (ZeRO)"
 ---
 If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
 
-In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillons of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.
+In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.
 
 ## ZeRO Overview
 ZeRO leverages the aggregate computation and memory resources of data parallelism to reduce the memory and compute requirements of each device (GPU) used for model training. ZeRO reduces the memory consumption of each GPU by partitioning the various model training states (weights, gradients, and optimizer states) across the available devices (GPUs and CPUs) in the distributed training hardware. Concretely, ZeRO is being implemented as incremental stages of optimizations, where optimizations in earlier stages are available in the later stages. To deep dive into ZeRO, please see our [paper](https://arxiv.org/abs/1910.02054v3).
@@ -226,7 +226,7 @@ class ParallelTransformerLayer(MegatronModule):
 
 #### Allocating Massive Megatron-LM Models
 
-We make two further changes to model initalization in order to support models
+We make two further changes to model initialization in order to support models
 that exceed *local* system memory, but not *total* system memory.
 
 1. Allocate the model in a memory-scalable fashion. The model parameters will

From 871f3048ad0d05e79f8835849b7a00656a14b3f4 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 16 Mar 2021 12:38:08 -0700
Subject: [PATCH 10/78] Allow args to be optional in deepspeed.initialize
 (#825)

---
 deepspeed/__init__.py       | 12 ++++--
 deepspeed/runtime/engine.py | 20 +++++-----
 tests/unit/test_config.py   | 80 +++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index c4c2acf0b0d7..3401f121bca0 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -49,8 +49,8 @@ def _parse_version(version_str):
 sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler
 
 
-def initialize(args,
-               model,
+def initialize(args=None,
+               model=None,
                optimizer=None,
                model_parameters=None,
                training_data=None,
@@ -62,8 +62,7 @@ def initialize(args,
     """Initialize the DeepSpeed Engine.
 
     Arguments:
-        args: a dictionary containing local_rank and deepspeed_config
-            file location
+        args: an object containing local_rank and deepspeed_config fields. This is optional if `config_params` is passed.
 
         model: Required: nn.module class before apply any wrappers
 
@@ -88,6 +87,9 @@ def initialize(args,
             mini-batch of Tensor(s).  Used when using batched loading from a
             map-style dataset.
 
+        config_params: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config
+            as a dictionary instead.
+
     Returns:
         A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
 
@@ -108,6 +110,8 @@ def initialize(args,
         __git_branch__),
              ranks=[0])
 
+    assert model is not None, "deepspeed.initialize requires a model"
+
     if not isinstance(model, PipelineModule):
         engine = DeepSpeedEngine(args=args,
                                  model=model,
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f5737c07ea04..e11e2c1d7afc 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -495,9 +495,10 @@ def _configure_with_arguments(self, args, mpu):
         # After the distributed backend is initialized we are guaranteed the LOCAL_RANK
         # environment variable is set. We must align args.local_rank to this value for
         # backwards compatability with scripts relying on [args|self].local_rank containing
-        # the correct local rank info.
-        args.local_rank = int(os.environ['LOCAL_RANK'])
-        self.local_rank = args.local_rank
+        # the correct local rank info. _do_args_sanity_check will ensure this is the case.
+        self.local_rank = int(os.environ['LOCAL_RANK'])
+        if hasattr(args, 'local_rank'):
+            args.local_rank = self.local_rank
 
         config_file = args.deepspeed_config if hasattr(args,
                                                        'deepspeed_config') else None
@@ -513,15 +514,14 @@ def _do_args_sanity_check(self, args):
                 assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
             args.deepspeed_config = args.deepscale_config
 
-        local_rank_err = "DeepSpeed requires a command line parameter of --local_rank [int] and/or setting the LOCAL_RANK environment variable."
-        if hasattr(args, 'local_rank'):
-            assert type(args.local_rank) == int, local_rank_err
-            if "LOCAL_RANK" in os.environ and args.local_rank >= 0:
-                env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+        assert "LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment variable, it is set by the deepspeed launcher, " \
+            "deepspeed.init_distributed, or the torch.distributed launcher. If using a different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed."
+        if hasattr(args, 'local_rank') and args.local_rank != None:
+            assert isinstance(args.local_rank, int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
+            if args.local_rank >= 0:
+                env_local_rank = int(os.environ.get("LOCAL_RANK"))
                 assert env_local_rank == args.local_rank, \
                     f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}."
-        else:
-            assert "LOCAL_RANK" in os.environ, local_rank_err
 
         if self.config_params is None:
             assert hasattr(args, 'deepspeed_config') and args.deepspeed_config is not None, \
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 4cabefe71a33..7de3a40fabeb 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -226,3 +226,83 @@ def _helper():
                 model.step()
 
     _helper()
+
+
+def test_none_args(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+
+    @distributed_test(world_size=1)
+    def _helper():
+        model = SimpleModel(hidden_dim=10)
+        model, _, _, _ = deepspeed.initialize(args=None, model=model, config_params=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=10,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+
+    _helper()
+
+
+def test_no_args(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+
+    @distributed_test(world_size=1)
+    def _helper():
+        model = SimpleModel(hidden_dim=10)
+        model, _, _, _ = deepspeed.initialize(model=model, config_params=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=10,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+
+    _helper()
+
+
+def test_no_model(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+
+    @distributed_test(world_size=1)
+    def _helper():
+        model = SimpleModel(hidden_dim=10)
+        with pytest.raises(AssertionError):
+            model, _, _, _ = deepspeed.initialize(model=None, config_params=config_dict)
+
+        with pytest.raises(AssertionError):
+            model, _, _, _ = deepspeed.initialize(model, config_params=config_dict)

From fa87a73a8a3bead24ad9ea52090646fa620d74e8 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 16 Mar 2021 13:06:39 -0700
Subject: [PATCH 11/78] Fix ZeRO3 save_checkpoint (#857)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/zero/stage3.py | 12 ++++-----
 tests/unit/test_checkpointing.py | 44 +++++++++++++++++---------------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 99b4916aef3c..ea4653578616 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -2269,7 +2269,7 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
 
         assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[sub_group_id].numel(), \
             "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                single_grad_partition.numel(), self.partition_size[sub_group_id], sub_group_id, partition_id)
+                single_grad_partition.numel(), self.fp32_partitioned_groups_flat[sub_group_id].numel(), sub_group_id, partition_id)
 
         self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition
 
@@ -2638,14 +2638,12 @@ def get_groups_without_padding(self, groups_with_padding):
     def _set_fp32_optimizer_param_groups(self):
         for sub_group_id, _ in enumerate(self.fp16_groups):
             param_group_id = self.sub_group_to_group_id[sub_group_id]
-            self.optimizer.param_groups[param_group_id]['params'] = [
-                self.fp32_partitioned_groups_flat[sub_group_id]
-            ]
+            self.optimizer.param_groups[param_group_id]['params'].append(
+                self.fp32_partitioned_groups_flat[sub_group_id])
 
     def _clear_fp32_optimizer_param_groups(self):
-        for sub_group_id, _ in enumerate(self.fp16_groups):
-            param_group_id = self.sub_group_to_group_id[sub_group_id]
-            self.optimizer.param_groups[param_group_id]['params'] = []
+        for param_group in self.optimizer.param_groups:
+            param_group['params'] = []
 
     def _rigid_state_dict(self):
         state_dict = {}
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index 0fbe354933c4..765c44c8e551 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -47,7 +47,7 @@ def compare_model_states(saved_model, loaded_model, compare_optimizer=True):
     if FP16_DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
             saved_model.optimizer,
             FP16_DeepSpeedZeroOptimizer_Stage3):
-        for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
+        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
             assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
@@ -303,12 +303,13 @@ def _test_checkpoint_fused_optimizer(args,
                            'deepspeed_adam'),
                           (3,
                            False,
-                           'Adam')])
+                           'Adam'),
+                          (3,
+                           True,
+                           'deepspeed_adam')])
 def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
         pytest.skip("cpu-adam is not compatible")
-    if zero_stage == 3:
-        pytest.skip('Skip checkpointing tests for ZeRO3')
 
     config_dict = {
         "train_batch_size": 2,
@@ -324,8 +325,10 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt
             }
         },
         "fp16": {
-            "enabled": True
+            "enabled": True,
+            "initial_scale_power": 8
         },
+        "wall_clock_breakdown": True,
         "zero_optimization": {
             "stage": zero_stage,
             "cpu_offload": use_cpu_offload
@@ -340,9 +343,7 @@ def _test_checkpoint_zero_optimizer(args,
                                         hidden_dim,
                                         load_optimizer_states):
         if zero_stage == 3:
-            global FP16_DeepSpeedZeroOptimizer_Stage3
-            from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.ScatteredParameters(zero_modules=True):
+            with deepspeed.zero.Init():
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -371,15 +372,16 @@ def _test_checkpoint_zero_optimizer(args,
                            'deepspeed_adam'),
                           (3,
                            False,
-                           'Adam')])
+                           'Adam'),
+                          (3,
+                           True,
+                           'deepspeed_adam')])
 def test_checkpoint_zero_no_optimizer(tmpdir,
                                       zero_stage,
                                       use_cpu_offload,
                                       adam_optimizer):
     if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
         pytest.skip("cpu-adam is not compatible")
-    if zero_stage == 3:
-        pytest.skip('Skip checkpointing tests for ZeRO3')
 
     config_dict = {
         "train_batch_size": 2,
@@ -413,7 +415,7 @@ def _test_checkpoint_zero_no_optimizer(args,
         if zero_stage == 3:
             global FP16_DeepSpeedZeroOptimizer_Stage3
             from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.ScatteredParameters(zero_modules=True):
+            with deepspeed.zero.Init():
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -445,12 +447,13 @@ def _test_checkpoint_zero_no_optimizer(args,
                            'deepspeed_adam'),
                           (3,
                            False,
-                           'Adam')])
+                           'Adam'),
+                          (3,
+                           True,
+                           'deepspeed_adam')])
 def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
         pytest.skip("cpu-adam is not compatible")
-    if zero_stage == 3:
-        pytest.skip('Skip checkpointing tests for ZeRO3')
 
     config_dict = {
         "train_batch_size": 2,
@@ -493,7 +496,7 @@ def _test_checkpoint_lr_scheduler(args,
         if zero_stage == 3:
             global FP16_DeepSpeedZeroOptimizer_Stage3
             from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.ScatteredParameters(zero_modules=True):
+            with deepspeed.zero.Init():
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -526,14 +529,15 @@ def _test_checkpoint_lr_scheduler(args,
                           (2,
                            True,
                            'deepspeed_adam'),
+                          (3,
+                           False,
+                           'Adam'),
                           (3,
                            True,
-                           'Adam')])
+                           'deepspeed_adam')])
 def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
         pytest.skip("cpu-adam is not compatible")
-    if zero_stage == 3:
-        pytest.skip('Skip checkpointing tests for ZeRO3')
 
     config_dict = {
         "train_batch_size": 2,
@@ -570,7 +574,7 @@ def _test_checkpoint_no_lr_scheduler(args,
                                          load_optimizer_states,
                                          load_lr_scheduler_states):
         if zero_stage == 3:
-            with deepspeed.ScatteredParameters(zero_modules=True):
+            with deepspeed.zero.Init():
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]

From 7bcd72a2789a647ff45e03081d6ade721947524b Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 16 Mar 2021 13:07:32 -0700
Subject: [PATCH 12/78] Make config objects json serializable (#862)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/profiling/config.py                     | 10 ++++++++--
 .../runtime/activation_checkpointing/config.py    | 11 ++---------
 deepspeed/runtime/config_utils.py                 | 13 ++++++++++++-
 deepspeed/runtime/zero/config.py                  | 15 ++-------------
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index 0e389baba18b..807802670654 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -3,12 +3,15 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.profiling.constants import *
 
 
-class DeepSpeedFlopsProfilerConfig(object):
+class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
+        """
+        docstring
+        """
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
@@ -24,6 +27,9 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
+        """
+        docstring
+        """
         self.enabled = get_scalar_param(flops_profiler_dict,
                                         FLOPS_PROFILER_ENABLED,
                                         FLOPS_PROFILER_ENABLED_DEFAULT)
diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py
index 30ac5157f843..19e904980da7 100755
--- a/deepspeed/runtime/activation_checkpointing/config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@@ -3,7 +3,7 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 
 #########################################
 #  DeepSpeed Activation Checkpointing
@@ -56,7 +56,7 @@
 }
 
 
-class DeepSpeedActivationCheckpointingConfig(object):
+class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
         super(DeepSpeedActivationCheckpointingConfig, self).__init__()
 
@@ -74,13 +74,6 @@ def __init__(self, param_dict):
 
         self._initialize(act_chkpt_config_dict)
 
-    """
-    For json serialization
-    """
-
-    def repr(self):
-        return self.__dict__
-
     def _initialize(self, act_chkpt_config_dict):
         self.partition_activations = get_scalar_param(
             act_chkpt_config_dict,
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 37f35692369b..62782852a3d2 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -5,10 +5,21 @@
 """
 Collection of DeepSpeed configuration utilities
 """
-
+import json
 from collections import Counter
 
 
+class DeepSpeedConfigObject(object):
+    """
+    For json serialization
+    """
+    def repr(self):
+        return self.__dict__
+
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+
+
 def get_scalar_param(param_dict, param_name, param_default_value):
     return param_dict.get(param_name, param_default_value)
 
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 63a0e4292bd2..ac61a9dd52b3 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -3,13 +3,12 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.utils import logger
 from deepspeed.runtime.zero.constants import *
-import json
 
 
-class DeepSpeedZeroConfig(object):
+class DeepSpeedZeroConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
         super(DeepSpeedZeroConfig, self).__init__()
 
@@ -66,16 +65,6 @@ def read_zero_config_deprecated(self, param_dict):
             .format(ZERO_FORMAT))
         return zero_config_dict
 
-    """
-    For json serialization
-    """
-
-    def repr(self):
-        return self.__dict__
-
-    def __repr__(self):
-        return json.dumps(self.__dict__, sort_keys=True, indent=4)
-
     def _initialize(self, zero_config_dict):
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,

From 12a53b43833b7bea279a205e313f2bd3f0cdfd99 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 16 Mar 2021 15:22:55 -0700
Subject: [PATCH 13/78] bump version 0.3.13

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 0b9c0199636e..e4737652ca5a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.12
+0.3.13

From 68c8481bcf63f6fa8481c631eb63fecc5173ee89 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 16:27:20 -0700
Subject: [PATCH 14/78] 1-bit Adam v2 (#817)

Authors: @awan-10 @conglongli @samyam @jeffra

What's new:

NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation.
Add support to momentum masks for those parameters with constant zero gradients during training.
Bug fixes (e.g., #813).

* NCCL-based 1-bit Adam + Code Refactor for Comm. Backends (#594)

* NCCL based 1-bit Implementation + Refactor to add communication backends (#593)

* add nccl 1-bit optim.

* temporary commit to save stuff.

* Use dist collectives instead of mpi routines.

* remove old code for comm.

* Fix bugs. still does not work.

* modify to test the nccl side code path

* Initial gather impl. Works intra-node.

* Updates to comm. phase 2. nccl comm. passed the tests.

* refactor code to introduce nccl/mpi as backends for onebit adam.

* Refactor updates to test/engine.

* Fix compile/runtime errors.

* simplify support for nccl/mpi backends.

* Add missign file

* Add compression backend in constructor. Revert later.

* modify test with some perf counting.

* Implement a true non-blocking gather for nccl side.

* Revert "Add compression backend in constructor. Revert later."

This reverts commit df8c40d3105e9f2542a8aa6619e80d675a09753f.

* improve the 1-bit adam test.

* Refactor comm. and compression backend in 1-bit adam.

* Fix the test.

* Fix runtime errors and typos in nccl backend

* fix mpi backend. modify tests.

* modify nccl perf test.

* fix mpi side errors.

* Add an mpi perf test

* Sync DSE.

* Remove old collectives file.

* Undo a typo.

* Graceful failure for torch versions that don't support nccl pt2pt.

* Revert "Merge branch 'master' into staging-1bit-nccl-v2"

This reverts commit 78400850703b4b2d84f11b73c109f56919e748ea, reversing
changes made to a6dba72aeafad63661dfe566d3accd03d00be78c.

* Revert "Revert "Merge branch 'master' into staging-1bit-nccl-v2""

This reverts commit 6dbdd9858bafef4d340c089fdc0e3ddde3706f47.

* comm optimization + 1-bit lamb

* Saving/debugging commit.

* finalizing 1-bit lamb

* finalizing 1-bit lamb

* add momentum mask and chkpt handling for 1-bit adam

* Cleanup and modify nccl test to be runnable with deepspeed launcher.

* Fix format.

* fix formatting again.

* make test runnable without mpi4py

* Add dist.alltoall and dist.allgather instead of custom functions.

* remove debug prints.

* formatting and renaming

* renaming

* renaming

* add unit test, fix existing tests

* skip unit test when torch < 1.8

* revert 1-bit lamb

* flatten momentum when dimension is more than 1

* add warning message for 1-bit adam under fp32

* improve version check

* add fp32 test

* 1-bit adam doc

* fix file name

* doc fix

* torch 1.8 is released

* doc fix

* fix tests

* update news

* add doc for momentum mask

* fix checkpoing handling, add unit test

* checkpoint handling doc

* doc final cleanup

* bump dates

* update tests

* url change

* doc fix

* fix test

* doc update

Co-authored-by: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 README.md                                     |   1 +
 deepspeed/runtime/comm/__init__.py            |   0
 deepspeed/runtime/comm/mpi.py                 | 290 ++++++++++++++
 deepspeed/runtime/comm/nccl.py                | 178 +++++++++
 deepspeed/runtime/compression/__init__.py     |   0
 deepspeed/runtime/compression/cupy.py         |  24 ++
 deepspeed/runtime/custom_collectives.py       | 154 --------
 deepspeed/runtime/engine.py                   |   6 +-
 deepspeed/runtime/fp16/onebit/__init__.py     |   0
 .../fp16/{onebit_adam.py => onebit/adam.py}   | 253 +++++-------
 docs/_pages/config-json.md                    |  13 +-
 docs/_tutorials/onebit-adam.md                |  98 +++--
 docs/code-docs/source/optimizers.rst          |   2 +-
 docs/index.md                                 |   1 +
 .../test_mpi_backend.py}                      |  46 ++-
 tests/onebit/test_mpi_perf.py                 |  74 ++++
 .../test_nccl_backend.py}                     |  64 +--
 tests/onebit/test_nccl_perf.py                |  94 +++++
 tests/onebitadam/test_server_error.py         |  87 -----
 tests/unit/test_onebit.py                     | 368 ++++++++++++++++++
 20 files changed, 1263 insertions(+), 490 deletions(-)
 create mode 100644 deepspeed/runtime/comm/__init__.py
 create mode 100644 deepspeed/runtime/comm/mpi.py
 create mode 100644 deepspeed/runtime/comm/nccl.py
 create mode 100644 deepspeed/runtime/compression/__init__.py
 create mode 100644 deepspeed/runtime/compression/cupy.py
 delete mode 100644 deepspeed/runtime/custom_collectives.py
 create mode 100644 deepspeed/runtime/fp16/onebit/__init__.py
 rename deepspeed/runtime/fp16/{onebit_adam.py => onebit/adam.py} (55%)
 rename tests/{onebitadam/test_com_reduce_cuda.py => onebit/test_mpi_backend.py} (65%)
 create mode 100644 tests/onebit/test_mpi_perf.py
 rename tests/{onebitadam/test_com_reduce_host.py => onebit/test_nccl_backend.py} (59%)
 create mode 100644 tests/onebit/test_nccl_perf.py
 delete mode 100644 tests/onebitadam/test_server_error.py
 create mode 100644 tests/unit/test_onebit.py

diff --git a/README.md b/README.md
index 768cfc50c4dd..da8bccc383d4 100755
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
new file mode 100644
index 000000000000..9e112bccc71d
--- /dev/null
+++ b/deepspeed/runtime/comm/mpi.py
@@ -0,0 +1,290 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import cupy
+import time
+import numpy as np
+from mpi4py import MPI
+
+from deepspeed.runtime.compression.cupy import CupyBackend
+
+
+class MpiBackend(object):
+    def __init__(self, cuda_aware):
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.cuda_aware = cuda_aware
+        self.compression_backend = CupyBackend()
+
+    def my_igather(self, rank, size, comm, sendbuf, recbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(comm.Irecv(recbuf[idx], source=idx))
+                else:
+                    recbuf[rank] = sendbuf
+        else:
+            req.append(comm.Isend(sendbuf, dest=root))
+        return req
+
+    def gather_cuda(self,
+                    rank,
+                    world_size,
+                    comm,
+                    cupy_sign_list_packed,
+                    cupy_recvbuf_sign,
+                    cupy_worker_scale,
+                    cupy_recvbuf_scale):
+        # We do in-place operations on cupy buffers so we do not return any buffers
+        requests = []
+        for idx in range(world_size):
+            req_sign = self.my_igather(rank,
+                                       world_size,
+                                       comm,
+                                       cupy_sign_list_packed[idx],
+                                       cupy_recvbuf_sign,
+                                       root=idx)
+            requests += req_sign
+
+        for idx in range(world_size):
+            req_scale = self.my_igather(rank,
+                                        world_size,
+                                        comm,
+                                        cupy_worker_scale,
+                                        cupy_recvbuf_scale,
+                                        root=idx)
+            requests += req_scale
+
+        MPI.Request.Waitall(requests)
+
+    def gather_host(self,
+                    rank,
+                    world_size,
+                    comm,
+                    cupy_sign_list_packed,
+                    cupy_recvbuf_sign,
+                    cupy_worker_scale,
+                    cupy_recvbuf_scale):
+
+        # In-place operations are not possible for newly created cupy arrays
+        # so we need to return the new buffers
+        numpy_recvbuf_sign = np.zeros([world_size,
+                                       cupy_sign_list_packed[rank].size],
+                                      dtype=cupy_sign_list_packed[0].dtype)
+        numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+        # 1. convert from cupy to numpy
+        numpy_sign_list_packed = cupy_sign_list_packed
+
+        for idx in range(world_size):
+            numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
+
+        numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
+        numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
+
+        cupy.cuda.get_current_stream().synchronize()
+
+        # 2. use numpy buffers for communication
+        requests = []
+
+        for idx in range(world_size):
+            req_sign = self.my_igather(rank,
+                                       world_size,
+                                       comm,
+                                       numpy_sign_list_packed[idx],
+                                       numpy_recvbuf_sign,
+                                       root=idx)
+            requests += req_sign
+
+        for idx in range(world_size):
+            req_scale = self.my_igather(rank,
+                                        world_size,
+                                        comm,
+                                        numpy_worker_scale,
+                                        numpy_recvbuf_scale,
+                                        root=idx)
+            requests += req_scale
+
+        MPI.Request.Waitall(requests)
+
+        # 3. Convert back from numpy to cupy
+        cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
+        for idx in range(world_size):
+            cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
+
+        cupy_worker_scale = cupy.asarray(numpy_worker_scale)
+        cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
+        cupy.cuda.get_current_stream().synchronize()
+
+        return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
+
+    def allgather_cuda(self,
+                       comm,
+                       cupy_server_sign_packed,
+                       cupy_recvbuf_sign_server,
+                       cupy_server_scale,
+                       cupy_recvbuf_scale_server):
+        comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
+        comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
+
+    def allgather_host(self,
+                       comm,
+                       cupy_server_sign_packed,
+                       cupy_recvbuf_sign_server,
+                       cupy_server_scale,
+                       cupy_recvbuf_scale_server):
+
+        # 1. Convert cupy to numpy
+        numpy_recvbuf_sign_server = np.zeros(
+            [comm.Get_size(),
+             cupy_server_sign_packed.size],
+            dtype=cupy_server_sign_packed.dtype)
+        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
+                                               1],
+                                              dtype=cupy_server_scale.dtype)
+
+        numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
+        numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
+        numpy_server_scale = cupy.asnumpy(cupy_server_scale)
+        numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
+        cupy.cuda.get_current_stream().synchronize()
+
+        # 2. Communicate numpy buffers
+        comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
+        comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
+        comm.Barrier()
+
+        # 3. Convert numpy back to cupy
+        cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
+        cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
+        cupy_server_scale = cupy.asarray(numpy_server_scale)
+        cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
+        cupy.cuda.get_current_stream().synchronize()
+
+        return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
+
+    def compressed_allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             local_rank):
+
+        all_start_time = time.time()
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
+        original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size,
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        worker_error.set_(buffer_m - worker_scale *
+                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
+            self.size)
+        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
+
+        cupy_recvbuf_sign = cupy.zeros(
+            [self.size,
+             cupy_sign_list_packed[self.rank].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 1
+        gather_start = time.time()
+        if self.cuda_aware:
+            self.gather_cuda(self.rank,
+                             self.size,
+                             self.comm,
+                             cupy_sign_list_packed,
+                             cupy_recvbuf_sign,
+                             cupy_worker_scale,
+                             cupy_recvbuf_scale)
+        else:
+            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank,
+               self.size,
+               self.comm,
+               cupy_sign_list_packed,
+               cupy_recvbuf_sign,
+               cupy_worker_scale,
+               cupy_recvbuf_scale)
+        gather_end = time.time()
+
+        # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
+        cupy_sign_list_packed = None
+
+        compensated_server_m = self.compression_backend.cupy2torch(
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+                self.size,
+                -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
+                        1 / self.size)).sum(0)
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        server_error.set_(
+            compensated_server_m - server_scale *
+            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
+
+        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(
+                compensated_server_m.sign_().add_(1).bool()),
+            1)
+        compensated_server_m = None
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [self.size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_recvbuf_sign.dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size,
+                                                1],
+                                               dtype=cupy_recvbuf_scale.dtype)
+        # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
+        cupy_recvbuf_sign = None
+
+        # Communication Phase 2
+        if self.cuda_aware:
+            self.allgather_cuda(self.comm,
+                                cupy_server_sign_packed[0],
+                                cupy_recvbuf_sign_server,
+                                cupy_server_scale,
+                                cupy_recvbuf_scale_server)
+        else:
+            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
+                  cupy_server_sign_packed[0],
+                  cupy_recvbuf_sign_server,
+                  cupy_server_scale,
+                  cupy_recvbuf_scale_server)
+
+        # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
+        cupy_server_sign_packed = None
+
+        buffer_m.data.copy_(
+            self.compression_backend.cupy2torch(
+                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                    self.size,
+                    -1)).float().add_(-0.5).mul_(2.0).mul_(
+                        self.compression_backend.cupy2torch(
+                            cupy_recvbuf_scale_server)).flatten().data)
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
+
+        # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None
+
+        return buffer_m
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
new file mode 100644
index 000000000000..0ac2646bd0d7
--- /dev/null
+++ b/deepspeed/runtime/comm/nccl.py
@@ -0,0 +1,178 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import torch.distributed as dist
+import time
+import cupy
+import numpy as np
+
+from deepspeed.runtime.compression.cupy import CupyBackend
+
+
+class NcclBackend(object):
+    def __init__(self):
+        self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        self.rank = dist.get_rank(group=self.world_group)
+        self.size = dist.get_world_size(group=self.world_group)
+        self.compression_backend = CupyBackend()
+
+    def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(dist.irecv(recvbuf[idx], src=idx, group=group))
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            req.append(dist.isend(sendbuf, group=group, dst=root))
+        return req
+
+    def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    dist.recv(recvbuf[idx], src=idx, group=group)
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            dist.send(sendbuf, group=group, dst=root)
+
+    def compressed_allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             local_rank):
+
+        # all_start_time = time.time()
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
+        original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size,
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        worker_error.set_(buffer_m - worker_scale *
+                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
+            self.size)
+        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
+
+        cupy_recvbuf_sign = cupy.zeros(
+            [self.size,
+             cupy_sign_list_packed[self.rank].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
+
+        sign_list_packed = [
+            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
+            for idx in range(self.size)
+        ]
+
+        # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
+        recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
+        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+        recvbuf_scale = [
+            torch.zeros(1,
+                        dtype=worker_scale.dtype,
+                        device=torch.device(local_rank)) for i in range(self.size)
+        ]
+
+        # communication phase 1
+        # gather_start = time.time()
+        # Alltoall for sign
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed))
+        # Allgather for scale
+        dist.all_gather(recvbuf_scale, worker_scale)
+
+        # gather_end = time.time()
+
+        # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
+        cupy_sign_list_packed = None
+
+        cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
+        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
+
+        compensated_server_m = self.compression_backend.cupy2torch(
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+                self.size,
+                -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        server_error.set_(
+            compensated_server_m - server_scale *
+            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
+
+        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(
+                compensated_server_m.sign_().add_(1).bool()),
+            1)
+        compensated_server_m = None
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [self.size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_recvbuf_sign.dtype)
+        # cupy_recvbuf_sign, recvbuf_sign = None, None
+        cupy_recvbuf_sign = None
+
+        server_sign_packed = [
+            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
+        ]
+        recvbuf_sign_server = [
+            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
+            for idx in range(self.size)
+        ]
+
+        # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size,
+                                                1],
+                                               dtype=cupy_worker_scale.dtype)
+        # cupy_recvbuf_scale, recvbuf_scale = None, None
+
+        recvbuf_scale_server = [
+            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
+            for idx in range(self.size)
+        ]
+
+        # Communication Phase 2
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0])
+        dist.all_gather(recvbuf_scale_server, server_scale)
+
+        cupy_server_sign_packed = None
+
+        # need to convert from a tensor list to a single tensor
+        # dist.all_gather only provides a tensor list as the recv/output buffer
+        recvbuf_sign_server = torch.stack(recvbuf_sign_server)
+
+        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
+            recvbuf_sign_server)
+
+        buffer_m.data.copy_(
+            self.compression_backend.cupy2torch(
+                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                    self.size,
+                    -1)).float().add_(-0.5).mul_(2.0).mul_(
+                        self.compression_backend.cupy2torch(
+                            cupy_recvbuf_scale_server)).flatten().data)
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
+
+        return buffer_m
diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py
new file mode 100644
index 000000000000..68e56c68e9d0
--- /dev/null
+++ b/deepspeed/runtime/compression/cupy.py
@@ -0,0 +1,24 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import cupy
+from torch.utils.dlpack import to_dlpack
+from torch.utils.dlpack import from_dlpack
+
+
+class CupyBackend(object):
+    def __init__(self):
+        pass
+
+    def torch2cupy(self, tensor):
+        return cupy.fromDlpack(to_dlpack(tensor))
+
+    def cupy2torch(self, cupy_tensor):
+        return from_dlpack(cupy_tensor.toDlpack())
+
+    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
+        packed_sign = cupy.packbits(cupy_bool_tensor)
+        sign_list_packed = cupy.split(packed_sign, num_chunks)
+        cupy.cuda.get_current_stream().synchronize()
+        return sign_list_packed
diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py
deleted file mode 100644
index cb77edcaf60d..000000000000
--- a/deepspeed/runtime/custom_collectives.py
+++ /dev/null
@@ -1,154 +0,0 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-from mpi4py import MPI
-import numpy as np
-import cupy
-
-
-def my_igather(rank, size, comm, sendbuf, recbuf, root):
-    req = []
-    if rank == root:
-        for idx in range(size):
-            if idx != rank:
-                req.append(comm.Irecv(recbuf[idx], source=idx))
-            else:
-                recbuf[rank] = sendbuf
-    else:
-        req.append(comm.Isend(sendbuf, dest=root))
-    return req
-
-
-def gather_cuda(rank,
-                world_size,
-                comm,
-                cupy_sign_list_packed,
-                cupy_recvbuf_sign,
-                cupy_worker_scale,
-                cupy_recvbuf_scale):
-    # We do in-place operations on cupy buffers so we do not return any buffers
-    requests = []
-    for idx in range(world_size):
-        req_sign = my_igather(rank,
-                              world_size,
-                              comm,
-                              cupy_sign_list_packed[idx],
-                              cupy_recvbuf_sign,
-                              root=idx)
-        requests += req_sign
-
-    for idx in range(world_size):
-        req_scale = my_igather(rank,
-                               world_size,
-                               comm,
-                               cupy_worker_scale,
-                               cupy_recvbuf_scale,
-                               root=idx)
-        requests += req_scale
-
-    MPI.Request.Waitall(requests)
-
-
-def gather_host(rank,
-                world_size,
-                comm,
-                cupy_sign_list_packed,
-                cupy_recvbuf_sign,
-                cupy_worker_scale,
-                cupy_recvbuf_scale):
-    # In-place operations are not possible for newly created cupy arrays
-    # so we need to return the new buffers
-    numpy_recvbuf_sign = np.zeros([world_size,
-                                   cupy_sign_list_packed[rank].size],
-                                  dtype=cupy_sign_list_packed[0].dtype)
-    numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
-
-    # 1. convert from cupy to numpy
-    numpy_sign_list_packed = cupy_sign_list_packed
-
-    for idx in range(world_size):
-        numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
-
-    numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
-    numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
-
-    cupy.cuda.get_current_stream().synchronize()
-
-    # 2. use numpy buffers for communication
-    requests = []
-
-    for idx in range(world_size):
-        req_sign = my_igather(rank,
-                              world_size,
-                              comm,
-                              numpy_sign_list_packed[idx],
-                              numpy_recvbuf_sign,
-                              root=idx)
-        requests += req_sign
-
-    for idx in range(world_size):
-        req_scale = my_igather(rank,
-                               world_size,
-                               comm,
-                               numpy_worker_scale,
-                               numpy_recvbuf_scale,
-                               root=idx)
-        requests += req_scale
-
-    MPI.Request.Waitall(requests)
-
-    # 3. Convert back from numpy to cupy
-    cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
-    for idx in range(world_size):
-        cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
-
-    cupy_worker_scale = cupy.asarray(numpy_worker_scale)
-    cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
-    cupy.cuda.get_current_stream().synchronize()
-
-    return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
-
-
-def allgather_cuda(comm,
-                   cupy_server_sign_packed,
-                   cupy_recvbuf_sign_server,
-                   cupy_server_scale,
-                   cupy_recvbuf_scale_server):
-    comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
-    comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
-
-
-def allgather_host(comm,
-                   cupy_server_sign_packed,
-                   cupy_recvbuf_sign_server,
-                   cupy_server_scale,
-                   cupy_recvbuf_scale_server):
-
-    # 1. Convert cupy to numpy
-    numpy_recvbuf_sign_server = np.zeros([comm.Get_size(),
-                                          cupy_server_sign_packed.size],
-                                         dtype=cupy_server_sign_packed.dtype)
-    numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
-                                           1],
-                                          dtype=cupy_server_scale.dtype)
-
-    numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
-    numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
-    numpy_server_scale = cupy.asnumpy(cupy_server_scale)
-    numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
-    cupy.cuda.get_current_stream().synchronize()
-
-    # 2. Communicate numpy buffers
-    comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
-    comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
-    comm.Barrier()
-
-    # 3. Convert numpy back to cupy
-    cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
-    cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
-    cupy_server_scale = cupy.asarray(numpy_server_scale)
-    cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
-    cupy.cuda.get_current_stream().synchronize()
-
-    return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index e11e2c1d7afc..5b3295e99fc8 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -675,8 +675,12 @@ def _configure_basic_optimizer(self, model_parameters):
             from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+            from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
+            if not self.fp16_enabled():
+                logger.warning(
+                    f'Currently the convergence of 1-bit Adam is only verified under FP16'
+                )
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/fp16/onebit_adam.py b/deepspeed/runtime/fp16/onebit/adam.py
similarity index 55%
rename from deepspeed/runtime/fp16/onebit_adam.py
rename to deepspeed/runtime/fp16/onebit/adam.py
index 215bffb0c453..e3417fea9d6f 100644
--- a/deepspeed/runtime/fp16/onebit_adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -6,19 +6,15 @@
 import importlib
 import numpy as np
 import time
-import cupy
-from torch.utils.dlpack import to_dlpack
-from torch.utils.dlpack import from_dlpack
-from deepspeed.utils.logging import logger
+import torch.distributed as dist
 
-from mpi4py import MPI
-from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host
+from deepspeed.utils.logging import logger
 
 
 class OnebitAdam(torch.optim.Optimizer):
     """Implements the 1-bit Adam algorithm. Currently GPU-only.
-    For usage example please see, TODO DeepSpeed Tutorial
-    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
+    For usage example please see https://www.deepspeed.ai/tutorials/onebit-adam/
+    For technical details please read https://arxiv.org/abs/2102.02888
 
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining
@@ -31,8 +27,6 @@ class OnebitAdam(torch.optim.Optimizer):
         eps (float, optional): term added to the denominator to improve
             numerical stability. (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
-        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False) NOT SUPPORTED in 1-bit Adam!
@@ -42,6 +36,7 @@ class OnebitAdam(torch.optim.Optimizer):
             second moment estimate as in the original paper. (default: False)
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
@@ -60,10 +55,12 @@ def __init__(self,
                  weight_decay=0.,
                  max_grad_norm=0.,
                  amsgrad=False,
-                 cuda_aware=False):
+                 cuda_aware=False,
+                 comm_backend_name='nccl'):
 
         if amsgrad:
             raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
+
         defaults = dict(lr=lr,
                         bias_correction=bias_correction,
                         betas=betas,
@@ -72,160 +69,40 @@ def __init__(self,
                         max_grad_norm=max_grad_norm)
 
         super(OnebitAdam, self).__init__(params, defaults)
-        from mpi4py import MPI
         self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
 
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
         self.comm_time = 0.0
         self.step_time = 0.0
         self.ave_step = 1
         self.bk_time = 0.0
-        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
         self.deepspeed = deepspeed
         self.adam_freeze_key = False
         self.initialize = False
         self.freeze_step = freeze_step
         self.cuda_aware = cuda_aware
 
-    def torch2cupy(self, tensor):
-        return cupy.fromDlpack(to_dlpack(tensor))
-
-    def cupy2torch(self, cupy_tensor):
-        return from_dlpack(cupy_tensor.toDlpack())
-
-    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
-        packed_sign = cupy.packbits(cupy_bool_tensor)
-        sign_list_packed = cupy.split(packed_sign, num_chunks)
-        cupy.cuda.get_current_stream().synchronize()
-        return sign_list_packed
-
-    def Compressed_Allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             rank,
-                             world_size,
-                             comm,
-                             local_rank):
-
-        all_start_time = time.time()
-        original_size = buffer_m.numel()
-        cupy.cuda.Device(local_rank).use()
-
-        if torch.numel(buffer_m) != torch.numel(worker_error):
-            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
-                                       device=buffer_m.device)
-            buffer_m = torch.cat([buffer_m, empty_tensor])
-
-        buffer_m.add_(worker_error)
-        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        sign_buffer_m = buffer_m.sign().add_(1).bool()
-        sign_buffer_m = sign_buffer_m.float()
-        sign_buffer_m.add_(-0.5).mul_(2.0)
-        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
-        sign_buffer_m = None
-
-        compensated_buffer_m = buffer_m
-        compensated_buffer_m.sign_()
-        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
-        cupy_worker_scale = self.torch2cupy(worker_scale)
-        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
-        compensated_buffer_m = None
-
-        cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m,
-                                                       world_size)
-        cupy_compensated_buffer_m = None
-
-        cupy_recvbuf_sign = cupy.zeros([world_size,
-                                        cupy_sign_list_packed[rank].size],
-                                       dtype=cupy_sign_list_packed[0].dtype)
-        cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
-
-        # Communication Phase 1
-        gather_start = time.time()
-        if self.cuda_aware:
-            gather_cuda(rank,
-                        world_size,
-                        comm,
-                        cupy_sign_list_packed,
-                        cupy_recvbuf_sign,
-                        cupy_worker_scale,
-                        cupy_recvbuf_scale)
-        else:
-            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank,
-               world_size,
-               comm,
-               cupy_sign_list_packed,
-               cupy_recvbuf_sign,
-               cupy_worker_scale,
-               cupy_recvbuf_scale)
-        gather_end = time.time()
-
-        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-            world_size,
-            -1)
-        cupy_recvbuf_sign = None
-        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
-        cupy_unpacked_sign = None
-        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
-        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
-        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
-        unpacked_sign = None
-
-        compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        sign_server_m = compensated_server_m.sign().add_(1).bool()
-        sign_server_m = sign_server_m.float()
-        sign_server_m.add_(-0.5).mul_(2.0)
-        server_error.set_(compensated_server_m - server_scale * sign_server_m)
-        sign_server_m = None
-
-        compensated_server_m.sign_()
-        compensated_server_m = compensated_server_m.add_(1).bool()
-        cupy_server_scale = self.torch2cupy(server_scale)
-        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
-        compensated_server_m = None
-
-        cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1)
-
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [world_size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_sign_list_packed[0].dtype)
-        cupy_recvbuf_scale_server = cupy.zeros([world_size,
-                                                1],
-                                               dtype=cupy_worker_scale.dtype)
-
-        # Communication Phase 2
-        if self.cuda_aware:
-            allgather_cuda(comm,
-                           cupy_server_sign_packed[0],
-                           cupy_recvbuf_sign_server,
-                           cupy_server_scale,
-                           cupy_recvbuf_scale_server)
-        else:
-            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm,
-                  cupy_server_sign_packed[0],
-                  cupy_recvbuf_sign_server,
-                  cupy_server_scale,
-                  cupy_recvbuf_scale_server)
+        self.comm_backend_name = comm_backend_name
+
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
 
-        cupy_server_unpacked_sign = (cupy.unpackbits(
-            cupy_recvbuf_sign_server.flatten())).reshape(world_size,
-                                                         -1)
-        cupy_recvbuf_sign_server = None
+        if self.comm_backend_name == 'nccl':
+            TORCH_MAJOR = int(torch.__version__.split('.')[0])
+            TORCH_MINOR = int(torch.__version__.split('.')[1])
+            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.comm_backend_handle = NcclBackend()
 
-        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
-        cupy_server_unpacked_sign = None
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
 
-        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
-        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
-        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+        self.size = self.comm_backend_handle.size
 
-        return buffer_m
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
 
     def step(self, closure=None, grads=None):
         """Performs a single optimization step.
@@ -275,9 +152,7 @@ def step(self, closure=None, grads=None):
                 if grad is None:
                     grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                    raise RuntimeError('1-bit Adam does not support sparse gradients')
 
                 state = self.state[p]
 
@@ -337,13 +212,24 @@ def step(self, closure=None, grads=None):
 
                         if self.size > 1:
                             exp_avg.set_(
-                                self.Compressed_Allreduce(exp_avg,
-                                                          state['worker_error'],
-                                                          state['server_error'],
-                                                          self.rank,
-                                                          self.size,
-                                                          self.comm,
-                                                          self.deepspeed.local_rank))
+                                self.comm_backend_handle.compressed_allreduce(
+                                    exp_avg,
+                                    state['worker_error'],
+                                    state['server_error'],
+                                    self.deepspeed.local_rank))
+                        # Because 1-bit compression cannot represent exact zero, it is required to
+                        # provide a momentum mask for those params that have constant exact zeros in their
+                        # momentums, otherwise the compression error would keep accumulating.
+                        # For example, for BERT pre-training seq 128, bert.embeddings.position_embeddings.weight
+                        # always have exact zeros in its momentum for row 129 to 512, because it only
+                        # learns up to seq length 128 while the model supports up to 512 seq length.
+                        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+                        if 'exp_avg_mask' in group:
+                            if exp_avg.device != group['exp_avg_mask'].device:
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                    device=exp_avg.device)
+                            exp_avg.mul_(group['exp_avg_mask'])
+
                     if self.initialize:
                         update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
 
@@ -372,3 +258,52 @@ def step(self, closure=None, grads=None):
                 self.deepspeed.enable_backward_allreduce = False
 
         return loss
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides load_state_dict() to add special handling when loading checkpoints
+        """
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # BERT pre-training seqlen 128 and 512 ), we don't use the exp_avg_mask
+        # in checkpoints but always use the one user provided in training script.
+        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+        # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
+                    'param_groups'][i]:
+                state_dict['param_groups'][i].pop('exp_avg_mask')
+        super().load_state_dict(state_dict)
+        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print("Checkpoint loaded and 1-bit Adam warmup stage starts/continues.")
+            if self.adam_freeze_key is True:
+                self.adam_freeze_key = False
+                self.deepspeed.enable_backward_allreduce = True
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and 1-bit Adam compression stage starts/continues."
+                )
+            if self.adam_freeze_key is False:
+                self.adam_freeze_key = True
+                self.deepspeed.enable_backward_allreduce = False
+        # We reset the compression errors when loading checkpoints for 3 reasons:
+        # 1) The worker and server error at each GPU are distinct, so in current implementation
+        # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
+        # If we want to save them correctly we need O(num_gpu*model_size) memory in order to
+        # gather all the error, which is a very large memory requirement. It's possible to save
+        # them in a distributed way, but it will make the checkpoint saving/loading much more complicated.
+        # 2) Even if we are able to save the compression errors correctly, you need to have the
+        # exact same number of GPUs in order to load them correctly.
+        # 3) We verified on BERT pre-training that occasionally resetting the compression error
+        # at checkpoint loading does not affect the convergence.
+        # However, please avoid frequent checkpoint loading which could break the error
+        # compensation mechanism thus affect the convergence.
+        for group in self.param_groups:
+            for p in group['params']:
+                if 'worker_error' in self.state[p]:
+                    self.state[p].pop('worker_error')
+                if 'server_error' in self.state[p]:
+                    self.state[p].pop('server_error')
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 40f31310d57e..9a9554cbd75f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -60,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
 | torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
 | adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |
 
-  Another example of ***optimizer*** with 1-bit Adam specific parameters is as follows.
+  Another example of ***optimizer*** with 1-bit Adam
 
 ```json
 "optimizer": {
@@ -74,11 +74,20 @@ The Adam optimizer also supports the following two params keys/values in additio
       "eps": 1e-8,
       "weight_decay": 3e-7,
       "freeze_step": 400,
-      "cuda_aware": true
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
     }
   }
 ```
 
+The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
+
+| "params" key  | Description                                                                 | Default |
+| ------------- | --------------------------------------------------------------------------- | ------- |
+| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication | 100000   |
+| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication         | false    |
+| comm\_backend\_name | To indicate which backend implementation to use                               | "nccl"   |
+
 ### Scheduler Parameters
 
 ***scheduler***: [dictionary]
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index c8eee07586aa..1a15000135c9 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -1,7 +1,15 @@
 ---
-title: "1-bit Adam: Up to 5x less communication volume and up to 2x faster training"
+title: "1-bit Adam: Up to 5x less communication volume and up to 3.4x faster training"
 ---
 
+**Note:**
+This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes include: 1) NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation. 2) Add support to momentum masks for those parameters with constant zero gradients during training. 3) Bug fixes. See details below.
+{: .notice--info}
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
+{: .notice--warning}
+
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
 
 To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
@@ -13,7 +21,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert
 
 ## 1. Overview
 
-### Pre-requisites for installing DeepSpeed
+### 1.1 Pre-requisites for installing DeepSpeed
 
 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
@@ -25,9 +33,19 @@ git submodule update --init --recursive
 cd DeepSpeedExamples/
 ```
 
-### Pre-requisites for 1-bit Adam
+### 1.2 Pre-requisites for 1-bit Adam
+
+#### 1.2.1 (New in v2) NCCL-based implementation
+
+In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
 
-1-bit Adam uses advanced communication schemes that are not yet supported by PyTorch distributed and NCCL. We rely on Message Passing Interface (MPI) for these advanced communication primitives.
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
 
 We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
 
@@ -43,31 +61,32 @@ An example launch command for 1-bit Adam using the `deepspeed` launcher is as fo
 deepspeed --launcher=[mvapich|openmpi] script.py
 ```
 
-Please note that because 1-bit Adam uses MPI backend to communicate during the compression stage, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+Please note that for MPI-based implementation of 1-bit Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
 
 Alternatively, the standard mpirun launcher can also be used as follows:
 
 ```shell
-mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash [training_script.sh]
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
-### 1-bit Algorithm
+### 1.3 1-bit Algorithm
 
-The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
-### Configuration of 1-bit Adam
+### 1.4 Configuration of 1-bit Adam
 The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
 
 ```json
 {
   "train_batch_size": 4096,
-  "train_micro_batch_size_per_gpu": 64,
+  "train_micro_batch_size_per_gpu": 16,
   "optimizer": {
     "type": "OneBitAdam",
     "params": {
-      "lr": 2e-4,
-      "freeze_step": 400,
-      "cuda_aware": true
+      "lr": 4e-4,
+      "freeze_step": 23000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
     }
   },
   "fp16": {
@@ -75,12 +94,20 @@ The 1-bit Adam feature can be used by setting the optimizer configuration option
   }
 }
 ```
-Please note two new parameters `freeze_step` and `cuda_aware` that have been added to support the 1-bit Adam feature.
+Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_name` that have been added to support the 1-bit Adam feature.
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to Adam's variance/second moment term. See detailed analysis in our [paper](https://arxiv.org/abs/2102.02888)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
 
-`cuda_aware` is used to indicate that the underlying MPI library support CUDA-Aware communication.
-This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model. If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+#### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
 
@@ -93,9 +120,13 @@ This feature is only supported on systems with InfiniBand interconnect and a CUD
 
 You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.
 
+**Note:** For details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
+
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
 
-The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
+We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
 helps to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -132,7 +163,7 @@ For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the su
 
 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
-```
+``` -->
 
 ### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
 
@@ -148,18 +179,16 @@ Table 1 shows the fine-tuning configuration we used in our experiments.
 | ------------------------------ | ---------------------|
 | Total batch size               | 96    		|
 | Train micro batch size per GPU | 3     		|
-| Optimizer                      | **OnebitAdam**  	|
+| Optimizer                      | **"OnebitAdam"**  	|
 | Learning rate                  | 3e-5  		|
 | Sequence-length                | 384   		|
 | Weight-decay                   | 0.0   		|
 | Epoch count                    | 2     		|
 | **freeze_step**                | 400     	   	|
-| **cuda_aware**                 | True     		|
+| **comm_backend_name**          | "nccl"     		|
 
 Table 1. Fine-tuning configuration
 
-**Note:** For more details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
-
 ### 2.3 Performance Results for BingBertSQuAD Fine-tuning
 
 ***Accuracy:***
@@ -174,19 +203,24 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor
 
 ***Training Speed and Scalability:***
 
-1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
+<!-- 1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
 
 ![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
 
-Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.
+Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
+
+Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+
 
 
 ## 3. BERT Pre-training with 1-bit Adam
-For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) post.
+For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) tutorial.
 
 ### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
 
-The main part of training is done in `deepspeed_train.py`, which has
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `deepspeed_train.py`, which has
 already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
 are the shell scripts that help to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -218,11 +252,11 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
 For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
-```
+``` -->
 
 ### 3.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit Adam enabled
 
-The `deepspeed_bsz4k_onebit_config_seq128.json` file gives the user the ability to specify DeepSpeed
+The `deepspeed_bsz4k_onebit_config_seq128_*.json` file gives the user the ability to specify DeepSpeed
 options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
 
 Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
@@ -240,7 +274,7 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
       "weight_decay": 0.01,
       "bias_correction": false,
       "freeze_step": 23000,
-      "cuda_aware": true
+      "comm_backend_name": "nccl"
     }
   },
   "gradient_clipping": 1.0,
@@ -251,8 +285,8 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
   }
 }
 ```
-The above file is for BERT-large but for BERT-base training (sequence length 128), the suggested `freeze_step` will need to be changed to 16000. For the rest of the pre-training using sequence 512, we suggest to use a `freeze_step` of 1500. And make sure to set the `cuda_aware` correctly as described above.
+The above file is for BERT-large. For BERT-base training (sequence length 128), the suggested `freeze_step` is 16000. For sequence 512 pre-training, we suggest to use a `freeze_step` of 1500 for both BERT-base and BERT-large. And make sure to set the `comm_backend_name` and `cuda_aware` correctly as described above.
 
 ### 3.3 Performance Results for BERT Pre-training
 
-Performance results of BERT Pre-training can be seen from our detailed [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
+Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 89fc47ac547b..d7b338561b96 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -17,4 +17,4 @@ FusedLamb (GPU)
 
 OneBitAdam (GPU)
 ----------------------------
-.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
diff --git a/docs/index.md b/docs/index.md
index ee21bd3928fb..a30848246e07 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -28,6 +28,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebit/test_mpi_backend.py
similarity index 65%
rename from tests/onebitadam/test_com_reduce_cuda.py
rename to tests/onebit/test_mpi_backend.py
index a5a87ce67232..785021cf0935 100644
--- a/tests/onebitadam/test_com_reduce_cuda.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -4,26 +4,22 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+
+from deepspeed.runtime.comm.mpi import MpiBackend
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-1:2245',
-                                     world_size=size,
-                                     rank=rank)
-
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
+deepspeed.init_distributed(dist_backend='nccl')
 
-# Set cuda_aware to True to use CUDA buffers for communication
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=True)
+# Change cuda_aware to True to test out CUDA-Aware MPI communication
+backend = MpiBackend(cuda_aware=False)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
 
+# A simulated compression function using torch.distributed
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
     scale = a.norm() / np.sqrt(a.numel())
@@ -52,21 +48,20 @@ def torch_sim(a):
 else:
     right_tensor_size = tensor_size
 right_server_size = right_tensor_size // size
+
 # Adding bias to the initialization of the gradient we are communicating
 # In order to get rid of the case where some elements in the gradient are too small
 a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
 worker_error = torch.zeros(right_tensor_size, device=device)
 server_error = torch.zeros(right_server_size, device=device)
+
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
 torch.cuda.empty_cache()
 local_rank = rank % torch.cuda.device_count()
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
+
+a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
 threshold = 1e-6
 magnitude_threshold = 1e-6
 diff_mask = (a_after - a_torch) > threshold
@@ -74,13 +69,16 @@ def torch_sim(a):
 mpi_server = torch.chunk(a_after, size)[rank] + server_error
 torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
 
+test_correctness = True
+
 # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
 # The test would skip those numbers that are too small in compensated_server_m
-if torch.sum(diff_server_mask) == 0:
-    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
-else:
-    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
-    if torch.sum(check_mag_mask) == 0:
-        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+if test_correctness:
+    if torch.sum(diff_server_mask) == 0:
+        print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
     else:
-        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) == 0:
+            print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
+        else:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
new file mode 100644
index 000000000000..6017ec873c21
--- /dev/null
+++ b/tests/onebit/test_mpi_perf.py
@@ -0,0 +1,74 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+
+from deepspeed.runtime.comm.mpi import MpiBackend
+
+# Configure wall clock timer
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+from statistics import mean
+
+timers = SynchronizedWallClockTimer()
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+deepspeed.init_distributed(dist_backend='nccl')
+# Change cuda_aware to True to test out CUDA-Aware MPI communication
+backend = MpiBackend(cuda_aware=False)
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+warmup = 10
+iters = 10
+
+local_rank = rank % torch.cuda.device_count()
+
+# Warmup
+for i in range(warmup):
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+time_list = []
+
+for i in range(iters):
+    timers('compressed_allreduce').start()
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    timers('compressed_allreduce').stop()
+    time_list.append(timers('compressed_allreduce').elapsed())
+
+timer_names = ['compressed_allreduce']
+timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+
+places = 2
+convert = 1e3
+float_size = 4
+
+if rank == 0:
+    for i in range(iters):
+        lat = time_list[i]
+        print("latency = ", lat * convert)
+
+minlat = round(min(time_list) * convert)
+maxlat = round(max(time_list) * convert)
+meanlat = round(mean(time_list) * convert, places)
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat))
diff --git a/tests/onebitadam/test_com_reduce_host.py b/tests/onebit/test_nccl_backend.py
similarity index 59%
rename from tests/onebitadam/test_com_reduce_host.py
rename to tests/onebit/test_nccl_backend.py
index 1507abc44f24..16de37174c10 100644
--- a/tests/onebitadam/test_com_reduce_host.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -1,29 +1,31 @@
-from mpi4py import MPI
 import time
 import torch
 import torch.distributed as dist
 import numpy as np
+import argparse
 import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+import os
 
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
+from deepspeed.runtime.comm.nccl import NcclBackend
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-1:2245',
-                                     world_size=size,
-                                     rank=rank)
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
 
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
+deepspeed.init_distributed(dist_backend='nccl')
+args.local_rank = int(os.environ['LOCAL_RANK'])
 
-# Set cuda_aware to False to use host buffers for communication
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
+torch.cuda.set_device(args.local_rank)
+device = torch.device("cuda", args.local_rank)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+size = dist.get_world_size()
+rank = dist.get_rank()
 
+backend = NcclBackend()
+local_rank = args.local_rank
 
+
+# A simulated compression function using torch.distributed
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
     scale = a.norm() / np.sqrt(a.numel())
@@ -45,28 +47,26 @@ def torch_sim(a):
     return a_server_compressed, worker_error, server_error
 
 
-tensor_size = 100 * 2**20
+tensor_size = 300 * 2**20
 server_size = int(tensor_size / size)
 if tensor_size % (8 * size) != 0:
     right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
 else:
     right_tensor_size = tensor_size
 right_server_size = right_tensor_size // size
+
 # Adding bias to the initialization of the gradient we are communicating
 # In order to get rid of the case where some elements in the gradient are too small
 a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
 worker_error = torch.zeros(right_tensor_size, device=device)
 server_error = torch.zeros(right_server_size, device=device)
+
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
 torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
+
+a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
 threshold = 1e-6
 magnitude_threshold = 1e-6
 diff_mask = (a_after - a_torch) > threshold
@@ -74,13 +74,17 @@ def torch_sim(a):
 mpi_server = torch.chunk(a_after, size)[rank] + server_error
 torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
 
+test_correctness = True
+
 # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
 # The test would skip those numbers that are too small in compensated_server_m
-if torch.sum(diff_server_mask) == 0:
-    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
-else:
-    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
-    if torch.sum(check_mag_mask) == 0:
-        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+if test_correctness:
+    if torch.sum(diff_server_mask) == 0:
+        print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
     else:
-        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) == 0:
+            print(
+                'Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
+        else:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
new file mode 100644
index 000000000000..1374cda4ddce
--- /dev/null
+++ b/tests/onebit/test_nccl_perf.py
@@ -0,0 +1,94 @@
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import argparse
+import deepspeed
+import os
+
+from deepspeed.runtime.comm.nccl import NcclBackend
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from statistics import mean
+
+timers = SynchronizedWallClockTimer()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
+
+deepspeed.init_distributed(dist_backend='nccl')
+args.local_rank = int(os.environ['LOCAL_RANK'])
+
+torch.cuda.set_device(args.local_rank)
+device = torch.device("cuda", args.local_rank)
+
+size = dist.get_world_size()
+rank = dist.get_rank()
+
+backend = NcclBackend()
+local_rank = args.local_rank
+
+# Setting tensor_size (BERT-Large)
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+warmup = 10
+iters = 10
+
+# Warmup
+for i in range(warmup):
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+time_list = []
+
+a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+scale = a.norm() / np.sqrt(a.numel())
+a_compressed = scale * a_sign
+
+print("Shape of the compressed buffer:", a_compressed.shape) if rank == 0 else None
+
+for i in range(iters):
+    timers('compressed_allreduce').start()
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    #torch.distributed.all_reduce(a_compressed)
+    timers('compressed_allreduce').stop()
+    time_list.append(timers('compressed_allreduce').elapsed())
+
+#timer_names = ['compressed_allreduce']
+#timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+
+places = 2
+convert = 1e3
+float_size = 4
+
+if rank == 0:
+    for i in range(iters):
+        lat = time_list[i]
+        print("latency = ", lat * convert)
+
+minlat = round(min(time_list) * convert)
+maxlat = round(max(time_list) * convert)
+meanlat = round(mean(time_list) * convert, places)
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat,
+                                                        maxlat,
+                                                        meanlat)) if rank == 0 else None
+#print("tensor shape", a.shape)
+duration = meanlat / 1e3
+tput = ((tensor_size * 4) / duration)
+print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput / 1e9)) if rank == 0 else None
+size = tensor_size * 4
+n = dist.get_world_size()
+busbw = (size / duration) * (2 * (n - 1) / n)
+print("busbw: %f GB/s" % (busbw / 1e9)) if rank == 0 else None
diff --git a/tests/onebitadam/test_server_error.py b/tests/onebitadam/test_server_error.py
deleted file mode 100644
index 075145f84915..000000000000
--- a/tests/onebitadam/test_server_error.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from mpi4py import MPI
-import time
-import torch
-import torch.distributed as dist
-import numpy as np
-import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
-
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
-
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
-
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
-
-device = torch.device('cuda', rank % torch.cuda.device_count())
-
-
-def torch_sim(a):
-    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    scale = a.norm() / np.sqrt(a.numel())
-    a_compressed = scale * a_sign
-    a_sign = None
-    worker_error = a - a_compressed
-    dist.all_reduce(a_compressed)
-    a_compressed.mul_(1 / dist.get_world_size())
-    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
-    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
-    rank = dist.get_rank()
-    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
-    torch.distributed.barrier()
-    return a_server_compressed, worker_error, server_error
-
-
-# Input Tensor size
-tensor_size = 100 * 2**20
-
-server_size = int(tensor_size / size)
-if tensor_size % (8 * size) != 0:
-    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
-else:
-    right_tensor_size = tensor_size
-
-right_server_size = right_tensor_size // size
-
-# The -0.5 is required for avoiding sign flips/errors
-a = torch.rand(tensor_size, device=device) - 0.5
-
-worker_error = torch.zeros(right_tensor_size, device=device)
-server_error = torch.zeros(right_server_size, device=device)
-a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
-
-# Test the 1-bit Adam optimizer
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
-
-# If the error is below the threshold, it is acceptable for training
-threshold = 1e-6
-
-diff_pos = ((a_after - a_torch) > threshold)
-
-if rank == 0:
-    before_diff = torch.chunk(a_after - a_torch,
-                              size)[rank] + server_error - server_error_torch
-    if torch.norm(before_diff) / torch.norm(torch.chunk(a_after,
-                                                        size)[rank]) < threshold:
-        print('Successfully passed the test')
-    else:
-        print('The difference for the tensor before allgather is {}'.format(
-            torch.norm(before_diff)))
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
new file mode 100644
index 000000000000..8e0056be0cff
--- /dev/null
+++ b/tests/unit/test_onebit.py
@@ -0,0 +1,368 @@
+import torch
+import torch.distributed as dist
+import deepspeed
+import argparse
+import pytest
+import json
+import os
+import numpy as np
+import time
+from common import distributed_test
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
+    pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
+                allow_module_level=True)
+
+
+def test_onebitadam_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitadam_fp16_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitadam_fp32_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitadam_fp32_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitadam_exp_avg_mask(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+    mask1 = torch.flatten(mask1)
+    optimizer_grouped_parameters = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                    {
+                                        'params': [param_optimizer[1][1]],
+                                        'weight_decay': 0.01
+                                    }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
+        model, optimizer, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      model_parameters=optimizer_grouped_parameters)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v['exp_avg'].size() == mask1.size():
+                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
+
+    _test_onebitadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitadam_checkpointing(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    mask2 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+        mask2[1][col] += 1
+    mask1 = torch.flatten(mask1)
+    mask2 = torch.flatten(mask2)
+
+    optimizer_grouped_parameters_1 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_2 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask2
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_3 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
+        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_1)
+        data_loader = random_dataloader(model=model_1,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model_1.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.adam_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        # optimizer_1.optimizer.gather_compression_errors()
+        model_1.save_checkpoint(save_folder, tag=None)
+        time.sleep(5)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
+
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_2)
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
+        model_2.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        for v in optimizer_2.state.values():
+            assert 'worker_error' not in v, f"Incorrect worker error"
+            assert 'server_error' not in v, f"Incorrect server error"
+        assert optimizer_2.optimizer.adam_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_3)
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(model=model_3,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model_3.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.adam_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
+        model_3.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        for v in optimizer_3.state.values():
+            assert 'worker_error' not in v, f"Incorrect worker error"
+            assert 'server_error' not in v, f"Incorrect server error"
+        assert optimizer_3.optimizer.adam_freeze_key is False
+
+    _test_onebitadam_checkpointing(mask1,
+                                   mask2,
+                                   args=args,
+                                   model=model,
+                                   hidden_dim=hidden_dim)
+
+
+def test_compressed_allreduce_basic(tmpdir):
+    @distributed_test(world_size=[1, 2])
+    def _test_compressed_allreduce_basic():
+        from deepspeed.runtime.comm.nccl import NcclBackend
+        size = dist.get_world_size()
+        rank = dist.get_rank()
+        backend = NcclBackend()
+        local_rank = dist.get_rank()
+        device = torch.device("cuda", dist.get_rank())
+
+        # A simulated compression function using torch.distributed
+        def torch_sim(a):
+            a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+            scale = a.norm() / np.sqrt(a.numel())
+            a_compressed = scale * a_sign
+            a_sign = None
+            worker_error = a - a_compressed
+            dist.all_reduce(a_compressed)
+            a_compressed.mul_(1 / dist.get_world_size())
+            a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(
+                2.0)
+            a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+            server_scale = [
+                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
+            ]
+            a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+            a_server_compressed = torch.cat(
+                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+            rank = dist.get_rank()
+            server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+            torch.cuda.synchronize()
+            torch.distributed.barrier()
+            return a_server_compressed, worker_error, server_error
+
+        tensor_size = 300 * 2**20
+        server_size = int(tensor_size / size)
+        if tensor_size % (8 * size) != 0:
+            right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+        else:
+            right_tensor_size = tensor_size
+        right_server_size = right_tensor_size // size
+
+        # Adding bias to the initialization of the gradient we are communicating
+        # In order to get rid of the case where some elements in the gradient are too small
+        a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+        worker_error = torch.zeros(right_tensor_size, device=device)
+        server_error = torch.zeros(right_server_size, device=device)
+
+        a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+        torch.cuda.empty_cache()
+
+        a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+        threshold = 1e-6
+        magnitude_threshold = 1e-6
+        diff_mask = (a_after - a_torch) > threshold
+        diff_server_mask = torch.chunk(diff_mask, size)[rank]
+        mpi_server = torch.chunk(a_after, size)[rank] + server_error
+        torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+        # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+        # The test would skip those numbers that are too small in compensated_server_m
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) != 0:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
+        assert torch.sum(diff_server_mask) == 0 or torch.sum(check_mag_mask) == 0
+
+    _test_compressed_allreduce_basic()

From 10c0bea6021045c917f8e80eb9f4d8e59ae28dee Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 18 Mar 2021 08:54:44 -0700
Subject: [PATCH 15/78] consistent checkpoint filenaming (#865)

* consistent checkpoint filenaming

* backward compatible rename

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/engine.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 5b3295e99fc8..f965eb688d16 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1352,7 +1352,7 @@ def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank):
         zero_ckpt_name = os.path.join(
             checkpoints_path,
             str(tag),
-            filename + '_mp_rank_{:02d}'.format(mp_rank) + 'optim_states.pt')
+            filename + '_mp_rank_{:02d}'.format(mp_rank) + '_optim_states.pt')
         return zero_ckpt_name
 
     def _get_zero_ckpt_name(self, checkpoints_path, tag):
@@ -1529,13 +1529,20 @@ def _get_all_zero_checkpoints(self, load_dir, tag):
             mp_rank=mp_rank,
             dp_world_size=self.loaded_checkpoint_dp_world_size)
         invalid_zero_ckpt_paths = []
-        for ckpt_name in zero_ckpt_names:
+        for i, ckpt_name in enumerate(zero_ckpt_names):
             if not os.path.exists(ckpt_name):
+                # transparently handle the old file pattern for optim_states
+                if 'optim_states.pt' in ckpt_name:
+                    ckpt_name_try = ckpt_name.replace("_optim_states.pt",
+                                                      "optim_states.pt")
+                    if os.path.exists(ckpt_name_try):
+                        zero_ckpt_names[i] = ckpt_name_try
+                        continue
                 invalid_zero_ckpt_paths.append(ckpt_name)
 
         if len(invalid_zero_ckpt_paths) > 0:
             logger.warn(
-                f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist"
+                f"The following zero checkpoints paths are missing: {invalid_zero_ckpt_paths}"
             )
             return None
 

From 9e9f8cbed04ef8708f49316000f1fddc93ee4639 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 18 Mar 2021 08:58:23 -0700
Subject: [PATCH 16/78] [doc] launcher (#868)

As discussed in https://github.com/microsoft/DeepSpeed/issues/662 this PR modifies the doc:
* explains what to use instead of CUDA_VISIBLE_DEVICES
* puts the `--hostfile` cl arg in the correct place in the invocation script

Fixes: https://github.com/microsoft/DeepSpeed/issues/662

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 docs/_tutorials/getting-started.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 37f104f0739e..e12388aaf973 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -186,8 +186,8 @@ slots available.
 The following command launches a PyTorch training job across all available nodes and GPUs
 specified in `myhostfile`:
 ```bash
-deepspeed <client_entry.py> <client args> \
-  --deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
+deepspeed --hostfile=myhostfile <client_entry.py> <client args> \
+  --deepspeed --deepspeed_config ds_config.json
 ```
 
 Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
@@ -264,3 +264,10 @@ not detected or passed in then DeepSpeed will query the number of GPUs on the
 local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
+
+Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control 
+which devices should be used. For example, to use only gpu1 of the current 
+node, do:
+```bash
+deepspeed --include localhost:1 ...
+```

From 22d5a1f3e8727d5d444a4d4e4fa6290441851c4a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 23 Mar 2021 22:46:46 -0700
Subject: [PATCH 17/78] [doc] pipeline (#888)

* [doc] pipeline

As @g-karthik flagged in https://github.com/microsoft/DeepSpeed/pull/659#discussion_r600132598 my previous correction PR had one sentence that said the wrong thing. So this PR attempts to rectify that.

Thank you!

* tweak
---
 docs/_tutorials/pipeline.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 70790c82b301..0d847ea18752 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,9 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
-Then each worker will allocate only the layers it's assigned to. So, continuing the
-example from the previous paragraph, a machine with 16 GPUs will need to allocate a
-total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
+Then each worker will allocate only the layers it's assigned to. So, comparing to the
+example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to 
+allocate a total of 1x model size on its CPU memory and not 16x.
 
 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`

From 7f03282c5170ffca2d1c000776b0334d7fd5c97e Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 25 Mar 2021 05:49:11 -0700
Subject: [PATCH 18/78] [debug utils] see_memory_usage fixes (#890)

* see_memory_usage fixes

* didn't expect pt-1.2

* fix the order of things

* fix the order of things
---
 deepspeed/runtime/utils.py         | 8 ++++++++
 docs/_tutorials/getting-started.md | 4 ++--
 docs/_tutorials/pipeline.md        | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index bfacc0af512a..b1a7a4b0aae1 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -8,6 +8,7 @@
 
 import os
 import psutil
+import gc
 from math import ceil
 from math import floor
 from bisect import bisect_left, bisect_right
@@ -551,6 +552,9 @@ def see_memory_usage(message, force=False):
     if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
         return
 
+    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
+    gc.collect()
+
     # Print message except when distributed but not rank 0
     logger.info(message)
     logger.info(
@@ -564,6 +568,10 @@ def see_memory_usage(message, force=False):
     logger.info(
         f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
+    # get the peak memory to report correct data, so reset the counter for the next call
+    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
+        torch.cuda.reset_peak_memory_stats()
+
 
 def call_to_str(base, *args, **kwargs):
     """Construct a string representation of a call.
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index e12388aaf973..e9b9aa0e627e 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -265,8 +265,8 @@ local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
 
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control 
-which devices should be used. For example, to use only gpu1 of the current 
+Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
+which devices should be used. For example, to use only gpu1 of the current
 node, do:
 ```bash
 deepspeed --include localhost:1 ...
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 0d847ea18752..1751846830ef 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -277,7 +277,7 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
-example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to 
+example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
 
 Here is an example of the abbreviated AlexNet model, but expressed only

From 7531c6bf530e9f48b6920acf13427748c48814ac Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 26 Mar 2021 14:52:25 -0700
Subject: [PATCH 19/78] full fp32 weights reconstruction for zero 2+3 (#892)

---
 deepspeed/runtime/engine.py     |  27 +++++-
 deepspeed/utils/zero_to_fp32.py | 151 ++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 deepspeed/utils/zero_to_fp32.py

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f965eb688d16..667826d363d2 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -3,10 +3,13 @@
 '''
 
 import os
+import stat
 import torch
 import warnings
 import hashlib
 import torch.distributed as dist
+from collections import OrderedDict
+from shutil import copyfile
 
 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
@@ -1684,8 +1687,30 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
         torch.save(state, save_path)
         self._curr_save_path = None
 
+    def _get_param_shapes(self):
+        param_shapes = OrderedDict()
+        for name, param in self.module.named_parameters():
+            param_shapes[name] = param.ds_shape if hasattr(param,
+                                                           "ds_shape") else param.shape
+            # print(f"saving param {name} {param_shapes[name]}")
+        return param_shapes
+
+    def _copy_recovery_script(self, save_path):
+        base_dir = os.path.dirname(os.path.dirname(__file__))
+        script = "zero_to_fp32.py"
+        src = os.path.join(base_dir, "utils", script)
+        dst = os.path.join(save_path, script)
+        logger.info(f"creating recovery script {dst}")
+        copyfile(src, dst)
+        # make executable
+        os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
+
     def _save_zero_checkpoint(self, save_path, tag):
         zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
-        zero_sd = {'optimizer_state_dict': self.optimizer.state_dict()}
+        zero_sd = dict(
+            optimizer_state_dict=self.optimizer.state_dict(),
+            param_shapes=self._get_param_shapes(),
+        )
         torch.save(zero_sd, zero_checkpoint_name)
+        self._copy_recovery_script(save_path)
         logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
new file mode 100644
index 000000000000..3401fd635e7c
--- /dev/null
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py global_step1 pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import os
+from collections import OrderedDict
+import deepspeed
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+
+
+def get_optim_files(checkpoint_dir):
+
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    optim_files = sorted(glob.glob(f"{checkpoint_dir}/*_optim_states.pt"))
+
+    if len(optim_files) == 0:
+        raise FileNotFoundError(
+            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+
+    return optim_files
+
+
+def parse_optim_states(files):
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f))
+
+    if not "zero_stage" in state_dicts[0]['optimizer_state_dict']:
+        raise ValueError(f"non zero checkpoint")
+    zero_stage = state_dicts[0]['optimizer_state_dict']["zero_stage"]
+
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = "single_partition_of_fp32_groups"
+    elif zero_stage == 3:
+        fp32_groups_key = "fp32_flat_groups"
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    param_shapes = state_dicts[0]["param_shapes"]
+    fp32_flat_groups = [
+        state_dicts[i]['optimizer_state_dict'][fp32_groups_key][0]
+        for i in range(len(state_dicts))
+    ]
+    world_size = state_dicts[0]['optimizer_state_dict']["partition_count"]
+
+    return zero_stage, world_size, param_shapes, fp32_flat_groups
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = int(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def convert_zero_chkpt_to_fp32_consolid_state_dict(checkpoint_dir, output_file):
+    """
+    Convert zero 2 or 3 checkpoint into a single fp32 consolidated state_dict file that can be
+    loaded with ``torch.load(file)`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the deepspeed checkpoint folder
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+
+    """
+    print(f"Processing zero checkpoint '{checkpoint_dir}'")
+
+    optim_files = get_optim_files(checkpoint_dir)
+    zero_stage, world_size, param_shapes, fp32_flat_groups = parse_optim_states(optim_files)
+    print(
+        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    # Reconstruction protocol:
+    #
+    # - for zero2 we just need to concat the partitions back to back and reconsolidate over one huge
+    # flat buffer - no need to deal with padding since if there is any it will be only in the tail
+    # of the last partition so there it will be just left out
+    #
+    # - for zero3 we need to zip the partitions together at boundary of each param, re-consolidating
+    # each param, while dealing with padding if any
+
+    if zero_stage == 2:
+        # XXX: memory usage doubles here (zero2)
+        full_single_fp32_vector = torch.cat(fp32_flat_groups, 0)
+
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    state_dict = OrderedDict()
+    offset = 0
+    total_numel = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        if zero_stage == 2:
+            # print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(
+                0,
+                offset,
+                unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        elif zero_stage == 3:
+            partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+            # print(f"{name} full shape: {shape} partition0 numel {partitioned_numel} partitioned_padding_numel {partitioned_padding_numel}")
+            # XXX: memory usage doubles here (zero3)
+            state_dict[name] = torch.cat(
+                tuple(fp32_flat_groups[i].narrow(0,
+                                                 offset,
+                                                 partitioned_numel)
+                      for i in range(world_size)),
+                0).view(shape)
+            offset += partitioned_numel + partitioned_padding_numel
+
+    # the job is done
+    print(f"Saving fp32 state dict to {output_file} (total_numel={total_numel})")
+
+    torch.save(state_dict, output_file)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir",
+        type=str,
+        help=
+        "path to the deepspeed checkpoint folder, e.g., path/checkpoint-1/global_step1")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help=
+        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-1/pytorch_model.bin)"
+    )
+    args = parser.parse_args()
+
+    convert_zero_chkpt_to_fp32_consolid_state_dict(args.checkpoint_dir, args.output_file)

From 39013dd2b89d471520be17b405519dfd7f94c1c4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 26 Mar 2021 18:08:00 -0700
Subject: [PATCH 20/78] save_fp16_model consolidated for zero3 (#893)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/engine.py         | 98 +++++++++++++++++++++++++++++
 deepspeed/runtime/zero/config.py    |  6 ++
 deepspeed/runtime/zero/constants.py |  8 ++-
 3 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 667826d363d2..32301c08a5c7 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -388,6 +388,9 @@ def zero_prefetch_bucket_size(self):
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
+    def zero_gather_fp16_weights_on_model_save(self):
+        return self._config.zero_config.gather_fp16_weights_on_model_save
+
     def fp16_enabled(self):
         return self._config.fp16_enabled
 
@@ -1714,3 +1717,98 @@ def _save_zero_checkpoint(self, save_path, tag):
         torch.save(zero_sd, zero_checkpoint_name)
         self._copy_recovery_script(save_path)
         logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
+
+    def _zero3_consolidated_fp16_state_dict(self):
+        """
+
+        Get a full non-partitioned state_dict with fp16 weights on cpu.
+
+        This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
+
+        1. consolidates the weights from different partitions on gpu0
+        2. works on one layer at a time to require as little gpu0 memory as possible, by
+        moving the already consolidated weights to cpu
+        3. takes care to keep the shared params shared when gradually copying the params to cpu
+
+        Returns:
+            a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
+
+        """
+        import deepspeed
+
+        if not self.zero_optimization_partition_weights():
+            raise ValueError("this function requires ZeRO-3 mode")
+
+        state_dict = OrderedDict() if torch.distributed.get_rank() == 0 else None
+        shared_weights = {}
+
+        def get_layer_state_dict(module, prefix=""):
+            # gather one layer at a time to be memory-efficient
+            with deepspeed.zero.GatheredParameters(list(
+                    module.parameters(recurse=False))):
+                if torch.distributed.get_rank() == 0:
+                    for name, param in module.named_parameters(recurse=False):
+                        if param is None:
+                            continue
+                        key = prefix + name
+                        # for shared weights we want to make sure not to unshare them when copying to cpu
+                        data_ptr_id = param.storage().data_ptr()
+                        if data_ptr_id in shared_weights:
+                            # shared weights
+                            # print(f"`{key}` is shared with `{shared_weights[data_ptr_id]}`")
+                            state_dict[key] = state_dict[shared_weights[data_ptr_id]]
+                        else:
+                            state_dict[key] = param.detach().cpu()
+                            shared_weights[data_ptr_id] = key
+                        #print(f"param {name} {param.shape}")
+                        #print(f"param {key} {param.shape} {state_dict[key].storage().data_ptr()}")
+
+                    # now buffers - not sure if need to take care of potentially shared weights here
+                    for name, buf in module.named_buffers(recurse=False):
+                        if buf is not None and name not in module._non_persistent_buffers_set:
+                            state_dict[prefix + name] = buf.detach().cpu()
+
+            for name, child in module.named_children():
+                if child is not None:
+                    get_layer_state_dict(child, prefix + name + ".")
+
+        see_memory_usage("before get_layer_state_dict", force=False)
+        get_layer_state_dict(self.module, prefix="")
+        see_memory_usage("after get_layer_state_dict", force=False)
+
+        return state_dict
+
+    def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
+        r"""Save fp16 model weights
+
+        This method saves the fp16 model weights at the desired destination.
+
+        Arguments:
+            save_dir: Required. Directory for saving the model
+            save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
+
+        Important: all processes must call this method and not just the process with rank 0. It is
+        because the processes need to work in sync to gather the weights. This method will hang
+        waiting to synchronize with other processes if it's called just for the process with rank 0.
+
+        """
+
+        path = os.path.join(save_dir, save_filename)
+
+        if self.zero_optimization_partition_weights():
+            if self.zero_gather_fp16_weights_on_model_save():
+                # consolidation is expensive in time and memory and therefore isn't a default
+                state_dict = self._zero3_consolidated_fp16_state_dict()
+            else:
+                # the model will be bogus if not consolidated so don't confuse the user by saving it
+                logger.info(
+                    f"Did not save the model {path} because `stage3_gather_fp16_weights_on_model_save` is False"
+                )
+                return
+        else:
+            state_dict = self.module.state_dict()
+
+        if torch.distributed.get_rank() == 0:
+            os.makedirs(save_dir, exist_ok=True)
+            logger.info(f"Saving model weights to {path}")
+            torch.save(state_dict, path)
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index ac61a9dd52b3..622ffa9ba1cb 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -34,6 +34,7 @@ def __init__(self, param_dict):
         self.param_persistence_threshold = None
         self.max_live_parameters = None
         self.max_reuse_distance = None
+        self.gather_fp16_weights_on_model_save = None
 
         #Stage3 Specific Parameters
         self.prefetch_bucket_size = None
@@ -150,3 +151,8 @@ def _initialize(self, zero_config_dict):
             zero_config_dict,
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
+
+        self.gather_fp16_weights_on_model_save = get_scalar_param(
+            zero_config_dict,
+            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
+            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index 8d4cf2c5d293..e5812980a337 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -99,6 +99,10 @@
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000
 
+# gathers params for saving a model - inefficient but is required in certain situations
+ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
+ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
+
 ZERO_OPTIMIZATION_DEFAULT = {
     ZERO_OPTIMIZATION_STAGE:
     ZERO_OPTIMIZATION_STAGE_DEFAULT,
@@ -133,5 +137,7 @@
     ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE:
     ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
-    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT
+    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
+    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
+    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT
 }

From 7fcc8911cbced68a045063a4edc0507688fc78e6 Mon Sep 17 00:00:00 2001
From: hamlet <gvvvv@163.com>
Date: Sat, 27 Mar 2021 14:59:29 +0800
Subject: [PATCH 21/78] Fix zero stage2 cpu_offload when some model trainable
 parameters skipped in training (#861)

* Fix zero stage2 cpu_offload when some model trainable parameters skipped in training, as in https://github.com/microsoft/DeepSpeed/issues/707

As some model trainable parameters skipped in training,
their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
so they have no norm_for_param_grads

* Trim space

* Trim space

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/stage2.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index e0ca4f025957..cd29625958c9 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -883,8 +883,12 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         for p in params:
             if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
                 param_id = self.get_param_id(p)
-                param_norm = self.norm_for_param_grads[param_id]
-                total_norm += param_norm.item()**2
+                # as some model have trainable parameters but skipped in training,
+                # their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
+                # so they have no norm_for_param_grads
+                if param_id in self.norm_for_param_grads:
+                    param_norm = self.norm_for_param_grads[param_id]
+                    total_norm += param_norm.item()**2
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])

From b4ac3b600467af0acd0e02116950aafc9ddb4c96 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Mon, 29 Mar 2021 12:23:36 +0200
Subject: [PATCH 22/78] mlperf attn initial commit

---
 deepspeed/runtime/engine.py      |  51 ++++++++--------
 deepspeed/runtime/pipe/engine.py |   6 +-
 deepspeed/runtime/utils.py       | 100 ++++++++++++++++++++++++-------
 3 files changed, 110 insertions(+), 47 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index d57299d52340..22a213454091 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -97,6 +97,7 @@ def print_configuration(args, name):
 class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training.
     """
+
     def __init__(self,
                  args,
                  model,
@@ -147,7 +148,7 @@ def __init__(self,
 
         if mpu is not None:
             assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
-                " with model parallelism."
+                                                  " with model parallelism."
 
         self._set_distributed_vars()
 
@@ -452,7 +453,7 @@ def _configure_checkpointing(self, dist_init_required):
 
         # only the first data parallel process needs to store the model checkpoint
         self.save_non_zero_checkpoint = (
-            dp_rank == 0) or self.zero_optimization_partition_weights()
+                                                dp_rank == 0) or self.zero_optimization_partition_weights()
 
         if self.zero_optimization():
             param_rank = torch.distributed.get_rank(
@@ -532,7 +533,7 @@ def _do_args_sanity_check(self, args):
 
     def _is_supported_optimizer(self, optimizer_name):
         return optimizer_name in DEEPSPEED_OPTIMIZERS or \
-            getattr(torch.optim, optimizer_name, None) is not None
+               getattr(torch.optim, optimizer_name, None) is not None
 
     # Validate configuration based on command line arguments
     def _do_sanity_check(self):
@@ -697,7 +698,7 @@ def _configure_fp16_optimizer(self, optimizer):
             else:
                 log_dist('Creating fp16 optimizer with static loss scale: {}'.format(
                     self.loss_scale()),
-                         ranks=[0])
+                    ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
                     static_loss_scale=self.loss_scale(),
@@ -926,11 +927,11 @@ def forward(self, *inputs, **kwargs):
         return loss
 
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
-        #Zero stage 2 communicates during non gradient accumulation boundaries as well
+        # Zero stage 2 communicates during non gradient accumulation boundaries as well
         if self.zero_optimization_partition_gradients():
             self.optimizer.overlapping_partition_gradients_reduce_epilogue()
 
-        #Communicate only at gradient accumulation boundaries
+        # Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
             if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
                 assert self.zero_reduce_scatter()
@@ -1031,7 +1032,7 @@ def is_gradient_accumulation_boundary(self):
             bool: if the current step is a gradient accumulation boundary.
         """
         return (self.micro_steps + 1) % \
-            self.gradient_accumulation_steps() == 0
+               self.gradient_accumulation_steps() == 0
 
     def zero_grad(self):
         """
@@ -1065,8 +1066,8 @@ def _take_model_step(self, lr_kwargs):
         self.timers('_step_step').stop()
 
         self.timers('_step_zero_grad').start()
-        #zero grad in basic optimizer could be unreliable and may not exhibit
-        #the behaviour that we want
+        # zero grad in basic optimizer could be unreliable and may not exhibit
+        # the behaviour that we want
         if not self.zero_optimization() and not self.fp16_enabled(
         ) and not self.amp_enabled():
             self.zero_grad()
@@ -1414,7 +1415,7 @@ def load_checkpoint(self,
                     tag = fd.read().strip()
             else:
                 logger.warning(f"Unable to find latest file at {latest_path}, if trying to load latest " \
-                "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.")
+                               "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.")
                 return None, None
 
         load_path, client_states = self._load_checkpoint(load_dir,
@@ -1442,7 +1443,7 @@ def _load_checkpoint(self,
         if not os.path.exists(load_path):
             logger.warn(
                 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load'
-                .format(load_path))
+                    .format(load_path))
             return None, None
 
         logger.info(f'rank: {self.global_rank} loading checkpoint: {load_path}')
@@ -1485,7 +1486,7 @@ def _load_checkpoint(self,
         client_state = {
             key: value
             for key,
-            value in checkpoint.items() if not key in deepspeed_states
+                value in checkpoint.items() if not key in deepspeed_states
         }
 
         return load_path, client_state
@@ -1568,8 +1569,8 @@ def _checkpoint_tag_validation(self, tag):
             dist.all_reduce(min_bhash, op=torch.distributed.ReduceOp.MIN)
             valid = all(min_bhash == bhash) and all(max_bhash == bhash)
             msg = f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across " \
-                "all ranks. Including rank unique information in checkpoint tag could cause issues when " \
-                "restoring with different world sizes."
+                  "all ranks. Including rank unique information in checkpoint tag could cause issues when " \
+                  "restoring with different world sizes."
             if self.checkpoint_tag_validation_fail():
                 assert valid, msg
             elif not valid:
@@ -1661,29 +1662,29 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
 
         state = {
             'module':
-            self.module_state_dict(),
+                self.module_state_dict(),
             'optimizer':
-            self.optimizer.state_dict()
-            if self.optimizer and not self.zero_optimization() else None,
+                self.optimizer.state_dict()
+                if self.optimizer and not self.zero_optimization() else None,
             'lr_scheduler':
-            self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
+                self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
             'csr_tensor_module_names':
-            self.csr_tensor_module_names,
+                self.csr_tensor_module_names,
             'skipped_steps':
-            self.skipped_steps,
+                self.skipped_steps,
             'global_steps':
-            self.global_steps,
+                self.global_steps,
             'global_samples':
-            self.global_samples,
+                self.global_samples,
             'dp_world_size':
-            self.dp_world_size,
+                self.dp_world_size,
             'mp_world_size':
-            self.mp_world_size
+                self.mp_world_size
         }
         state.update(client_state)
 
         log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0])
-        #logger.info('Saving model checkpoint: {}'.format(save_path))
+        # logger.info('Saving model checkpoint: {}'.format(save_path))
         torch.save(state, save_path)
         self._curr_save_path = None
 
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index ec40da0a2082..f0529a839f60 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -594,14 +594,16 @@ def _exec_backward_pass(self, buffer_id):
 
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
-            # print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
+            # print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[
+            # 1].size()}')
             part_grad = PartitionedTensor.from_meta(
                 meta=self.grad_layer[0],
                 local_part=self.grad_layer[1],
                 group=self.grid.get_slice_parallel_group())
             grad_tensors = tuple([part_grad.full(), self.grad_layer[2]])
             part_grad = None
-            # print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
+            # print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[
+            # 1].size()}')
 
         # This handles either a single tensor or tuple of tensors.
         if isinstance(outputs, tuple):
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index bfacc0af512a..a47b4f22f774 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -63,6 +63,7 @@ def move_to_device(item, device):
 
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
+
     def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False):
         self.mpu = mpu
         self.params = [] if param_groups else None
@@ -204,12 +205,12 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         for p in parameters:
             if mpu is not None:
                 if (mpu.get_model_parallel_rank() == 0
-                    ) or is_model_parallel_parameter(p):
+                ) or is_model_parallel_parameter(p):
                     param_norm = p.grad.data.float().norm(norm_type)
-                    total_norm += param_norm.item()**norm_type
+                    total_norm += param_norm.item() ** norm_type
             else:
                 param_norm = p.grad.data.float().norm(norm_type)
-                total_norm += param_norm.item()**norm_type
+                total_norm += param_norm.item() ** norm_type
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
@@ -217,7 +218,7 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
             torch.distributed.all_reduce(total_norm_cuda,
                                          op=torch.distributed.ReduceOp.SUM,
                                          group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
 
     if total_norm == float(
             'inf') or total_norm == -float('inf') or total_norm != total_norm:
@@ -261,21 +262,21 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
         for p in parameters:
             if mpu is not None:
                 if (mpu.get_model_parallel_rank() == 0
-                    ) or is_model_parallel_parameter(p):
+                ) or is_model_parallel_parameter(p):
                     try:
                         param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
                     except TypeError as err:
                         param_norm = float(torch.norm(p.float(), norm_type))
 
-                    #param_norm = p.data.float().norm(norm_type)
-                    total_norm += param_norm**norm_type
+                    # param_norm = p.data.float().norm(norm_type)
+                    total_norm += param_norm ** norm_type
             else:
                 try:
                     param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
                 except TypeError as err:
                     param_norm = float(torch.norm(p.float(), norm_type))
-                #param_norm = p.data.float().norm(norm_type)
-                total_norm += param_norm**norm_type
+                # param_norm = p.data.float().norm(norm_type)
+                total_norm += param_norm ** norm_type
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
@@ -283,7 +284,7 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
             torch.distributed.all_reduce(total_norm_cuda,
                                          op=torch.distributed.ReduceOp.SUM,
                                          group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
 
     if total_norm == float(
             'inf') or total_norm == -float('inf') or total_norm != total_norm:
@@ -529,12 +530,12 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     max_cached = torch.cuda.max_memory_cached()
 
     # convert to GB for printing
-    new_alloced /= 1024**3
-    new_cached /= 1024**3
-    delta_alloced /= 1024**3
-    delta_cached /= 1024**3
-    max_alloced /= 1024**3
-    max_cached /= 1024**3
+    new_alloced /= 1024 ** 3
+    new_cached /= 1024 ** 3
+    delta_alloced /= 1024 ** 3
+    delta_cached /= 1024 ** 3
+    max_alloced /= 1024 ** 3
+    max_cached /= 1024 ** 3
 
     print(
         f'RANK={rank} MEMSTATS',
@@ -554,13 +555,13 @@ def see_memory_usage(message, force=False):
     # Print message except when distributed but not rank 0
     logger.info(message)
     logger.info(
-        f"MA {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
-        Max_MA {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
-        CA {round(torch.cuda.memory_cached() / (1024 * 1024 * 1024),2)} GB \
+        f"MA {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024), 2)} GB \
+        Max_MA {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), 2)} GB \
+        CA {round(torch.cuda.memory_cached() / (1024 * 1024 * 1024), 2)} GB \
         Max_CA {round(torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))} GB ")
 
     vm_stats = psutil.virtual_memory()
-    used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
+    used_GB = round(((vm_stats.total - vm_stats.available) / (1024 ** 3)), 2)
     logger.info(
         f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
@@ -585,3 +586,62 @@ def call_to_str(base, *args, **kwargs):
         name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
     name += ')'
     return name
+
+
+class GradientNoiseScale:
+
+    def __init__(self, model, batch_size_small, n_batches, beta):
+        self.batch_size_small, self.batch_size_large = batch_size_small, batch_size_small * n_batches
+        self.n_batches = n_batches
+        self.beta = beta
+        self.model = model
+        self.buffer = []
+        self.ema_scale = None
+        self.ema_noise = None
+        self.scale = None
+        self.noise = None
+        self.noise_scale = None
+        self.n_updates = 0
+
+    def ema(self, avg, yi, i):
+        if avg is None: avg = 0
+        avg = self.beta * avg + (1 - self.beta) * yi
+        return avg, avg / (1 - self.beta ** (i + 1))
+
+    def _flatten_grads(self):
+        grads = [param.grad.flatten().view(-1, 1) for param in self.model.parameters()]
+        grads = torch.cat(grads)
+        return grads
+
+    def _get_scale(self, grads_small, grads_big):
+        return (grads_small - grads_big) / ((1 / self.batch_size_small) - (1 / self.batch_size_large))
+
+    def _get_noise(self, grads_small, grads_big):
+        return (self.batch_size_large * grads_big - self.batch_size_small * grads_small) / (
+                self.batch_size_large - self.batch_size_small)
+
+    def update(self):
+
+        curr_grad = self._flatten_grads()
+        self.buffer.append(curr_grad)
+        if self.n_updates % self.n_batches == self.n_batches - 1:
+            # gather prev n batches and empty buffer
+            past_grads = torch.cat(self.buffer, dim=1)
+            self.buffer = []
+
+            past_grads = past_grads.mean(dim=1)
+
+            g_big = (past_grads ** 2).mean()
+            g_small = (curr_grad ** 2).mean()
+
+            noise = self._get_noise(g_small, g_big)
+            scale = self._get_scale(g_small, g_big)
+
+            self.ema_scale, scale = self.ema(self.ema_scale, scale, self.n_updates)
+            self.ema_noise, noise = self.ema(self.ema_noise, noise, self.n_updates)
+
+            self.scale = scale.item()
+            self.noise = noise.item()
+            self.noise_scale = scale / noise
+
+        self.n_updates += 1

From af2d8fc58747e3a6494060177ff730cbedac86a0 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 30 Mar 2021 11:02:27 -0700
Subject: [PATCH 23/78] update kramdown (#901)

security alert related to older kramdown version
---
 docs/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 94dad7c80bc0..dca4bec5def3 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -68,7 +68,7 @@ GEM
       jekyll-theme-time-machine (= 0.1.1)
       jekyll-titles-from-headings (= 0.5.3)
       jemoji (= 0.12.0)
-      kramdown (= 2.3.0)
+      kramdown (= 2.3.1)
       kramdown-parser-gfm (= 1.1.0)
       liquid (= 4.0.3)
       mercenary (~> 0.3)

From 23ff6cb718bbedd97acf549644b5a5706aae3f1d Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 30 Mar 2021 11:02:51 -0700
Subject: [PATCH 24/78] update backward api doc (#903)

---
 deepspeed/runtime/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 32301c08a5c7..dd77413c82a2 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -955,7 +955,7 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False):
 
         Arguments:
             loss: Torch tensor on which to execute backward propagation
-            allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
+            allreduce_gradients: is deprecated, ignored, and will soon be removed'
         """
 
         if not allreduce_gradients:

From c0422642632d39bf6aefaab4b4a1e9572241a8a9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 Mar 2021 11:06:22 -0700
Subject: [PATCH 25/78] Bump kramdown from 2.3.0 to 2.3.1 in /docs (#905)

Bumps [kramdown](https://github.com/gettalong/kramdown) from 2.3.0 to 2.3.1.
- [Release notes](https://github.com/gettalong/kramdown/releases)
- [Changelog](https://github.com/gettalong/kramdown/blob/master/doc/news.page)
- [Commits](https://github.com/gettalong/kramdown/commits)

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 docs/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index dca4bec5def3..81646671de47 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -196,7 +196,7 @@ GEM
       gemoji (~> 3.0)
       html-pipeline (~> 2.2)
       jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.0)
+    kramdown (2.3.1)
       rexml
     kramdown-parser-gfm (1.1.0)
       kramdown (~> 2.0)

From 8c9e16eb27431847bd04bcd43ffff262f4438dc9 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 30 Mar 2021 17:28:19 -0700
Subject: [PATCH 26/78] We're hiring! + integration posts

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index da8bccc383d4..f96808bd9cad 100755
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 [![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
 
+### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
+
 [DeepSpeed](https://www.deepspeed.ai/) is a deep learning optimization
 library that makes distributed training easy, efficient, and effective.
 
@@ -31,8 +33,10 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
+* [2021/01/19] [[🤗Hugging Face Blog] Fit More and Train Faster With ZeRO via DeepSpeed and FairScale](https://huggingface.co/blog/zero-deepspeed-fairscale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
@@ -40,7 +44,6 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
   * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
   * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
   * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
-* [2020/08/07] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) is now available on-demand
 
 
 # Table of Contents

From c6b497df99a9049cb5b2aa0b915fcce346fa2ae8 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 30 Mar 2021 17:30:27 -0700
Subject: [PATCH 27/78] [website] We're hiring! + integration posts

---
 docs/index.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index a30848246e07..d2e2606af22c 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,6 +4,8 @@ toc: true
 toc_label: "Contents"
 ---
 
+### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
+
 DeepSpeed is a deep learning optimization library that makes distributed training easy,
 efficient, and effective.
 
@@ -28,8 +30,10 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
+* [2021/01/19] [[🤗Hugging Face Blog] Fit More and Train Faster With ZeRO via DeepSpeed and FairScale](https://huggingface.co/blog/zero-deepspeed-fairscale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})

From c814abdadd31d7a92ba2b77c48b2f042318a5a7f Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 30 Mar 2021 17:34:44 -0700
Subject: [PATCH 28/78] [website] we're hiring!

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index d2e2606af22c..31755b8e3201 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,7 +4,7 @@ toc: true
 toc_label: "Contents"
 ---
 
-### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
+<b>03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)</b>
 
 DeepSpeed is a deep learning optimization library that makes distributed training easy,
 efficient, and effective.

From 5d721e092d60eef027ae4af773c0acab8ae44828 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 1 Apr 2021 16:52:54 -0700
Subject: [PATCH 29/78] zero.Init() clarification (#880)

* zero.Init() clarification

clarify that if `model.half()` can't fit into gpu memory `zero.Init()` is a must.

this proposal is via @samyam's clarification shared elsewhere.

Thank you.

* style

* add clarity

* style

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/partition_parameters.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e6cb9199899a..4465adfd7c16 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -279,6 +279,9 @@ def __init__(self,
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
 
+        Important: If the fp16 weights of the model can't fit onto a single GPU memory
+        this feature must be used.
+
         .. note::
             Initializes ``torch.distributed`` if it has not already been done so.
             See :meth:`deepseed.init_distributed` for more information.

From 8db4fdf8157c688b7f1cca082020684024329e19 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 2 Apr 2021 13:20:21 -0700
Subject: [PATCH 30/78] disable pipe test (#915)

This test has been giving us trouble for a bit, seeing nondeterministic failures, skipping for now to not break out CI. Need to revisit soon though.
---
 tests/unit/test_pipe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_pipe.py b/tests/unit/test_pipe.py
index 30d4314a8441..65ae0023b8ec 100755
--- a/tests/unit/test_pipe.py
+++ b/tests/unit/test_pipe.py
@@ -169,6 +169,7 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
     return losses
 
 
+@pytest.mark.skip(reason="been seeing nondeterministic failures, skipping for now")
 @pytest.mark.parametrize('topo',
                          [
                              PipeTopo(num_pp=1,

From ab5534fc4c0f8ca21ada321f9730d723aa31288b Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Fri, 2 Apr 2021 13:53:30 -0700
Subject: [PATCH 31/78] Add link to AML examples. (#916)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 README.md                          | 1 +
 docs/_tutorials/azure.md           | 2 ++
 docs/_tutorials/getting-started.md | 1 +
 docs/index.md                      | 1 +
 4 files changed, 5 insertions(+)

diff --git a/README.md b/README.md
index f96808bd9cad..c7bde12dd0ea 100755
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md
index 3644b4621f8f..45d41a618a23 100644
--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -10,6 +10,8 @@ benefit all your large model training jobs.
 
 If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
 
+To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed).
+
 To help with launching Azure instances we suggest using the [Azure
 CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
 several helper scripts to get you quickly started using DeepSpeed with Azure.
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index e9b9aa0e627e..ecd3159df8c9 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -9,6 +9,7 @@ date: 2020-05-15
 
 * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
+* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
 
 ## Writing DeepSpeed Models
diff --git a/docs/index.md b/docs/index.md
index 31755b8e3201..497f88bab5c3 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,6 +30,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)

From c334c852edca0c5d06ed2bfb54e87bf41d9f1f5c Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Tue, 6 Apr 2021 22:17:34 +0200
Subject: [PATCH 32/78] add inference_batch fn

---
 deepspeed/runtime/pipe/engine.py | 159 +++++++++++++++++++++++++++++--
 1 file changed, 153 insertions(+), 6 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index f0529a839f60..6daa4a5c9f02 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -391,6 +391,145 @@ def eval_batch(self, data_iter):
 
         return self.agg_eval_loss
 
+    def inference_batch(self, data_iter):
+        """Evaluate the pipeline on a batch of data from ``data_iter``. The
+        engine will evaluate ``self.train_batch_size()`` total samples
+        collectively across all workers.
+
+        This method is equivalent to:
+
+        .. code-block:: python
+
+            module.eval()
+            with torch.no_grad():
+                output = module(batch)
+
+        .. warning::
+            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
+            from ``data_iter`` by each pipeline. There must be sufficient
+            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
+
+            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
+            that wraps data loaders to automatically restart upon a ``StopIteration``.
+
+        Args:
+            data_iter (Iterator): Iterator of data to evaluate.
+
+        Returns:
+            The arithmetic mean of the losses computed this batch.
+        """
+        print('ENTERING')
+        self.module.eval()
+        self.total_loss = None
+
+        # Use the provided data iterator
+        train_iterator = self.data_iterator
+        self.set_dataiterator(data_iter)
+
+        # Do the work
+        print('DOING SCHED')
+        sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
+                                           stages=self.num_stages,
+                                           stage_id=self.stage_id)
+        with torch.no_grad():
+            self._exec_schedule(sched)
+        print(f'DONE SCHED {self.global_rank}')
+
+        # the shapes are variable so we need to first broadcast the shapes, then the tensors themselves
+
+        if self.is_last_stage():
+            print(f'1. SENDING FROM SRC RANK: {self.global_rank}')
+
+            logits, presents = self.total_loss
+            logits = logits.clone().detach()
+            presents = presents.clone().detach()
+            logits_shape = list(logits.shape) # [4, 1, 50304]
+            presents_shape = list(presents.shape) # [12, 2, 1, 4, 12, 64]
+            print(f'1. SENDING CONSTRUCT TENSORS')
+
+            logits_shape_tensor = torch.LongTensor(logits_shape).to(self.device)
+            presents_shape_tensor = torch.LongTensor(presents_shape).to(self.device)
+            dist.broadcast(tensor=logits_shape_tensor,
+                            src=self.global_rank)
+            dist.broadcast(tensor=presents_shape_tensor,
+                            src=self.global_rank)
+            print(f'1. DONE SENDING FROM SRC RANK: {self.global_rank}')
+
+        else:
+            src_rank = self.grid.stage_to_global(self.num_stages - 1)
+            print(f'1. RECVING FROM SRC RANK: {src_rank}')
+            print(f'1. CONSTRUCTING TENSORS')
+            a = 1
+            print(f'1. CONSTRUCTED A')
+
+            misc = torch.LongTensor([0]).to(self.device)
+            print(f'1. CONSTRUCTED MISC')
+
+            logits_shape_tensor = torch.LongTensor([0] * 3).to(self.device)
+            print(f'1. CONSTRUCTED LOGITS')
+
+            presents_shape_tensor = torch.LongTensor([0] * 6).to(self.device)
+            print(f'1. DONE CONSTRUCTING TENSORS')
+
+
+            dist.broadcast(tensor=logits_shape_tensor,
+                            src=src_rank)
+            dist.broadcast(tensor=presents_shape_tensor,
+                            src=src_rank)
+            print(f'1. DONE RECVING FROM SRC RANK: {src_rank}')
+            logits_shape_tensor = logits_shape_tensor.clone().detach()
+            presents_shape_tensor = presents_shape_tensor.clone().detach()
+
+        logits_shape = logits_shape_tensor.tolist()
+        presents_shape = presents_shape_tensor.tolist()
+
+        if self.is_last_stage():
+            print(f'SENDING FROM SRC RANK: {self.global_rank}')
+
+            # outputs = torch.Tensor([logits, presents]).to(self.device)
+            dist.broadcast(tensor=logits,
+                            src=self.global_rank,
+                            group=self.mpu.get_pipe_parallel_group())
+            dist.broadcast(tensor=presents,
+                            src=self.global_rank,
+                            group=self.mpu.get_pipe_parallel_group())
+            print(f'DONE SENDING FROM SRC RANK: {self.global_rank}')
+
+        else:
+            logits = torch.zeros(logits_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
+            presents = torch.zeros(presents_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
+            src_rank = self.grid.stage_to_global(self.num_stages - 1)
+            assert src_rank in self.grid.pp_group
+            print(f'RECVING FROM SRC RANK: {src_rank}')
+            dist.broadcast(tensor=logits,
+                            src=src_rank,
+                            group=self.grid.get_pipe_parallel_group())
+            dist.broadcast(tensor=presents,
+                            src=src_rank,
+                            group=self.grid.get_pipe_parallel_group())
+            logits = logits.clone().detach()
+            presents = presents.clone().detach()
+            print(f'DONE RECVING FROM SRC RANK: {src_rank}')
+
+        print(f'LOGITS: {logits.shape}, PRESENTS: {presents.shape}, IS_DATA_PARALLEL: {self.is_data_parallel}')
+        # self.agg_eval_loss = self._aggregate_total_loss()
+        if self.tensorboard_enabled():
+            if self.global_rank == 0:
+                self.summary_events = [(f'Train/Samples/eval_loss',
+                                        self.agg_eval_loss.mean().item(),
+                                        self.global_samples)]
+                for event in self.summary_events:  # write_summary_events
+                    self.summary_writer.add_scalar(event[0], event[1], event[2])
+                self.summary_writer.flush()
+
+        # Restore the training iterator
+        self.set_dataiterator(train_iterator)
+
+        # Reset any buffers that may have been populated during the forward passes.
+        # ds_checkpointing.reset()
+        print('RETURNING LOGITS / PRESENTS')
+        return logits, presents
+
     def is_first_stage(self):
         """True if this process is in the first stage in the pipeline."""
         return self.stage_id == 0
@@ -594,16 +733,14 @@ def _exec_backward_pass(self, buffer_id):
 
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
-            # print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[
-            # 1].size()}')
+            # print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
             part_grad = PartitionedTensor.from_meta(
                 meta=self.grad_layer[0],
                 local_part=self.grad_layer[1],
                 group=self.grid.get_slice_parallel_group())
             grad_tensors = tuple([part_grad.full(), self.grad_layer[2]])
             part_grad = None
-            # print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[
-            # 1].size()}')
+            # print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
 
         # This handles either a single tensor or tuple of tensors.
         if isinstance(outputs, tuple):
@@ -663,6 +800,7 @@ def _exec_load_micro_batch(self, buffer_id):
 
         if self.wall_clock_breakdown():
             self.timers('batch_input').stop()
+        # print('DONE LOADING MICROBATCH: ', loaded)
 
     def _send_tensor_meta(self, buffer, recv_stage):
         """ Communicate metadata about upcoming p2p transfers.
@@ -781,7 +919,7 @@ def _exec_send_activations(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').start()
             self.timers('comms').start()
-
+        print('send 1')
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
         # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
@@ -791,10 +929,12 @@ def _exec_send_activations(self, buffer_id):
             outputs = list(outputs)
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
+        print('send 2')
 
         if self.first_output_send:
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
+        print('send 3')
 
         if isinstance(outputs, torch.Tensor):
             p2p.send(outputs, self.next_stage)
@@ -804,16 +944,19 @@ def _exec_send_activations(self, buffer_id):
         else:
             raise NotImplementedError('Could not send output of type '
                                       f'{type(outputs)}')
+        print('send 4')
 
         # Restore the boolean tensor
         if self.module.__class__.__name__ == 'GPT2ModelPipe':
             outputs = list(outputs)
             outputs[-1] = outputs[-1].bool()
             outputs = tuple(outputs)
+        print('send 5')
 
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').stop()
             self.timers('comms').stop()
+        print('send done')
 
     def _exec_send_grads(self, buffer_id):
         if self.wall_clock_breakdown():
@@ -872,10 +1015,11 @@ def _exec_recv_activations(self, buffer_id):
             self.timers('pipe_recv_input').start()
 
         recvd = None
-
+        print('1')
         # Allocate the buffer if necessary
         if self.pipe_recv_buf is None:
             self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
+        print('2')
 
         if isinstance(self.pipe_recv_buf, torch.Tensor):
             p2p.recv(self.pipe_recv_buf, self.prev_stage)
@@ -906,11 +1050,13 @@ def _exec_recv_activations(self, buffer_id):
 
             for buffer in recvd:
                 buffer.requires_grad = buffer.is_floating_point()
+        print('3')
 
         self.pipe_buffers['inputs'][buffer_id] = recvd
 
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_input').stop()
+        print('DONE RECV ACTIVATION')
 
     def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():
@@ -1145,6 +1291,7 @@ def _exec_schedule(self, pipe_schedule):
         for step_cmds in pipe_schedule:
             # For each instruction in the step
             for cmd in step_cmds:
+                print(cmd, self.global_rank)
                 if type(cmd) not in self._INSTRUCTION_MAP:
                     raise RuntimeError(
                         f'{self.__class__.__name__} does not understand instruction {repr(cmd)}'

From ce14cf1af6b31fea0f8cc4d9cc09aac19a61c6a3 Mon Sep 17 00:00:00 2001
From: Takuya Makino <takuyamakino15@gmail.com>
Date: Wed, 7 Apr 2021 13:30:15 +0900
Subject: [PATCH 33/78] Add space in help string (#926)

---
 deepspeed/launcher/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index ac873f4ca3f4..5295b6859599 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -95,7 +95,7 @@ def parse_args(args=None):
     parser.add_argument("--launcher",
                         default=PDSH_LAUNCHER,
                         type=str,
-                        help="(optional) choose launcher backend for multi-node"
+                        help="(optional) choose launcher backend for multi-node "
                         "training. Options currently include PDSH, OpenMPI, MVAPICH.")
 
     parser.add_argument("--launcher_args",

From b5f56b2c6bade2b0bfc80c5fb550d6597f43a17a Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Wed, 7 Apr 2021 13:00:56 -0700
Subject: [PATCH 34/78] =?UTF-8?q?Fix=20for=20fragmented=20linear=20inputs?=
 =?UTF-8?q?=20in=20ZeRO=203=20Linear=20layers=20where=20reshap=E2=80=A6=20?=
 =?UTF-8?q?(#881)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/zero/linear.py               |  8 ++++----
 deepspeed/runtime/zero/partition_parameters.py | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index f29fcda2bb19..23f97d5a542a 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -77,10 +77,10 @@ def backward(ctx, grad_output):
             #print("Computing grad weight")
             dim = grad_output.dim()
             if dim > 2:
-                grad_weight = grad_output.view(-1,
-                                               grad_output.shape[-1]).t().matmul(
-                                                   input.view(-1,
-                                                              input.shape[-1]))
+                grad_weight = grad_output.reshape(-1,
+                                                  grad_output.shape[-1]).t().matmul(
+                                                      input.reshape(-1,
+                                                                    input.shape[-1]))
             else:
                 grad_weight = grad_output.t().matmul(input)
             #print(f"Computed grad weight grad_weight {grad_weight.shape}")
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 4465adfd7c16..5a1a40460e16 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -190,6 +190,9 @@ def _init_subclass(cls, **kwargs):
         torch.empty = empty_cuda_tensor
 
         if self.mem_efficient_linear:
+            print_rank_0(
+                f"Your linear layers are being patched with more memory efficient version. This will persit unless manually reset.",
+                force=True)
             self.linear_bk = torch.nn.functional.linear
             torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
 
@@ -210,8 +213,9 @@ def _disable_class(cls):
         torch.Tensor.__new__ = torch.Tensor.__old_new__
         torch.empty = _orig_torch_empty
 
-        if self.mem_efficient_linear:
-            torch.nn.functional.linear = self.linear_bk
+        #un doing it here will undo it during training
+        #if self.mem_efficient_linear:
+        #    torch.nn.functional.linear = self.linear_bk
 
         # Now that we cleaned up the metaclass injection, raise the exception.
         if exc_type is not None:
@@ -357,6 +361,13 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
+        if mem_efficient_linear:
+            print_rank_0(
+                f"Your linear layers are being patched with more memory efficient version. This will persit unless manually turned reset.",
+                force=True)
+            self.linear_bk = torch.nn.functional.linear
+            torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
+
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)

From 6d94afb5487be2861e948d1198f42fcbdeab36e4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:01:32 -0700
Subject: [PATCH 35/78] [zero3] GatheredParameters can now handle a list of
 params (#884)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 .../runtime/zero/partition_parameters.py      | 77 ++++++++++++++-----
 1 file changed, 58 insertions(+), 19 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 5a1a40460e16..c654e66abc59 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -864,19 +864,20 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
 
 
 class GatheredParameters:
-    def __init__(self, param, modifier_rank=None, fwd_module=None, enabled=True):
-        """A context that collects a parameter that was partitioned via a
-        :class:`deepspeed.zero.Init` context. The parameter is partitioned
+    def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
+        """A context that collects parameters that were partitioned via a
+        :class:`deepspeed.zero.Init` context. The parameters are partitioned
         again upon exit.
 
         Args:
-            param (``torch.nn.Parameter``): The parameter to collect.
+            params (``torch.nn.Parameter``): A single parameter or a list of parameters to collect.
+                It's assumed that all parameters are zero params.
             modifier_rank (int, optional): If specified, this rank's parameter will be
-                broadcasted after the context. This argument is required if ``param`` is
-                modified all processes should have a consistent view of the data. Defaults
+                broadcasted on exit from the context. This argument is required if ``params`` are
+                modified, so that all processes have a consistent view of the data. Defaults
                 to ``None``.
-            fwd_module (``torch.nn.Module``, optional): If specified, ``param`` will be
-                registered as an external parameter of ``fwd_module``. See :meth:`deepspeed.zero.register_external_parameter`.
+            fwd_module (``torch.nn.Module``, optional): If specified, ``params`` will be
+                registered as external parameters of ``fwd_module``. See :meth:`deepspeed.zero.register_external_parameter`.
             enabled (bool, optional): If ``False``, this context is a no-op. Defaults to ``True``.
 
         Examples
@@ -911,41 +912,79 @@ def forward(self, input):
                                                            fwd_module=self):
                         y = self.layer2(x, self.layer1.weight)
                     return y
+
+
+        #. Pretrained model loading
+
+            .. code-block:: python
+
+                with deepspeed.zero.Init():
+                    model = MyModel()
+
+                state_dict = torch.load(model_path, map_location="cpu")
+
+                def load(module: nn.Module, prefix=""):
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(state_dict, prefix)
+
+                    for name, child in module._modules.items():
+                        if child is not None:
+                            load(child, prefix + name + ".")
+
+                load(model, prefix="")
+
+        If this approach is not used, then the full model will first get copied to each GPU. For models
+        bigger than the memory of a single gpu this method is required.
         """
 
         self.enabled = enabled
         if not enabled:
             return
 
-        # This is a no-op, just return.
-        if not is_zero_param(param):
+        if not isinstance(params, list):
+            params = [params]
+
+        # enable if at least one is zero-param, otherwise a noop
+        if not any(is_zero_param(p) for p in params):
             self.enabled = False
             return
 
-        self.param = param
+        self.params = params
         self.src_rank = None
         if modifier_rank is not None:
-            if self.param.ds_process_group == torch.distributed.group.WORLD:
+            if self.params[0].ds_process_group == torch.distributed.group.WORLD:
                 self.src_rank = modifier_rank
             else:
                 # A group was specified; convert DP rank to global rank
-                self.src_rank = _get_global_rank(self.param.ds_process_group,
+                self.src_rank = _get_global_rank(self.params[0].ds_process_group,
                                                  modifier_rank)
         self.fwd_module = fwd_module
         if self.fwd_module is not None:
             # is a no-op if already registered
-            register_external_parameter(self.fwd_module, self.param)
+            for p in self.params:
+                register_external_parameter(self.fwd_module, p)
 
     def __enter__(self):
         if not self.enabled:
             return
-        self.param.all_gather()
+        self.params[0].all_gather(param_list=self.params)
 
     def __exit__(self, *exc):
         if not self.enabled:
             return
-        if self.src_rank is not None:
-            torch.distributed.broadcast(self.param,
+        if self.src_rank is None:
+            return
+
+        handles = [
+            torch.distributed.broadcast(p,
                                         self.src_rank,
-                                        group=self.param.ds_process_group)
-        self.param.partition(has_been_updated=self.src_rank is not None)
+                                        group=p.ds_process_group,
+                                        async_op=True) for p in self.params
+        ]
+        for h in handles:
+            h.wait()
+        self.params[0].partition(param_list=self.params, has_been_updated=True)

From c79184ebcc501ad64a8628afe18c0659d1bc4963 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:02:19 -0700
Subject: [PATCH 36/78] fix cpu_adam memory leak on deepspeed re-use in the
 same process (#896)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 csrc/adam/cpu_adam.cpp         | 8 ++++++++
 deepspeed/ops/adam/cpu_adam.py | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index d425dc3169ef..6726b895f12c 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -672,6 +672,13 @@ int ds_adam_step_plus_copy(int optimizer_id,
     return 0;
 }
 
+int destroy_adam_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
@@ -679,4 +686,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           &ds_adam_step_plus_copy,
           "DeepSpeed CPU Adam update and param copy (C++)");
     m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
+    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
 }
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 7977d232b1fa..35eeedb86b5d 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -85,6 +85,11 @@ def __init__(self,
                                      weight_decay,
                                      adamw_mode)
 
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_adam.destroy_adam(self.opt_id)
+
     def __setstate__(self, state):
         super(DeepSpeedCPUAdam, self).__setstate__(state)
         for group in self.param_groups:

From a128f34e7d6404a18f00861a9c3b700c50e22010 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:06:28 -0700
Subject: [PATCH 37/78] [benchmarks] flatten/unflatten benchmarks (#919)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 tests/benchmarks/flatten_bench.py   | 134 ++++++++++++++++++++++++++
 tests/benchmarks/unflatten_bench.py | 143 ++++++++++++++++++++++++++++
 2 files changed, 277 insertions(+)
 create mode 100755 tests/benchmarks/flatten_bench.py
 create mode 100755 tests/benchmarks/unflatten_bench.py

diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
new file mode 100755
index 000000000000..b3ed3c601492
--- /dev/null
+++ b/tests/benchmarks/flatten_bench.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./flatten_bench.py -t
+# ./flatten_bench.py -c
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+import argparse
+
+import gc
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
+t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(t)
+flat_cpp = flatten(t)
+flat_apex = flatten_apex(t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+TIMES = 1000
+
+
+# the programs being tested
+def py():
+    for i in range(TIMES):
+        flat = _flatten_dense_tensors(t)
+
+
+def cpp():
+    for i in range(TIMES):
+        flat = flatten(t)
+
+
+def apex():
+    for i in range(TIMES):
+        flat = flatten_apex(t)
+
+
+#### cProfile ####
+
+import cProfile
+
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### timeit ####
+
+import timeit
+
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
new file mode 100755
index 000000000000..85baf751ad9c
--- /dev/null
+++ b/tests/benchmarks/unflatten_bench.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./unflatten_bench.py -t
+# ./unflatten_bench.py -c
+# kernprof -l unflatten_bench.py -l; python -m line_profiler  unflatten_bench.py.lprof
+
+import argparse
+import gc
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+from apex_C import unflatten as unflatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
+unflat_t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(unflat_t)
+flat_cpp = flatten(unflat_t)
+flat_apex = flatten_apex(unflat_t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+flat_t = flat_py
+unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_py[i]).all()
+unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
+unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_apex[i]).all()
+
+
+# the programs being tested
+def py():
+    for i in range(1000):
+        unflat = _unflatten_dense_tensors(flat_t, unflat_t)
+
+
+def cpp():
+    for i in range(1000):
+        unflat = unflatten(flat_t, unflat_t)
+
+
+def apex():
+    for i in range(1000):
+        unflat = unflatten_apex(flat_t, unflat_t)
+
+
+#### cProfile ####
+
+import cProfile
+
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### timeit ####
+
+import timeit
+
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof
+
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()

From 5ca86ae4eda0d332495721419e92e71ecad32a5a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:08:59 -0700
Subject: [PATCH 38/78] improved readability + typos (#895)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/engine.py                   | 32 +++++++------------
 deepspeed/runtime/zero/config.py              |  6 ----
 .../runtime/zero/partition_parameters.py      |  4 +--
 deepspeed/runtime/zero/stage3.py              |  2 +-
 4 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index dd77413c82a2..6d8040fe8ec2 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1662,27 +1662,19 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
         # then instead just returns None.
         self._curr_ckpt_path = os.path.join(save_dir, tag)
 
-        state = {
-            'module':
-            self.module_state_dict(),
-            'optimizer':
-            self.optimizer.state_dict()
+        state = dict(
+            module=self.module_state_dict(),
+            optimizer=self.optimizer.state_dict()
             if self.optimizer and not self.zero_optimization() else None,
-            'lr_scheduler':
-            self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
-            'csr_tensor_module_names':
-            self.csr_tensor_module_names,
-            'skipped_steps':
-            self.skipped_steps,
-            'global_steps':
-            self.global_steps,
-            'global_samples':
-            self.global_samples,
-            'dp_world_size':
-            self.dp_world_size,
-            'mp_world_size':
-            self.mp_world_size
-        }
+            lr_scheduler=self.lr_scheduler.state_dict()
+            if self.lr_scheduler is not None else None,
+            csr_tensor_module_names=self.csr_tensor_module_names,
+            skipped_steps=self.skipped_steps,
+            global_steps=self.global_steps,
+            global_samples=self.global_samples,
+            dp_world_size=self.dp_world_size,
+            mp_world_size=self.mp_world_size,
+        )
         state.update(client_state)
 
         log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0])
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 622ffa9ba1cb..c179d01f2988 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -36,12 +36,6 @@ def __init__(self, param_dict):
         self.max_reuse_distance = None
         self.gather_fp16_weights_on_model_save = None
 
-        #Stage3 Specific Parameters
-        self.prefetch_bucket_size = None
-        self.param_persistence_threshold = None
-        self.max_live_parameters = None
-        self.max_reuse_distance = None
-
         if ZERO_OPTIMIZATION in param_dict.keys():
             zero_config_dict = param_dict[ZERO_OPTIMIZATION]
             if type(zero_config_dict) is bool:
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index c654e66abc59..0acc675985ca 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -399,10 +399,10 @@ def _convert_to_deepspeed_param(self, param):
         # Stores the shape of the original tensor
         param.ds_shape = param.shape
 
-        # Stores the number of elements in the original parmaeter without padding
+        # Stores the number of elements in the original parameter without padding
         param.ds_numel = param.numel()
 
-        # Stores the paritioned copy of the tensor
+        # Stores the partitioned copy of the tensor
         param.ds_tensor = None
 
         # Keeps track of how many active sub-modules need this param at any given point in time
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index ea4653578616..e5299949fcf6 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -580,7 +580,7 @@ def __init__(self,
                  gradient_accumulation_steps=1,
                  elastic_checkpoint=False):
 
-        see_memory_usage("Stage 3 intialize beginning", force=True)
+        see_memory_usage("Stage 3 initialize beginning", force=True)
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")

From f19cf6714faf867a5d728b7f4e5c0e2f5c25f127 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:10:14 -0700
Subject: [PATCH 39/78] [zero doc] fix misspelled param (#878)

We really really really need those params to be validated...

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 docs/_tutorials/zero.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index ad6e222707e0..1e9f97b98a11 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -122,7 +122,7 @@ configurations is available [here](/docs/config-json/#zero-optimizations-for-fp1
     "stage3_max_live_parameters": 6000000,
     "stage3_max_reuse_distance": 100000000,
     "stage3_prefetch_bucket_size": 200000,
-    "stage3_param_persitance_threshold": 100000,
+    "stage3_param_persistence_threshold": 100000,
     "reduce_bucket_size": 3000000,
     "sub_group_size": 1e6
   }

From 7b46d11fa4da8023eb54fbcd13ec717c2b20da8d Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Wed, 7 Apr 2021 13:16:02 -0700
Subject: [PATCH 40/78] Samyamr/stage 3 skip modules without parameters (#867)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/zero/stage3.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index e5299949fcf6..493106e93239 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -98,8 +98,8 @@ def move_to_cpu(tensor_list):
         tensor.data = tensor.data.cpu()
 
 
-def get_all_parameters(sub_module):
-    return itertools.chain(sub_module.named_parameters(recurse=False),
+def get_all_parameters(sub_module, recurse=False):
+    return itertools.chain(sub_module.named_parameters(recurse=recurse),
                            sub_module.ds_external_parameters())
 
 
@@ -1037,13 +1037,19 @@ def setup_zero_stage3_hooks(self):
         self.hierarchy = 0
         self._register_hooks_recursively(self.module)
 
+        #reset step at the beginning of forward
+        def _pre_forward_hook(module, *args):
+            self.param_coordinator.reset_step()
+
         #reset step if in inference mode
         def _end_of_forward_hook(module, *args):
 
             if not torch._C.is_grad_enabled():
                 self.param_coordinator.reset_step()
 
+        #likely one of them should be enough but just to be safe
         self.module.register_forward_hook(_end_of_forward_hook)
+        self.module.register_forward_pre_hook(_pre_forward_hook)
 
     def persistent_parameters(self):
         persistent_params = []

From 316992913d3218c0d88364ea4478caa4d53bc7e5 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Apr 2021 13:17:20 -0700
Subject: [PATCH 41/78] docs (#909)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/engine.py         |  2 ++
 docs/_pages/config-json.md          |  8 +++++-
 docs/_tutorials/advanced-install.md | 12 +++++++++
 docs/_tutorials/zero.md             | 38 +++++++++++++++++++++++++++++
 docs/code-docs/source/training.rst  |  8 ++++++
 5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 6d8040fe8ec2..5081adb0b021 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1715,6 +1715,8 @@ def _zero3_consolidated_fp16_state_dict(self):
 
         Get a full non-partitioned state_dict with fp16 weights on cpu.
 
+        Important: this function must be called on all ranks and not just rank 0.
+
         This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
 
         1. consolidates the weights from different partitions on gpu0
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 9a9554cbd75f..81c7d5bd62b9 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -256,7 +256,8 @@ Enabling and configuring ZeRO memory optimizations
     "stage3_prefetch_bucket_size" : 5e8,
     "stage3_param_persistence_threshold" : 1e6,
     "sub_group_size" : 1e12,
-    "elastic_checkpoint" : [true|false]
+    "elastic_checkpoint" : [true|false],
+    "stage3_gather_fp16_weights_on_model_save": [true|false]
     }
 ```
 
@@ -351,6 +352,11 @@ Enabling and configuring ZeRO memory optimizations
 | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
 
 
+***stage3_gather_fp16_weights_on_model_save***: [boolean]
+| Description                                                                                                                                                          | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |
+
 ### Logging
 
 ***steps\_per\_print***: [integer]
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 18f60e864039..4ff02bf7ec84 100644
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -73,6 +73,18 @@ DS_BUILD_OPS=1 pip install deepspeed --global-option="build_ext" --global-option
 
 This should complete the full build 2-3 times faster. You can adjust `-j` to specify how many cpu-cores are to be used during the build. In the example it is set to 8 cores.
 
+You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, pytorch, python, etc.)
+
+```bash
+DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel
+```
+
+This will create a pypi binary wheel under `dist`, e.g., ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` and then you can install it directly on multiple machines, in our example:
+
+```bash
+pip install dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
+```
+
 
 ## Install DeepSpeed from source
 
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 1e9f97b98a11..8f506d25babe 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -260,5 +260,43 @@ for more details.
         self.init_method(self.position_embeddings.weight)
     ```
 
+## Extracting weights
+
+If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
+
+- under ZeRO-2 `state_dict` contains the fp16 model weights and these can be saved normally with `torch.save`.
+- under ZeRO-3 `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs. If you want to get to these weights enable:
+
+```
+    "zero_optimization": {
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+```
+And then save the model using:
+
+```
+            if self.deepspeed:
+                self.deepspeed.save_fp16_model(output_dir, output_file)
+```
+
+Because it requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.
+
+Note that if `stage3_gather_fp16_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them.
+You can use this method to save ZeRO-2 weights as well.
+
+If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:
+
+```
+$ cd /path/to/checkpoints_dir
+$ ./zero_to_fp32.py global_step1 pytorch_model.bin
+Processing zero checkpoint at global_step1
+Detected checkpoint of type zero stage 3, world_size: 2
+Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
+```
+
+The `zero_to_fp32.py` gets created automatically when you save a checkpoint.
+
+Note: currently this script uses 2x memory (general RAM) of the size of the final checkpoint.
+
 
 Congratulations! You have completed the ZeRO tutorial.
diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
index d88d755f39cb..52e124fc3b40 100644
--- a/docs/code-docs/source/training.rst
+++ b/docs/code-docs/source/training.rst
@@ -31,3 +31,11 @@ Optimizer Step
 Gradient Accumulation
 ---------------------
 .. autofunction:: deepspeed.DeepSpeedEngine.is_gradient_accumulation_boundary
+
+
+Model Saving
+------------
+.. autofunction:: deepspeed.DeepSpeedEngine.save_fp16_model
+
+
+Additionally when a DeepSpeed checkpoint is created, a script ``zero_to_fp32.py`` is added there which can be used to reconstruct fp32 master weights into a single pytorch ``state_dict`` file.

From e721cb691f6a86d59bf82f1d383fbfebd313c0df Mon Sep 17 00:00:00 2001
From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Wed, 7 Apr 2021 17:06:41 -0700
Subject: [PATCH 42/78] Supporting different hidden dimensions for transformer
 kernels-v2 (#934)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 csrc/transformer/ds_transformer_cuda.cpp |  1 +
 csrc/transformer/gelu_kernels.cu         | 43 +++++++++++-------------
 tests/unit/test_cuda_backward.py         | 11 +++---
 tests/unit/test_cuda_forward.py          | 22 +++++++-----
 4 files changed, 40 insertions(+), 37 deletions(-)
 mode change 100644 => 100755 csrc/transformer/ds_transformer_cuda.cpp

diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
old mode 100644
new mode 100755
index 8e605d7558d5..6844fa563705
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -29,6 +29,7 @@ size_t get_workspace_size(int maxBatchSize,
 {
     size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
     if (training) {
+        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
         workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
                                      2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
         if (gelu_checkpoint)
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 209b64a90902..12048006266e 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -35,13 +35,11 @@ the intermediate hyperbolic tangent, since there's no intrinsic
 that computes it directly.
 */
 
-__global__ void gelu_kernel(const float* input, float* vals, int intermediate_size)
+__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float4* input_cast = reinterpret_cast<const float4*>(input);
     float4* vals_cast = reinterpret_cast<float4*>(vals);
@@ -60,14 +58,12 @@ __global__ void gelu_kernel(const float* input, float* vals, int intermediate_si
     }
 }
 
-__global__ void gelu_kernel(const __half* input, __half* vals, int intermediate_size)
+__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float2* input_cast = reinterpret_cast<const float2*>(input);
     float2* vals_cast = reinterpret_cast<float2*>(vals);
@@ -98,13 +94,12 @@ __global__ void gelu_kernel(const __half* input, __half* vals, int intermediate_
 __global__ void fused_bias_gelu(const float* input,
                                 const float* bias,
                                 float* vals,
-                                int intermediate_size)
+                                int row_stride,
+                                int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float4* input_cast = reinterpret_cast<const float4*>(input);
     float4* vals_cast = reinterpret_cast<float4*>(vals);
@@ -133,14 +128,13 @@ __global__ void fused_bias_gelu(const float* input,
 __global__ void fused_bias_gelu(const __half* input,
                                 const __half* bias,
                                 __half* vals,
-                                int intermediate_size)
+                                int row_stride,
+                                int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float2* input_cast = reinterpret_cast<const float2*>(input);
     float2* vals_cast = reinterpret_cast<float2*>(vals);
@@ -182,13 +176,12 @@ __global__ void fused_bias_gelu(const __half* input,
 __global__ void d_gelu_func(float* d_output,
                             const float* gelu_input,
                             const float* bias,
-                            int intermediate_size)
+                            int row_stride,
+                            int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     float4* d_output_cast = reinterpret_cast<float4*>(d_output);
     const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
@@ -218,14 +211,13 @@ __global__ void d_gelu_func(float* d_output,
 __global__ void d_gelu_func(__half* d_output,
                             const __half* gelu_input,
                             const __half* bias,
-                            int intermediate_size)
+                            int row_stride,
+                            int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     float2* d_output_cast = reinterpret_cast<float2*>(d_output);
     const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
@@ -282,11 +274,12 @@ void launch_bias_gelu(const T* input,
                       cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(input, bias, output, intermediate_size);
+    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, output, intermediate_size / 4, iterations);
 }
 
 template <typename T>
@@ -297,11 +290,12 @@ void launch_gelu(const T* input,
                  cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(input, output, intermediate_size);
+    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        input, output, intermediate_size / 4, iterations);
 }
 
 template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
@@ -324,11 +318,12 @@ void launch_d_gelu(T* d_output,
                    cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(d_output, input, bias, intermediate_size);
+    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
+        d_output, input, bias, intermediate_size / 4, iterations);
 }
 
 template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index 2c7e07aa8b31..e05cb1190dde 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -17,9 +17,9 @@
 import sys
 
 #if not deepspeed.ops.__installed_ops__['transformer']:
-pytest.skip(
-    "transformer kernels are temporarily disabled because of unexplained failures",
-    allow_module_level=True)
+#pytest.skip(
+#    "transformer kernels are temporarily disabled because of unexplained failures",
+#    allow_module_level=True)
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
@@ -258,6 +258,9 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 # 3-128-54-2-24-False-True-0.2
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
+                             (8,1600,128,25,3,True,True, 0.05),
+                             (8,160,128,2,3,True,True, 0.1),
+                             (8,1600,128,2,3,True,True, 0.05),
                              (3,1024,119,16,24,True,False, 0.05),
                              (3,1024,115,16,24,True,True, 0.05),
                              (1024,128,10,2,2,False,False, 0.1),
@@ -291,7 +294,7 @@ def test_backward(batch_size,
     ds_config.initializer_range = 0.02
     ds_config.fp16 = use_fp16
 
-    run_backward(ds_config, seq_len, atol=atol)
+    run_backward(ds_config, seq_len, atol=atol, verbose=False)
 
 
 #@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 5add5e152a91..73e847aa3ac4 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -199,7 +199,11 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 # FP16 test cases can only run on the devices support FP16.
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,256,53,4,3,True,False),
+                             (8,160,128,2,3,True,True),
+                             (8,160,128,2,3,False,True),
+                             (8,1600,128,2,3,True,True),
+                             (8,1600,128,25,3,True,True),
+                             (8,1600,128,25,3,False,True),
                              (8,256,52,4,3,True,True),
                              (3,1024,51,16,3,True,False),
                              (3,1024,54,16,3,True,True),
@@ -259,10 +263,10 @@ def test_forward(batch_size,
 
 @pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,3,1024,512,16,3,True,False),
-                             (8,7,1024,512,16,3,True,True),
-                             (8,3,1024,512,16,3,False,False),
-                             (8,7,1024,512,16,3,False,True),
+                             #(8,3,1024,512,16,3,True,False),
+                             #(8,7,1024,512,16,3,True,True),
+                             #(8,3,1024,512,16,3,False,False),
+                             #(8,7,1024,512,16,3,False,True),
                          ]) # yapf: disable
 def test_forward_with_small_bsz(batch_size,
                                 small_bsz,
@@ -294,10 +298,10 @@ def test_forward_with_small_bsz(batch_size,
 
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (64,1024,128,16,3,True,False),
-                             (64,1024,128,16,3,True,True),
-                             (64,1024,128,16,3,False,False),
-                             (64,1024,128,16,3,False,True),
+                             #(64,1024,128,16,3,True,False),
+                             #(64,1024,128,16,3,True,True),
+                             #(64,1024,128,16,3,False,False),
+                             #(64,1024,128,16,3,False,True),
                          ]) # yapf: disable
 def test_forward_stochastic(batch_size,
                             hidden_size,

From dba52bc9dd307f7ee36bdbad45ff3508b443a705 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 13:26:55 +0200
Subject: [PATCH 43/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 131 ++++++++++++++-----------------
 1 file changed, 58 insertions(+), 73 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 6daa4a5c9f02..161d693b7744 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -437,81 +437,67 @@ def inference_batch(self, data_iter):
 
         # the shapes are variable so we need to first broadcast the shapes, then the tensors themselves
 
-        if self.is_last_stage():
-            print(f'1. SENDING FROM SRC RANK: {self.global_rank}')
-
-            logits, presents = self.total_loss
-            logits = logits.clone().detach()
-            presents = presents.clone().detach()
-            logits_shape = list(logits.shape) # [4, 1, 50304]
-            presents_shape = list(presents.shape) # [12, 2, 1, 4, 12, 64]
-            print(f'1. SENDING CONSTRUCT TENSORS')
-
-            logits_shape_tensor = torch.LongTensor(logits_shape).to(self.device)
-            presents_shape_tensor = torch.LongTensor(presents_shape).to(self.device)
-            dist.broadcast(tensor=logits_shape_tensor,
-                            src=self.global_rank)
-            dist.broadcast(tensor=presents_shape_tensor,
-                            src=self.global_rank)
-            print(f'1. DONE SENDING FROM SRC RANK: {self.global_rank}')
-
+        if not self.is_last_stage():
+            # print(f'1. SENDING FROM SRC RANK: {self.global_rank}')
+            logits, presents = None
         else:
-            src_rank = self.grid.stage_to_global(self.num_stages - 1)
-            print(f'1. RECVING FROM SRC RANK: {src_rank}')
-            print(f'1. CONSTRUCTING TENSORS')
-            a = 1
-            print(f'1. CONSTRUCTED A')
-
-            misc = torch.LongTensor([0]).to(self.device)
-            print(f'1. CONSTRUCTED MISC')
-
-            logits_shape_tensor = torch.LongTensor([0] * 3).to(self.device)
-            print(f'1. CONSTRUCTED LOGITS')
-
-            presents_shape_tensor = torch.LongTensor([0] * 6).to(self.device)
-            print(f'1. DONE CONSTRUCTING TENSORS')
-
-
-            dist.broadcast(tensor=logits_shape_tensor,
-                            src=src_rank)
-            dist.broadcast(tensor=presents_shape_tensor,
-                            src=src_rank)
-            print(f'1. DONE RECVING FROM SRC RANK: {src_rank}')
-            logits_shape_tensor = logits_shape_tensor.clone().detach()
-            presents_shape_tensor = presents_shape_tensor.clone().detach()
-
-        logits_shape = logits_shape_tensor.tolist()
-        presents_shape = presents_shape_tensor.tolist()
-
-        if self.is_last_stage():
-            print(f'SENDING FROM SRC RANK: {self.global_rank}')
+            logits, presents = self.total_loss
 
-            # outputs = torch.Tensor([logits, presents]).to(self.device)
-            dist.broadcast(tensor=logits,
-                            src=self.global_rank,
-                            group=self.mpu.get_pipe_parallel_group())
-            dist.broadcast(tensor=presents,
-                            src=self.global_rank,
-                            group=self.mpu.get_pipe_parallel_group())
-            print(f'DONE SENDING FROM SRC RANK: {self.global_rank}')
+        #     src_rank = self.grid.stage_to_global(self.num_stages - 1)
+        #     print(f'1. RECVING FROM SRC RANK: {src_rank}')
+        #     print(f'1. CONSTRUCTING TENSORS')
+        #     a = 1
+        #     print(f'1. CONSTRUCTED A')
+        #
+        #     misc = torch.LongTensor([0]).to(self.device)
+        #     print(f'1. CONSTRUCTED MISC')
+        #
+        #     logits_shape_tensor = torch.LongTensor([0] * 3).to(self.device)
+        #     print(f'1. CONSTRUCTED LOGITS')
+        #
+        #     presents_shape_tensor = torch.LongTensor([0] * 6).to(self.device)
+        #     print(f'1. DONE CONSTRUCTING TENSORS')
+        #
+        #
+        #     dist.broadcast(tensor=logits_shape_tensor,
+        #                     src=src_rank)
+        #     dist.broadcast(tensor=presents_shape_tensor,
+        #                     src=src_rank)
+        #     print(f'1. DONE RECVING FROM SRC RANK: {src_rank}')
+        #     logits_shape_tensor = logits_shape_tensor.clone().detach()
+        #     presents_shape_tensor = presents_shape_tensor.clone().detach()
+        #
+        # logits_shape = logits_shape_tensor.tolist()
+        # presents_shape = presents_shape_tensor.tolist()
+        #
+        # if self.is_last_stage():
+        #     print(f'SENDING FROM SRC RANK: {self.global_rank}')
+        #
+        #     # outputs = torch.Tensor([logits, presents]).to(self.device)
+        #     dist.broadcast(tensor=logits,
+        #                     src=self.global_rank,
+        #                     group=self.mpu.get_pipe_parallel_group())
+        #     dist.broadcast(tensor=presents,
+        #                     src=self.global_rank,
+        #                     group=self.mpu.get_pipe_parallel_group())
+        #     print(f'DONE SENDING FROM SRC RANK: {self.global_rank}')
+        #
+        # else:
+        #     logits = torch.zeros(logits_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
+        #     presents = torch.zeros(presents_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
+        #     src_rank = self.grid.stage_to_global(self.num_stages - 1)
+        #     assert src_rank in self.grid.pp_group
+        #     print(f'RECVING FROM SRC RANK: {src_rank}')
+        #     dist.broadcast(tensor=logits,
+        #                     src=src_rank,
+        #                     group=self.grid.get_pipe_parallel_group())
+        #     dist.broadcast(tensor=presents,
+        #                     src=src_rank,
+        #                     group=self.grid.get_pipe_parallel_group())
+        #     logits = logits.clone().detach()
+        #     presents = presents.clone().detach()
+        #     print(f'DONE RECVING FROM SRC RANK: {src_rank}')
 
-        else:
-            logits = torch.zeros(logits_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
-            presents = torch.zeros(presents_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
-            src_rank = self.grid.stage_to_global(self.num_stages - 1)
-            assert src_rank in self.grid.pp_group
-            print(f'RECVING FROM SRC RANK: {src_rank}')
-            dist.broadcast(tensor=logits,
-                            src=src_rank,
-                            group=self.grid.get_pipe_parallel_group())
-            dist.broadcast(tensor=presents,
-                            src=src_rank,
-                            group=self.grid.get_pipe_parallel_group())
-            logits = logits.clone().detach()
-            presents = presents.clone().detach()
-            print(f'DONE RECVING FROM SRC RANK: {src_rank}')
-
-        print(f'LOGITS: {logits.shape}, PRESENTS: {presents.shape}, IS_DATA_PARALLEL: {self.is_data_parallel}')
         # self.agg_eval_loss = self._aggregate_total_loss()
         if self.tensorboard_enabled():
             if self.global_rank == 0:
@@ -527,7 +513,6 @@ def inference_batch(self, data_iter):
 
         # Reset any buffers that may have been populated during the forward passes.
         # ds_checkpointing.reset()
-        print('RETURNING LOGITS / PRESENTS')
         return logits, presents
 
     def is_first_stage(self):

From d9641fd667100f7203f0126f4df711ed54684c4f Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 13:30:35 +0200
Subject: [PATCH 44/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 161d693b7744..059fcd3b8251 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -439,7 +439,7 @@ def inference_batch(self, data_iter):
 
         if not self.is_last_stage():
             # print(f'1. SENDING FROM SRC RANK: {self.global_rank}')
-            logits, presents = None
+            logits, presents = None, None
         else:
             logits, presents = self.total_loss
 

From dbc3b13ae7867feb1ec636b31b87a2bd130d5ecb Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 13:42:01 +0200
Subject: [PATCH 45/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 059fcd3b8251..19a4f249f1ef 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -1038,6 +1038,7 @@ def _exec_recv_activations(self, buffer_id):
         print('3')
 
         self.pipe_buffers['inputs'][buffer_id] = recvd
+        print('3.1')
 
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_input').stop()

From 0f5faf9be7d72a9d6d5ae9c2b15092e05b3a9334 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 15:03:33 +0200
Subject: [PATCH 46/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 19a4f249f1ef..0e57031f2362 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -904,7 +904,7 @@ def _exec_send_activations(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').start()
             self.timers('comms').start()
-        print('send 1')
+        print(f'send 1 -> {self.local_rank}')
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
         # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
@@ -914,12 +914,12 @@ def _exec_send_activations(self, buffer_id):
             outputs = list(outputs)
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
-        print('send 2')
+        print(f'send 2 -> {self.local_rank}')
 
         if self.first_output_send:
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
-        print('send 3')
+        print(f'send 3 -> {self.local_rank}')
 
         if isinstance(outputs, torch.Tensor):
             p2p.send(outputs, self.next_stage)
@@ -929,19 +929,19 @@ def _exec_send_activations(self, buffer_id):
         else:
             raise NotImplementedError('Could not send output of type '
                                       f'{type(outputs)}')
-        print('send 4')
+        print(f'send 4 -> {self.local_rank}')
 
         # Restore the boolean tensor
         if self.module.__class__.__name__ == 'GPT2ModelPipe':
             outputs = list(outputs)
             outputs[-1] = outputs[-1].bool()
             outputs = tuple(outputs)
-        print('send 5')
+        print(f'send 5 -> {self.local_rank}')
 
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').stop()
             self.timers('comms').stop()
-        print('send done')
+        print(f'send done -> {self.local_rank}')
 
     def _exec_send_grads(self, buffer_id):
         if self.wall_clock_breakdown():

From f6fc1af4839c2395e5a12dce3b5b6046e6284ac5 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 15:11:30 +0200
Subject: [PATCH 47/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 0e57031f2362..4cdfa5be6f35 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -418,7 +418,7 @@ def inference_batch(self, data_iter):
         Returns:
             The arithmetic mean of the losses computed this batch.
         """
-        print('ENTERING')
+        print(f'ENTERING, rank: {self.local_rank}', flush=True)
         self.module.eval()
         self.total_loss = None
 
@@ -427,7 +427,7 @@ def inference_batch(self, data_iter):
         self.set_dataiterator(data_iter)
 
         # Do the work
-        print('DOING SCHED')
+        print(f'DOING SCHED, rank: {self.local_rank}', flush=True)
         sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
                                            stages=self.num_stages,
                                            stage_id=self.stage_id)
@@ -904,7 +904,7 @@ def _exec_send_activations(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').start()
             self.timers('comms').start()
-        print(f'send 1 -> {self.local_rank}')
+        print(f'send 1 -> {self.local_rank}', flush=True)
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
         # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
@@ -914,12 +914,12 @@ def _exec_send_activations(self, buffer_id):
             outputs = list(outputs)
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
-        print(f'send 2 -> {self.local_rank}')
+        print(f'send 2 -> {self.local_rank}', flush=True)
 
         if self.first_output_send:
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
-        print(f'send 3 -> {self.local_rank}')
+        print(f'send 3 -> {self.local_rank}', flush=True)
 
         if isinstance(outputs, torch.Tensor):
             p2p.send(outputs, self.next_stage)
@@ -929,19 +929,19 @@ def _exec_send_activations(self, buffer_id):
         else:
             raise NotImplementedError('Could not send output of type '
                                       f'{type(outputs)}')
-        print(f'send 4 -> {self.local_rank}')
+        print(f'send 4 -> {self.local_rank}', flush=True)
 
         # Restore the boolean tensor
         if self.module.__class__.__name__ == 'GPT2ModelPipe':
             outputs = list(outputs)
             outputs[-1] = outputs[-1].bool()
             outputs = tuple(outputs)
-        print(f'send 5 -> {self.local_rank}')
+        print(f'send 5 -> {self.local_rank}', flush=True)
 
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').stop()
             self.timers('comms').stop()
-        print(f'send done -> {self.local_rank}')
+        print(f'send done -> {self.local_rank}', flush=True)
 
     def _exec_send_grads(self, buffer_id):
         if self.wall_clock_breakdown():
@@ -1000,11 +1000,11 @@ def _exec_recv_activations(self, buffer_id):
             self.timers('pipe_recv_input').start()
 
         recvd = None
-        print('1')
+        print(f'1 rank: {self.local_rank}', flush=True)
         # Allocate the buffer if necessary
         if self.pipe_recv_buf is None:
             self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
-        print('2')
+        print(f'2 rank: {self.local_rank}', flush=True)
 
         if isinstance(self.pipe_recv_buf, torch.Tensor):
             p2p.recv(self.pipe_recv_buf, self.prev_stage)
@@ -1035,14 +1035,14 @@ def _exec_recv_activations(self, buffer_id):
 
             for buffer in recvd:
                 buffer.requires_grad = buffer.is_floating_point()
-        print('3')
+        print(f'3 rank: {self.local_rank}', flush=True)
 
         self.pipe_buffers['inputs'][buffer_id] = recvd
-        print('3.1')
+        print(f'3.1 rank: {self.local_rank}', flush=True)
 
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_input').stop()
-        print('DONE RECV ACTIVATION')
+        print(f'DONE RECV ACTIVATION rank: {self.local_rank}', flush=True)
 
     def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():

From 03371ea89633f8e932678df8d9897b68ea5c5352 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 16:45:17 +0200
Subject: [PATCH 48/78] Pull changes from DeepSpeed

---
 deepspeed/runtime/pipe/engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 4cdfa5be6f35..27791d1addbd 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -917,6 +917,7 @@ def _exec_send_activations(self, buffer_id):
         print(f'send 2 -> {self.local_rank}', flush=True)
 
         if self.first_output_send:
+            print(f'SEND FIRST OUTPUT (RANK: {self.local_rank})')
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
         print(f'send 3 -> {self.local_rank}', flush=True)

From f90dc479c3b78ef6699a4a664e57a097183fc47e Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 17:23:49 +0200
Subject: [PATCH 49/78] cleanup, reinstantiate sending of logits / layer_past

---
 deepspeed/runtime/pipe/engine.py | 133 +++++++++++++------------------
 1 file changed, 54 insertions(+), 79 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 27791d1addbd..7a62b51245d0 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -392,7 +392,7 @@ def eval_batch(self, data_iter):
         return self.agg_eval_loss
 
     def inference_batch(self, data_iter):
-        """Evaluate the pipeline on a batch of data from ``data_iter``. The
+        """Inference the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
 
@@ -418,85 +418,76 @@ def inference_batch(self, data_iter):
         Returns:
             The arithmetic mean of the losses computed this batch.
         """
-        print(f'ENTERING, rank: {self.local_rank}', flush=True)
         self.module.eval()
         self.total_loss = None
+        if self.micro_batches > 1:
+            print_rank_0('WARNING: setting g.a.s to 1 in inference')
+            self.micro_batches = 1
+        if self.is_data_parallel:
+            raise NotImplementedError('Inference not yet implemented for pipeline + data parellel')
 
         # Use the provided data iterator
         train_iterator = self.data_iterator
         self.set_dataiterator(data_iter)
 
         # Do the work
-        print(f'DOING SCHED, rank: {self.local_rank}', flush=True)
         sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
                                            stages=self.num_stages,
                                            stage_id=self.stage_id)
         with torch.no_grad():
             self._exec_schedule(sched)
-        print(f'DONE SCHED {self.global_rank}')
 
         # the shapes are variable so we need to first broadcast the shapes, then the tensors themselves
-
-        if not self.is_last_stage():
-            # print(f'1. SENDING FROM SRC RANK: {self.global_rank}')
-            logits, presents = None, None
-        else:
+        if self.is_last_stage():
             logits, presents = self.total_loss
+            logits = logits.clone().detach()
+            presents = presents.clone().detach()
+            logits_shape = list(logits.shape)
+            presents_shape = list(presents.shape)
+
+            logits_shape_tensor = torch.LongTensor(logits_shape).to(self.device)
+            presents_shape_tensor = torch.LongTensor(presents_shape).to(self.device)
+            dist.broadcast(tensor=logits_shape_tensor,
+                           src=self.global_rank)
+            dist.broadcast(tensor=presents_shape_tensor,
+                           src=self.global_rank)
+        else:
+            src_rank = self.grid.stage_to_global(self.num_stages - 1)
+            logits_shape_tensor = torch.LongTensor([0] * 3).to(self.device)
+            presents_shape_tensor = torch.LongTensor([0] * 6).to(self.device)
+            dist.broadcast(tensor=logits_shape_tensor,
+                           src=src_rank)
+            dist.broadcast(tensor=presents_shape_tensor,
+                           src=src_rank)
+            logits_shape_tensor = logits_shape_tensor.clone().detach()
+            presents_shape_tensor = presents_shape_tensor.clone().detach()
+
+        logits_shape = logits_shape_tensor.tolist()
+        presents_shape = presents_shape_tensor.tolist()
 
-        #     src_rank = self.grid.stage_to_global(self.num_stages - 1)
-        #     print(f'1. RECVING FROM SRC RANK: {src_rank}')
-        #     print(f'1. CONSTRUCTING TENSORS')
-        #     a = 1
-        #     print(f'1. CONSTRUCTED A')
-        #
-        #     misc = torch.LongTensor([0]).to(self.device)
-        #     print(f'1. CONSTRUCTED MISC')
-        #
-        #     logits_shape_tensor = torch.LongTensor([0] * 3).to(self.device)
-        #     print(f'1. CONSTRUCTED LOGITS')
-        #
-        #     presents_shape_tensor = torch.LongTensor([0] * 6).to(self.device)
-        #     print(f'1. DONE CONSTRUCTING TENSORS')
-        #
-        #
-        #     dist.broadcast(tensor=logits_shape_tensor,
-        #                     src=src_rank)
-        #     dist.broadcast(tensor=presents_shape_tensor,
-        #                     src=src_rank)
-        #     print(f'1. DONE RECVING FROM SRC RANK: {src_rank}')
-        #     logits_shape_tensor = logits_shape_tensor.clone().detach()
-        #     presents_shape_tensor = presents_shape_tensor.clone().detach()
-        #
-        # logits_shape = logits_shape_tensor.tolist()
-        # presents_shape = presents_shape_tensor.tolist()
-        #
-        # if self.is_last_stage():
-        #     print(f'SENDING FROM SRC RANK: {self.global_rank}')
-        #
-        #     # outputs = torch.Tensor([logits, presents]).to(self.device)
-        #     dist.broadcast(tensor=logits,
-        #                     src=self.global_rank,
-        #                     group=self.mpu.get_pipe_parallel_group())
-        #     dist.broadcast(tensor=presents,
-        #                     src=self.global_rank,
-        #                     group=self.mpu.get_pipe_parallel_group())
-        #     print(f'DONE SENDING FROM SRC RANK: {self.global_rank}')
-        #
-        # else:
-        #     logits = torch.zeros(logits_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
-        #     presents = torch.zeros(presents_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(self.device)
-        #     src_rank = self.grid.stage_to_global(self.num_stages - 1)
-        #     assert src_rank in self.grid.pp_group
-        #     print(f'RECVING FROM SRC RANK: {src_rank}')
-        #     dist.broadcast(tensor=logits,
-        #                     src=src_rank,
-        #                     group=self.grid.get_pipe_parallel_group())
-        #     dist.broadcast(tensor=presents,
-        #                     src=src_rank,
-        #                     group=self.grid.get_pipe_parallel_group())
-        #     logits = logits.clone().detach()
-        #     presents = presents.clone().detach()
-        #     print(f'DONE RECVING FROM SRC RANK: {src_rank}')
+        if self.is_last_stage():
+            dist.broadcast(tensor=logits,
+                           src=self.global_rank,
+                           group=self.mpu.get_pipe_parallel_group())
+            dist.broadcast(tensor=presents,
+                           src=self.global_rank,
+                           group=self.mpu.get_pipe_parallel_group())
+
+        else:
+            logits = torch.zeros(logits_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(
+                self.device)
+            presents = torch.zeros(presents_shape, dtype=torch.half if self.fp16_enabled() else torch.float32).to(
+                self.device)
+            src_rank = self.grid.stage_to_global(self.num_stages - 1)
+            assert src_rank in self.grid.pp_group
+            dist.broadcast(tensor=logits,
+                           src=src_rank,
+                           group=self.grid.get_pipe_parallel_group())
+            dist.broadcast(tensor=presents,
+                           src=src_rank,
+                           group=self.grid.get_pipe_parallel_group())
+            logits = logits.clone().detach()
+            presents = presents.clone().detach()
 
         # self.agg_eval_loss = self._aggregate_total_loss()
         if self.tensorboard_enabled():
@@ -511,8 +502,6 @@ def inference_batch(self, data_iter):
         # Restore the training iterator
         self.set_dataiterator(train_iterator)
 
-        # Reset any buffers that may have been populated during the forward passes.
-        # ds_checkpointing.reset()
         return logits, presents
 
     def is_first_stage(self):
@@ -785,7 +774,6 @@ def _exec_load_micro_batch(self, buffer_id):
 
         if self.wall_clock_breakdown():
             self.timers('batch_input').stop()
-        # print('DONE LOADING MICROBATCH: ', loaded)
 
     def _send_tensor_meta(self, buffer, recv_stage):
         """ Communicate metadata about upcoming p2p transfers.
@@ -904,7 +892,6 @@ def _exec_send_activations(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').start()
             self.timers('comms').start()
-        print(f'send 1 -> {self.local_rank}', flush=True)
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
         # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
@@ -914,13 +901,10 @@ def _exec_send_activations(self, buffer_id):
             outputs = list(outputs)
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
-        print(f'send 2 -> {self.local_rank}', flush=True)
 
         if self.first_output_send:
-            print(f'SEND FIRST OUTPUT (RANK: {self.local_rank})')
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
-        print(f'send 3 -> {self.local_rank}', flush=True)
 
         if isinstance(outputs, torch.Tensor):
             p2p.send(outputs, self.next_stage)
@@ -930,19 +914,16 @@ def _exec_send_activations(self, buffer_id):
         else:
             raise NotImplementedError('Could not send output of type '
                                       f'{type(outputs)}')
-        print(f'send 4 -> {self.local_rank}', flush=True)
 
         # Restore the boolean tensor
         if self.module.__class__.__name__ == 'GPT2ModelPipe':
             outputs = list(outputs)
             outputs[-1] = outputs[-1].bool()
             outputs = tuple(outputs)
-        print(f'send 5 -> {self.local_rank}', flush=True)
 
         if self.wall_clock_breakdown():
             self.timers('pipe_send_output').stop()
             self.timers('comms').stop()
-        print(f'send done -> {self.local_rank}', flush=True)
 
     def _exec_send_grads(self, buffer_id):
         if self.wall_clock_breakdown():
@@ -1001,11 +982,9 @@ def _exec_recv_activations(self, buffer_id):
             self.timers('pipe_recv_input').start()
 
         recvd = None
-        print(f'1 rank: {self.local_rank}', flush=True)
         # Allocate the buffer if necessary
         if self.pipe_recv_buf is None:
             self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
-        print(f'2 rank: {self.local_rank}', flush=True)
 
         if isinstance(self.pipe_recv_buf, torch.Tensor):
             p2p.recv(self.pipe_recv_buf, self.prev_stage)
@@ -1036,14 +1015,11 @@ def _exec_recv_activations(self, buffer_id):
 
             for buffer in recvd:
                 buffer.requires_grad = buffer.is_floating_point()
-        print(f'3 rank: {self.local_rank}', flush=True)
 
         self.pipe_buffers['inputs'][buffer_id] = recvd
-        print(f'3.1 rank: {self.local_rank}', flush=True)
 
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_input').stop()
-        print(f'DONE RECV ACTIVATION rank: {self.local_rank}', flush=True)
 
     def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():
@@ -1278,7 +1254,6 @@ def _exec_schedule(self, pipe_schedule):
         for step_cmds in pipe_schedule:
             # For each instruction in the step
             for cmd in step_cmds:
-                print(cmd, self.global_rank)
                 if type(cmd) not in self._INSTRUCTION_MAP:
                     raise RuntimeError(
                         f'{self.__class__.__name__} does not understand instruction {repr(cmd)}'

From 23d0f78faa566b20a99ab2d69f0d26c888f7c975 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 8 Apr 2021 17:34:12 +0200
Subject: [PATCH 50/78] cleanup, reinstantiate sending of logits / layer_past

---
 deepspeed/runtime/pipe/engine.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 7a62b51245d0..950f1d76ccbf 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -392,9 +392,7 @@ def eval_batch(self, data_iter):
         return self.agg_eval_loss
 
     def inference_batch(self, data_iter):
-        """Inference the pipeline on a batch of data from ``data_iter``. The
-        engine will evaluate ``self.train_batch_size()`` total samples
-        collectively across all workers.
+        """Inference the pipeline on a single batch of data from ``data_iter``.
 
         This method is equivalent to:
 
@@ -405,24 +403,29 @@ def inference_batch(self, data_iter):
                 output = module(batch)
 
         .. warning::
-            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
-            from ``data_iter`` by each pipeline. There must be sufficient
-            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
-
-            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
-            that wraps data loaders to automatically restart upon a ``StopIteration``.
+            we're assuming that in inference we a) don't want to calculate loss and b) gradient_accum_steps = 0
 
         Args:
             data_iter (Iterator): Iterator of data to evaluate.
+            data_iter should have dummy labels as deepspeed expects it this way
 
         Returns:
-            The arithmetic mean of the losses computed this batch.
+            logits, presents (NB this is not a general purpose function, it's designed specifically to run with
+            gpt-neox, which will return logits + presents in inference. This is a massive hack.)
         """
         self.module.eval()
         self.total_loss = None
         if self.micro_batches > 1:
             print_rank_0('WARNING: setting g.a.s to 1 in inference')
             self.micro_batches = 1
+        train_batch_fn = self.batch_fn
+        self.set_batch_fn(lambda x: x) # we just want to return `data_iter` as is
+        # deepspeed sends metadata across pipeline stages only once in the first step, then assumes it will stay
+        # constant in inference, the metadata of the tensors being sent across pipe stages may change we need to set
+        # these two flags in order for deepspeed to send the metadata every step, otherwise torch.distributed hangs
+        # silently.
+        self.first_output_send = True
+        self.pipe_recv_buf = None
         if self.is_data_parallel:
             raise NotImplementedError('Inference not yet implemented for pipeline + data parellel')
 
@@ -499,8 +502,9 @@ def inference_batch(self, data_iter):
                     self.summary_writer.add_scalar(event[0], event[1], event[2])
                 self.summary_writer.flush()
 
-        # Restore the training iterator
+        # Restore the training iterator & batch_fn
         self.set_dataiterator(train_iterator)
+        self.set_batch_fn(train_batch_fn)
 
         return logits, presents
 

From 0cd9abf20416c1fce091e7f79694f02fd21b3687 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 8 Apr 2021 09:44:16 -0700
Subject: [PATCH 51/78] bump to 0.3.14

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index e4737652ca5a..0b69c00c5f5a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.13
+0.3.14

From 112ebff6e86a177608e52f7e36595ea2926ae37b Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 8 Apr 2021 10:03:54 -0700
Subject: [PATCH 52/78] add pypi badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c7bde12dd0ea..8323084ac6fe 100755
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
-[![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
+[![Downloads](https://pepy.tech/badge/deepspeed/month)](https://pepy.tech/project/deepspeed)
 
 ### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
 

From e6999ebd1657d6dc0826d1c51f2986d8ede016a6 Mon Sep 17 00:00:00 2001
From: Takuya Makino <takuyamakino15@gmail.com>
Date: Wed, 14 Apr 2021 02:53:45 +0900
Subject: [PATCH 53/78] Delete check of pdsh (#941)

---
 deepspeed/launcher/runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 5295b6859599..a4a49dca6bf3 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -306,9 +306,6 @@ def main(args=None):
 
     multi_node_exec = len(active_resources) > 1
 
-    if multi_node_exec and not shutil.which('pdsh'):
-        raise RuntimeError("pdsh is not installed, unable to proceed")
-
     if not multi_node_exec:
         deepspeed_launch = [
             sys.executable,

From adac058ac0f6422eed24042f280db93a6841da59 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 14 Apr 2021 04:30:12 -0700
Subject: [PATCH 54/78] fix double linear override; spelling (#954)

---
 deepspeed/runtime/zero/partition_parameters.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 0acc675985ca..0f2741251bb6 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -191,7 +191,7 @@ def _init_subclass(cls, **kwargs):
 
         if self.mem_efficient_linear:
             print_rank_0(
-                f"Your linear layers are being patched with more memory efficient version. This will persit unless manually reset.",
+                "nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
                 force=True)
             self.linear_bk = torch.nn.functional.linear
             torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
@@ -361,13 +361,6 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
-        if mem_efficient_linear:
-            print_rank_0(
-                f"Your linear layers are being patched with more memory efficient version. This will persit unless manually turned reset.",
-                force=True)
-            self.linear_bk = torch.nn.functional.linear
-            torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
-
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)

From c87118b0c5c4cfc47446ccd37c22fdc063143fa4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 14 Apr 2021 07:46:46 -0700
Subject: [PATCH 55/78] [config] turn exponential notation back on for config
 dump (#955)

* e-notation for large floats

* handle ints too

* readability

* handle bool

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/config.py       |  3 ++-
 deepspeed/runtime/config_utils.py | 42 +++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 11e1d4037c8e..b9ba6cf37cba 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -9,7 +9,7 @@
 
 from .constants import *
 from .fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
-from .config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
+from .config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys, ScientificNotationEncoder
 from .zero.config import DeepSpeedZeroConfig
 from .zero.constants import *
 from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
@@ -744,6 +744,7 @@ def print(self, name):
             json.dumps(self._param_dict,
                        sort_keys=True,
                        indent=4,
+                       cls=ScientificNotationEncoder,
                        separators=(',',
                                    ':'))))
 
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 62782852a3d2..12711d56f7f6 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -6,7 +6,40 @@
 Collection of DeepSpeed configuration utilities
 """
 import json
-from collections import Counter
+from collections import Counter, Mapping, Sequence
+
+
+# adapted from https://stackoverflow.com/a/50701137/9201239
+class ScientificNotationEncoder(json.JSONEncoder):
+    """
+    This class overrides ``json.dumps`` default formatter.
+
+    This version keeps everything as normal except formats numbers bigger than 1e3 using scientific notation.
+
+    Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
+
+    """
+    def iterencode(self, o, _one_shot=False, level=0):
+        indent = self.indent if self.indent is not None else 4
+        prefix_close = " " * level * indent
+        level += 1
+        prefix = " " * level * indent
+        if isinstance(o, bool):
+            return "true" if o else "false"
+        elif isinstance(o, float) or isinstance(o, int):
+            if o > 1e3:
+                return f"{o:e}"
+            else:
+                return f"{o}"
+        elif isinstance(o, Mapping):
+            x = [
+                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
+                v in o.items()
+            ]
+            return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
+        elif isinstance(o, Sequence) and not isinstance(o, str):
+            return f"[{ f', '.join(map(self.iterencode, o)) }]"
+        return "\n, ".join(super().iterencode(o, _one_shot))
 
 
 class DeepSpeedConfigObject(object):
@@ -17,7 +50,12 @@ def repr(self):
         return self.__dict__
 
     def __repr__(self):
-        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+        return json.dumps(
+            self.__dict__,
+            sort_keys=True,
+            indent=4,
+            cls=ScientificNotationEncoder,
+        )
 
 
 def get_scalar_param(param_dict, param_name, param_default_value):

From 7003d447208f0c2c4c07bd6df202f8c52bd68fdf Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 14 Apr 2021 11:52:55 -0700
Subject: [PATCH 56/78] document how to override ~/.cache/torch_extensions
 (#959)

---
 docs/_tutorials/advanced-install.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 4ff02bf7ec84..3b9449d368fc 100644
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -103,6 +103,23 @@ script in the repo. This will build a python wheel locally and copy it to all
 the nodes listed in your hostfile (either given via --hostfile, or defaults to
 /job/hostfile).
 
+When the code using DeepSpeed is used for the first time it'll automatically build only the CUDA
+extensions, required for the run, and by default it'll place them under
+`~/.cache/torch_extensions/`. The next time the same program is executed these now precompiled
+extensions will be loaded form that directory.
+
+If you use multiple virtual environments this could be a problem, since by default there is only one
+extensions directory, but different virtual environments may use different setups (e.g. different
+python or cuda versions) and then the loading of a CUDA extension built by another environment will
+fail. Therefore, if you need to you can override the default location with the help of the
+ `TORCH_EXTENSIONS_DIR` environment variable. So in each virtual environment you can point it to a
+ unique directory and DeepSpeed will use it to save and load CUDA extensions.
+
+ You can also change it just for a specific run with:
+
+```bash
+ TORCH_EXTENSIONS_DIR=./torch-extensions deepspeed ...
+```
 
 ## Building for the correct architectures
 

From 8b8ed2a7d936c3c95e2dd2e7005b23cc6aaa8f16 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 14 Apr 2021 12:36:31 -0700
Subject: [PATCH 57/78] [zero] faster flatten/unflatten (cpp version)  (#910)

* faster flatten/unflatten with apex

* switch to cpp flatten/unflatten

* style

* better comment

* missing import

* switch to build ops at run time

* fixes

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/stage1.py | 124 +++++++++++++++--------------
 deepspeed/runtime/zero/stage2.py |  82 +++++++++----------
 deepspeed/runtime/zero/stage3.py | 132 +++++++++++++------------------
 3 files changed, 161 insertions(+), 177 deletions(-)

diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py
index 7cd37f904faa..dde8424ceaad 100755
--- a/deepspeed/runtime/zero/stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -1,7 +1,6 @@
 import math
 import torch
 import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from collections import defaultdict
 
 from deepspeed.runtime.zero.utils import _initialize_parameter_parallel_groups
@@ -9,6 +8,7 @@
 from deepspeed.runtime.utils import get_grad_norm, CheckOverflow
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_OPTIMIZER_STATES
 from deepspeed.utils import logger, log_dist
+from deepspeed.ops.op_builder import UtilsBuilder
 
 
 def get_alignment_padding(flattened_lean_size, sub_partition_id, sub_partition_size):
@@ -29,54 +29,6 @@ def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_c
     return group_paddings
 
 
-def flatten_dense_tensors_sub_partition_aligned(tensor_list,
-                                                dp,
-                                                max_elements_per_comm,
-                                                pg):
-    assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
-
-    num_elements = sum(t.numel() for t in tensor_list)
-    log_dist("Total number of elements in model: {}, max elements per com: {}".format(
-        num_elements,
-        max_elements_per_comm),
-             ranks=[0])
-
-    # Compute aligned partition size based on parameter count
-    aligned_param_partition_size = math.ceil(num_elements / dp)
-
-    # Compute aligned partition size based on communication size
-    aligned_comm_partition_size = int(max_elements_per_comm // dp)
-
-    if aligned_param_partition_size <= aligned_comm_partition_size:
-        sub_partition_count = 1
-        sub_partition_size = aligned_param_partition_size
-    else:
-        sub_partition_count = math.ceil(aligned_param_partition_size /
-                                        aligned_comm_partition_size)
-        sub_partition_size = aligned_comm_partition_size
-
-    # Compute required padding  for alignment to dp and max_elements_per_comm
-    padding = (sub_partition_count * sub_partition_size * dp) - num_elements
-
-    log_dist(
-        f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
-        ranks=[0])
-    log_dist(
-        f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
-        ranks=[0])
-
-    if padding == 0:
-        aligned_tensor_list = tensor_list
-    else:
-        pad_tensor = torch.zeros(padding,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        aligned_tensor_list = tensor_list + [pad_tensor]
-
-    flat_tensors = _flatten_dense_tensors(aligned_tensor_list)
-    return flat_tensors
-
-
 def _single_range_check(current_index, start_index, end_index, tensor_size):
     offset = 0
     if (current_index >= start_index) and (current_index < end_index):
@@ -127,6 +79,11 @@ def __init__(self,
                  max_elements_per_comm=5e8,
                  elastic_checkpoint=True):
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         if dp_process_group is not None and partition_size is not None:
             raise ValueError("Cannot specify both dp_process_group "
                              "and partition size")
@@ -209,7 +166,7 @@ def __init__(self,
 
             # flattens all tensors into single 1d tensor aligned with sub-partition size for later dividing
             # RS: create aligned sub-partitions
-            flat_aligned_params = flatten_dense_tensors_sub_partition_aligned(
+            flat_aligned_params = self.flatten_dense_tensors_sub_partition_aligned(
                 tensor_list=self.fp16_groups[i],
                 dp=dist.get_world_size(group=self.dp_process_group),
                 max_elements_per_comm=self.max_elems_per_comm[i],
@@ -218,8 +175,8 @@ def __init__(self,
 
             # TODO: I don't think this does anything?
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -455,8 +412,8 @@ def get_all_sub_partition_info(tensor_list,
 
         return params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local
 
-    @staticmethod
-    def get_flat_sub_partitions(comm_tensor_list,
+    def get_flat_sub_partitions(self,
+                                comm_tensor_list,
                                 comm_param_offsets,
                                 sub_partition_size,
                                 dtype,
@@ -527,7 +484,7 @@ def get_flat_sub_partitions(comm_tensor_list,
             partition_params.append(my_params)  #flat_tensor_list)
             final_param_offsets.append(my_offsets)
             assert len(flat_tensor_list) == len(my_offsets), "{} {}".format(len(flat_tensor_list), len(my_offsets))
-            flat_sub_partitions.append(_flatten_dense_tensors(flat_tensor_list))
+            flat_sub_partitions.append(self.flatten(flat_tensor_list))
         if num_comm_intervals is not None and len(
                 flat_sub_partitions) < num_comm_intervals:
             # logger.info("padding w. sub partitions to ensure uniform communication")
@@ -569,6 +526,55 @@ def free_grad_in_param_list(self, param_list):
             else:
                 p.grad = None
 
+    def flatten_dense_tensors_sub_partition_aligned(self,
+                                                    tensor_list,
+                                                    dp,
+                                                    max_elements_per_comm,
+                                                    pg):
+        assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
+
+        num_elements = sum(t.numel() for t in tensor_list)
+        log_dist(
+            "Total number of elements in model: {}, max elements per com: {}".format(
+                num_elements,
+                max_elements_per_comm),
+            ranks=[0])
+
+        # Compute aligned partition size based on parameter count
+        aligned_param_partition_size = math.ceil(num_elements / dp)
+
+        # Compute aligned partition size based on communication size
+        aligned_comm_partition_size = int(max_elements_per_comm // dp)
+
+        if aligned_param_partition_size <= aligned_comm_partition_size:
+            sub_partition_count = 1
+            sub_partition_size = aligned_param_partition_size
+        else:
+            sub_partition_count = math.ceil(aligned_param_partition_size /
+                                            aligned_comm_partition_size)
+            sub_partition_size = aligned_comm_partition_size
+
+        # Compute required padding  for alignment to dp and max_elements_per_comm
+        padding = (sub_partition_count * sub_partition_size * dp) - num_elements
+
+        log_dist(
+            f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
+            ranks=[0])
+        log_dist(
+            f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
+            ranks=[0])
+
+        if padding == 0:
+            aligned_tensor_list = tensor_list
+        else:
+            pad_tensor = torch.zeros(padding,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            aligned_tensor_list = tensor_list + [pad_tensor]
+
+        flat_tensors = self.flatten(aligned_tensor_list)
+        return flat_tensors
+
     def reduce_scatter_gradients(self,
                                  postscale_gradients,
                                  gradient_predivide_factor,
@@ -699,8 +705,8 @@ def step(self, closure=None):
 
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -903,7 +909,7 @@ def _retrieve_group_sub_partition_weights(self,
                 sub_partition_idx = (comm_idx * num_partitions) + rank
                 all_sub_partition_weights[sub_partition_idx] = sub_partition_weights
 
-        flat_merged_weights = flatten_dense_tensors_sub_partition_aligned(
+        flat_merged_weights = self.flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_sub_partition_weights,
             dp=dist.get_world_size(group=self.dp_process_group),
             max_elements_per_comm=max_elems_per_comm,
@@ -951,7 +957,7 @@ def _partition_base_optimizer_state(self,
             return all_partition_states[0]
 
         alignment = dist.get_world_size(group=self.dp_process_group)
-        flat_merged_partitions = flatten_dense_tensors_sub_partition_aligned(
+        flat_merged_partitions = self.flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_partition_states,
             dp=dist.get_world_size(group=self.dp_process_group),
             max_elements_per_comm=max_elems_per_comm,
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index cd29625958c9..39d780e55574 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -3,7 +3,6 @@
 '''
 
 import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
 import math
@@ -16,9 +15,8 @@
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
-
+from deepspeed.ops.op_builder import UtilsBuilder
 from deepspeed.utils import logger
-from ...ops.op_builder import UtilsBuilder
 
 #Toggle this to true to enable correctness test
 #with gradient partitioning and without
@@ -52,28 +50,6 @@ def lcm(x, y):
     return x * y // gcd(x, y)
 
 
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tensor in tensor_list:
-        num_elements = num_elements + tensor.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
 def get_alignment_padding(tensor_list, alignment):
     num_elements = sum([tensor.numel() for tensor in tensor_list])
     remainder = num_elements % alignment
@@ -121,11 +97,6 @@ def __init__(self,
                  gradient_predivide_factor=1.0,
                  gradient_accumulation_steps=1):
 
-        # Load pre-installed or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
-
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
             logger.info(f"Allgather bucket size {allgather_bucket_size}")
@@ -143,6 +114,11 @@ def __init__(self,
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         self.timers = timers
 
         self.reduce_scatter = reduce_scatter
@@ -236,7 +212,7 @@ def __init__(self,
 
             #create flat buffer in CPU and move to GPU
             self.fp16_groups_flat.append(
-                flatten_dense_tensors_aligned(
+                self.flatten_dense_tensors_aligned(
                     self.fp16_groups[i],
                     dist.get_world_size(group=self.dp_process_group)).cuda(
                         torch.cuda.current_device()))
@@ -247,8 +223,8 @@ def __init__(self,
                     f"After Flattening and after emptying param group {i} cache")
 
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -611,6 +587,27 @@ def report_ipg_memory_usage(self, tag, param_elems):
             f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
         )
 
+    # create a flat tensor aligned at the alignment boundary
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+        num_elements = 0
+        for tensor in tensor_list:
+            num_elements = num_elements + tensor.numel()
+
+        remaining = num_elements % alignment
+
+        if remaining:
+            elements_to_add = alignment - remaining
+            pad_tensor = torch.zeros(elements_to_add,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            padded_tensor_list = tensor_list + [pad_tensor]
+
+            num_elements = num_elements + elements_to_add
+        else:
+            padded_tensor_list = tensor_list
+
+        return self.flatten(padded_tensor_list)
+
     ############### Independent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
@@ -1004,7 +1001,7 @@ def are_all_related_partitions_reduced(params_id):
                 self.param_dict[params_id].grad = None
 
     def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
+        flatten_tensor = self.flatten(tensors)
 
         def print_func():
             logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
@@ -1327,7 +1324,7 @@ def get_flat_partition(self,
         if return_tensor_list:
             return flat_tensor_list
 
-        return _flatten_dense_tensors(flat_tensor_list)
+        return self.flatten(flat_tensor_list)
 
     def free_grad_in_param_list(self, param_list):
         for p in param_list:
@@ -1419,14 +1416,13 @@ def step(self, closure=None):
                 #create a flat gradients for parameters updated by this process
                 # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
                 if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    single_grad_partition = flatten_dense_tensors_aligned(
+                    single_grad_partition = self.flatten_dense_tensors_aligned(
                         self.averaged_gradients[i],
                         int(self.partition_size[i])).to(
                             self.single_partition_of_fp32_groups[i].dtype)
                 else:
-                    single_grad_partition = _flatten_dense_tensors(
-                        self.averaged_gradients[i]).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
+                    single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
+                        self.single_partition_of_fp32_groups[i].dtype)
                 assert single_grad_partition.numel() == self.partition_size[i], \
                     "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
 
@@ -1507,8 +1503,8 @@ def step(self, closure=None):
 
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -1749,7 +1745,7 @@ def _restore_from_fp32_weights(self, all_state_dict):
             merged_partitions = [
                 sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
             ]
-            flat_merged_partitions = flatten_dense_tensors_aligned(
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 merged_partitions,
                 dist.get_world_size(group=self.dp_process_group))
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
@@ -1773,7 +1769,7 @@ def _partition_base_optimizer_state(self, state_key, all_partition_states):
         partition_id = dist.get_rank(group=self.dp_process_group)
         alignment = dist.get_world_size(group=self.dp_process_group)
         if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = flatten_dense_tensors_aligned(
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 all_partition_states,
                 alignment)
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 493106e93239..c7eb4b5cfc7b 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -6,7 +6,6 @@
 import os
 
 import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
 import math
@@ -18,26 +17,13 @@
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, ZeroParamType, _init_external_params, Init, is_zero_param
 from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.op_builder import UtilsBuilder
 
 import itertools
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
-try:
-    from apex_C import flatten
-    from apex_C import unflatten
-except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
-
 
 def print_rank_0(message, debug=False, force=False):
     if torch.distributed.get_rank() == 0 and (debug or force):
@@ -71,28 +57,6 @@ def lcm(x, y):
     return x * y // gcd(x, y)
 
 
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tens in tensor_list:
-        num_elements = num_elements + tens.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
 def move_to_cpu(tensor_list):
     for tensor in tensor_list:
         tensor.data = tensor.data.cpu()
@@ -598,6 +562,11 @@ def __init__(self,
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         if not all(is_zero_param(p) for p in module.parameters()):
             group = None
             if mpu:
@@ -872,7 +841,7 @@ def _create_fp16_partitions(self):
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(
+                        self.flatten_dense_tensors_aligned(
                             self.fp16_partitioned_groups[i],
                             dist.get_world_size(group=self.dp_process_group)).cuda(
                                 torch.cuda.current_device()))
@@ -883,7 +852,7 @@ def _create_fp16_partitions(self):
                     #Without the detach, seems like the flattening becomes part of the
                     #model graph causing errors downstream
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(
+                        self.flatten_dense_tensors_aligned(
                             self.fp16_partitioned_groups[i],
                             dist.get_world_size(
                                 group=self.dp_process_group)).detach().pin_memory())
@@ -893,9 +862,8 @@ def _create_fp16_partitions(self):
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
                 #set model fp16 weight to slices of flattened buffer
-                updated_params = _unflatten_dense_tensors(
-                    self.fp16_partitioned_groups_flat[i],
-                    self.fp16_partitioned_groups[i])
+                updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
+                                                self.fp16_partitioned_groups[i])
 
                 for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
                     partitioned_param.data = q.data
@@ -961,9 +929,9 @@ def _create_fp16_partitions_with_defragmentation(self):
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(self.fp16_partitioned_groups[i],
-                                                      1).cuda(
-                                                          torch.cuda.current_device()))
+                        self.flatten_dense_tensors_aligned(
+                            self.fp16_partitioned_groups[i],
+                            1).cuda(torch.cuda.current_device()))
                     see_memory_usage(
                         f"After flattening and moving param group {i} to GPU",
                         force=False)
@@ -1741,7 +1709,7 @@ def are_all_related_partitions_reduced(params_id):
                 self.param_dict[params_id].grad = None
 
     def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
+        flatten_tensor = self.flatten(tensors)
 
         def print_func():
             logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
@@ -1799,7 +1767,7 @@ def set_none_gradients_to_zero(self, i, partition_id):
 
     def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
         rank = None
-        tensor = flatten(bucket)
+        tensor = self.flatten(bucket)
 
         tensor_to_allreduce = tensor
 
@@ -1829,7 +1797,7 @@ def allreduce_and_copy(self, small_bucket, rank=None, log=None):
         with torch.cuda.stream(self.reduction_stream):
             allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
             if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
     def allreduce_no_retain(self,
@@ -2048,7 +2016,7 @@ def get_flat_partition(self,
         if return_tensor_list:
             return flat_tensor_list
 
-        return _flatten_dense_tensors(flat_tensor_list)
+        return self.flatten(flat_tensor_list)
 
     def free_grad_in_param_list(self, param_list):
         for p in param_list:
@@ -2158,9 +2126,8 @@ def old_step(self, closure=None):
                 # create a flat gradients for parameters updated by this process
 
                 # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                single_grad_partition = _flatten_dense_tensors(
-                    self.averaged_gradients[i]).to(
-                        self.fp32_partitioned_groups_flat[i].dtype)
+                single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
+                    self.fp32_partitioned_groups_flat[i].dtype)
 
                 assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[i].numel(), \
                     "averaged gradients have different number of elements that partition size {} {} {} {}".format(
@@ -2174,11 +2141,10 @@ def old_step(self, closure=None):
                 self.averaged_gradients[i] = None
 
             single_partition_grad_groups.append(single_grad_partition)
-            debug_fp32_grads[i] = [
-                (t.clone().detach(),
-                 t) for t in _unflatten_dense_tensors(single_grad_partition,
-                                                      group)
-            ]
+            debug_fp32_grads[i] = [(t.clone().detach(),
+                                    t)
+                                   for t in self.unflatten(single_grad_partition,
+                                                           group)]
 
         self.stop_timers([OPTIMIZER_FP32_GRADIENT])
 
@@ -2213,9 +2179,8 @@ def old_step(self, closure=None):
             #for p in self.fp16_groups[i]:
             #    p.data=p.ds_tensor
 
-            updated_params = _unflatten_dense_tensors(
-                self.fp16_partitioned_groups_flat[i],
-                self.fp16_partitioned_groups[i])
+            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
+                                            self.fp16_partitioned_groups[i])
             for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
                 # print(f"Grad fn: {p.grad_fn}")
                 # p.data = torch.ones(1).half().cuda()
@@ -2269,9 +2234,8 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
 
         partition_id = dist.get_rank(group=self.dp_process_group)
 
-        single_grad_partition = _flatten_dense_tensors(
-            self.averaged_gradients[sub_group_id]).to(
-                self.fp32_partitioned_groups_flat[sub_group_id].dtype)
+        single_grad_partition = self.flatten(self.averaged_gradients[sub_group_id]).to(
+            self.fp32_partitioned_groups_flat[sub_group_id].dtype)
 
         assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[sub_group_id].numel(), \
             "averaged gradients have different number of elements that partition size {} {} {} {}".format(
@@ -2302,10 +2266,30 @@ def _release_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'After release optimizer sub group {sub_group_id}',
                          force=False)
 
+    # create a flat tensor aligned at the alignment boundary
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+        num_elements = 0
+        for tens in tensor_list:
+            num_elements = num_elements + tens.numel()
+
+        remaining = num_elements % alignment
+
+        if remaining:
+            elements_to_add = alignment - remaining
+            pad_tensor = torch.zeros(elements_to_add,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            padded_tensor_list = tensor_list + [pad_tensor]
+
+            num_elements = num_elements + elements_to_add
+        else:
+            padded_tensor_list = tensor_list
+
+        return self.flatten(padded_tensor_list)
+
     def _unflatten_partitioned_parameters(self, sub_group_id):
-        updated_params = _unflatten_dense_tensors(
-            self.fp16_partitioned_groups_flat[sub_group_id],
-            self.fp16_partitioned_groups[sub_group_id])
+        updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                        self.fp16_partitioned_groups[sub_group_id])
 
         for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
             partitioned_param.data = q.data
@@ -2411,10 +2395,9 @@ def dump_post_step_gradients(self):
         for i, group in enumerate(self.fp16_groups):
             print(
                 f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
-            unflat_fp16 = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                   self.fp16_groups[i])
-            unflat_fp32 = _unflatten_dense_tensors(self.fp32_partitioned_groups_flat[i],
-                                                   self.fp16_groups[i])
+            unflat_fp16 = self.unflatten(self.fp16_groups_flat[i], self.fp16_groups[i])
+            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i],
+                                         self.fp16_groups[i])
             for j, p in enumerate(self.fp16_groups[i]):
                 param_id = self.get_param_id(p)
                 param_norm = float(p.data.float().norm(2))
@@ -2599,8 +2582,7 @@ def _set_loss_scale(self, value):
 
     def _get_lean_tensors(self, padded_flattened_tensor, group_tensors, paddings):
         # Remove paddings from flattened tensor
-        individual_tensors = _unflatten_dense_tensors(padded_flattened_tensor,
-                                                      group_tensors)
+        individual_tensors = self.unflatten(padded_flattened_tensor, group_tensors)
         lean_lengths = [t.numel() - pad for t, pad in zip(group_tensors, paddings)]
         lean_tensors = [t[:len] for t, len in zip(individual_tensors, lean_lengths)]
         #logger.info(f'rank {dist.get_rank()}: lean_tensors = {[t.numel() for t in lean_tensors]}')
@@ -2721,14 +2703,14 @@ def _get_flattened_partition(self, all_partition_states):
 
         local_state_partitions = []
         for param_index, param_slices in enumerate(param_partitions):
-            flattened_merged_tensor = flatten_dense_tensors_aligned(
+            flattened_merged_tensor = self.flatten_dense_tensors_aligned(
                 param_slices,
                 alignment)
             new_partitions = self.get_data_parallel_partitions(flattened_merged_tensor)
             local_state_partitions.append(new_partitions[partition_id])
 
         if torch.is_tensor(local_state_partitions[0]):
-            return flatten_dense_tensors_aligned(local_state_partitions, alignment)
+            return self.flatten_dense_tensors_aligned(local_state_partitions, alignment)
 
         # Assume non-tensor states are not partitioned and equal across ranks, so return first one
         return local_state_partitions[0]
@@ -2783,7 +2765,7 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
 
         # update fp16 unflattened params
         for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
-            updated_params = _unflatten_dense_tensors(
+            updated_params = self.unflatten(
                 self.fp16_partitioned_groups_flat[sub_group_id],
                 self.fp16_partitioned_groups[sub_group_id])
 

From c83e49f9ed075175cfd81621fa59efe43b85114b Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Wed, 14 Apr 2021 14:31:39 -0700
Subject: [PATCH 58/78] update lr scheduler doc for doing per step or epoch
 update (#913)

* update lr scheduler doc for doing per step or epoch update

* work

* trigger build

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/profiling/flops_profiler/README.md |  2 +-
 docs/_pages/config-json.md                   | 34 +++++-----
 docs/_tutorials/getting-started.md           | 69 ++++++++++++--------
 docs/code-docs/source/schedulers.rst         |  5 +-
 4 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index 179a0b134756..e3343e7852fa 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -9,7 +9,7 @@
 
 ## Overview
 
-The DeepSpeed flops profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module.
+This profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module.
 It shows the parameters, latency, and number of floating point operations of the modules within the model to identify potential bottlenecks.
 It also outputs the names of the top `k` modules in terms of aggregated time, flops, and number of parameters at depth `l` with `k` and `l` specified by the user.
 The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a standalone package.
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 81c7d5bd62b9..4ec491e1de3f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -82,14 +82,16 @@ The Adam optimizer also supports the following two params keys/values in additio
 
 The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
 
-| "params" key  | Description                                                                 | Default |
-| ------------- | --------------------------------------------------------------------------- | ------- |
-| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication | 100000   |
-| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication         | false    |
-| comm\_backend\_name | To indicate which backend implementation to use                               | "nccl"   |
+| "params" key        | Description                                                                        | Default |
+| ------------------- | ---------------------------------------------------------------------------------- | ------- |
+| freeze\_step        | Number of warm up steps before 1-bit compression gets applied to the communication | 100000  |
+| cuda\_aware         | To indicate that the underlying MPI library supports CUDA-Aware communication      | false   |
+| comm\_backend\_name | To indicate which backend implementation to use                                    | "nccl"  |
 
 ### Scheduler Parameters
 
+DeepSpeed calls the `step()` method of the scheduler at every training step when `model_engine.step()` is executed.
+
 ***scheduler***: [dictionary]
 
 | Fields | Value                                                                                                                      | Example                                        |
@@ -269,8 +271,8 @@ Enabling and configuring ZeRO memory optimizations
 
 ***stage***: [integer]
 
-| Description                                                                                                                                                           | Default |
-| --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                                                                                               | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning, respectively. | `0`     |
 
 ***allgather_partitions***: [boolean]
@@ -323,26 +325,26 @@ Enabling and configuring ZeRO memory optimizations
 
 ***cpu_offload_use_pin_memory***: [boolean]
 
-| Description                                                                               | Default |
-| ----------------------------------------------------------------------------------------- | ------- |
-| Use pinned CPU memory when offloading. Can improve performance. Valid only with stage 3.  | `False` |
+| Description                                                                              | Default |
+| ---------------------------------------------------------------------------------------- | ------- |
+| Use pinned CPU memory when offloading. Can improve performance. Valid only with stage 3. | `False` |
 
 ***stage3_max_live_parameters***: [integer]
 
-| Description                                                                                                                           | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                         | Default |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. | `1e9`   |
 
 ***stage3_max_reuse_distance***: [integer]
 
-| Description                                                                                                      | Default |
-| ---------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                          | Default |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication. | `1e9`   |
 
 ***stage3_prefetch_bucket_size***: [integer]
 
-| Description                                                                                                                     | Default |
-| ------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                            | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication. | `5e8`   |
 
 
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index ecd3159df8c9..16fb5035cc97 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -1,7 +1,7 @@
 ---
-title: "Getting Started"
+title: 'Getting Started'
 permalink: /getting-started/
-excerpt: "First steps with DeepSpeed"
+excerpt: 'First steps with DeepSpeed'
 date: 2020-05-15
 ---
 
@@ -13,12 +13,14 @@ date: 2020-05-15
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
 
 ## Writing DeepSpeed Models
+
 DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
 can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs
 for training and checkpointing the model. Please see the tutorials for detailed
 examples.
 
 To initialize the DeepSpeed engine:
+
 ```python
 model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
                                                      model=model,
@@ -27,10 +29,10 @@ model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
 
 `deepspeed.initialize` ensures that all of the necessary setup required for
 distributed data parallel or mixed precision training are done
-appropriately under the hood.  In addition to wrapping the model, DeepSpeed can
+appropriately under the hood. In addition to wrapping the model, DeepSpeed can
 construct and manage the training optimizer, data loader, and the learning rate
 scheduler based on the parameters passed to `deepspeed.initialize` and the
-DeepSpeed [configuration file](#deepspeed-configuration).
+DeepSpeed [configuration file](#deepspeed-configuration). Note that DeepSpeed automatically executes the learning rate schedule at every training step.
 
 If you already have a distributed environment setup, you'd need to replace:
 
@@ -48,7 +50,6 @@ The default is to use the NCCL backend, which DeepSpeed has been thoroughly test
 
 But if you don't need the distributed environment setup until after `deepspeed.initialize()` you don't have to use this function, as DeepSpeed will automatically initialize the distributed environment during its `initialize`. Regardless, you will need to remove `torch.distributed.init_process_group` if you already had it in place.
 
-
 ### Training
 
 Once the DeepSpeed engine has been initialized, it can be used to train the
@@ -67,32 +68,31 @@ for step, batch in enumerate(data_loader):
     model_engine.step()
 ```
 
-
 Under the hood, DeepSpeed automatically performs the necessary operations
 required for distributed data parallel training, in mixed precision, with a
-pre-defined learning rate schedule:
+pre-defined learning rate scheduler:
 
-* **Gradient Averaging**: in distributed data parallel training, `backward`
+- **Gradient Averaging**: in distributed data parallel training, `backward`
   ensures that gradients are averaged across data parallel processes after
   training on an `train_batch_size`.
 
-* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
+- **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
   engine automatically handles scaling the loss to avoid precision loss in the
   gradients.
 
-* **Learning Rate Schedule**: if using DeepSpeed's learning rate
-  schedule, then DeepSpeed automatically handles any updates to the learning
-  rate when `step` is executed.
-
-
+- **Learning Rate Scheduler**: when using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+  - if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
+  - if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
 ### Model Checkpointing
+
 Saving and loading the training state is handled via the `save_checkpoint` and
 `load_checkpoint` API in DeepSpeed which takes two arguments to uniquely
 identify a checkpoint:
-  * `ckpt_dir`: the directory where checkpoints will be saved.
-  * `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
-    In the following code snippet, we use the loss value as the checkpoint identifier.
+
+- `ckpt_dir`: the directory where checkpoints will be saved.
+- `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
+  In the following code snippet, we use the loss value as the checkpoint identifier.
 
 ```python
 #load checkpoint
@@ -133,6 +133,7 @@ each process needs to save its master weights and scheduler+optimizer states. Th
 waiting to synchronize with other processes if it's called just for the process with rank 0.
 
 ## DeepSpeed Configuration
+
 DeepSpeed features can be enabled, disabled, or configured using a config JSON
 file that should be specified as `args.deepspeed_config`. A sample config file
 is shown below. For a full set of features see [ API
@@ -156,6 +157,7 @@ doc](/docs/config-json/).
 ```
 
 # Launching DeepSpeed Training
+
 DeepSpeed installs the entry point `deepspeed` to launch distributed training.
 We illustrate an example usage of DeepSpeed with the following assumptions:
 
@@ -164,18 +166,20 @@ We illustrate an example usage of DeepSpeed with the following assumptions:
 3. `client args` is the `argparse` command line arguments
 4. `ds_config.json` is the configuration file for DeepSpeed
 
-
 ## Resource Configuration (multi-node)
+
 DeepSpeed configures multi-node compute resources with hostfiles that are compatible with
 [OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod).
-A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless
-SSH, and *slot counts*, which specify the number of GPUs available on the system. For
+A hostfile is a list of _hostnames_ (or SSH aliases), which are machines accessible via passwordless
+SSH, and _slot counts_, which specify the number of GPUs available on the system. For
 example,
+
 ```
 worker-1 slots=4
 worker-2 slots=4
 ```
-specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use
+
+specifies that two machines named _worker-1_ and _worker-2_ each have four GPUs to use
 for training.
 
 Hostfiles are specified with the `--hostfile` command line option. If no hostfile is
@@ -183,9 +187,9 @@ specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified o
 DeepSpeed queries the number of GPUs on the local machine to discover the number of local
 slots available.
 
-
 The following command launches a PyTorch training job across all available nodes and GPUs
 specified in `myhostfile`:
+
 ```bash
 deepspeed --hostfile=myhostfile <client_entry.py> <client args> \
   --deepspeed --deepspeed_config ds_config.json
@@ -195,20 +199,25 @@ Alternatively, DeepSpeed allows you to restrict distributed training of your mod
 subset of the available nodes and GPUs. This feature is enabled through two command line
 arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be
 restricted to use only two nodes with the following command:
+
 ```bash
 deepspeed --num_nodes=2 \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
+
 You can instead include or exclude specific resources using the `--include` and
 `--exclude` flags. For example, to use all available resources **except** GPU 0 on node
-*worker-2* and GPUs 0 and 1 on *worker-3*:
+_worker-2_ and GPUs 0 and 1 on _worker-3_:
+
 ```bash
 deepspeed --exclude="worker-2:0@worker-3:0,1" \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
-Similarly, you can use **only** GPUs 0 and 1 on *worker-2*:
+
+Similarly, you can use **only** GPUs 0 and 1 on _worker-2_:
+
 ```bash
 deepspeed --include="worker-2:0,1" \
 	<client_entry.py> <client args> \
@@ -228,24 +237,26 @@ executing from and also in your home directory (`~/`).
 As a concrete example, some clusters require special NCCL variables to set
 prior to training. The user can simply add these variables to a
 `.deepspeed_env` file in their home directory that looks like this:
+
 ```
 NCCL_IB_DISABLE=1
 NCCL_SOCKET_IFNAME=eth0
 ```
+
 DeepSpeed will then make sure that these environment variables are set when
 launching each process on every node across their training job.
 
-
 ### MPI and AzureML Compatibility
+
 As described above, DeepSpeed provides its own parallel launcher to help launch
 multi-node/multi-gpu training jobs. If you prefer to launch your training job
 using MPI (e.g., mpirun), we provide support for this. It should be noted that
-DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
+DeepSpeed will still use the torch distributed NCCL backend and _not_ the MPI
 backend.
 
 To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
 mpirun as a launcher backend) you simply need to install the
-[mpi4py](https://pypi.org/project/mpi4py/) python package.  DeepSpeed will use
+[mpi4py](https://pypi.org/project/mpi4py/) python package. DeepSpeed will use
 this to discover the MPI environment and pass the necessary state (e.g., world
 size, rank) to the torch distributed backend.
 
@@ -259,8 +270,9 @@ deepspeed.init_distributed()
 ```
 
 ## Resource Configuration (single-node)
+
 In the case that we are only running on a single node (with one or more GPUs)
-DeepSpeed *does not* require a hostfile as described above. If a hostfile is
+DeepSpeed _does not_ require a hostfile as described above. If a hostfile is
 not detected or passed in then DeepSpeed will query the number of GPUs on the
 local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
@@ -269,6 +281,7 @@ as the hostname.
 Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
 which devices should be used. For example, to use only gpu1 of the current
 node, do:
+
 ```bash
 deepspeed --include localhost:1 ...
 ```
diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
index 6be3112164ef..c7b67cbb20d8 100755
--- a/docs/code-docs/source/schedulers.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,8 +1,9 @@
 Learning Rate Schedulers
 ===================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers.
-
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+  * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
+  * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
 LRRangeTest
 ---------------------------

From 2805c3935ae02558f18a71a115185632b087da70 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 16 Apr 2021 14:40:23 -0700
Subject: [PATCH 59/78] Fix ZeRO-3 UnboundLocalError (#968)

* Fix UnboundLocalError

* Get full partition size
---
 deepspeed/runtime/zero/partition_parameters.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 0f2741251bb6..42cdcd645f4d 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -808,17 +808,12 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         start = partition_size * rank
         end = start + partition_size
 
-        dest_tensor = partition_buffer.view(-1).narrow(0, 0, partition_size)
+        dest_tensor_full_buffer = partition_buffer.view(-1).narrow(0, 0, partition_size)
 
         #print("before partition gradients")
         if start < param.ds_numel:
             elements = min(param.ds_numel - start, partition_size)
 
-            dest_tensor_full_buffer = partition_buffer.view(-1).narrow(
-                0,
-                0,
-                partition_size)
-
             dest_tensor = dest_tensor_full_buffer.narrow(0, 0, elements)
             src_tensor = param.grad.view(-1).narrow(0, start, elements)
 

From 0d4a54a04d658db40a120bc10c6f1f1a4478f6f1 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Sun, 18 Apr 2021 23:45:37 -0700
Subject: [PATCH 60/78] ZeRO-Infinity (#976)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com>
Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com>
---
 DeepSpeedExamples                             |    2 +-
 README.md                                     |    2 +
 csrc/aio/common/deepspeed_aio_common.cpp      |  333 ++++++
 csrc/aio/common/deepspeed_aio_common.h        |   36 +
 csrc/aio/common/deepspeed_aio_types.cpp       |   74 ++
 csrc/aio/common/deepspeed_aio_types.h         |   57 +
 csrc/aio/common/deepspeed_aio_utils.cpp       |  123 ++
 csrc/aio/common/deepspeed_aio_utils.h         |   77 ++
 csrc/aio/py_lib/deepspeed_aio_thread.cpp      |   84 ++
 csrc/aio/py_lib/deepspeed_aio_thread.h        |   57 +
 csrc/aio/py_lib/deepspeed_py_aio.cpp          |  121 ++
 csrc/aio/py_lib/deepspeed_py_aio.h            |   27 +
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp   |  282 +++++
 csrc/aio/py_lib/deepspeed_py_aio_handle.h     |   68 ++
 csrc/aio/py_lib/deepspeed_py_copy.cpp         |  133 +++
 csrc/aio/py_lib/deepspeed_py_copy.h           |   40 +
 csrc/aio/py_lib/py_ds_aio.cpp                 |   41 +
 csrc/aio/py_test/ds_aio_basic.py              |  144 +++
 csrc/aio/py_test/ds_aio_handle.py             |  176 +++
 csrc/aio/py_test/parse_aio_stats.py           |  169 +++
 csrc/aio/py_test/run_read_sweep.sh            |   59 +
 csrc/aio/py_test/run_write_sweep.sh           |   58 +
 csrc/aio/py_test/test_ds_aio.py               |  120 ++
 csrc/aio/py_test/test_ds_aio_utils.py         |   42 +
 csrc/includes/cpu_adam.h                      |    2 +-
 deepspeed/launcher/runner.py                  |    2 +-
 deepspeed/ops/aio/__init__.py                 |    6 +
 deepspeed/profiling/config.py                 |    6 -
 .../activation_checkpointing/checkpointing.py |   33 +-
 deepspeed/runtime/config.py                   |    4 +
 deepspeed/runtime/config_utils.py             |    8 +-
 deepspeed/runtime/engine.py                   |   49 +-
 deepspeed/runtime/swap_tensor/__init__.py     |    4 +
 deepspeed/runtime/swap_tensor/aio_config.py   |   44 +
 .../runtime/swap_tensor/async_swapper.py      |  173 +++
 deepspeed/runtime/swap_tensor/constants.py    |   27 +
 .../runtime/swap_tensor/optimizer_utils.py    |  526 ++++++++
 .../partitioned_optimizer_swapper.py          |  260 ++++
 .../swap_tensor/partitioned_param_swapper.py  |  308 +++++
 .../pipelined_optimizer_swapper.py            |  284 +++++
 deepspeed/runtime/swap_tensor/utils.py        |  241 ++++
 deepspeed/runtime/zero/__init__.py            |    8 +
 deepspeed/runtime/zero/config.py              |   59 +-
 deepspeed/runtime/zero/constants.py           |   36 +-
 deepspeed/runtime/zero/linear.py              |    6 +
 deepspeed/runtime/zero/offload_config.py      |   63 +
 deepspeed/runtime/zero/offload_constants.py   |   67 ++
 .../runtime/zero/partition_parameters.py      |  213 +++-
 deepspeed/runtime/zero/stage3.py              | 1060 ++++++++++++-----
 deepspeed/runtime/zero/tiling.py              |  293 +++++
 docs/_data/navigation.yml                     |    4 +
 docs/_pages/config-json.md                    |  126 +-
 docs/_tutorials/pipeline.md                   |    6 +
 docs/_tutorials/zero.md                       |  111 +-
 docs/code-docs/source/optimizers.rst          |    4 +
 docs/code-docs/source/zero3.rst               |   70 +-
 docs/index.md                                 |    1 +
 op_builder/__init__.py                        |    4 +-
 op_builder/async_io.py                        |   56 +
 tests/unit/modelingpreln.py                   |    3 -
 tests/unit/test_pipe_module.py                |    2 +-
 tests/unit/test_zero_context.py               |  195 ++-
 tests/unit/test_zero_tiled.py                 |  169 +++
 63 files changed, 6251 insertions(+), 607 deletions(-)
 create mode 100644 csrc/aio/common/deepspeed_aio_common.cpp
 create mode 100644 csrc/aio/common/deepspeed_aio_common.h
 create mode 100644 csrc/aio/common/deepspeed_aio_types.cpp
 create mode 100644 csrc/aio/common/deepspeed_aio_types.h
 create mode 100644 csrc/aio/common/deepspeed_aio_utils.cpp
 create mode 100644 csrc/aio/common/deepspeed_aio_utils.h
 create mode 100644 csrc/aio/py_lib/deepspeed_aio_thread.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_aio_thread.h
 create mode 100644 csrc/aio/py_lib/deepspeed_py_aio.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_py_aio.h
 create mode 100644 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_py_aio_handle.h
 create mode 100644 csrc/aio/py_lib/deepspeed_py_copy.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_py_copy.h
 create mode 100755 csrc/aio/py_lib/py_ds_aio.cpp
 create mode 100755 csrc/aio/py_test/ds_aio_basic.py
 create mode 100755 csrc/aio/py_test/ds_aio_handle.py
 create mode 100755 csrc/aio/py_test/parse_aio_stats.py
 create mode 100755 csrc/aio/py_test/run_read_sweep.sh
 create mode 100755 csrc/aio/py_test/run_write_sweep.sh
 create mode 100755 csrc/aio/py_test/test_ds_aio.py
 create mode 100755 csrc/aio/py_test/test_ds_aio_utils.py
 create mode 100755 deepspeed/ops/aio/__init__.py
 create mode 100644 deepspeed/runtime/swap_tensor/__init__.py
 create mode 100644 deepspeed/runtime/swap_tensor/aio_config.py
 create mode 100644 deepspeed/runtime/swap_tensor/async_swapper.py
 create mode 100644 deepspeed/runtime/swap_tensor/constants.py
 create mode 100644 deepspeed/runtime/swap_tensor/optimizer_utils.py
 create mode 100644 deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
 create mode 100644 deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
 create mode 100644 deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
 create mode 100644 deepspeed/runtime/swap_tensor/utils.py
 create mode 100644 deepspeed/runtime/zero/offload_config.py
 create mode 100644 deepspeed/runtime/zero/offload_constants.py
 create mode 100644 deepspeed/runtime/zero/tiling.py
 create mode 100644 op_builder/async_io.py
 create mode 100644 tests/unit/test_zero_tiled.py

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 20ea07a2a069..bdf8e59aede8 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 20ea07a2a069696abec212e25476a9bf76aced70
+Subproject commit bdf8e59aede8c8e0577e8d4d557298ca8515268f
diff --git a/README.md b/README.md
index 8323084ac6fe..2700b7175fe0 100755
--- a/README.md
+++ b/README.md
@@ -193,6 +193,8 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
+
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
new file mode 100644
index 000000000000..11927969c50f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -0,0 +1,333 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "deepspeed_aio_common.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_PERF 0
+#define DEBUG_DS_AIO_SUBMIT_PERF 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+    __attribute__((unused));
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+
+    std::cout << c_library_name << ": latency statistics(usec) " << tag
+              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
+}
+
+static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
+                               struct deepspeed_aio_latency_t& summary_latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._avg_usec =
+        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+}
+
+static void _do_io_submit_singles(const long long int n_iocbs,
+                                  const long long int iocb_index,
+                                  std::unique_ptr<aio_context>& aio_ctxt,
+                                  std::vector<std::chrono::duration<double>>& submit_times)
+{
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto st = std::chrono::high_resolution_clock::now();
+        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
+        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
+               submit_times.back().count() * 1e6,
+               iocb_index,
+               aio_ctxt->_iocbs[i]->u.c.buf,
+               aio_ctxt->_iocbs[i]->u.c.nbytes,
+               aio_ctxt->_iocbs[i]->u.c.offset);
+#endif
+        assert(submit_ret > 0);
+    }
+}
+
+static void _do_io_submit_block(const long long int n_iocbs,
+                                const long long int iocb_index,
+                                std::unique_ptr<aio_context>& aio_ctxt,
+                                std::vector<std::chrono::duration<double>>& submit_times)
+{
+    const auto st = std::chrono::high_resolution_clock::now();
+    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
+    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
+           submit_times.back().count() * 1e6,
+           iocb_index,
+           n_iocbs,
+           aio_ctxt->_iocbs[0]->u.c.buf,
+           aio_ctxt->_iocbs[0]->u.c.nbytes,
+           aio_ctxt->_iocbs[0]->u.c.offset);
+#endif
+    assert(submit_ret > 0);
+}
+
+static int _do_io_complete(const long long int min_completes,
+                           const long long int max_completes,
+                           std::unique_ptr<aio_context>& aio_ctxt,
+                           std::vector<std::chrono::duration<double>>& reap_times)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    const auto n_completes = io_getevents(
+        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
+
+    assert(n_completes >= min_completes);
+    return n_completes;
+}
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
+
+    const auto num_io_blocks = static_cast<long long int>(
+        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+    const auto max_queue_bytes =
+        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (long long iocb_index = 0; iocb_index < num_io_blocks;
+         iocb_index += aio_ctxt->_queue_depth) {
+        const auto start_offset = iocb_index * aio_ctxt->_block_size;
+        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
+        const auto n_iocbs =
+            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
+        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
+
+        if (config->_single_submit) {
+            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        } else {
+            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        }
+
+        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
+    }
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
+
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+
+    auto request_iocbs = aio_ctxt->_queue_depth;
+    auto n_pending_iocbs = 0;
+    const auto min_completes = 1;
+    auto start = std::chrono::high_resolution_clock::now();
+    while (true) {
+        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
+        if (n_iocbs > 0) {
+            if (config->_single_submit) {
+                _do_io_submit_singles(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            } else {
+                _do_io_submit_block(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            }
+        }
+
+        n_pending_iocbs += n_iocbs;
+        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
+
+        if (n_pending_iocbs == 0) { break; }
+
+        const auto n_complete =
+            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
+        n_pending_iocbs -= n_complete;
+    }
+
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code)
+{
+    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
+                          " error = " + std::to_string(error_code);
+    std::cerr << c_library_name << ":  " << err_msg << std::endl;
+}
+
+int open_file(const char* filename, const bool read_op)
+{
+    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int mode = 0600;
+    const auto fd = open(filename, flags, mode);
+    if (fd == -1) {
+        const auto error_code = errno;
+        const auto error_msg = read_op ? " open for read " : " open for write ";
+        report_file_error(filename, error_msg, error_code);
+        return -1;
+    }
+    return fd;
+}
+
+int regular_read(const char* filename, std::vector<char>& buffer)
+{
+    long long int num_bytes;
+    const auto f_size = get_file_size(filename, num_bytes);
+    assert(f_size != -1);
+    buffer.resize(num_bytes);
+    const auto fd = open(filename, O_RDONLY, 0600);
+    assert(fd != -1);
+    long long int read_bytes = 0;
+    auto r = 0;
+    do {
+        const auto buffer_ptr = buffer.data() + read_bytes;
+        const auto bytes_to_read = num_bytes - read_bytes;
+        r = read(fd, buffer_ptr, bytes_to_read);
+        read_bytes += r;
+    } while (r > 0);
+
+    if (read_bytes != num_bytes) {
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
+    }
+    assert(read_bytes == num_bytes);
+    close(fd);
+    return 0;
+}
+
+static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+{
+    std::vector<char> regular_buffer;
+    const auto reg_ret = regular_read(filename, regular_buffer);
+    assert(0 == reg_ret);
+    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
+              << std::endl;
+
+    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+
+    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
+}
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes)
+{
+    const auto msg_suffix = std::string("deepspeed_aio_") +
+                            std::string(read_op ? "read()" : "write()") +
+                            std::string("using read()");
+
+    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
+        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
+        return false;
+    }
+
+    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
+    return true;
+}
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
new file mode 100644
index 000000000000..1f32fc8f794f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_utils.h>
+#include <stdlib.h>
+#include <memory>
+#include <string>
+
+using namespace std;
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf);
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf);
+
+int open_file(const char* filename, const bool read_op);
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code);
+
+int regular_read(const char* filename, std::vector<char>& buffer);
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
new file mode 100644
index 000000000000..5f717c3b5658
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -0,0 +1,74 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+deepspeed_aio_config_t::deepspeed_aio_config_t()
+    : _block_size(c_block_size),
+      _queue_depth(c_io_queue_depth),
+      _single_submit(false),
+      _overlap_events(false),
+      _lock_memory(false)
+{
+}
+
+deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const bool lock_memory)
+    : _block_size(block_size),
+      _queue_depth(queue_depth),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _lock_memory(lock_memory)
+{
+}
+
+void deepspeed_aio_latency_t::dump(const std::string tag)
+{
+    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
+}
+
+void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
+{
+    _min_usec += other._min_usec;
+    _max_usec += other._max_usec;
+    _avg_usec += other._avg_usec;
+}
+
+void deepspeed_aio_latency_t::scale(const float scaler)
+{
+    _min_usec *= scaler;
+    _max_usec *= scaler;
+    _avg_usec *= scaler;
+}
+
+aio_context::aio_context(const int block_size, const int queue_depth)
+{
+    _block_size = block_size;
+    _queue_depth = queue_depth;
+    for (auto i = 0; i < queue_depth; ++i) {
+        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
+    }
+    _io_events.resize(queue_depth);
+    io_queue_init(queue_depth, &_io_ctxt);
+}
+
+aio_context::~aio_context()
+{
+    for (auto& iocb : _iocbs) { free(iocb); }
+    _io_events.resize(0);
+    io_queue_release(_io_ctxt);
+}
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
new file mode 100644
index 000000000000..5c5dcdf0b559
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <libaio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+struct deepspeed_aio_latency_t {
+    double _min_usec;
+    double _max_usec;
+    double _avg_usec;
+
+    void dump(const std::string tag);
+    void accumulate(const deepspeed_aio_latency_t&);
+    void scale(const float value);
+};
+
+struct deepspeed_aio_perf_t {
+    deepspeed_aio_latency_t _submit;
+    deepspeed_aio_latency_t _complete;
+    double _e2e_usec;
+    double _e2e_rate_GB;
+};
+
+struct deepspeed_aio_config_t {
+    const int _block_size;
+    const int _queue_depth;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const bool _lock_memory;
+
+    deepspeed_aio_config_t();
+    deepspeed_aio_config_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool lock_memory);
+};
+
+struct aio_context {
+    io_context_t _io_ctxt;
+    std::vector<struct io_event> _io_events;
+    std::vector<struct iocb*> _iocbs;
+    int _block_size;
+    int _queue_depth;
+
+    aio_context(const int block_size, const int queue_depth);
+    ~aio_context();
+};
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
new file mode 100644
index 000000000000..a3d89be5ad3e
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -0,0 +1,123 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+io_xfer_ctxt::io_xfer_ctxt(const int fd,
+                           const long long int file_offset,
+                           const long long int num_bytes,
+                           const void* buffer)
+    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+{
+}
+
+io_prep_context::io_prep_context(const bool read_op,
+                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 const size_t block_size,
+                                 const std::vector<struct iocb*>* iocbs)
+    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
+{
+}
+
+void io_prep_context::prep_iocbs(const int n_iocbs,
+                                 const size_t num_bytes,
+                                 const void* start_buffer,
+                                 const long long int start_offset)
+{
+    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto shift = i * _block_size;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        auto byte_count = _block_size;
+        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
+
+        if (_read_op) {
+            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        } else {
+            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        }
+    }
+}
+
+io_prep_generator::io_prep_generator(const bool read_op,
+                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                     const size_t block_size)
+    : _read_op(read_op),
+      _xfer_ctxt(xfer_ctxt),
+      _block_size(block_size),
+      _remaining_bytes(xfer_ctxt->_num_bytes),
+      _next_iocb_index(0)
+{
+    _num_io_blocks =
+        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+    _remaining_io_blocks = _num_io_blocks;
+}
+
+int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
+{
+    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
+        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        return 0;
+    }
+
+    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
+
+    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
+        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+
+        if (_read_op) {
+            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        } else {
+            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        }
+        _remaining_bytes -= num_bytes;
+    }
+    _remaining_io_blocks -= actual_n_iocbs;
+
+    return actual_n_iocbs;
+}
+
+int get_file_size(const char* filename, long long int& size)
+{
+    struct stat st;
+    if (stat(filename, &st) == -1) { return -1; }
+    size = st.st_size;
+    return 0;
+}
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock)
+{
+    void* ptr;
+    int retval;
+
+    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
+    if (retval) { return nullptr; }
+
+    if (lock == false) { return ptr; }
+
+    auto mlock_ret = mlock(ptr, size);
+    if (mlock_ret != 0) {
+        auto mlock_error = errno;
+        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
+
+        free(ptr);
+        return nullptr;
+    }
+
+    return ptr;
+}
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
new file mode 100644
index 000000000000..f37a95c5149a
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -0,0 +1,77 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <deepspeed_aio_types.h>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+struct io_xfer_ctxt {
+    const int _fd;
+    const long long int _base_offset;
+    const void* _mem_buffer;
+    const long long int _num_bytes;
+
+    io_xfer_ctxt(const int fd,
+                 const long long int file_offset,
+                 const long long int num_bytes,
+                 const void* buffer);
+};
+
+struct io_prep_context {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+    const std::vector<struct iocb*>* _iocbs;
+
+    io_prep_context(const bool read_op,
+                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                    const size_t block_size,
+                    const std::vector<struct iocb*>* iocbs);
+
+    void prep_iocbs(const int n_iocbs,
+                    const size_t num_bytes,
+                    const void* start_buffer,
+                    const long long int start_offset);
+};
+
+struct io_prep_generator {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+
+    long long int _remaining_bytes;
+    long long int _num_io_blocks;
+    long long int _remaining_io_blocks;
+    long long int _next_iocb_index;
+
+    io_prep_generator(const bool read_op,
+                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                      const size_t block_size);
+
+    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
+};
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+
+int get_file_size(const char* filename, long long int& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
new file mode 100644
index 000000000000..2c7509cb3ba0
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -0,0 +1,84 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_aio_thread.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int num_bytes,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _num_bytes(num_bytes),
+      _validate(validate)
+{
+    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::fini()
+{
+    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+}
+
+deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
+    : _tid(tid),
+      _aio_config(aio_config),
+      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
+      _time_to_exit(false)
+{
+}
+
+deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
+
+void deepspeed_aio_thread_t::run()
+{
+    while (true) {
+        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
+
+        {
+            std::unique_lock<std::mutex> lock(_work_sync._mutex);
+            _work_sync._cond_var.wait(lock,
+                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
+            if (!_work_queue.empty()) {
+                next_io_op = _work_queue.front();
+                _work_queue.pop();
+            }
+        }
+
+        if (next_io_op) {
+            const auto base_offset = next_io_op->_num_bytes * _tid;
+
+            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
+
+            if (_aio_config._overlap_events) {
+                do_aio_operation_overlap(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            } else {
+                do_aio_operation_sequential(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
+                _complete_queue.push(next_io_op);
+            }
+            _complete_sync._cond_var.notify_one();
+        }
+
+        if (_time_to_exit) { break; }
+    }
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
new file mode 100644
index 000000000000..ee099dd2d16c
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _num_bytes;
+    torch::Tensor _cpu_buffer;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int num_bytes,
+                 const bool validate);
+
+    char* data_ptr() const;
+    void fini();
+};
+
+struct thread_sync_t {
+    std::mutex _mutex;
+    std::condition_variable _cond_var;
+};
+
+struct deepspeed_aio_thread_t {
+    const int _tid;
+    deepspeed_aio_config_t& _aio_config;
+
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
+
+    bool _time_to_exit;
+
+    struct thread_sync_t _work_sync;
+    struct thread_sync_t _complete_sync;
+
+    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
+
+    ~deepspeed_aio_thread_t();
+
+    void run();
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
new file mode 100644
index 000000000000..cc2895cc74b3
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -0,0 +1,121 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "deepspeed_py_aio.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_READ 0
+#define DEBUG_DS_AIO_WRITE 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
new file mode 100644
index 000000000000..a78d5734009d
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -0,0 +1,27 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate);
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
new file mode 100644
index 000000000000..8d4cfd9e8636
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -0,0 +1,282 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_aio_handle.h"
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0)
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_aio_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_aio_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_aio_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_aio_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        completed_op->fini();
+
+        close(completed_op->_fd);
+
+        if (completed_op->_validate) {
+            validate_aio_operation(completed_op->_read_op,
+                                   completed_op->_filename.c_str(),
+                                   completed_op->data_ptr(),
+                                   completed_op->_num_bytes);
+        }
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                       const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
new file mode 100644
index 000000000000..09358f4d927b
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -0,0 +1,68 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+
+struct deepspeed_aio_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+
+    deepspeed_aio_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_aio_handle_t();
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
new file mode 100644
index 000000000000..3cdb5ed344bf
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -0,0 +1,133 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_copy.h"
+#include <omp.h>
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+#if defined(__AVX512__) or defined(__AVX256__)
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#else
+    __m256 data;
+#endif
+};
+#endif
+
+static void helper_memcpy_1(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            AVX_Data src_4;
+            src_4.data = SIMD_LOAD(src + i);
+
+            SIMD_STORE(dest + i, src_4.data);
+        }
+    }
+
+#endif
+
+    if (param_size > rounded_size) {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
+    }
+}
+
+static void helper_memcpy_4(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            AVX_Data src_4[4];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+static void helper_mempcy_8(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            AVX_Data src_4[8];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
+            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
+            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
+            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
+{
+    auto dest_c = dest.contiguous();
+    auto src_c = src.contiguous();
+
+    float* dest_ptr = (float*)dest_c.data_ptr();
+    float* src_ptr = (float*)src_c.data_ptr();
+
+    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
+
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
new file mode 100644
index 000000000000..823c298f6431
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -0,0 +1,40 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cpuid.h>
+#include <x86intrin.h>
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+#define TILE (1024 * 1024 * 1024)
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
new file mode 100755
index 000000000000..eee2cba0a962
--- /dev/null
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -0,0 +1,41 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_aio_handle.h"
+#include "deepspeed_py_copy.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+
+    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
+
+    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>())
+
+        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+
+        .def("read", &deepspeed_aio_handle_t::read)
+        .def("write", &deepspeed_aio_handle_t::write)
+
+        .def("pread", &deepspeed_aio_handle_t::pread)
+        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+
+        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
+        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
+        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
+        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+
+        .def("wait", &deepspeed_aio_handle_t::wait);
+}
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
new file mode 100755
index 000000000000..305cacc54e18
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_read, aio_write
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_read(ctxt['buffer'],
+             ctxt['file'],
+             args.block_size,
+             args.queue_depth,
+             args.single_submit,
+             args.overlap_events,
+             args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_write(ctxt['buffer'],
+              ctxt['file'],
+              args.block_size,
+              args.queue_depth,
+              args.single_submit,
+              args.overlap_events,
+              args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
new file mode 100755
index 000000000000..fe1459c89013
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -0,0 +1,176 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_handle
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    if args.gpu:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
+    else:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    handle = aio_handle(args.block_size,
+                        args.queue_depth,
+                        args.single_submit,
+                        args.overlap_events,
+                        io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['handle'] = handle
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
new file mode 100755
index 000000000000..82adf85ea8a2
--- /dev/null
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+import re
+
+RAW_RATE = 'raw_rate'
+E2E_RATE = 'e2e_rate'
+SUBMIT_LATENCY = 'submit_latency'
+COMPLETE_LATENCY = 'complete_latency'
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+TASK_READ_SPEED = 'task_read_speed'
+
+PERF_METRICS = [
+    RAW_RATE,
+    E2E_RATE,
+    SUBMIT_LATENCY,
+    COMPLETE_LATENCY,
+    READ_SPEED,
+    WRITE_SPEED
+]
+METRIC_SEARCH = {
+    RAW_RATE: 'ds_raw_time',
+    E2E_RATE: 'ds_time',
+    SUBMIT_LATENCY: 'aggr: submit',
+    COMPLETE_LATENCY: 'aggr: complete',
+    READ_SPEED: 'E2E Read Speed',
+    WRITE_SPEED: 'E2E Write Speed'
+}
+
+NUM_BYTES = (400 * 1024 * 1024)
+NUM_GIGA_BYTES = (1024 * 1024 * 1024)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--logdir',
+                        type=str,
+                        required=True,
+                        help='Folder of statistics logs')
+
+    parser.add_argument(
+        '--metric',
+        type=str,
+        required=True,
+        help=
+        'Performance metric to report: [raw_rate|e2e_rate|submit_latency|complete_latency]'
+    )
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(file)
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    num_giga_bytes = NUM_BYTES / NUM_GIGA_BYTES
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric == RAW_RATE:
+                    fields = line.split()
+                    raw_time_sec = float(fields[2]) / 1e06
+                    raw_rate = (thread_count * num_giga_bytes * 1.0) / raw_time_sec
+                    return raw_rate
+                elif metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.logdir):
+        print(f'{args.logdir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    log_files = [
+        f for f in os.listdir(args.logdir)
+        if os.path.isfile(os.path.join(args.logdir,
+                                       f))
+    ]
+
+    log_files_path = [os.path.join(args.logdir, f) for f in log_files]
+    results = get_results(log_files_path, args.metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
new file mode 100755
index 000000000000..8ef29d973d58
--- /dev/null
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <input file> <output log dir>"
+    exit 1
+fi
+
+INPUT_FILE=$1
+if [[ ! -f ${INPUT_FILE} ]]; then
+    echo "Input file not found: ${INPUT_FILE}"
+    exit 1
+fi
+
+LOG_DIR=$2
+RUN_SCRIPT=./test_ds_aio.py
+READ_OPT="--read_file ${INPUT_FILE}"
+
+if [[ -d ${LOG_DIR} ]]; then
+    rm -f ${LOG_DIR}/*
+else
+    mkdir -p ${LOG_DIR}
+fi
+
+DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sync"
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1 ; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${DISABLE_CACHE}
+                        echo ${cmd}
+                        echo ${SYNC}
+
+                        eval ${DISABLE_CACHE}
+                        eval ${cmd}
+                        eval ${SYNC}
+                        sleep 2
+                    done
+                done
+            done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
new file mode 100755
index 000000000000..0360934b469e
--- /dev/null
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
+
+if [[ $# -ne 3 ]]; then
+    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
+    exit 1
+fi
+
+SIZE="$1M"
+WRITE_DIR=$2
+LOG_DIR=$3
+
+OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+
+
+prep_folder ${WRITE_DIR}
+prep_folder ${LOG_DIR}
+
+RUN_SCRIPT=./test_ds_aio.py
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads 1"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/write_${SIZE}B_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${cmd}
+                        eval ${cmd}
+                        sleep 2
+                    done
+                done
+        done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
new file mode 100755
index 000000000000..607a59f58e08
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+import argparse
+import time
+import sys
+from multiprocessing import Pool
+import multiprocessing as mp
+from deepspeed.ops.aio import aio_read, aio_write, aio_handle
+from ds_aio_basic import aio_basic_multiprocessing
+from ds_aio_handle import aio_handle_multiprocessing
+
+GB_DIVISOR = 1024**3
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
+
+    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
+
+    parser.add_argument('--write_size',
+                        type=str,
+                        default=None,
+                        help='Number of bytes to write.')
+
+    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
+
+    parser.add_argument('--threads',
+                        type=int,
+                        default=1,
+                        help='Thread parallelism count.')
+
+    parser.add_argument(
+        '--single_submit',
+        action='store_true',
+        help=
+        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
+    )
+
+    parser.add_argument('--overlap_events',
+                        action='store_true',
+                        help='Overlap I/O submission and completion requests.')
+
+    parser.add_argument('--validate',
+                        action='store_true',
+                        help='Perform validation in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops',
+                        type=int,
+                        default=1,
+                        help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel',
+                        type=int,
+                        default=None,
+                        help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def refine_args(args):
+    if args.write_size and type(args.write_size) == str:
+        args.write_size = refine_integer_value(args.write_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+
+def validate_args(args):
+    if args.read_file and not os.path.isfile(args.read_file):
+        print(f'args validation error: {args.read_file} not found')
+        return False
+
+    return True
+
+
+def main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = parse_arguments()
+    refine_args(args)
+    if not validate_args(args):
+        quit()
+
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    if args.read_file:
+        multiprocess_function(args, True)
+
+    if args.write_file:
+        multiprocess_function(args, False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
new file mode 100755
index 000000000000..689a0a503247
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+
+GIGABYTE = 1024**3
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg):
+    if tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = total_bytes / task_latency_sec / GIGABYTE
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = total_bytes / e2e_latency_sec / GIGABYTE
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 5fae35261f55..ed33b04b3e10 100755
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -20,7 +20,7 @@
         }                                                                                      \
     }
 
-#define TILE (1024 * 1024 * 1024)
+#define TILE (128 * 1024 * 1024)
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index a4a49dca6bf3..8b7c52e68d7e 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -304,7 +304,7 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = len(active_resources) > 1
+    multi_node_exec = True  # len(active_resources) > 1
 
     if not multi_node_exec:
         deepspeed_launch = [
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
new file mode 100755
index 000000000000..50e6c9a3c988
--- /dev/null
+++ b/deepspeed/ops/aio/__init__.py
@@ -0,0 +1,6 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from ..op_builder import AsyncIOBuilder
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index 807802670654..b230eaef859b 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -9,9 +9,6 @@
 
 class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
-        """
-        docstring
-        """
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
@@ -27,9 +24,6 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
-        """
-        docstring
-        """
         self.enabled = get_scalar_param(flops_profiler_dict,
                                         FLOPS_PROFILER_ENABLED,
                                         FLOPS_PROFILER_ENABLED_DEFAULT)
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 8a9785a9aedb..faa60f20efa3 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -24,7 +24,7 @@
 
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
-from deepspeed.runtime.utils import move_to_device
+from deepspeed.runtime.utils import move_to_device, see_memory_usage
 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
 
 # DeepSpeed Checkpointing Enabled or Disabled
@@ -55,34 +55,6 @@
 SYNCHRONIZE = False
 PROFILE_TIME = False
 
-
-def see_memory_usage(message, force=False):
-    # return
-    if not force:
-        return
-    # dist.barrier()
-    if dist.get_rank() == 0:
-        logger.info(message)
-        logger.info(
-            "Memory Allocated %s GigaBytes",
-            torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max Memory Allocated %s GigaBytes",
-            torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Cache Allocated %s GigaBytes",
-            torch.cuda.memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max cache Allocated %s GigaBytes",
-            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info("")
-        #input("Press Any Key To Continue ..")
-
-
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
 transport_stream = None
@@ -685,8 +657,7 @@ def backward(ctx, *grads):
 
         torch.autograd.backward(output_tensors, grad_tensors)
 
-        see_memory_usage("After backward checkpointing code before backward",
-                         force=False)
+        see_memory_usage("After backward checkpointing code after backward", force=False)
 
         if PROFILE_TIME:
             timers('backward').stop()
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index b9ba6cf37cba..9e33876994f9 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -24,6 +24,8 @@
 
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 
+from .swap_tensor.aio_config import get_aio_config
+
 TENSOR_CORE_ALIGN_SIZE = 8
 
 ADAM_OPTIMIZER = 'adam'
@@ -656,6 +658,8 @@ def _initialize_params(self, param_dict):
         self.checkpoint_tag_validation_enabled = validation_mode != ValidationMode.IGNORE
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
 
+        self.aio_config = get_aio_config(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 12711d56f7f6..7a6f994e806a 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -6,7 +6,7 @@
 Collection of DeepSpeed configuration utilities
 """
 import json
-from collections import Counter, Mapping, Sequence
+import collections
 
 
 # adapted from https://stackoverflow.com/a/50701137/9201239
@@ -31,13 +31,13 @@ def iterencode(self, o, _one_shot=False, level=0):
                 return f"{o:e}"
             else:
                 return f"{o}"
-        elif isinstance(o, Mapping):
+        elif isinstance(o, collections.Mapping):
             x = [
                 f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
                 v in o.items()
             ]
             return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
-        elif isinstance(o, Sequence) and not isinstance(o, str):
+        elif isinstance(o, collections.Sequence) and not isinstance(o, str):
             return f"[{ f', '.join(map(self.iterencode, o)) }]"
         return "\n, ".join(super().iterencode(o, _one_shot))
 
@@ -70,7 +70,7 @@ def dict_raise_error_on_duplicate_keys(ordered_pairs):
     """Reject duplicate keys."""
     d = dict((k, v) for k, v in ordered_pairs)
     if len(d) != len(ordered_pairs):
-        counter = Counter([pair[0] for pair in ordered_pairs])
+        counter = collections.Counter([pair[0] for pair in ordered_pairs])
         keys = [key for key, value in counter.items() if value > 1]
         raise ValueError("Duplicate keys in DeepSpeed config: {}".format(keys))
     return d
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 5081adb0b021..f71a7324585a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -340,14 +340,14 @@ def zero_reduce_scatter(self):
     def zero_overlap_comm(self):
         return self._config.zero_config.overlap_comm
 
-    def zero_cpu_offload(self):
-        return self._config.zero_config.cpu_offload
+    def zero_offload_optimizer(self):
+        return self._config.zero_config.offload_optimizer
 
-    def zero_cpu_offload_params(self):
-        return self._config.zero_config.cpu_offload_params
+    def zero_offload_param(self):
+        return self._config.zero_config.offload_param
 
-    def zero_cpu_offload_use_pin_memory(self):
-        return self._config.zero_config.cpu_offload_use_pin_memory
+    def zero_cpu_offload(self):
+        return self._config.zero_config.offload_optimizer is not None
 
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
@@ -436,6 +436,12 @@ def initial_dynamic_scale(self):
     def dynamic_loss_scale_args(self):
         return self._config.dynamic_loss_scale_args
 
+    def swap_tensor_config(self):
+        return self._config.swap_tensor_config
+
+    def aio_config(self):
+        return self._config.aio_config
+
     def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
@@ -594,8 +600,9 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
             client_optimizer.param_groups[:] = [
                 pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
             ]
-            logger.info(
-                "Removing param_group that has no 'params'in the client Optimizer")
+            if self.global_rank == 0:
+                logger.info(
+                    "Removing param_group that has no 'params' in the client Optimizer")
 
             basic_optimizer = client_optimizer
             if self.global_rank == 0:
@@ -793,14 +800,14 @@ def _configure_zero_optimizer(self, optimizer):
                 dp_process_group=self.data_parallel_group,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=self.zero_overlap_comm(),
-                cpu_offload_optimizer_state=self.zero_cpu_offload(),
-                cpu_offload_params=self.zero_cpu_offload_params(),
-                cpu_offload_use_pin_memory=self.zero_cpu_offload_use_pin_memory(),
+                offload_optimizer_config=self.zero_offload_optimizer(),
+                offload_param_config=self.zero_offload_param(),
                 sub_group_size=self.zero_sub_group_size(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor(),
-                gradient_accumulation_steps=self.gradient_accumulation_steps())
+                gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                aio_config=self.aio_config())
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
@@ -905,6 +912,13 @@ def forward(self, *inputs, **kwargs):
         if self.module.training and self.progressive_layer_drop:
             kwargs.update(self.progressive_layer_drop.get_state())
 
+        if self.zero_optimization_partition_weights():
+            # Enable automated discovery of external parameters by indicating that
+            # we are in a forward pass.
+            for module in self.module.modules():
+                module._parameters._in_forward = True
+                pass
+
         if self.wall_clock_breakdown():
             self.timers('forward_microstep').start()
             self.timers('forward').start()
@@ -913,11 +927,15 @@ def forward(self, *inputs, **kwargs):
             self.tput_timer.start()
         loss = self.module(*inputs, **kwargs)
 
-        # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
         if self.zero_optimization_partition_weights():
+            # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
             if not torch._C.is_grad_enabled():
                 self.optimizer.param_coordinator.reset_step()
 
+            # Disable automated discovery of external parameters
+            for module in self.module.modules():
+                module._parameters._in_forward = False
+
         if self.wall_clock_breakdown():
             self.timers('forward').stop()
             self.timers('forward_microstep').stop()
@@ -1085,8 +1103,9 @@ def _take_model_step(self, lr_kwargs):
         else:
             if self.lr_scheduler is not None:
                 self.lr_scheduler.step(**(lr_kwargs or {}))
-            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
-                self._report_progress(self.global_steps + 1)
+
+        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+            self._report_progress(self.global_steps + 1)
 
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
diff --git a/deepspeed/runtime/swap_tensor/__init__.py b/deepspeed/runtime/swap_tensor/__init__.py
new file mode 100644
index 000000000000..8c18558e9423
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/__init__.py
@@ -0,0 +1,4 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
new file mode 100644
index 000000000000..6a7014c1863c
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -0,0 +1,44 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.swap_tensor.constants import *
+
+AIO_DEFAULT_DICT = {
+    AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
+    AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
+    AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
+    AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+}
+
+
+def get_aio_config(param_dict):
+    if AIO in param_dict.keys() and param_dict[AIO] is not None:
+        aio_dict = param_dict[AIO]
+        return {
+            AIO_BLOCK_SIZE:
+            get_scalar_param(aio_dict,
+                             AIO_BLOCK_SIZE,
+                             AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH:
+            get_scalar_param(aio_dict,
+                             AIO_QUEUE_DEPTH,
+                             AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_THREAD_COUNT:
+            get_scalar_param(aio_dict,
+                             AIO_THREAD_COUNT,
+                             AIO_THREAD_COUNT_DEFAULT),
+            AIO_SINGLE_SUBMIT:
+            get_scalar_param(aio_dict,
+                             AIO_SINGLE_SUBMIT,
+                             AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS:
+            get_scalar_param(aio_dict,
+                             AIO_OVERLAP_EVENTS,
+                             AIO_OVERLAP_EVENTS_DEFAULT)
+        }
+
+    return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
new file mode 100644
index 000000000000..e6e19a4c67ef
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
+
+INVALID_BUFFER_INDEX = -1
+ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait'
+
+
+class AsyncTensorSwapper(object):
+    def __init__(self, aio_handle, numel_alignment, timers):
+        self.free_buffer_index = []
+        self.swapping_buffer_index = []
+        self.ready_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.all_buffers = []
+        self.aio_handle = aio_handle
+        self.numel_alignment = numel_alignment
+        self.max_numel = 0
+        self.num_pending_swaps = 0
+        self.timers = timers
+        self.timer_names = set()
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+    def has_buffers(self):
+        return len(self.all_buffers) > 0
+
+    def add_buffers(self, buffer_list):
+        assert len(self.all_buffers) == 0
+        assert all([buffer.is_pinned() for buffer in buffer_list])
+        dtype = buffer_list[0].dtype
+        assert all([buffer.dtype == dtype for buffer in buffer_list])
+
+        self.dtype = dtype
+        self.all_buffers = [SwapBuffer(buffer) for buffer in buffer_list]
+        self.free_buffer_index += [i for i in range(len(self.all_buffers))]
+        self.max_numel = max([buffer.numel() for buffer in buffer_list])
+        self.timer_names = set()
+
+    def get_timer_names(self):
+        return list(self.timer_names)
+
+    def release_buffers(self):
+        self._report_statistics('Swapped out[Before flush]')
+        self._flush_buffers_until_complete()
+        self._report_statistics('Swapped out[After flush]')
+
+        pinned_buffers = [buf.buffer for buf in self.all_buffers]
+        self.all_buffers = []
+        self.free_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+        return pinned_buffers
+
+    def swap_out_tensors(self, tensor_list, path_list):
+        for tensor, swap_path in zip(tensor_list, path_list):
+            self._swap_out_tensor(tensor, swap_path)
+
+    def _report_statistics(self, message):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=self.dtype).element_size()
+            swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
+            logger.info(
+                f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
+            )
+
+    def _swap_out_tensor(self, tensor, swap_path):
+        assert len(self.all_buffers) > 0
+
+        aligned_numel = self._io_aligned_numel(tensor.numel())
+        assert aligned_numel <= self.max_numel
+
+        self._make_swap_space(aligned_numel)
+        assert self.current_buffer_index != INVALID_BUFFER_INDEX
+
+        swap_buffer = self._get_current_buffer()
+        swap_buffer.insert_tensor(tensor, swap_path, aligned_numel)
+
+    def _make_swap_space(self, numel):
+        if self.current_buffer_index == INVALID_BUFFER_INDEX:
+            self._allocate_buffer()
+            return
+
+        if not self._get_current_buffer().has_space(numel):
+            if len(self.free_buffer_index) > 0:
+                self._flush_ready_buffers()
+            else:
+                self._flush_buffers_until_complete()
+            self._allocate_buffer()
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
+
+    def _allocate_buffer(self):
+        assert self.current_buffer_index == INVALID_BUFFER_INDEX
+        assert len(self.all_buffers) > 0
+        assert len(self.free_buffer_index) > 0
+        self.current_buffer_index = self.free_buffer_index[-1]
+        self.free_buffer_index = self.free_buffer_index[:-1]
+
+    def _flush_ready_buffers(self):
+        if self.current_buffer_index != INVALID_BUFFER_INDEX:
+            self.ready_buffer_index.append(self.current_buffer_index)
+            self.current_buffer_index = INVALID_BUFFER_INDEX
+
+        self._swap_out_ready_buffers()
+
+    def _flush_buffers_until_complete(self):
+        self._flush_ready_buffers()
+        assert len(self.ready_buffer_index) == 0
+
+        self._wait_for_swap_complete()
+        assert len(self.swapping_buffer_index) == 0
+        assert len(self.free_buffer_index) == len(self.all_buffers)
+
+    def _swap_out_ready_buffers(self):
+        for buffer_index in self.ready_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            swap_tensors = buffer.get_swap_tensors()
+            swap_paths = buffer.get_swap_paths()
+            self.num_pending_swaps += len(swap_tensors)
+            swap_out_tensors(self.aio_handle, swap_tensors, swap_paths)
+
+        self.swapping_buffer_index += self.ready_buffer_index
+        self.ready_buffer_index = []
+
+    def _wait_for_swap_complete(self):
+        assert len(self.swapping_buffer_index) > 0
+
+        self._start_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        assert self.aio_handle.wait() == self.num_pending_swaps
+        self._stop_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        self.timer_names.add(ASYNC_SWAPPER_WAIT_TIMER)
+
+        self.num_pending_swaps = 0
+
+        for buffer_index in self.swapping_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            self.num_elements_swapped += buffer.get_num_elem()
+            buffer.reset()
+
+        self.free_buffer_index += self.swapping_buffer_index
+        assert len(self.free_buffer_index) <= len(self.all_buffers)
+        self.swapping_buffer_index = []
+
+    def _get_buffer(self, index):
+        assert index != INVALID_BUFFER_INDEX
+        return self.all_buffers[index]
+
+    def _get_current_buffer(self):
+        return self._get_buffer(self.current_buffer_index)
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and force:
+            self.timers.log(name_list)
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
new file mode 100644
index 000000000000..752ec8dcaacf
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -0,0 +1,27 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# AIO
+#########################################
+AIO_FORMAT = '''
+"aio": {
+  "block_size": 1048576,
+  "queue_depth": 8,
+  "thread_count": 1,
+  "single_submit": false,
+  "overlap_events": true
+}
+'''
+AIO = "aio"
+AIO_BLOCK_SIZE = "block_size"
+AIO_BLOCK_SIZE_DEFAULT = 1048576
+AIO_QUEUE_DEPTH = "queue_depth"
+AIO_QUEUE_DEPTH_DEFAULT = 8
+AIO_THREAD_COUNT = "thread_count"
+AIO_THREAD_COUNT_DEFAULT = 1
+AIO_SINGLE_SUBMIT = "single_submit"
+AIO_SINGLE_SUBMIT_DEFAULT = False
+AIO_OVERLAP_EVENTS = "overlap_events"
+AIO_OVERLAP_EVENTS_DEFAULT = True
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
new file mode 100644
index 000000000000..5785ff7cc36d
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -0,0 +1,526 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool
+
+
+class FlattenedTensorSwapInfo(object):
+    def __init__(self, path, length, offset):
+        self.path = path
+        self.offset = offset
+        self.length = length
+
+
+class OptimizerStateSwapInfo(object):
+    def __init__(self, parameter, numel, base_folder):
+        self.tensors = []
+        self.param_id = id(parameter)
+        self.swap_folder = base_folder
+        self.swap_paths = []
+        self.swapped_gradients = {}
+        self.unswapped_gradients = {}
+        self.tensor_numel = numel
+        self.tensor_dtype = parameter.dtype
+        self.tensor_device = parameter.device
+        self.has_state_tensors = False
+        self._add_tensors([parameter])
+
+    def numel(self):
+        return self.tensor_numel
+
+    def has_gradients(self):
+        return self.swapped_gradients or self.unswapped_gradients
+
+    def _add_tensors(self, tensor_list):
+        for t in tensor_list:
+            self.tensors.append(t)
+            self.swap_paths.append(os.path.join(self.swap_folder, f'{id(t)}.tensor.swp'))
+
+    def add_state_tensors(self, tensor_list):
+        self.has_state_tensors = True
+        self._add_tensors(tensor_list)
+
+    def device(self):
+        return self.tensor_device
+
+    def dtype(self):
+        return self.tensor_dtype
+
+    def release_memory(self):
+        for tensor in self.tensors:
+            tensor.data = torch.Tensor()
+
+    def get_or_create_gradient_paths(self, offsets, lengths):
+        gradient_paths = []
+        for offset, length in zip(offsets, lengths):
+            if not offset in self.swapped_gradients.keys():
+                path = os.path.join(
+                    self.swap_folder,
+                    f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
+                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(
+                    path,
+                    length,
+                    offset)
+
+            gradient_paths.append(self.swapped_gradients[offset].path)
+
+        return gradient_paths
+
+    def set_swap_buffers(self, buffers):
+        compute_lengths = [self.numel()] * len(self.tensors)
+        compute_buffers = get_sized_buffers(buffers, compute_lengths)
+        for t, buffer in zip(self.tensors, compute_buffers):
+            t.data = buffer.data
+
+    def get_swap_gradient_buffers(self, swap_buffer):
+        assert self.numel() <= swap_buffer.numel()
+        return [
+            swap_buffer.narrow(0,
+                               grad.offset,
+                               grad.length) for grad in self.swapped_gradients.values()
+        ]
+
+    def get_swap_gradient_paths(self):
+        return [grad.path for grad in self.swapped_gradients.values()]
+
+    def get_unpinned_state_tensors(self):
+        return [t for t in self.tensors if not t.is_pinned()]
+
+    def read_unswapped_gradients(self, dest_buffer):
+        num_elem_count = 0
+        for offset, grad_partition in self.unswapped_gradients.items():
+            dst_tensor = dest_buffer.narrow(0, offset, grad_partition.numel())
+            dst_tensor.data.copy_(grad_partition.data)
+            num_elem_count += grad_partition.numel()
+
+        return num_elem_count
+
+    def release_unswapped_gradients(self):
+        self.unswapped_gradients = {}
+
+
+SWAPPER_DEBUG_MODE = False
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+
+
+class OptimizerSwapper(object):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        self.swap_config = swap_config
+        self.aio_config = aio_config
+
+        # NVMe swap management
+        self.swap_params_info = {}
+        self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
+        self.swap_folder = os.path.join(base_folder,
+                                        'optimizer',
+                                        f'rank{torch.distributed.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.optimizer = optimizer
+
+        # Swap buffer management
+        self.largest_numel = largest_numel
+        self.dtype = dtype
+        self.swap_buffer_manager = SwapBufferManager(
+            num_elems=largest_numel,
+            count=swap_config[OFFLOAD_OPTIMIZER_BUFFER_COUNT],
+            dtype=dtype)
+
+        # Read/Write alignment for each thread during Intra-request parallelism
+        self.min_aio_bytes = max(MIN_AIO_BYTES, aio_config[AIO_BLOCK_SIZE])
+        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_THREAD_COUNT]
+        self.numel_alignment = self.aligned_bytes // self.swap_element_size
+
+        # Timers
+        self.timers = timers
+        self.timer_names = set()
+
+        # Print exclusion list
+        self.print_exclude_list = [
+            'optimizer',
+            'swap_buffer_manager',
+            'swap_params_info',
+            'timers',
+            'timer_names',
+        ]
+
+    def swappable_tensor(self, param=None, numel=None):
+        assert param is not None or numel is not None, "Either param or numel must be provided"
+        if param is not None:
+            return self.min_aio_bytes <= (param.numel() * self.swap_element_size)
+        return self.min_aio_bytes <= (numel * self.swap_element_size)
+
+    def init_timers(self):
+        self.timer_names = set()
+
+    def log_timers(self):
+        if self.timer_names:
+            self._log_timers(list(self.timer_names), force=True)
+
+    def pre_backward(self):
+        self.init_timers()
+
+    def post_backward(self):
+        pass
+
+    def _flush_gradient_swapper(self, gradient_swapper):
+        if gradient_swapper.has_buffers():
+            self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+            pinned_buffers = gradient_swapper.release_buffers()
+            self.swap_buffer_manager.free(pinned_buffers)
+            self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.update(gradient_swapper.get_timer_names())
+
+    def _swap_out_gradients(self,
+                            parameter,
+                            gradient_offsets,
+                            gradient_tensors,
+                            gradient_swapper):
+        if not id(parameter) in self.swap_params_info.keys():
+            return
+
+        swap_info = self.swap_params_info[id(parameter)]
+
+        swappable_tensors = []
+        swappable_offsets = []
+        swappable_lengths = []
+
+        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(
+            tensors=gradient_tensors,
+            offsets=gradient_offsets
+        )
+
+        self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+        for tensor, offset in zip(aligned_gradients, aligned_offsets):
+            if not self.swappable_tensor(param=tensor):
+                swap_info.unswapped_gradients[offset] = tensor
+                continue
+
+            swappable_tensors.append(tensor)
+            swappable_offsets.append(offset)
+            swappable_lengths.append(tensor.numel())
+
+        if len(swappable_tensors) > 0:
+            if not gradient_swapper.has_buffers():
+                pinned_buffers = self.swap_buffer_manager.allocate_all(
+                    num_elems=self.largest_numel,
+                    dtype=self.dtype)
+
+                gradient_swapper.add_buffers(pinned_buffers)
+
+            swappable_paths = swap_info.get_or_create_gradient_paths(
+                swappable_offsets,
+                swappable_lengths)
+
+            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors,
+                                              path_list=swappable_paths)
+
+        self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+
+    def _initialize_from_swapped_fp16_params(self,
+                                             aio_handle,
+                                             fp16_partitions_info,
+                                             fp16_num_elems,
+                                             fp16_pinned_buffers,
+                                             fp32_parameters):
+        assert len(fp32_parameters) == len(fp16_partitions_info)
+        assert len(fp32_parameters) == len(fp16_num_elems)
+        assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers])
+
+        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters,
+                                               num_elems=fp16_num_elems)
+
+        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+
+        fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers]
+        assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \
+        f"numel of fp16 buffers {fp16_buffer_numel} is too small for initializing fp32 params {self.largest_numel}"
+
+        fp32_swap_buffers = SwapBufferPool(fp32_pinned_buffers)
+        fp16_swap_buffers = SwapBufferPool(fp16_pinned_buffers)
+
+        curr_index = 0
+        while curr_index < len(fp32_parameters):
+            fp16_pinned_tensors = self._swap_in_fp16_params(
+                aio_handle=aio_handle,
+                fp16_num_elems=fp16_num_elems[curr_index:],
+                fp16_partitions_info=fp16_partitions_info[curr_index:],
+                fp16_swap_buffers=fp16_swap_buffers)
+
+            if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+                for i, tensor in enumerate(fp16_pinned_tensors):
+                    true_index = curr_index + i
+                    logger.info(
+                        f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
+                    )
+
+            swap_out_count = self._swap_out_fp16_params(
+                aio_handle=aio_handle,
+                fp32_swap_paths=fp32_swap_paths[curr_index:],
+                fp32_swap_buffers=fp32_swap_buffers,
+                fp16_pinned_tensors=fp16_pinned_tensors)
+            assert swap_out_count == len(fp16_pinned_tensors), \
+            f"{swap_out_count} does not match {len(fp16_pinned_tensors)}"
+
+            fp16_swap_buffers.reset()
+            fp32_swap_buffers.reset()
+            curr_index += swap_out_count
+
+        self.swap_buffer_manager.free(fp32_pinned_buffers)
+
+    def _swap_in_fp16_params(self,
+                             aio_handle,
+                             fp16_num_elems,
+                             fp16_partitions_info,
+                             fp16_swap_buffers):
+        assert len(fp16_num_elems) > 0
+
+        swapped_fp16_tensors = []
+        swap_tensors = []
+        swap_paths = []
+        unswapped_srcs = []
+        unswapped_dsts = []
+
+        for i, numel in enumerate(fp16_num_elems):
+            pinned_tensor, _ = fp16_swap_buffers.allocate_tensor(numel, None, numel)
+            if pinned_tensor is None:
+                break
+
+            swapped_fp16_tensors.append(pinned_tensor)
+            offset = 0
+            for tensor, partition_numel, partition_path in fp16_partitions_info[i]:
+                dst_tensor = pinned_tensor.narrow(0, offset, partition_numel)
+                if partition_path is None:
+                    unswapped_srcs.append(tensor)
+                    unswapped_dsts.append(dst_tensor)
+                else:
+                    swap_paths.append(partition_path)
+                    swap_tensors.append(dst_tensor)
+                offset += partition_numel
+
+        assert len(swapped_fp16_tensors) + len(unswapped_srcs) > 0
+        ret = swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+        for src, dst in zip(unswapped_srcs, unswapped_dsts):
+            dst.data.copy_(src.data)
+
+        assert len(swap_tensors) == aio_handle.wait()
+
+        return swapped_fp16_tensors
+
+    def _swap_out_fp16_params(self,
+                              aio_handle,
+                              fp32_swap_paths,
+                              fp32_swap_buffers,
+                              fp16_pinned_tensors):
+
+        assert len(fp16_pinned_tensors) <= len(fp32_swap_paths)
+        swap_out_count = 0
+        for i, fp16_tensor in enumerate(fp16_pinned_tensors):
+            if not fp32_swap_buffers.has_space(fp16_tensor.numel()):
+                fp32_swap_buffers.swap_out(aio_handle)
+                fp32_swap_buffers.reset()
+
+            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(
+                fp16_tensor,
+                fp32_swap_paths[i],
+                self._io_aligned_numel(fp16_tensor.numel())
+                )
+            assert pinned_tensor is not None
+            swap_out_count += 1
+
+        if len(fp32_swap_buffers.get_swap_tensors()) > 0:
+            fp32_swap_buffers.swap_out(aio_handle)
+
+        return swap_out_count
+
+    def _initialize_parameters(self, parameters, src_tensors, aio_handle):
+        assert len(parameters) == len(src_tensors)
+
+        swap_paths = self._get_swap_paths(parameters=parameters,
+                                          num_elems=[src.numel() for src in src_tensors])
+
+        SWAP_INIT_TIMER = "swap_init_write"
+        self._start_timer(SWAP_INIT_TIMER)
+
+        pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+        assert pinned_buffers is not None
+
+        self._swap_out_unpinned_tensors(aio_handle=aio_handle,
+                                        unpinned_tensors=src_tensors,
+                                        dest_paths=swap_paths,
+                                        pinned_buffers=pinned_buffers)
+
+        if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+            for i, tensor in enumerate(src_tensors):
+                logger.info(
+                    f'copy_in_fp16_param: fp32_id = {id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
+                )
+
+        self.swap_buffer_manager.free(pinned_buffers)
+
+        self._stop_timer(SWAP_INIT_TIMER)
+        self._log_timers([SWAP_INIT_TIMER])
+
+    def _get_swap_paths(self, parameters, num_elems):
+        swap_info_list = [
+            self._create_param_swap_info(parameter=p,
+                                         numel=numel) \
+            for p, numel in zip(parameters, num_elems)
+        ]
+        assert len(swap_info_list) == len(num_elems)
+
+        swap_paths = [info.swap_paths[0] for info in swap_info_list]
+        return swap_paths
+
+    def _swap_out_unpinned_tensors(self,
+                                   aio_handle,
+                                   unpinned_tensors,
+                                   dest_paths,
+                                   pinned_buffers):
+
+        swap_buffer_count = len(pinned_buffers)
+        unpinned_tensor_count = len(unpinned_tensors)
+
+        for i in range(0, unpinned_tensor_count, swap_buffer_count):
+            swap_tensor_count = min((unpinned_tensor_count - i), swap_buffer_count)
+
+            src_tensors = unpinned_tensors[i:(i + swap_tensor_count)]
+            compute_lengths = [t.numel() for t in src_tensors]
+            compute_buffers = get_sized_buffers(pinned_buffers, compute_lengths)
+
+            for dst, src in zip(compute_buffers, src_tensors):
+                dst.data.copy_(src.data)
+
+            swap_lengths = [self._io_aligned_numel(t.numel()) for t in src_tensors]
+            swap_buffers = get_sized_buffers(pinned_buffers, swap_lengths)
+
+            swap_paths = dest_paths[i:(i + swap_tensor_count)]
+            swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+            assert aio_handle.wait() == swap_tensor_count
+
+    def _adjust_for_misaligned_lengths(self, tensors, offsets):
+        new_tensors = []
+        new_offsets = []
+
+        for orig_tensor, orig_offset in zip(tensors, offsets):
+            if not self.swappable_tensor(param=orig_tensor):
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            remainder = orig_tensor.numel() % self.numel_alignment
+            if remainder == 0:
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            # Split into two by making remainder a tensor
+            aligned_length = (orig_tensor.numel() //
+                              self.numel_alignment) * self.numel_alignment
+            new_tensors.append(orig_tensor.narrow(0, 0, aligned_length))
+            new_offsets.append(orig_offset)
+
+            # remainder tensor
+            new_tensors.append(orig_tensor.narrow(0, aligned_length, remainder))
+            new_offsets.append(orig_offset + aligned_length)
+
+        return new_tensors, new_offsets
+
+    def _retrieve_unswapped_grad_partitions(self, swap_info, dest_buffer):
+        UNSWAPPED_READ_GRADIENTS = 'unswapped_read_gradients'
+        self._start_timer(UNSWAPPED_READ_GRADIENTS)
+        tensor_count = len(swap_info.unswapped_gradients)
+        num_elem_count = swap_info.read_unswapped_gradients(dest_buffer)
+        self._stop_timer(UNSWAPPED_READ_GRADIENTS)
+        self._log_timers([UNSWAPPED_READ_GRADIENTS])
+
+        # It shoud be safe to discard unswapped gradient partitions
+        swap_info.release_unswapped_gradients()
+
+        if SWAPPER_DEBUG_MODE:
+            logger.info(
+                f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
+            )
+
+    def _get_state_tensors(self, parameter):
+        if not parameter in self.optimizer.state:
+            return []
+
+        tensor_list = []
+        for value in self.optimizer.state[parameter].values():
+            if torch.is_tensor(value):
+                tensor_list.append(value)
+
+        return tensor_list
+
+    def _update_param_state_info(self, swap_info, parameter):
+        if not swap_info.has_state_tensors:
+            state_tensors = self._get_state_tensors(parameter)
+            if state_tensors:
+                swap_info.add_state_tensors(state_tensors)
+
+    def _create_param_swap_info(self, parameter, numel):
+        param_id = id(parameter)
+        assert not param_id in self.swap_params_info
+
+        self.swap_params_info[param_id] = OptimizerStateSwapInfo(
+            parameter=parameter,
+            numel=numel,
+            base_folder=self.swap_folder)
+        swap_info = self.swap_params_info[param_id]
+
+        self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _get_param_swap_info(self, parameter):
+        param_id = id(parameter)
+        swap_info = self.swap_params_info.get(param_id, None)
+
+        if swap_info is not None:
+            self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and (SWAPPER_DEBUG_MODE or force):
+            self.timers.log(name_list)
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
new file mode 100644
index 000000000000..fb321f3b3937
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -0,0 +1,260 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+DEBUG_MODE = False
+
+SWAP_IN_PARAM_TIMER = 'swap_in_param'
+SWAP_OUT_PARAM_TIMER = 'swap_out_param'
+SWAP_IN_GRADIENT_TIMER = 'swap_in_gradient'
+
+
+class PartitionedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PartitionedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                            aio_config[AIO_QUEUE_DEPTH],
+                                            aio_config[AIO_SINGLE_SUBMIT],
+                                            aio_config[AIO_OVERLAP_EVENTS],
+                                            aio_config[AIO_THREAD_COUNT])
+
+        # Overlap swapping out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.print_exclude_list += [
+            'aio_handle',
+            'gradient_swapper',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PartitionedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter=None):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        required_buffer_count = len(
+            swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(swap_info.numel())
+        pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
+                                                           count=required_buffer_count,
+                                                           dtype=parameter.dtype)
+        assert pinned_buffers is not None
+        self.allocated_swap_buffers = pinned_buffers.copy()
+
+        self._start_timer(SWAP_IN_PARAM_TIMER)
+        self._swap_in_parameter(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffers=pinned_buffers[:required_buffer_count])
+        self._stop_timer(SWAP_IN_PARAM_TIMER)
+        self.timer_names.add(SWAP_IN_PARAM_TIMER)
+
+        self._start_timer(SWAP_IN_GRADIENT_TIMER)
+        self._swap_in_gradients(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffer=pinned_buffers[-1])
+        self._stop_timer(SWAP_IN_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_IN_GRADIENT_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap=False):
+        swap_info = self._get_param_swap_info(parameter=parameter)
+
+        if swap_info is None:
+            return
+
+        self._start_timer(SWAP_OUT_PARAM_TIMER)
+        pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info)
+        swap_bytes = sum([
+            self._io_aligned_numel(t.numel()) * t.element_size()
+            for t in swap_info.tensors
+        ])
+
+        WRITE_TIMER = 'swap_submit_write'
+        self._start_timer(WRITE_TIMER)
+
+        swap_out_tensors(self.aio_handle, pinned_tensors, pinned_paths)
+        assert self.aio_handle.wait() == len(pinned_tensors)
+        for t in pinned_tensors:
+            t.data = torch.Tensor()
+
+        if len(unpinned_tensors) > 0:
+            pinned_buffers = self.swap_buffer_manager.allocate_all(
+                num_elems=self.largest_numel,
+                dtype=self.dtype)
+            self._swap_out_unpinned_tensors(aio_handle=self.aio_handle,
+                                            unpinned_tensors=unpinned_tensors,
+                                            dest_paths=unpinned_paths,
+                                            pinned_buffers=pinned_buffers)
+            self.allocated_swap_buffers += pinned_buffers
+
+            for t in unpinned_tensors:
+                t.data = torch.Tensor()
+        self._stop_timer(WRITE_TIMER)
+
+        self.swap_buffer_manager.free(self.allocated_swap_buffers)
+        self.allocated_swap_buffers = []
+
+        self._stop_timer(SWAP_OUT_PARAM_TIMER)
+        self.timer_names.add(SWAP_OUT_PARAM_TIMER)
+
+        self._log_timers([WRITE_TIMER])
+
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_out: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        assert len(swap_info.tensors) <= len(dest_buffers)
+
+        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(
+            swap_info.tensors)
+        swap_buffers = get_sized_buffers(dest_buffers, swap_lengths)
+
+        READ_TIMER = 'swap_submit_read_param'
+        WAIT_TIMER = 'swap_wait_read_param'
+
+        self._start_timer(READ_TIMER)
+        swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths)
+        self._stop_timer(READ_TIMER)
+
+        swap_bytes = sum(
+            [buffer.numel() * buffer.element_size() for buffer in swap_buffers])
+
+        self._start_timer(WAIT_TIMER)
+        aio_handle.wait()
+        self._stop_timer(WAIT_TIMER)
+
+        compute_lengths = [swap_info.numel()] * len(swap_info.tensors)
+        compute_buffers = get_sized_buffers(dest_buffers, compute_lengths)
+        for t, buffer in zip(swap_info.tensors, compute_buffers):
+            t.data = buffer.data
+
+        self._log_timers([READ_TIMER, WAIT_TIMER])
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def _seperate_pinned_tensors(self, swap_info):
+        pinned_tensors = []
+        pinned_paths = []
+
+        unpinned_tensors = []
+        unpinned_paths = []
+
+        for tensor, path in zip(swap_info.tensors, swap_info.swap_paths):
+            if tensor.is_pinned():
+                pinned_tensors.append(tensor)
+                pinned_paths.append(path)
+            else:
+                unpinned_tensors.append(tensor)
+                unpinned_paths.append(path)
+
+        return pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths
+
+    def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
+        swap_info = self.swap_params_info[id(parameter)]
+        param_gradients = swap_info.swapped_gradients.values()
+        swap_buffers = [
+            gradient_tensor.narrow(0,
+                                   grad.offset,
+                                   grad.length) for grad in param_gradients
+        ]
+        swap_paths = [grad.path for grad in param_gradients]
+        SWAP_READ_GRADIENTS = 'swap_submit_read_gradient'
+        SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient'
+
+        self._start_timer(SWAP_READ_GRADIENTS)
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+        self._stop_timer(SWAP_READ_GRADIENTS)
+
+        self._start_timer(SWAP_WAIT_GRADIENTS)
+        assert len(swap_buffers) == aio_handle.wait()
+        self._stop_timer(SWAP_WAIT_GRADIENTS)
+
+        self._log_timers([SWAP_READ_GRADIENTS, SWAP_WAIT_GRADIENTS])
+
+    def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
+        swap_info = self.swap_params_info.get(id(parameter), None)
+        if not (swap_info and swap_info.has_gradients()):
+            return
+
+        assert dest_buffer.is_pinned()
+        assert parameter.numel() <= dest_buffer.numel()
+
+        parameter.grad = dest_buffer.narrow(0, 0, parameter.numel())
+
+        if swap_info.swapped_gradients:
+            self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad)
+
+        if swap_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=swap_info,
+                                                     dest_buffer=parameter.grad)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
new file mode 100644
index 000000000000..0f8a690b7a91
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -0,0 +1,308 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from enum import Enum
+import torch
+import torch.distributed as dist
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+from .constants import *
+from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, print_object
+from ..zero.offload_constants import *
+
+
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
+class PartitionedParamStatus(Enum):
+    # Partitioned parameters are present and ready for use
+    AVAILABLE = 1
+
+    # partitioned params are in some non-memory device
+    NOT_AVAILABLE = 2
+
+    # partitioned params are being read from some non-memory device.
+    INFLIGHT = 3
+
+
+class AsyncPartitionedParameterSwapper(object):
+    def __init__(self, ds_config):
+
+        aio_op = AsyncIOBuilder().load(verbose=False)
+        self.aio_handle = aio_op.aio_handle
+
+        #set swap buffers, create aio handles
+        self._configure_aio(ds_config)
+
+        #mapping from param id to path
+        self.id_to_path = {}
+
+        #mapping from pram_id to buffer id
+        self.param_id_to_buffer_id = {}
+
+        #number of elements in the param
+        self.param_id_to_numel = {}
+
+        self.pending_writes = 0
+        self.pending_reads = 0
+
+        #keep track of async swap in params and buffers
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+        #keep track of available params
+        self.available_params = set()
+        self.available_numel = 0
+
+        self.invalid_buffer = torch.tensor(1).half()
+
+        if dist.get_rank() == 0:
+            exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
+            print_object(obj=self,
+                         name='AsyncPartitionedParameterSwapper',
+                         exclude_list=exclude_list)
+
+    def available_swap_in_buffers(self):
+        return len(self.available_buffer_ids)
+
+    def _configure_aio(self, ds_config):
+        self.swap_config = ds_config.zero_config.offload_param
+        self.swap_folder = os.path.join(self.swap_config[OFFLOAD_PARAM_NVME_PATH],
+                                        'zero_stage_3',
+                                        'fp16params',
+                                        f'rank{dist.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.elements_per_buffer = self.swap_config[OFFLOAD_PARAM_BUFFER_SIZE]
+        self.param_buffer_count = self.swap_config[OFFLOAD_PARAM_BUFFER_COUNT]
+
+        self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
+        self.reserved_buffer_ids = []
+
+        self.buffers = torch.empty(int(self.elements_per_buffer *
+                                       self.param_buffer_count),
+                                   dtype=torch.half,
+                                   pin_memory=True,
+                                   requires_grad=False)
+
+        self.aio_config = ds_config.aio_config
+
+        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                               self.aio_config[AIO_QUEUE_DEPTH],
+                                               self.aio_config[AIO_SINGLE_SUBMIT],
+                                               self.aio_config[AIO_OVERLAP_EVENTS],
+                                               self.aio_config[AIO_THREAD_COUNT])
+
+        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                                self.aio_config[AIO_QUEUE_DEPTH],
+                                                self.aio_config[AIO_SINGLE_SUBMIT],
+                                                self.aio_config[AIO_OVERLAP_EVENTS],
+                                                self.aio_config[AIO_THREAD_COUNT])
+
+        self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
+
+        self.swap_element_size = torch.tensor([], dtype=torch.half).element_size()
+        self.swap_out_params = []
+
+    #Check if partiitoned param or numel in a tensor is swappable or not
+    def swappable_tensor(self, param=None, numel=None):
+        if param is not None:
+            assert numel is None, "Both parma and numel cannot be provided"
+            numel = param.ds_tensor.ds_numel
+        if numel is not None:
+            return self.min_aio_bytes <= numel * self.swap_element_size
+        assert False, "Either param or numel must be provided"
+
+    def get_path(self, param, must_exist=False):
+        paths, _ = self._get_paths([param], must_exist=must_exist)
+        return paths[0]
+
+    def _get_paths(self, params, must_exist=False):
+        paths = []
+        tensors = []
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.id_to_path.keys():
+                param_path = self.id_to_path[param_id]
+            else:
+                assert not must_exist, f"Path for param id {param_id} does not exist"
+                param_path = os.path.join(self.swap_folder,
+                                          f'{param_id}_param.tensor.swp')
+
+                self.id_to_path[param_id] = param_path
+            paths.append(param_path)
+            tensors.append(param.ds_tensor)
+        return paths, tensors
+
+    def _track_numel(self, params):
+        for param in params:
+            assert param.ds_tensor is not None, "Partitioned tensor is None"
+            self.param_id_to_numel[param.ds_id] = param.ds_tensor.ds_numel
+
+    def _allocate_and_return_buffers_for_swap_in(self, params):
+        buffers = []
+        for param in params:
+            param_id = param.ds_id
+            assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown"
+            assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
+
+            buffer_id = self.available_buffer_ids.pop()
+            print_rank_0(
+                f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
+            self.param_id_to_buffer_id[param_id] = buffer_id
+            buffer = self.buffers.narrow(0,
+                                         int(buffer_id * self.elements_per_buffer),
+                                         self.param_id_to_numel[param_id])
+            buffers.append(buffer)
+
+        return buffers
+
+    #waits for inflight nvme write to complete
+    def synchronize_writes(self):
+        if self.pending_writes == 0:
+            return
+        assert self.pending_writes == self.aio_write_handle.wait()
+        self.pending_writes = 0
+        self.remove_partition_and_release_buffers(self.swap_out_params)
+        self.swap_out_params = []
+
+    #waits for inflight nvme reads to complete
+    def synchronize_reads(self):
+        if self.pending_reads == 0:
+            return
+
+        assert self.pending_reads == self.aio_read_handle.wait()
+
+        self.pending_reads = 0
+
+        for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers):
+            param.ds_tensor.data = swap_in_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+
+        self.available_params.update([param.ds_id for param in self.inflight_params])
+        self.available_numel += self.inflight_numel
+
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+    #Removes the memory assignment and releases the buffers
+    #Should only be executed after swapping out the tensors
+    def remove_partition_and_release_buffers(self, params):
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.param_id_to_buffer_id.keys():
+
+                buffer_id = self.param_id_to_buffer_id[param_id]
+
+                assert buffer_id is not None, "Missing buffer id for releasing"
+
+                self.available_buffer_ids.append(buffer_id)
+                del self.param_id_to_buffer_id[param_id]
+                print_rank_0(f"param {param.ds_id} releases buffer id {buffer_id}  ")
+
+                if param_id in self.available_params:
+                    self.available_params.remove(param_id)
+                    self.available_numel -= self.param_id_to_numel[param_id]
+
+            param.ds_tensor.data = self.invalid_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.NOT_AVAILABLE
+
+    #writes from in memory to nvme. Does not release the buffers
+    def _swap_out(self, params, async_op=True):
+
+        swap_out_paths, swap_out_params = self._get_paths(params)
+
+        self._track_numel(params)
+
+        swap_out_tensors(self.aio_write_handle, swap_out_params, swap_out_paths)
+
+        self.pending_writes += len(swap_out_params)
+        self.swap_out_params += params
+
+        if not async_op:
+            self.synchronize_writes()
+
+    #blocking swap out followed by releasing the memory buffers
+    def swap_out_and_release(self, params, async_op=False, force_buffer_release=False):
+        if async_op:
+            assert force_buffer_release, "Should not release preallocated buffers without completing the swap out. Set force_buffer_release to True to do it anyways"
+        self._swap_out(params, async_op=async_op)
+
+    #assigns an in memory buffer and swaps in from nvme
+    def swap_in(self, params, async_op=True, swap_in_buffers=None):
+
+        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight"
+        swap_in_paths, _ = self._get_paths(params)
+
+        if swap_in_buffers is None:
+            if len(self.available_buffer_ids) < len(swap_in_paths):
+                print_rank_0(
+                    f'Not enough swap in buffers {len(self.available_buffer_ids)} for params {len(swap_in_paths)}',
+                    force=True)
+                print_rank_0(
+                    f'Num inflight: params {len(self.inflight_params)}, buffers {len(self.inflight_swap_in_buffers)}, numel = {self.inflight_numel}',
+                    force=True)
+                print_rank_0(
+                    f'Num available: param {len(self.available_params)}, numel = {self.available_numel}',
+                    force=True)
+
+            assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
+            swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params)
+
+        swap_in_tensors(self.aio_read_handle, swap_in_buffers, swap_in_paths)
+
+        self.inflight_params.extend(params)
+        self.inflight_swap_in_buffers.extend(swap_in_buffers)
+        self.inflight_numel += sum([t.numel() for t in swap_in_buffers])
+
+        for param in params:
+            param.ds_tensor.status = PartitionedParamStatus.INFLIGHT
+
+        self.pending_reads += len(params)
+
+        if not async_op:
+            self.synchronize_reads()
+
+    #assign a buffer to a param and return the buffer
+    def get_buffer(self, param, numel):
+        assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}"
+        param_id = param.ds_id
+        self.param_id_to_numel[param_id] = numel
+        buffer_id = self.available_buffer_ids.pop()
+        self.param_id_to_buffer_id[param_id] = buffer_id
+
+        buffer = self.buffers.narrow(0,
+                                     int(buffer_id * self.elements_per_buffer),
+                                     self.param_id_to_numel[param_id])
+        print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id}")
+        return buffer
+
+    def reserve_available_buffers(self):
+        buffers = []
+        for id in self.available_buffer_ids:
+            buffers.append(
+                self.buffers.narrow(0,
+                                    int(id * self.elements_per_buffer),
+                                    int(self.elements_per_buffer)))
+            self.reserved_buffer_ids.append(id)
+
+        self.available_buffer_ids = []
+        return buffers
+
+    def release_reserved_buffers(self):
+        for id in self.reserved_buffer_ids:
+            self.available_buffer_ids.append(id)
+
+        self.reserved_buffer_ids = []
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
new file mode 100644
index 000000000000..7d0116faab5b
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -0,0 +1,284 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import SwapBufferManager, get_sized_buffer
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+
+class OptimizerSwapOp(object):
+    def __init__(self,
+                 aio_handle,
+                 read_op,
+                 param_info,
+                 allocated_buffers,
+                 state_buffers,
+                 num_ops):
+        self.aio_handle = aio_handle
+        self.read_op = read_op
+        self.param_info = param_info
+        self.allocated_buffers = allocated_buffers
+        self.state_buffers = state_buffers
+        self.wait_required = True
+        self.num_ops = num_ops
+
+    def is_parameter(self, parameter):
+        return id(parameter) == self.param_info.param_id
+
+    def wait(self):
+        assert self.wait_required
+        assert self.aio_handle.wait() == self.num_ops
+        self.wait_required = False
+
+
+SYNC_SWAP_IN = 'sync_swap_in'
+ASYNC_SWAP_IN = 'async_swap_in'
+SYNC_SWAP_OUT = 'sync_swap_out'
+ASYNC_SWAP_OUT = 'async_swap_out'
+
+SWAP_IN_STATE_TIMER = 'swap_in_state'
+SWAP_OUT_STATE_TIMER = 'swap_out_state'
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+ASYNC_SWAP_IN_STATE_TIMER = "async_swap_in_state"
+ASYNC_SWAP_OUT_STATE_TIMER = 'async_swap_out_state'
+
+
+class PipelinedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PipelinedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                  aio_config[AIO_QUEUE_DEPTH],
+                                                  aio_config[AIO_SINGLE_SUBMIT],
+                                                  aio_config[AIO_OVERLAP_EVENTS],
+                                                  aio_config[AIO_THREAD_COUNT])
+
+        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                 aio_config[AIO_QUEUE_DEPTH],
+                                                 aio_config[AIO_SINGLE_SUBMIT],
+                                                 aio_config[AIO_OVERLAP_EVENTS],
+                                                 aio_config[AIO_THREAD_COUNT])
+
+        # Overlap gradient swap out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.write_aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ]
+        self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+
+        self.swap_ops = {
+            SYNC_SWAP_IN: None,
+            ASYNC_SWAP_IN: None,
+            SYNC_SWAP_OUT: None,
+            ASYNC_SWAP_OUT: None
+        }
+
+        self.print_exclude_list += [
+            'gradient_swapper',
+            'read_aio_handle',
+            'write_aio_handle',
+            'swap_ops',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PipelinedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.write_aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.write_aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter):
+        assert parameter is not None
+        assert self.swap_ops[SYNC_SWAP_IN] is None
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        self._start_timer(SWAP_IN_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_IN]:
+            assert self.swap_ops[ASYNC_SWAP_IN].is_parameter(parameter)
+            self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN]
+            self.swap_ops[ASYNC_SWAP_IN] = None
+        else:
+            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=parameter)
+
+        if self.swap_ops[SYNC_SWAP_IN]:
+            self.swap_ops[SYNC_SWAP_IN].wait()
+
+        if self.async_swap_in and async_parameter is not None:
+            assert self.swap_ops[ASYNC_SWAP_IN] is None
+            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=async_parameter)
+
+        self._stop_timer(SWAP_IN_STATE_TIMER)
+        self.timer_names.add(SWAP_IN_STATE_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap):
+        self._start_timer(SWAP_OUT_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_OUT]:
+            self._start_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self._complete_swap_out(ASYNC_SWAP_OUT)
+            self._stop_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self.timer_names.add(ASYNC_SWAP_OUT_STATE_TIMER)
+
+        assert self.swap_ops[SYNC_SWAP_IN] is not None
+        assert not self.swap_ops[SYNC_SWAP_IN].wait_required
+        swap_op = self._swap_out_optimizer_state(aio_handle=self.write_aio_handle,
+                                                 parameter=parameter,
+                                                 swap_in_op=self.swap_ops[SYNC_SWAP_IN])
+        self.swap_ops[SYNC_SWAP_IN] = None
+
+        if self.async_swap_out and async_swap:
+            self.swap_ops[ASYNC_SWAP_OUT] = swap_op
+        else:
+            self.swap_ops[SYNC_SWAP_OUT] = swap_op
+            self._complete_swap_out(SYNC_SWAP_OUT)
+
+        self._stop_timer(SWAP_OUT_STATE_TIMER)
+        self.timer_names.add(SWAP_OUT_STATE_TIMER)
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _complete_swap_out(self, swap_out_type):
+        self.swap_ops[swap_out_type].wait()
+        self.swap_buffer_manager.free(self.swap_ops[swap_out_type].allocated_buffers)
+        self.swap_ops[swap_out_type] = None
+
+    def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
+        assert swap_in_op.is_parameter(parameter)
+
+        allocated_buffers = swap_in_op.allocated_buffers.copy()
+        swap_buffers = swap_in_op.state_buffers.copy()
+
+        param_info = swap_in_op.param_info
+        self._update_param_state_info(param_info, parameter)
+        unpinned_tensors = param_info.get_unpinned_state_tensors()
+
+        if len(unpinned_tensors) > 0:
+            new_alloc_buffers = self.swap_buffer_manager.allocate(
+                num_elems=self._io_aligned_numel(param_info.numel()),
+                count=len(unpinned_tensors),
+                dtype=param_info.dtype())
+            assert new_alloc_buffers is not None
+
+            allocated_buffers += new_alloc_buffers
+            swap_buffers += new_alloc_buffers
+
+            for pinned_dst, unpinned_src in zip(new_alloc_buffers, unpinned_tensors):
+                dst = get_sized_buffer(pinned_dst, unpinned_src.numel())
+                dst.data.copy_(unpinned_src.data)
+
+        swap_paths = param_info.swap_paths.copy()
+        assert len(swap_paths) == len(swap_buffers)
+
+        swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+        swap_out_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                      param_info=param_info,
+                                      read_op=False,
+                                      allocated_buffers=allocated_buffers,
+                                      state_buffers=swap_buffers,
+                                      num_ops=len(swap_buffers))
+
+        return swap_out_op
+
+    def _swap_in_optimizer_state(self, aio_handle, parameter):
+        param_info = self._get_param_swap_info(parameter)
+        if param_info is None:
+            return None
+
+        required_buffer_count = len(
+            param_info.tensors) + (1 if param_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(param_info.numel())
+        allocated_buffers = self.swap_buffer_manager.allocate(
+            num_elems=aligned_numel,
+            count=required_buffer_count,
+            dtype=parameter.dtype)
+        assert allocated_buffers is not None, \
+        f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing {OFFLOAD_OPTIMIZER_BUFFER_COUNT}"
+
+        state_buffers = allocated_buffers[:len(param_info.tensors)]
+        param_info.set_swap_buffers(state_buffers)
+
+        swap_buffers = state_buffers.copy()
+        swap_paths = param_info.swap_paths.copy()
+
+        if param_info.has_gradients():
+            parameter.grad = allocated_buffers[-1].narrow(0, 0, param_info.numel())
+            if param_info.swapped_gradients:
+                swap_buffers += param_info.get_swap_gradient_buffers(parameter.grad)
+                swap_paths += param_info.get_swap_gradient_paths()
+
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+
+        if param_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=param_info,
+                                                     dest_buffer=parameter.grad)
+
+        swap_in_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                     param_info=param_info,
+                                     read_op=True,
+                                     allocated_buffers=allocated_buffers,
+                                     state_buffers=state_buffers,
+                                     num_ops=len(swap_buffers))
+
+        return swap_in_op
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
new file mode 100644
index 000000000000..a88746fdc739
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -0,0 +1,241 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+from deepspeed.utils.logging import logger
+
+from deepspeed.runtime.swap_tensor.constants import AIO_BLOCK_SIZE, AIO_QUEUE_DEPTH, \
+    AIO_THREAD_COUNT, AIO_SINGLE_SUBMIT, AIO_OVERLAP_EVENTS
+
+MIN_AIO_BYTES = 1024**2
+AIO_ALIGNED_BYTES = 1024
+
+
+def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pread(buffer, path) == 0)
+
+
+def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pwrite(buffer, path) == 0)
+
+
+def print_object(obj, name, exclude_list=[]):
+    logger.info('{}:'.format(name))
+    for arg in sorted(vars(obj)):
+        if not arg in exclude_list:
+            dots = '.' * (29 - len(arg))
+            logger.info('  {} {} {}'.format(arg, dots, getattr(obj, arg)))
+
+
+class SwapBuffer(object):
+    def __init__(self, buffer):
+        self.buffer = buffer
+        self.reset()
+
+    def reset(self):
+        self.offset = 0
+        self.swap_tensors = {}
+        self.compute_tensors = {}
+        self.swap_paths = {}
+        self.num_elem = 0
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        swap_tensor, compute_tensor = self.allocate_tensor(swap_path, tensor.numel(), aligned_numel)
+        compute_tensor.data.copy_(tensor.data)
+        return swap_tensor, compute_tensor
+
+    def allocate_tensor(self, swap_path, numel, aligned_numel):
+        assert self.has_space(aligned_numel)
+        assert not self.offset in self.swap_tensors
+
+        allocate_offset = self.offset
+        swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel)
+        dest_tensor = swap_tensor.narrow(0, 0, numel)
+
+        self.swap_tensors[allocate_offset] = swap_tensor
+        self.compute_tensors[allocate_offset] = dest_tensor
+        self.swap_paths[allocate_offset] = swap_path
+        self.offset += aligned_numel
+        self.num_elem += numel
+
+        return self.swap_tensors[allocate_offset], self.compute_tensors[allocate_offset]
+
+    def has_space(self, numel):
+        return (self.offset + numel) <= self.buffer.numel()
+
+    def get_swap_tensors(self):
+        return [tensor for tensor in self.swap_tensors.values()]
+
+    def get_swap_paths(self):
+        return [path for path in self.swap_paths.values()]
+
+    def get_compute_tensors(self):
+        return [tensor for tensor in self.compute_tensors.values()]
+
+    def get_num_elem(self):
+        return self.num_elem
+
+    def get_swap_tensor(self, offset):
+        return self.swap_tensors.get(offset, None)
+
+    def get_compute_tensor(self, offset):
+        return self.compute_tensors.get(offset, None)
+
+    def get_swap_path(self, offset):
+        return self.swap_paths(offset, None)
+
+
+class SwapBufferPool(object):
+    def __init__(self, buffers):
+        assert all([buf.is_pinned() for buf in buffers])
+        self.buffers = [SwapBuffer(buf) for buf in buffers]
+        self.current_index = 0
+
+    def reset(self):
+        self.current_index = 0
+        for buffer in self.buffers:
+            buffer.reset()
+
+    def allocate_tensor(self, numel, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().allocate_tensor(swap_path, numel, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().insert_tensor(tensor, swap_path, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def get_swap_tensors(self):
+        swap_tensors = []
+        for buffer in self._get_used_buffers():
+            swap_tensors += buffer.get_swap_tensors()
+
+        return swap_tensors
+
+    def get_swap_paths(self):
+        swap_paths = []
+        for buffer in self._get_used_buffers():
+            swap_paths += buffer.get_swap_paths()
+
+        return swap_paths
+
+    def get_compute_tensors(self):
+        compute_tensors = []
+        for buffer in self._get_used_buffers():
+            compute_tensors += buffer.get_compute_tensors()
+
+        return compute_tensors
+
+    def has_space(self, numel):
+        if self._get_current_buffer().has_space(numel):
+            return True
+
+        if self.current_index == len(self.buffers) - 1:
+            return False
+
+        self.current_index += 1
+        return self._get_current_buffer().has_space(numel)
+
+    def swap_out(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_out_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def swap_in(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def _get_current_buffer(self):
+        return self.buffers[self.current_index]
+
+    def _get_used_buffers(self):
+        return self.buffers[:self.current_index + 1]
+
+
+class SwapBufferManager(object):
+    def __init__(self, num_elems, count, dtype):
+        self.num_elems = num_elems
+        self.count = count
+        self.dtype = dtype
+        self.all_buffers = [
+            torch.zeros(num_elems,
+                        device='cpu',
+                        dtype=dtype).pin_memory() for _ in range(count)
+        ]
+        self.free_buffer_index = [i for i in range(count)]
+        self.used_buffer_index = {}
+        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
+                                                                                     **3)
+
+        if torch.distributed.get_rank() == 0:
+            exclude_list = ['all_buffers']
+            print_object(obj=self, name='SwapBufferManager', exclude_list=exclude_list)
+
+    def allocate(self, num_elems, count, dtype):
+        assert dtype == self.dtype
+        assert num_elems <= self.num_elems
+        if count > len(self.free_buffer_index):
+            return None
+
+        used_indices = self.free_buffer_index[-count:]
+        self.free_buffer_index = self.free_buffer_index[:-count]
+
+        buffers = []
+        for i in used_indices:
+            tmp_buffer = self.all_buffers[i].narrow(0, 0, num_elems)
+            buffers.append(tmp_buffer)
+            self.used_buffer_index[id(tmp_buffer)] = i
+        return buffers
+
+    def allocate_all(self, num_elems, dtype):
+        return self.allocate(num_elems=num_elems,
+                             count=len(self.free_buffer_index),
+                             dtype=dtype)
+
+    def free(self, buffers):
+        buffer_ids = []
+        for buf in buffers:
+            buffer_ids.append(id(buf))
+
+        assert all([b_id in self.used_buffer_index for b_id in buffer_ids])
+
+        for b_id in buffer_ids:
+            self.free_buffer_index.append(self.used_buffer_index[b_id])
+            del (self.used_buffer_index[b_id])
+
+
+def get_sized_buffer(buffer, num_elems):
+    assert num_elems <= buffer.numel(), \
+        f'num_elems {num_elems}> buffer {buffer.numel()}'
+    return buffer.narrow(0, 0, num_elems) if num_elems < buffer.numel() else buffer
+
+
+def get_sized_buffers(buffer_list, num_elems_list):
+    swap_buffers = [
+        get_sized_buffer(buffer, num_elems) \
+        for buffer, num_elems in zip(buffer_list, num_elems_list)
+    ]
+    return swap_buffers
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index d521573e1a77..bd881c8ec69f 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -1,5 +1,13 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 from .partition_parameters import ZeroParamType
 from .partition_parameters import ZeroParamStatus
 from .partition_parameters import Init
 from .partition_parameters import GatheredParameters
 from .partition_parameters import register_external_parameter
+
+from .tiling import TiledLinear
+from .tiling import TiledLinearReturnBias
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index c179d01f2988..6bae0f1fbcd0 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -5,7 +5,10 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.utils import logger
-from deepspeed.runtime.zero.constants import *
+from .constants import *
+from .offload_constants import *
+from .offload_config import get_offload_param_config, get_default_offload_param_config, \
+    get_offload_optimizer_config, get_default_offload_optimizer_config
 
 
 class DeepSpeedZeroConfig(DeepSpeedConfigObject):
@@ -24,9 +27,8 @@ def __init__(self, param_dict):
         self.elastic_checkpoint = None
 
         #Offload Specific Parameters
-        self.cpu_offload = None
-        self.cpu_offload_params = None
-        self.cpu_offload_use_pin_memory = None
+        self.offload_param = None
+        self.offload_optimizer = None
         self.sub_group_size = None
 
         #Stage3 Specific Parameters
@@ -60,7 +62,24 @@ def read_zero_config_deprecated(self, param_dict):
             .format(ZERO_FORMAT))
         return zero_config_dict
 
+    def _sanity_check(self, zero_config_dict):
+        deprecated_dict = {
+            ZERO_OPTIMIZATION_CPU_OFFLOAD:
+            ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
+            ZERO_OPTIMIZATION_OFFLOAD_PARAM,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
+            f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}'
+        }
+
+        for old_key, new_key in deprecated_dict.items():
+            if old_key in zero_config_dict:
+                logger.warning(
+                    f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.')
+
     def _initialize(self, zero_config_dict):
+        self._sanity_check(zero_config_dict)
+
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
                                       ZERO_OPTIMIZATION_STAGE_DEFAULT)
@@ -103,24 +122,30 @@ def _initialize(self, zero_config_dict):
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
 
-        self.cpu_offload = get_scalar_param(zero_config_dict,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
-
         self.elastic_checkpoint = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT)
 
-        self.cpu_offload_params = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
-
-        self.cpu_offload_use_pin_memory = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT)
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD in zero_config_dict:
+            cpu_offload_optimizer = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
+            if cpu_offload_optimizer:
+                self.offload_optimizer = get_default_offload_optimizer_config()
+        else:
+            self.offload_optimizer = get_offload_optimizer_config(zero_config_dict)
+
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS in zero_config_dict:
+            cpu_offload_params = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
+            if cpu_offload_params:
+                self.offload_param = get_default_offload_param_config()
+        else:
+            self.offload_param = get_offload_param_config(zero_config_dict)
 
         self.sub_group_size = get_scalar_param(zero_config_dict,
                                                ZERO_OPTIMIZATION_SUB_GROUP_SIZE,
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index e5812980a337..40b450649850 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -3,6 +3,8 @@
 Licensed under the MIT license.
 """
 
+from .offload_constants import *
+
 #########################################
 # ZeRO optimization
 #########################################
@@ -22,10 +24,12 @@
     "overlap_comm": [true|false],
     "reduce_bucket_size": 500000000,
     "load_from_fp32_weights": [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory": [true|false],
-    "sub_group_size" : 1000000000000
+    "cpu_offload": [true|false] (deprecated),
+    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_use_pin_memory": [true|false] (deprecated),
+    "sub_group_size" : 1000000000000,
+    "offload_param": {...},
+    "offload_optimizer": {...}
     }
 }
 '''
@@ -67,18 +71,24 @@
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
 
-ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
-
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT = 'elastic_checkpoint'
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT = True
 
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS = 'cpu_offload_params'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT = False
 
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY = 'cpu_offload_use_pin_memory'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT = False
 
+ZERO_OPTIMIZATION_OFFLOAD_PARAM = OFFLOAD_PARAM
+ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT = None
+
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER = OFFLOAD_OPTIMIZER
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT = None
+
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE = 'sub_group_size'
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT = 1000000000000
 
@@ -118,16 +128,12 @@
     ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT:
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM:
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT,
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER:
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT,
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE:
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS:
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 23f97d5a542a..fb65673bd9b4 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -21,6 +21,11 @@
 tensor_map = {}
 
 
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
 class LinearFunctionForZeroStage3(torch.autograd.Function):
 
     # Note that both forward and backward are @staticmethods
@@ -46,6 +51,7 @@ def forward(ctx, input, weight, bias=None):
             if bias is not None:
                 output += bias
             ret = output
+
         return ret
 
     # This function has only a single output, so it gets only one gradient
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
new file mode 100644
index 000000000000..eaf3f13e2819
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -0,0 +1,63 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from .offload_constants import *
+
+OFFLOAD_PARAM_KEY_DEFAULT_DICT = {
+    OFFLOAD_PARAM_DEVICE: OFFLOAD_PARAM_DEVICE_DEFAULT,
+    OFFLOAD_PARAM_NVME_PATH: OFFLOAD_PARAM_NVME_PATH_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_COUNT: OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_SIZE: OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT,
+    OFFLOAD_PARAM_MAX_IN_CPU: OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT,
+    OFFLOAD_PARAM_PIN_MEMORY: OFFLOAD_PARAM_PIN_MEMORY_DEFAULT
+}
+
+OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT = {
+    OFFLOAD_OPTIMIZER_DEVICE: OFFLOAD_OPTIMIZER_DEVICE_DEFAULT,
+    OFFLOAD_OPTIMIZER_NVME_PATH: OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT,
+    OFFLOAD_OPTIMIZER_BUFFER_COUNT: OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIN_MEMORY: OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_READ: OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_WRITE: OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT,
+    OFFLOAD_OPTIMIZER_FAST_INIT: OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT
+}
+
+
+def _get_offload_config(param_dict, key_default_dict):
+    offload_config = {}
+    for key, default_value in key_default_dict.items():
+        offload_config[key] = get_scalar_param(param_dict, key, default_value)
+
+    return offload_config
+
+
+def get_offload_param_config(param_dict):
+    if OFFLOAD_PARAM in param_dict and param_dict[OFFLOAD_PARAM] is not None:
+        return _get_offload_config(param_dict=param_dict[OFFLOAD_PARAM],
+                                   key_default_dict=OFFLOAD_PARAM_KEY_DEFAULT_DICT)
+
+    return None
+
+
+def get_default_offload_param_config():
+    return OFFLOAD_PARAM_KEY_DEFAULT_DICT
+
+
+def get_offload_optimizer_config(param_dict):
+    if OFFLOAD_OPTIMIZER in param_dict and param_dict[OFFLOAD_OPTIMIZER] is not None:
+        offload_config = _get_offload_config(
+            param_dict=param_dict[OFFLOAD_OPTIMIZER],
+            key_default_dict=OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT)
+        offload_config[OFFLOAD_OPTIMIZER_PIPELINE] = offload_config[
+            OFFLOAD_OPTIMIZER_PIPELINE_READ] or offload_config[
+                OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+        return offload_config
+
+    return None
+
+
+def get_default_offload_optimizer_config():
+    return OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT
diff --git a/deepspeed/runtime/zero/offload_constants.py b/deepspeed/runtime/zero/offload_constants.py
new file mode 100644
index 000000000000..3ba71df4a807
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_constants.py
@@ -0,0 +1,67 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# TENSOR OFFLOADING
+#########################################
+OFFLOAD_CPU_DEVICE = "cpu"
+OFFLOAD_NVME_DEVICE = "nvme"
+
+#########################################
+# PARAM TENSOR OFFLOADING
+#########################################
+OFFLOAD_PARAM_FORMAT = '''
+"offload_param": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 5,
+  "buffer_size": 1e8,
+  "max_in_cpu": 1e9,
+  "pin_memory": [true|false]
+}
+'''
+OFFLOAD_PARAM = "offload_param"
+OFFLOAD_PARAM_DEVICE = "device"
+OFFLOAD_PARAM_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_PARAM_NVME_PATH = "nvme_path"
+OFFLOAD_PARAM_NVME_PATH_DEFAULT = None
+OFFLOAD_PARAM_BUFFER_COUNT = "buffer_count"
+OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT = 5
+OFFLOAD_PARAM_BUFFER_SIZE = "buffer_size"
+OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT = 1e8
+OFFLOAD_PARAM_MAX_IN_CPU = "max_in_cpu"
+OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT = 1e9
+OFFLOAD_PARAM_PIN_MEMORY = "pin_memory"
+OFFLOAD_PARAM_PIN_MEMORY_DEFAULT = False
+
+#########################################
+# OPTIMIZER TENSOR OFFLOADING
+#########################################
+OFFLOAD_OPTIMIZER_FORMAT = '''
+"offload_optimizer": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 4,
+  "pin_memory": [true|false],
+  "pipeline_read": false,
+  "pipeline_write": false,
+  "fast_init": false
+}
+'''
+OFFLOAD_OPTIMIZER = "offload_optimizer"
+OFFLOAD_OPTIMIZER_DEVICE = "device"
+OFFLOAD_OPTIMIZER_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_OPTIMIZER_NVME_PATH = "nvme_path"
+OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT = None
+OFFLOAD_OPTIMIZER_BUFFER_COUNT = "buffer_count"
+OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT = 4
+OFFLOAD_OPTIMIZER_PIN_MEMORY = "pin_memory"
+OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_READ = "pipeline_read"
+OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE = "pipeline_write"
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE = "pipeline"
+OFFLOAD_OPTIMIZER_FAST_INIT = "fast_init"
+OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT = False
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 42cdcd645f4d..c8bde6390b3c 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1,3 +1,8 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 import os
 import time
 import types
@@ -8,10 +13,15 @@
 import torch
 from torch.distributed.distributed_c10d import _get_global_rank
 
-from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
-from deepspeed.runtime.utils import see_memory_usage
+from .linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
+from .offload_constants import *
+
+from ..utils import see_memory_usage
 from deepspeed.utils import log_dist, init_distributed
 
+from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
+from ..config import DeepSpeedConfig
+
 param_count = 0
 
 
@@ -21,6 +31,8 @@ def print_rank_0(message, debug=False, force=False):
 
 
 def is_zero_param(parameter):
+    if not torch.is_tensor(parameter):
+        return False
     return hasattr(parameter, 'ds_id')
 
 
@@ -29,8 +41,6 @@ def _init_external_params(module):
         module._external_params = {}
 
         def external_parameters(self):
-            if not hasattr(self, '_external_params'):
-                self._external_params = {}
             return self._external_params.items()
 
         def all_parameters(self):
@@ -94,6 +104,28 @@ def forward(self, input):
     module._external_params[key] = parameter
 
 
+def unregister_external_parameter(module, parameter):
+    """Reverses the effects of :meth:`register_external_parameter`.
+
+    Args:
+        module (``torch.nn.Module``): The module to affect.
+        parameter (``torch.nn.Parameter``): The parameter to unregister.
+
+    Raises:
+        RuntimeError: If ``parameter`` is not of type ``torch.nn.Parameter``.
+        RuntimeError: If ``parameter`` is not a registered external parameter of ``module``.
+    """
+    if not isinstance(parameter, torch.nn.Parameter):
+        raise RuntimeError('Parameter is not a torch.nn.Parameter')
+
+    if not hasattr(module,
+                   '_external_params') or id(parameter) not in module._external_params:
+        raise RuntimeError('Parameter is not a registered external parameter of module.')
+
+    key = id(parameter)
+    del module._external_params[key]
+
+
 class ZeroParamType(Enum):
 
     # same as regular pytorch parameters
@@ -216,6 +248,8 @@ def _disable_class(cls):
         #un doing it here will undo it during training
         #if self.mem_efficient_linear:
         #    torch.nn.functional.linear = self.linear_bk
+        #        if self.mem_efficient_linear:
+        #            torch.nn.functional.linear = self.linear_bk
 
         # Now that we cleaned up the metaclass injection, raise the exception.
         if exc_type is not None:
@@ -236,6 +270,7 @@ def __init__(self,
                  mem_efficient_linear=True,
                  remote_device=None,
                  pin_memory=False,
+                 deepspeed_config=None,
                  enabled=True):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
@@ -249,14 +284,15 @@ def __init__(self,
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
                 DeepSpeed to partition parameters. Defaults to ``True``.
-            remote_device (string, optional): The device to store model
-                weights. Passing ``"cpu"`` will create the model in CPU
-                memory. The model may still be moved to GPU if
-                ``cpu_offload_param`` is ``False`` in the config provided to
-                :meth:`deepspeed.initialize`. Defaults to the local GPU.
+            remote_device (string, optional): The initial device to store model
+                weights e.g., ``cpu``, ``nvme``. Passing ``"cpu"`` will create the model in CPU
+                memory. The model may still be moved to GPU based on the
+                offload settings for training. Defaults to the local GPU.
             pin_memory (bool, optional): Potentially increase performance by
                 using pinned memory for model weights. ``remote_device`` must be
                 ``"cpu"``. Defaults to ``False``.
+            deepspeed_config (``json file``, optional): If provided, provides configuration
+                for swapping fp16 params to NVMe.
             enabled (bool, optional): If ``False``, this context has no
                 effect. Defaults to ``True``.
 
@@ -264,15 +300,15 @@ def __init__(self,
         are too large to allocate in their entirety in CPU memory. It has the
         following effects:
 
-        #. allocates tensors to either GPU or CPU memory
+        #. allocates tensors to either GPU or CPU memory or NVMe
         #. converts floating point tensors to half precision
         #. immediately partitions tensors among the group of data-parallel devices
         #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
            memory-efficient implementation
 
         These modifications allow for models that exceed the size of local CPU/GPU
-        memory, but fit within the total system memory (*i.e.*, aggregate CPU
-        or GPU memory) across all nodes. Consider initializing a model with one
+        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
+        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
         trillion parameters, whose weights occupy two terabytes (TB) in half
         precision. The initial CPU allocation in full precision requires 4TB of
         memory *per process*, and so a system with 8 GPUs per node would need 32TB of
@@ -302,7 +338,6 @@ def get_model():
         .. note::
             Only applicable to training with ZeRO-3.
 
-
         Examples
         --------
 
@@ -347,10 +382,20 @@ def get_model():
         #It is the device where parameters are fully instantiated using allgather
         self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
 
+        self._validate_remote_device(remote_device, deepspeed_config)
+
         #Remote device is the device where parameter partiitons are stored
-        #It can be same as local_device or it could be CPU.
+        #It can be same as local_device or it could be CPU or NVMe.
         self.remote_device = self.local_device if remote_device is None else remote_device
-        self.pin_memory = pin_memory if (self.remote_device == 'cpu') else False
+        self.pin_memory = pin_memory if (
+            self.remote_device == OFFLOAD_CPU_DEVICE) else False
+
+        # Enable fp16 param swapping to NVMe
+        if self.remote_device == OFFLOAD_NVME_DEVICE:
+            _ds_config = DeepSpeedConfig(deepspeed_config)
+            self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config)
+        else:
+            self.param_swapper = None
 
         # If we are provided an already-allocated module to prepare.
         if module is not None:
@@ -361,6 +406,23 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
+    def _validate_remote_device(self, remote_device, ds_config):
+        if ds_config is not None:
+            _ds_config = DeepSpeedConfig(ds_config)
+            if remote_device in [None, OFFLOAD_CPU_DEVICE]:
+                if _ds_config.zero_config.offload_param is not None:
+                    offload_param_device = _ds_config.zero_config.offload_param[
+                        OFFLOAD_PARAM_DEVICE]
+                    assert offload_param_device != OFFLOAD_NVME_DEVICE, \
+                    f"{OFFLOAD_PARAM_DEVICE} in DeepSpeed Config cannot be {offload_param_device} if remote device is {remote_device}."
+
+            if remote_device == OFFLOAD_NVME_DEVICE:
+                assert _ds_config.zero_config.offload_param is not None, \
+                f'{OFFLOAD_PARAM} must be defined in DeepSpeed Config if remote device is {OFFLOAD_NVME_DEVICE}.'
+
+                assert _ds_config.zero_config.offload_param[OFFLOAD_PARAM_NVME_PATH] is not None, \
+                f'{OFFLOAD_PARAM_NVME_PATH} in DeepSpeed Config cannot be None if remote device is {OFFLOAD_NVME_DEVICE}'
+
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
@@ -408,6 +470,10 @@ def _convert_to_deepspeed_param(self, param):
         # The group that the parameter is scattered across.
         param.ds_process_group = self.ds_process_group
 
+        # This is set to the Async Param swapper if remote device is nvme
+        # else this is set to None
+        param.nvme_swapper = self.param_swapper
+
         # DeepSped Param ID
         param.ds_id = Init.param_id
         Init.param_id += 1
@@ -458,6 +524,9 @@ def aligned_size():
         def padding_size():
             return self._padding_size(param)
 
+        def partitioned_size():
+            return self._partitioned_size(param)
+
         # Collectives for gathering and partitioning parameters
         param.all_gather = all_gather
         param.partition = partition
@@ -469,6 +538,7 @@ def padding_size():
         # Partitioning size utilities
         param.aligned_size = aligned_size
         param.padding_size = padding_size
+        param.partitioned_size = partitioned_size
 
     def _aligned_size(self, param):
         return param.ds_numel + self._padding_size(param)
@@ -477,7 +547,29 @@ def _padding_size(self, param):
         remainder = param.ds_numel % self.world_size
         return (self.world_size - remainder) if remainder else 0
 
+    def _partitioned_size(self, param):
+        return param.ds_tensor.ds_numel
+
+    def _ensure_availability_of_partitioned_params(self, params):
+        swap_in_list = []
+        swap_in_flight = []
+        for param in params:
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_list.append(param)
+            if param.ds_tensor.status == PartitionedParamStatus.INFLIGHT:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_flight.append(param)
+        if len(swap_in_list) > 0:
+            swap_in_list[0].nvme_swapper.swap_in(swap_in_list, async_op=False)
+        elif len(swap_in_flight) > 0:
+            swap_in_flight[0].nvme_swapper.synchronize_reads()
+
     def _all_gather(self, param_list, async_op=False, hierarchy=None):
+
+        #fetches from nvme if the partition is not available and in nvme
+        self._ensure_availability_of_partitioned_params(param_list)
+
         handles = []
         all_gather_list = []
         for param in param_list:
@@ -511,8 +603,9 @@ def _partition(self, param_list, force=False, has_been_updated=False):
             #print_rank_0(f"After Partitioning Param {param.ds_id}")
             # self._param_status(param)
 
-    def _partition_param(self, param, has_been_updated=False):
+    def _partition_param(self, param, buffer=None, has_been_updated=False):
         assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot parititon a param in flight"
+
         global reuse_buffers
         #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}")
         if param.ds_status is ZeroParamStatus.AVAILABLE:
@@ -534,22 +627,54 @@ def _partition_param(self, param, has_been_updated=False):
 
                 #param.data = param.ds_tensor.data
 
+                see_memory_usage(
+                    f'Before partitioning param {param.ds_id} {param.shape}',
+                    force=False)
                 #param.data does not store anything meaningful in partitioned state
                 param.data = torch.ones(1).half().to(param.device)
+                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                                 force=False)
+
+                if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                    print_rank_0(
+                        f"Param {param.ds_id} partition released since it exists in nvme",
+                        force=False)
+                    param.nvme_swapper.remove_partition_and_release_buffers([param])
+
                 return
 
             tensor_size = self._aligned_size(param)
             partition_size = tensor_size // self.world_size
 
             if param.ds_tensor is None:
-                partitioned_tensor = torch.zeros(partition_size,
-                                                 dtype=param.dtype,
-                                                 device=self.remote_device)
-                partitioned_tensor.requires_grad = False
-                if self.pin_memory:
-                    partitioned_tensor = partitioned_tensor.pin_memory()
+                final_location = None
+                if self.remote_device == OFFLOAD_NVME_DEVICE and self.param_swapper.swappable_tensor(
+                        numel=partition_size):
+                    final_location = OFFLOAD_NVME_DEVICE
+                    buffer = self.param_swapper.get_buffer(param, partition_size)
+                    partitioned_tensor = torch.zeros(1,
+                                                     dtype=param.dtype,
+                                                     device=buffer.device)
+                    partitioned_tensor.data = buffer.data
+                    print_rank_0(
+                        f"ID {param.ds_id} Initializing partition for the first time for nvme offload."
+                    )
 
+                else:
+                    partitioned_tensor = torch.zeros(
+                        partition_size,
+                        dtype=param.dtype,
+                        device=OFFLOAD_CPU_DEVICE
+                        if self.remote_device == OFFLOAD_NVME_DEVICE else
+                        self.remote_device)
+                    if self.pin_memory:
+                        partitioned_tensor = partitioned_tensor.pin_memory()
+
+                partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
+                param.ds_tensor.ds_numel = partition_size
+                param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+                param.ds_tensor.final_location = final_location
 
             start = partition_size * self.rank
             end = start + partition_size
@@ -584,7 +709,20 @@ def _partition_param(self, param, has_been_updated=False):
             #param.data = param.ds_tensor.data
 
             #param.data does not store anything meaningful in partitioned state
+
+            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
+                             force=False)
             param.data = torch.ones(1).half().to(param.device)
+            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                             force=False)
+
+            if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                self.param_swapper.swap_out_and_release([param])
+                print_rank_0(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
+                see_memory_usage(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.",
+                    force=False)
 
             print_rank_0(
                 f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}"
@@ -602,7 +740,7 @@ def _param_status(self, param):
 
     def _allgather_param(self, param, async_op=False, hierarchy=0):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         tensor_size = partition_size * self.world_size
         aligned_param_size = self._aligned_size(param)
@@ -611,9 +749,16 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         print_rank_0(
             f"{'--'* hierarchy}---- Before allocating Allgather param with id {param.ds_id} and status {param.ds_status} Partition Size {partition_size} and data shape {param.ds_shape}"
         )
+
+        see_memory_usage(
+            f'Before allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
         flat_tensor = torch.zeros(aligned_param_size,
                                   dtype=param.dtype,
                                   device=param.device).view(-1)
+        see_memory_usage(
+            f'After allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
 
         torch.cuda.synchronize()
 
@@ -646,7 +791,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return
 
-        partition_size = sum([param.ds_tensor.numel() for param in param_list])
+        partition_size = sum([param.ds_tensor.ds_numel for param in param_list])
 
         tensor_size = partition_size * self.world_size
         flat_tensor = torch.empty(tensor_size,
@@ -662,7 +807,7 @@ def _allgather_params(self, param_list, hierarchy=0):
             if i == self.rank:
                 offset = 0
                 for param in param_list:
-                    param_numel = param.ds_tensor.numel()
+                    param_numel = param.ds_tensor.ds_numel
 
                     partitions[i].narrow(0,
                                          offset,
@@ -677,9 +822,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         param_offset = 0
 
         for param in param_list:
-
-            param_partition_size = param.ds_tensor.numel()
-
+            param_partition_size = param.ds_tensor.ds_numel
             param_size = param.ds_numel
             replicated_tensor = torch.empty(param.ds_shape,
                                             dtype=param.dtype,
@@ -700,7 +843,7 @@ def _allgather_params(self, param_list, hierarchy=0):
                                                       param_start,
                                                       numel_to_copy).copy_(part_to_copy)
             #param_offset += param.data.numel()
-            param_offset += param.ds_tensor.numel()
+            param_offset += param.ds_tensor.ds_numel
 
             param.data = replicated_tensor.data
 
@@ -724,7 +867,7 @@ def _reduce_scatter_gradients(self, param_list):
             # some ranks may have partitions that are padded to go beyond the grad size.
             # For these ranks the output of reduce scatter is a separate buffer and needs
             # to be copied in
-            partition_size = param.ds_tensor.numel()
+            partition_size = param.ds_tensor.ds_numel
             start = self.rank * partition_size
             end = start + partition_size
             #print_rank_0("REduce scatter was executed for praam {param.ds_id}")
@@ -739,7 +882,7 @@ def _reduce_scatter_gradients(self, param_list):
 
     def _reduce_scatter_gradient(self, param):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
         #output = torch.empty(partition_size, dtype=param.dtype, device=param.device)
 
         total_size = partition_size * self.world_size
@@ -791,10 +934,10 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         # param.grad=None
         # param.grad.test()
         print_rank_0(
-            f"Partitioning param {id(param)} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.numel()}"
+            f"Partitioning param {param.ds_id} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.ds_numel}"
         )
         see_memory_usage("Before partitioning gradients", force=False)
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         if partition_buffer is None:
             assert not accumulate, "No buffer to accumulate to"
@@ -884,6 +1027,10 @@ def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
                     if torch.distributed.get_rank() == 0:
                         linear.weight.zero_()
 
+                with deepspeed.zero.GatheredParameters(linear.weight,
+                                                       modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        linear.weight.zero_()
 
         #. Collect a partitioned weight to pass to another module during
            training. The parameter will be registered as an external parameter
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index c7eb4b5cfc7b..f8b526952de8 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1,10 +1,12 @@
-from deepspeed.utils.logging import logger
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
 
+import sys
 import os
-
+from collections import defaultdict, OrderedDict
+import itertools
 import torch
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
@@ -12,18 +14,25 @@
 from torch._six import inf
 from torch.autograd import Variable
 
+from deepspeed.utils.logging import logger
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, ZeroParamType, _init_external_params, Init, is_zero_param
+from deepspeed.runtime.zero.partition_parameters import *
+from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.ops.op_builder import UtilsBuilder
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
+from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
+from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 
-import itertools
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
+FWD_MODULE_STACK = list()
+
 
 def print_rank_0(message, debug=False, force=False):
     if torch.distributed.get_rank() == 0 and (debug or force):
@@ -108,6 +117,47 @@ def _apply_forward_and_backward_to_tensors_only(module,
         return outputs
 
 
+class ZeROOrderedDict(OrderedDict):
+    def __init__(self, parent_module, *args, **kwargs):
+        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
+
+        Args:
+            parent_module (``collections.OrderedDict``): the collection to replace
+        """
+
+        super().__init__(*args, **kwargs)
+        self._parent_module = parent_module
+        self._in_forward = False
+
+    def __getitem__(self, key):
+        param = super().__getitem__(key)
+
+        # Params can be registered as None (e.g., bias)
+        if param is None:
+            return param
+
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if self._parent_module._parameters._in_forward:
+                print_rank_0(f'Registering external parameter from getter {key}',
+                             force=False)
+                register_external_parameter(FWD_MODULE_STACK[-1], param)
+                param.all_gather()
+
+        return param
+
+
+def _inject_parameters(module, cls):
+    for module in module.modules():
+        if cls == ZeROOrderedDict:
+            new_param = cls(parent_module=module)
+        else:
+            new_param = cls()
+
+        for key, param in module._parameters.items():
+            new_param[key] = param
+        module._parameters = new_param
+
+
 # TODO Needs to be implemented
 class PrefetchCoordinator(object):
     def __init__(self):
@@ -217,7 +267,6 @@ def get_reuse_distance_in_numel(self, sub_module, sub_module_step_id=None):
                     start_step,
                     end_step,
                     trace)
-
                 break
 
         self.reuse_numel_for_step_id[sub_module_step_id] = reuse_distance_in_numel
@@ -271,9 +320,26 @@ def finish_tracing(self, print_trace=False):
         if print_trace:
             self.prefetch_coordinator.print_trace()
 
+    #swap in parameter partitions from nvme for those parameters that will be used
+    # after the ones that are already being prefetched into full parameters
+    def _prefetch_nvme_param_partitions(self, sub_module, params_in_flight):
+        numel_in_flight = sum([param.ds_tensor.ds_numel for param in params_in_flight])
+        upcoming_param_list = self.prefetch_coordinator.get_params_to_prefetch(
+            sub_module,
+            numel=2 * numel_in_flight)
+        swap_in_params = []
+        for param in upcoming_param_list:
+            if len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers():
+                break
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_in_params.append(param)
+
+        if len(swap_in_params) > 0:
+            swap_in_params[0].nvme_swapper.swap_in(swap_in_params, async_op=True)
+
     # Pre fetches the parameters for sub_modules that comes after
     #  the current sub_module. This call is asynchronous
-    def prefetch_next_sub_modules(self, sub_module, numel=5000000):
+    def prefetch_next_sub_modules(self, sub_module, numel=5000000, nvme=False):
 
         params_to_prefetch = []
         if not self.prefetch_coordinator.trace_completed:
@@ -292,6 +358,9 @@ def prefetch_next_sub_modules(self, sub_module, numel=5000000):
                 # keeping track of number of elements consumed by available parmaeters
                 self._increment_available_parameter_numel(param.ds_numel)
 
+            if nvme:
+                self._prefetch_nvme_param_partitions(sub_module, params_to_prefetch)
+
         self._print_prefetch_elements_info(sub_module, params_to_prefetch)
         print_rank_0(
             f"{'--' * self.hierarchy}--PreFetching parameters {[param.ds_id for param in params_to_prefetch]} and available {self.total_available_parameter_numel}, max limit {self.max_available_parameters_in_numel}",
@@ -371,7 +440,9 @@ def fetch_sub_module(self, sub_module):
 
         for _, param in sub_module.named_parameters(recurse=False):
             param.ds_status = ZeroParamStatus.AVAILABLE
-            #print(f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} ")
+            print_rank_0(
+                f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} norm {param.norm()}",
+                force=False)
         #print_rank_0(f"After fetching (id, shape, device): {[(param.ds_id, param.shape, param.device) for param in sub_module.named_parameters(recurse=False)]}")
 
     def release_sub_module(self, sub_module):
@@ -383,6 +454,7 @@ def release_sub_module(self, sub_module):
             param for _,
             param in sub_module.named_parameters(recurse=False)
         ]
+
         if hasattr(sub_module, 'ds_external_parameters'):
             #print_rank_0(f"Releasing external parameters {sub_module.ds_external_parameters()}")
             params_to_release += [
@@ -396,25 +468,25 @@ def release_sub_module(self, sub_module):
             if not param.ds_active_sub_modules and not self._keep_for_later(
                     sub_module) and not param.ds_persist:
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}"
-                )
+                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}",
+                    force=False)
 
                 # Keeping track of number of elements that are consumed by available parameters
                 self._decrement_available_parameter_numel(param.ds_numel)
                 see_memory_usage(
-                    f"Before releasing param {param.ds_id} with numel{param.numel()}",
+                    f"Before releasing param {param.ds_id} with numel {param.numel()}",
                     force=False)
                 param.partition(hierarchy=self.hierarchy)
                 see_memory_usage(
-                    f"After releasing param {param.ds_id} has numel{param.numel()} ",
+                    f"After releasing param {param.ds_id} has numel {param.numel()} ",
                     force=False)
 
                 param.ds_status = ZeroParamStatus.NOT_AVAILABLE
             else:
 
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}"
-                )
+                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}",
+                    force=False)
 
     def release_and_reset_parameter(self, param):
         param.ds_active_sub_modules = 0
@@ -428,6 +500,8 @@ def release_and_reset_parameter(self, param):
     def _keep_for_later(self, sub_module):
         if not self.prefetch_coordinator.trace_completed:
             return False
+        if self.max_reuse_distance_in_numel == 0:
+            return False
         reuse_distance_in_numel = self.prefetch_coordinator.get_reuse_distance_in_numel(
             sub_module)
         #print_rank_0(f"Reuse distance and numel for sub_module id {sub_module.id} is {reuse_distance_in_numel}")
@@ -532,9 +606,8 @@ def __init__(self,
                  dp_process_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
-                 cpu_offload_optimizer_state=False,
-                 cpu_offload_params=False,
-                 cpu_offload_use_pin_memory=False,
+                 offload_optimizer_config=None,
+                 offload_param_config=None,
                  sub_group_size=1000000000000,
                  mpu=None,
                  clip_grad=0.0,
@@ -542,7 +615,8 @@ def __init__(self,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
                  gradient_accumulation_steps=1,
-                 elastic_checkpoint=False):
+                 elastic_checkpoint=False,
+                 aio_config=None):
 
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
@@ -580,21 +654,51 @@ def __init__(self,
         self.elastic_checkpoint = elastic_checkpoint
         self.overlap_comm = overlap_comm
 
+        # Replace ._parameters with a new class to enable auto-registration of
+        # external parameters
+        _inject_parameters(module, ZeROOrderedDict)
+
         if self.overlap_comm:
             self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-        ######################cpu offload setup##################################
-        self.cpu_offload = cpu_offload_optimizer_state
-        self.cpu_offload_use_pin_memory = cpu_offload_use_pin_memory
-
-        if cpu_offload_params:
-            assert cpu_offload_optimizer_state, "parameter offload is only available with optimizer state offload"
-        self.cpu_offload_params = cpu_offload_optimizer_state and cpu_offload_params
+        ###################### offload optimizer setup ##################################
+        self.optimizer_swapper = None
+        self.swap_optimizer = False
+
+        self.offload_optimizer = False
+        self.offload_optimizer_pin_memory = False
+        self.offload_optimizer_fast_init = False
+        if offload_optimizer_config is not None:
+            self.offload_optimizer = True
+            self.offload_optimizer_pin_memory = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_PIN_MEMORY]
+            self.swap_optimizer = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.offload_optimizer_fast_init = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_FAST_INIT]
+
+        ###################### offload param setup ##################################
+        self.offload_param = False
+        self.offload_param_pin_memory = False
+        self.params_in_nvme_and_cpu = False
+        self.max_params_in_cpu = 0
+        if offload_param_config is not None:
+            assert self.offload_optimizer, "parameter offload is only available with optimizer state offload"
+            self.offload_param = True
+            self.offload_param_pin_memory = offload_param_config[
+                OFFLOAD_PARAM_PIN_MEMORY]
+            self.params_in_nvme_and_cpu = offload_param_config[
+                OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
+            print_rank_0(
+                f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
+                force=True)
 
-        self.deepspeed_adam_offload = (self.cpu_offload
+        self.deepspeed_adam_offload = (self.offload_optimizer
                                        and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+        self.device = torch.cuda.current_device(
+        ) if not self.offload_optimizer else OFFLOAD_CPU_DEVICE
         ############################################################################
 
         see_memory_usage("Before Partitioned Parameter Coordinator", force=False)
@@ -661,10 +765,18 @@ def __init__(self,
 
         # Holds a fused and flattened copy of the parameters
         self.fp16_partitioned_groups_flat = []
+        self.fp16_partitioned_groups_flat_numel = []
+
+        #defragmented pinned memory
+        self.param_groups_fp16_flat_cpu_memory = []
+
+        #fp16 buffer for swapping out nvme params
+        self.param_group_fp16_flat_reuse_buffer = None
 
         #a single 32-bit partition of the parallel partitioned parameters
         #that this process will update
         self.fp32_partitioned_groups_flat = []
+        self.next_swappable_fp32_partitioned_groups = []
 
         # number of elements per partition in each group
         self.partition_size = []
@@ -680,20 +792,28 @@ def __init__(self,
 
         self.sub_group_to_group_id = {}
 
-        see_memory_usage("Before creating fp16 partitions", force=False)
-        #self._create_fp16_partitions()
+        see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
         see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
                          force=False)
 
+        # Optimizer ensor swapping
+        if self.swap_optimizer:
+            self._configure_tensor_swapping(offload_optimizer_config, aio_config)
+
         see_memory_usage("Before creating fp32 partitions", force=False)
         self._create_fp32_partitions()
         see_memory_usage("After creating fp32 partitions", force=False)
+        dist.barrier()
+
+        # To support pipelined optimizer swapping
+        self._create_next_swappable_fp32_groups()
 
         see_memory_usage("Before initializing optimizer states", force=False)
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states", force=False)
+        dist.barrier()
 
         if dist.get_rank() == 0:
             logger.info(f"optimizer state initialized")
@@ -718,6 +838,7 @@ def __init__(self,
         self.params_in_ipg_bucket = []
         self.elements_in_ipg_bucket = 0
         self.params_already_reduced = []
+        self.is_gradient_accumulation_boundary = True
         self._release_ipg_buffers()
         self.previous_reduced_grads = None
 
@@ -734,7 +855,10 @@ def __init__(self,
                 count = count + 1
 
         #Largest partitioned param
-        largest_partitioned_param_numel = self._get_largest_partitioned_numel()
+        largest_partitioned_param_numel = max(self.fp16_partitioned_groups_flat_numel)
+        print_rank_0(
+            f'Largest partitioned param numel = {largest_partitioned_param_numel}',
+            force=True)
 
         see_memory_usage(f"Before Set Grad positions", force=False)
 
@@ -744,7 +868,7 @@ def __init__(self,
 
         self.grads_in_partition = None
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.accumulated_grads_in_cpu = {}
             self.norm_for_param_grads = {}
             self.local_overflow = False
@@ -789,14 +913,26 @@ def __init__(self,
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
-    def _get_largest_partitioned_numel(self):
-        largest_partitioned_param_numel = 0
-        for partitioned_params_group in self.fp16_partitioned_groups:
-            for partitioned_param in partitioned_params_group:
-                if partitioned_param.numel() > largest_partitioned_param_numel:
-                    largest_partitioned_param_numel = partitioned_param.numel()
+    def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
+        nvme_swap_folder = os.path.join(
+            offload_optimizer_config[OFFLOAD_OPTIMIZER_NVME_PATH],
+            'zero_stage_3')
+        os.makedirs(nvme_swap_folder, exist_ok=True)
+        if torch.distributed.get_rank() == 0:
+            logger.info(f'Tensor Swapping: Adding optimizer tensors')
+
+        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config[
+            OFFLOAD_OPTIMIZER_PIPELINE] else PartitionedOptimizerSwapper
 
-        return largest_partitioned_param_numel
+        self.optimizer_swapper = swapper_type(
+            swap_config=offload_optimizer_config,
+            aio_config=aio_config,
+            base_folder=nvme_swap_folder,
+            optimizer=self.optimizer,
+            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
+            device=self.device,
+            dtype=torch.float32,
+            timers=self.timers)
 
     def _create_fp16_partitions(self):
         dist.barrier()
@@ -832,7 +968,7 @@ def _create_fp16_partitions(self):
                 #removing cloning here
                 see_memory_usage(f"Before Flattening param group {i}", force=False)
 
-                if not self.cpu_offload_params:
+                if not self.offload_param:
                     see_memory_usage(f"Before moving param group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
@@ -868,31 +1004,86 @@ def _create_fp16_partitions(self):
                 for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
                     partitioned_param.data = q.data
 
-    def _move_to_flat_buffer(self, src_list, flat_buffer):
+    def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
+        '''If flat buffer is None then the parameters in the param_list are
+        not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
+        Some of these parameters may aready be in CPU in unflattened buffers
+        or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
+        they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
+        needed during training.'''
+        if flat_buffer is None:
+            # this dst buffer is on NVMe, so skip this
+            return
+
         start = 0
-        for src in src_list:
-            dest = flat_buffer.narrow(0, start, src.numel())
-            start = start + src.numel()
-            dest.data.copy_(src.data)
-            src.data = dest.data
+        for param in param_list:
+            src = param.ds_tensor
+            dest = flat_buffer.narrow(0, start, src.ds_numel)
+            start = start + src.ds_numel
+            '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
+            if src.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with partition size {param.ds_tensor.ds_numel} permanently to CPU"
+                )
+                param.nvme_swapper.swap_in([param],
+                                           swap_in_buffers=[dest],
+                                           async_op=False)
+            else:
+                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
+                if not avoid_copy:
+                    dest.data.copy_(src.data)
+                src.data = dest.data
 
-    def _create_fp16_partitions_with_defragmentation(self):
-        dist.barrier()
-        partition_id = dist.get_rank(group=self.dp_process_group)
+            # Final location must be gpu/cpu in this case
+            param.ds_tensor.final_location = 'not-nvme'
+
+    def _create_param_groups_fp16_flat_cpu_memory(self):
 
-        if self.cpu_offload_params:
-            self.param_groups_fp16_flat_cpu_memory = []
-            for j, param_group in enumerate(self.optimizer.param_groups):
-                total_params = sum([p.ds_tensor.numel() for p in param_group['params']])
+        aggregate_params_count = 0
+
+        for j, param_group in enumerate(self.optimizer.param_groups):
+            params_in_group = sum([p.ds_tensor.ds_numel for p in param_group['params']])
+
+            flat_buffer_size = params_in_group
+
+            if self.params_in_nvme_and_cpu and \
+                aggregate_params_count + params_in_group > self.max_params_in_cpu:
+
+                flat_buffer_size = max(0,
+                                       self.max_params_in_cpu - aggregate_params_count)
+
+            aggregate_params_count += params_in_group
+
+            if flat_buffer_size > 0:
+                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
+                             force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(total_params,
+                    torch.empty(int(flat_buffer_size),
                                 dtype=torch.half,
                                 pin_memory=True))
+            else:
+                print_rank_0(
+                    f"No flat buffer size. Param group size was  {params_in_group}",
+                    force=False)
+
+                self.param_groups_fp16_flat_cpu_memory.append(
+                    torch.empty(1,
+                                dtype=torch.half))
+
+    def _create_fp16_partitions_with_defragmentation(self):
+        dist.barrier()
+        partition_id = dist.get_rank(group=self.dp_process_group)
+
+        #create a flat CPU memory allocation for each param group
+        if self.offload_param:
+            self._create_param_groups_fp16_flat_cpu_memory()
 
         # loop to deal with groups
         for j, param_group in enumerate(self.optimizer.param_groups):
 
             sub_groups = self._create_fp16_sub_groups(param_group['params'])
+            print_rank_0(f'fp16 group {j} has {len(sub_groups)} subgroups', force=True)
+
             flat_offset = 0
             for sub_group in sub_groups:
                 i = len(self.fp16_groups)
@@ -905,6 +1096,10 @@ def _create_fp16_partitions_with_defragmentation(self):
                 self.fp16_partitioned_groups.append(
                     [param.ds_tensor for param in self.fp16_groups[i]])
 
+                total_elements = sum(
+                    [t.ds_numel for t in self.fp16_partitioned_groups[i]])
+                self.fp16_partitioned_groups_flat_numel.append(total_elements)
+
                 print_rank_0(
                     f"fp16 group {i} partitioned_param norms : {[param.ds_tensor.norm().item() for param in self.fp16_groups[i]]}"
                 )
@@ -918,14 +1113,16 @@ def _create_fp16_partitions_with_defragmentation(self):
 
                 #not sure why apex was cloning the weights before flattening
                 #removing cloning here
-                see_memory_usage(f"Before Flattening param group {i}", force=False)
+                see_memory_usage(f"Before Flattening param subgroup {i}", force=False)
 
-                if not self.cpu_offload_params:
-                    see_memory_usage(f"Before moving param group {i} to CPU",
+                #all partitioned parameters remain in GPU during training
+                if not self.offload_param:
+                    see_memory_usage(f"Before moving param subgroup group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
                     move_to_cpu(self.fp16_partitioned_groups[i])
-                    see_memory_usage(f"After moving param group {i} to CPU", force=False)
+                    see_memory_usage(f"After moving param subgroup {i} to CPU",
+                                     force=False)
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
@@ -933,45 +1130,208 @@ def _create_fp16_partitions_with_defragmentation(self):
                             self.fp16_partitioned_groups[i],
                             1).cuda(torch.cuda.current_device()))
                     see_memory_usage(
-                        f"After flattening and moving param group {i} to GPU",
+                        f"After flattening and moving param subgroup {i} to GPU",
                         force=False)
+
+                #all partitioned parameters are in CPU during training
                 else:
-                    total_elements = sum(
-                        [t.numel() for t in self.fp16_partitioned_groups[i]])
-                    fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                        j].narrow(0,
-                                  flat_offset,
-                                  total_elements)
+                    print_rank_0(f"Params in nvme and cpu {self.params_in_nvme_and_cpu}")
+                    #Flat buffer may not be available for parameters that reside in NVME
+                    if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
+                            j].numel():
+                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
+                            j].narrow(0,
+                                      flat_offset,
+                                      total_elements)
+                        print_rank_0(
+                            f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
+                            force=False)
+                    #these parameters reside in NVME and
+                    elif self.params_in_nvme_and_cpu:
+                        fp16_partitioned_group_flat = None
+                        print_rank_0(
+                            f"No flat buffer for sub group {i} of {total_elements} elements",
+                            force=False)
+                    else:
+                        assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs"
+
                     self.fp16_partitioned_groups_flat.append(fp16_partitioned_group_flat)
                     flat_offset += total_elements
 
                 # move param to flat buffer for both param offload on/off
-                self._move_to_flat_buffer(self.fp16_partitioned_groups[i],
-                                          self.fp16_partitioned_groups_flat[i])
+                self._move_to_flat_buffer(self.fp16_groups[i],
+                                          self.fp16_partitioned_groups_flat[i],
+                                          avoid_copy=not self.offload_param)
 
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
+                #create a pinned memory to be used for swapping out params to NVME after optimizer step
+                if self.fp16_partitioned_groups_flat[
+                        -1] is None and self.param_group_fp16_flat_reuse_buffer is None:
+                    self.param_group_fp16_flat_reuse_buffer = torch.empty(
+                        max(self.fp16_partitioned_groups_flat_numel),
+                        dtype=torch.half,
+                        device='cpu',
+                        pin_memory=True)
+
+                see_memory_usage(f"After Flattening param subgroup {i}", force=False)
+
+    def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
+        offset = 0
+        elements_in_sub_group = sum(
+            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
+        assert (flat_buffer.numel() == elements_in_sub_group)
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.ds_tensor.ds_numel}"
+                )
+                param.nvme_swapper.swap_in([param], async_op=False)
+                dest.data.copy_(partitioned_param.data)
+                param.nvme_swapper.remove_partition_and_release_buffers([param])
+                print_rank_0(f"Swapping in {param.ds_id} done")
+            else:
+                dest.data.copy_(partitioned_param.data)
+            offset += partitioned_param.ds_numel
+
+    def _create_next_swappable_fp32_groups(self):
+        reverse_order_indices = [
+            i for i in range(len(self.fp32_partitioned_groups_flat))
+        ]
+        reverse_order_indices.reverse()
+
+        next_group = None
+        for i in reverse_order_indices:
+            self.next_swappable_fp32_partitioned_groups.append(next_group)
+            if self._swappable_optimizer_subgroup(i):
+                next_group = self.fp32_partitioned_groups_flat[i]
+
+        self.next_swappable_fp32_partitioned_groups.reverse()
+
+    def _get_sub_group_partitions(self, sub_group_id):
+        sub_group_partitions = []
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_path = param.nvme_swapper.get_path(param, True)
+                sub_group_partitions.append((partitioned_param,
+                                             param.ds_tensor.ds_numel,
+                                             swap_path))
+            else:
+                sub_group_partitions.append((partitioned_param,
+                                             partitioned_param.ds_numel,
+                                             None))
+
+        return sub_group_partitions
+
     def _create_fp32_partitions(self):
+        cpu_memory_usage = 0
+        cpu_memory_sub_groups = 0
+        nvme_memory_usage = 0
+        num_swappable_partitions = 0
+        num_swap_from_nvme_partitions = 0
+        num_swap_from_cpu_partitions = 0
+        swap_from_nvme_memory_usage = 0
+        swap_from_cpu_memory_usage = 0
+        GIGA_BYTES = (1024**3)
+
+        swappable_fp32_tensors = []
+        swappable_fp16_src_tensors = []
+        nvme_fp16_partitions_info = []
+        nvme_fp16_num_elems = []
+        nvme_fp32_dest_tensors = []
+        fp32_element_size = torch.tensor([], dtype=torch.float32).element_size()
+
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
-            # a partition of the fp32 master weights that will be updated by this process
+            num_elements = self.fp16_partitioned_groups_flat_numel[i]
 
-            self.fp32_partitioned_groups_flat.append(
-                self.fp16_partitioned_groups_flat[i].to(
-                    self.device).clone().float().detach())
-            element_size = self.fp32_partitioned_groups_flat[i].element_size()
-            num_elements = self.fp32_partitioned_groups_flat[i].numel()
+            # a partition of the fp32 master weights that will be updated by this process
+            if self._swappable_optimizer_subgroup(i):
+                self.fp32_partitioned_groups_flat.append(torch.Tensor())
+                nvme_memory_usage += (fp32_element_size * num_elements)
+                num_swappable_partitions += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    num_swap_from_nvme_partitions += 1
+                    swap_from_nvme_memory_usage += (fp32_element_size * num_elements)
+                    if self.offload_optimizer_fast_init:
+                        sub_group_partitions = self._get_sub_group_partitions(i)
+                        nvme_fp16_partitions_info.append(sub_group_partitions)
+                        nvme_fp16_num_elems.append(num_elements)
+                        nvme_fp32_dest_tensors.append(
+                            self.fp32_partitioned_groups_flat[i])
+                    else:
+                        unpinned_fp32_buffer = torch.empty(num_elements,
+                                                           device=self.device,
+                                                           dtype=torch.float)
+                        self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                        self.optimizer_swapper.initialize_parameters(
+                            parameters=[self.fp32_partitioned_groups_flat[i]],
+                            src_tensors=[unpinned_fp32_buffer])
+                else:
+                    num_swap_from_cpu_partitions += 1
+                    swap_from_cpu_memory_usage += (fp32_element_size * num_elements)
+                    swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i])
+                    swappable_fp16_src_tensors.append(
+                        self.fp16_partitioned_groups_flat[i])
+            else:
+                cpu_memory_usage += (fp32_element_size * num_elements)
+                cpu_memory_sub_groups += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    unpinned_fp32_buffer = torch.empty(num_elements,
+                                                       device=self.device,
+                                                       dtype=torch.float)
+                    self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                    self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer)
+                else:
+                    self.fp32_partitioned_groups_flat.append(
+                        self.fp16_partitioned_groups_flat[i].to(
+                            self.device).clone().float().detach())
 
             self.fp32_partitioned_groups_flat[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
 
+        if len(swappable_fp32_tensors) > 0:
+            self.optimizer_swapper.initialize_parameters(
+                parameters=swappable_fp32_tensors,
+                src_tensors=swappable_fp16_src_tensors)
+
+        if len(nvme_fp32_dest_tensors) > 0:
+            fp16_pinned_buffers = self.fp16_groups[0][
+                0].nvme_swapper.reserve_available_buffers()
+            assert len(fp16_pinned_buffers) > 0
+            self.optimizer_swapper.initialize_from_swapped_fp16_params(
+                fp16_partitions_info=nvme_fp16_partitions_info,
+                fp16_num_elems=nvme_fp16_num_elems,
+                fp16_pinned_buffers=fp16_pinned_buffers,
+                fp32_parameters=nvme_fp32_dest_tensors)
+            self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
+
+        nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
+            force=True)
+        if self.params_in_nvme_and_cpu:
+            print_rank_0(
+                f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+            print_rank_0(
+                f'Swap from CPU Partitions: count = {num_swap_from_cpu_partitions}, size = {swap_from_cpu_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+
+        cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
+            force=True)
+
         # Clear for on-the-fly population before the optimizer step
         for param_group in self.optimizer.param_groups:
             param_group['params'] = []
 
     def _create_fp16_sub_groups(self, params_group):
 
-        params_group_numel = sum([param.ds_tensor.numel() for param in params_group])
-
+        params_group_numel = sum([param.partitioned_size() for param in params_group])
         sub_group_size = self.sub_group_size
 
         if sub_group_size is None or sub_group_size >= params_group_numel:
@@ -983,7 +1343,7 @@ def _create_fp16_sub_groups(self, params_group):
         for param in params_group:
 
             sub_group.append(param)
-            local_sub_group_size += param.ds_tensor.numel()
+            local_sub_group_size += param.partitioned_size()
 
             if local_sub_group_size >= sub_group_size or id(param) == id(
                     params_group[-1]):
@@ -1019,6 +1379,10 @@ def _end_of_forward_hook(module, *args):
         self.module.register_forward_hook(_end_of_forward_hook)
         self.module.register_forward_pre_hook(_pre_forward_hook)
 
+        # Add top todule to stack trace
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(self.module)
+
     def persistent_parameters(self):
         persistent_params = []
         total_persistent_parameters = 0
@@ -1046,7 +1410,41 @@ def _register_hooks_recursively(self, module, count=[0]):
         def _pre_forward_module_hook(module, *args):
             self.pre_sub_module_forward_function(module)
 
-        def _post_forward_module_hook(module, *args):
+        def _post_forward_module_hook(module, input, output):
+            global FWD_MODULE_STACK
+            FWD_MODULE_STACK.pop()
+
+            if not isinstance(output, (list, tuple)):
+                if torch.is_tensor(output):
+                    output = [output]
+                else:
+                    print(f'got UNKNOWN type {type(output)}')
+                    outputs = []
+                    for name, val in vars(output).items():
+                        if not name.startswith('__') and torch.is_tensor(val):
+                            outputs.append(val)
+                    output = outputs
+                    print(f'convert output to {output}')
+
+            for item in filter(lambda item: is_zero_param(item), output):
+                if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):
+                    item.ds_active_sub_modules += 1
+                    module_to_register = FWD_MODULE_STACK[-1]
+                    print_rank_0(
+                        f'Registering dangling parameter for module {module_to_register.__class__.__name__}.',
+                        force=False)
+                    register_external_parameter(module_to_register, item)
+
+                    # It's possible that the parameter was already external to the completed module. If so, remove it the
+                    # registration as it will be covered by the outer module instead.
+                    if id(item) in module._external_params:
+                        print_rank_0(
+                            f'  Unregistering nested dangling parameter from module {module.__class__.__name__}',
+                            force=False)
+                        unregister_external_parameter(module, item)
+
+                    item.all_gather()
+
             self.post_sub_module_forward_function(module)
 
         def _pre_backward_module_hook(module, inputs, output):
@@ -1110,6 +1508,9 @@ def pre_sub_module_forward_function(self, sub_module):
         see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
                          force=False)
 
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(sub_module)
+
         self.param_coordinator.record_trace(sub_module)
 
         self.param_coordinator.fetch_sub_module(sub_module)
@@ -1117,8 +1518,10 @@ def pre_sub_module_forward_function(self, sub_module):
             f"Before sub module function {sub_module.__class__.__name__} after fetch",
             force=False)
 
-        self.param_coordinator.prefetch_next_sub_modules(sub_module,
-                                                         numel=self.prefetch_elements)
+        self.param_coordinator.prefetch_next_sub_modules(
+            sub_module,
+            numel=self.prefetch_elements,
+            nvme=self.params_in_nvme_and_cpu)
         see_memory_usage(
             f"Before sub module function {sub_module.__class__.__name__} after prefetch",
             force=False)
@@ -1127,11 +1530,13 @@ def pre_sub_module_forward_function(self, sub_module):
 
     def post_sub_module_forward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} before release",
+            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
+
         self.param_coordinator.release_sub_module(sub_module)
+
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} after release",
+            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
             force=False)
 
     def pre_sub_module_backward_function(self, sub_module):
@@ -1146,17 +1551,17 @@ def pre_sub_module_backward_function(self, sub_module):
 
     def post_sub_module_backward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} before release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
         self.param_coordinator.release_sub_module(sub_module)
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} after release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
             force=False)
 
     def _release_ipg_buffers(self):
         if self.contiguous_gradients:
             self.ipg_buffer = None
-            if not self.cpu_offload:
+            if not self.offload_optimizer and self.is_gradient_accumulation_boundary:
                 self.grads_in_partition = None
 
             self.grads_in_partition_offset = 0
@@ -1166,35 +1571,93 @@ def _optimizer_step(self, sub_group_id):
         fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
         fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
         self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+
         self.optimizer.step()
         self.optimizer.param_groups[param_group_id]['params'] = []
-        fp16_param.data.copy_(fp32_param.data)
+
+        if fp16_param is not None:
+            fp16_param.data.copy_(fp32_param.data)
+        else:
+            #synchronize incase there is a previous write going on the reuse buffer
+            self.fp16_groups[sub_group_id][0].nvme_swapper.synchronize_writes()
+            self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                0,
+                fp32_param.numel()).data.copy_(fp32_param.data)
+
+    def _swappable_optimizer_subgroup(self, sub_group_id):
+        if not self.swap_optimizer:
+            return False
+
+        return self.optimizer_swapper.swappable_tensor(
+            None,
+            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
+
+    def _partitioned_params_swap_out(self, i):
+        swap_out_params = []
+        offset = 0
+        for param, partitioned_param in zip(self.fp16_groups[i], self.fp16_partitioned_groups[i]):
+            src = self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                offset,
+                partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.AVAILABLE:
+                partitioned_param.data.copy_(src.data)
+            else:
+                partitioned_param.data = src.data
+                #Setting it to available just for good practice. It will be released at the end of the call
+                #by swap out and release
+                partitioned_param.status = PartitionedParamStatus.AVAILABLE
+                swap_out_params.append(param)
+            offset += partitioned_param.ds_numel
+
+        if len(swap_out_params) > 0:
+            #The write synchronize will happen before the buffer is reused in _optimizer_step so the buffer can be released
+            swap_out_params[0].nvme_swapper.swap_out_and_release(
+                swap_out_params,
+                async_op=True,
+                force_buffer_release=True)
 
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
-        largest_numel = max([t.numel() for t in self.fp16_partitioned_groups_flat])
+        largest_numel = max(
+            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
         gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
         gradient_buffer = torch.zeros(int(largest_numel),
                                       dtype=gradient_dtype,
                                       device=self.device)
 
+        timers = self.timers
+        timer_names = set()
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.init_timers()
+
+        INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
+        timer_names.add(INIT_OPTIMIZER_TIMER)
+        self.start_timers([INIT_OPTIMIZER_TIMER])
+
         for i, group in enumerate(self.fp16_groups):
+            swappable_optimizer_subgroup = self._swappable_optimizer_subgroup(i)
+            swappable_param_subgroup = self.fp16_partitioned_groups_flat[i] is None
+
+            num_elements = int(self.fp16_partitioned_groups_flat_numel[i])
+
             see_memory_usage(
-                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-            num_elements = int(self.fp16_partitioned_groups_flat[i].numel())
-            if self.cpu_offload and not self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device)
-            elif self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device).pin_memory()
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_in(i, timer_names)
+
+            if self.offload_optimizer and not swappable_optimizer_subgroup:
+                subgroup_gradient_buffer = torch.zeros(num_elements,
+                                                       dtype=gradient_dtype,
+                                                       device=self.device)
+                if self.offload_optimizer_pin_memory:
+                    subgroup_gradient_buffer = subgroup_gradient_buffer.pin_memory()
+                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
             else:
                 self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
                     0,
@@ -1203,14 +1666,27 @@ def initialize_optimizer_states(self):
 
             self._optimizer_step(i)
 
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_out(i, timer_names)
+
+            if swappable_param_subgroup:
+                self._partitioned_params_swap_out(i)
+
             see_memory_usage(
-                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-        if not self.cpu_offload:
+        self.stop_timers([INIT_OPTIMIZER_TIMER])
+        self.log_timers(timer_names)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
+
+        if not self.offload_optimizer:
             for group in self.fp32_partitioned_groups_flat:
                 group.grad = None
 
+        # Reset steps
         return
 
     #########################################################################
@@ -1269,7 +1745,7 @@ def independent_gradient_partition_epilogue(self):
 
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
         #TODO: use a similar code path for both cpu_offload and non-cpu offload
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             for i, sub_group in enumerate(self.fp16_groups):
                 self.averaged_gradients[i] = [
                     torch.zeros_like(param.ds_tensor) if param.grad is None else
@@ -1491,7 +1967,7 @@ def set_grad_positions(self):
             current_offset = 0
             for param in group:
                 param_id = self.get_param_id(param)
-                num_elements = param.ds_tensor.numel()
+                num_elements = param.ds_tensor.ds_numel
 
                 self.grad_position[param_id] = [
                     int(i),
@@ -1507,7 +1983,7 @@ def async_accumulate_grad_in_cpu_via_gpu(self, param, acc_grad_cpu_partition):
         dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
             0,
             0,
-            param.ds_tensor.numel())
+            param.ds_tensor.ds_numel)
 
         if self.micro_step_id > 0:
             dest_buffer.copy_(acc_grad_cpu_partition.view(-1), non_blocking=True)
@@ -1581,7 +2057,7 @@ def partition_previous_reduced_grads(self):
         if not self.previous_reduced_grads:
             return
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             allocate_grads_in_partition = self.grads_in_partition is None\
             and self.gradient_accumulation_steps > 1
         else:
@@ -1593,12 +2069,12 @@ def partition_previous_reduced_grads(self):
             for i, group in enumerate(self.fp16_groups):
                 total_size = 0
                 for param_in_partition in group:
-                    total_size += param_in_partition.ds_tensor.numel()
+                    total_size += param_in_partition.ds_tensor.ds_numel
 
                 see_memory_usage(
                     f"group {i} before creating {total_size} reduced gradients into partition",
                     force=False)
-                if self.cpu_offload_use_pin_memory:
+                if self.offload_param_pin_memory:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
                                     dtype=torch.half,
@@ -1612,51 +2088,73 @@ def partition_previous_reduced_grads(self):
                     f"group {i} after creating {total_size} reduced gradients into partition",
                     force=False)
 
-        for param in self.previous_reduced_grads:
-
-            [i, dest_offset, num_elements] = self.grad_position[self.get_param_id(param)]
-
-            # self.debug_fp16_grads[i][self.get_param_id(param)] = (
-            #     float(param.data.float().norm(2)),
-            #     float(param.grad.data.float().norm(2)))
-
-            if self.cpu_offload:
+        if self.offload_optimizer:
+            offload_fp32_gradients = {}
+            offload_fp32_offsets = {}
 
-                param.partition_gradients(partition_buffers=self.temp_grad_gpu_buffer)
-                with torch.cuda.stream(self.copy_grad_stream):
-                    self.reduction_stream.synchronize()
-
-                if self.gradient_accumulation_steps > 1:
+        with torch.cuda.stream(self.copy_grad_stream):
+            self.reduction_stream.synchronize()
+            for param in self.previous_reduced_grads:
+
+                [i,
+                 dest_offset,
+                 num_elements] = self.grad_position[self.get_param_id(param)]
+
+                if self.offload_optimizer:
+                    param.partition_gradients(
+                        partition_buffers=self.temp_grad_gpu_buffer)
+                    #with torch.cuda.stream(self.copy_grad_stream):
+                    #    self.reduction_stream.synchronize()
+
+                    if self.gradient_accumulation_steps > 1:
+                        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+                        fp16_grad_tensor = self.grads_in_partition[i].narrow(
+                            0,
+                            dest_offset,
+                            num_elements)
+                        self.async_accumulate_grad_in_cpu_via_gpu(
+                            param,
+                            fp16_grad_tensor)
+
+                    if self.is_gradient_accumulation_boundary:
+
+                        self.set_norm_for_param_grad_in_gpu(param)
+
+                        self.update_overflow_tracker_for_param_grad(param)
+
+                        if self._swappable_optimizer_subgroup(i):
+                            if not i in offload_fp32_gradients.keys():
+                                offload_fp32_gradients[i] = []
+                                offload_fp32_offsets[i] = []
+
+                            offload_fp32_gradients[i].append(param.grad.view(-1).float())
+                            param.grad = None
+                            offload_fp32_offsets[i].append(dest_offset)
+                        else:
+                            fp32_grad_tensor = self.fp32_partitioned_groups_flat[
+                                i].grad.narrow(0,
+                                               dest_offset,
+                                               num_elements)
+
+                            self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
+                                param,
+                                fp32_grad_tensor)
+                else:
                     # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
                     fp16_grad_tensor = self.grads_in_partition[i].narrow(
                         0,
                         dest_offset,
                         num_elements)
-                    self.async_accumulate_grad_in_cpu_via_gpu(param, fp16_grad_tensor)
-
-                if self.is_gradient_accumulation_boundary:
-
-                    self.set_norm_for_param_grad_in_gpu(param)
-
-                    self.update_overflow_tracker_for_param_grad(param)
-
-                    fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
+                    param.partition_gradients(
+                        partition_buffers=fp16_grad_tensor,
+                        accumulate=True if self.micro_step_id > 0 else False)
 
-                    self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
-                        param,
-                        fp32_grad_tensor)
-            else:
-                # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-                fp16_grad_tensor = self.grads_in_partition[i].narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
-                param.partition_gradients(
-                    partition_buffers=fp16_grad_tensor,
-                    accumulate=True if self.micro_step_id > 0 else False)
+            if self.offload_optimizer and self.swap_optimizer:
+                for i in offload_fp32_gradients.keys():
+                    self.optimizer_swapper.swap_out_gradients(
+                        parameter=self.fp32_partitioned_groups_flat[i],
+                        gradient_offsets=offload_fp32_offsets[i],
+                        gradient_tensors=offload_fp32_gradients[i])
 
         self.previous_reduced_grads = []
 
@@ -2046,164 +2544,7 @@ def stop_timers(self, timer_names):
         for name in timer_names:
             self.timers(name).stop()
 
-    def old_step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-
-        self.micro_step_id = INITIAL_MICRO_STEP_ID
-
-        # if self.cpu_offload:
-        #    torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        print_rank_0(f"Inside Step function")
-        see_memory_usage(f"In step before checking overflow", force=False)
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-        self.param_coordinator.hierarchy = 0
-        self.param_coordinator.finish_tracing(print_trace=True)
-
-        self.param_coordinator.reset_step()
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        timers = self.timers
-
-        OPTIMIZER_STEP = 'optimizer_step'
-        OPTIMIZER_FP16_UPDATE = 'optimizer_fp16_update'
-        OPTIMIZER_FP32_GRADIENT = 'optimizer_fp32_gradient'
-        timer_names = [OPTIMIZER_STEP, OPTIMIZER_FP16_UPDATE, OPTIMIZER_FP32_GRADIENT]
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients', force=False)
-            self.zero_grad()
-
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            see_memory_usage('After overflow after clearing gradients', force=False)
-
-            logger.info(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
-            return
-
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        debug_fp32_grads = [{} for _ in self.fp16_groups]
-
-        self.start_timers([OPTIMIZER_FP32_GRADIENT])
-        for i, group in enumerate(self.fp16_groups):
-
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
-
-                single_grad_partition = self.fp32_partitioned_groups_flat[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
-
-                # free gradients for all the prameters that are not updated by this process
-                # self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                # create a flat gradients for parameters updated by this process
-
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
-                    self.fp32_partitioned_groups_flat[i].dtype)
-
-                assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[i].numel(), \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                        single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.fp32_partitioned_groups_flat[i].grad = single_grad_partition
-
-                # release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.zero_grad()
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-            debug_fp32_grads[i] = [(t.clone().detach(),
-                                    t)
-                                   for t in self.unflatten(single_grad_partition,
-                                                           group)]
-
-        self.stop_timers([OPTIMIZER_FP32_GRADIENT])
-
-        print(f"Norm groups: {norm_groups}")
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-
-        #self.dump_pre_step_gradients(debug_fp32_grads)
-
-        self.start_timers([OPTIMIZER_STEP])
-        self.optimizer.step()
-        self.stop_timers([OPTIMIZER_STEP])
-
-        # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
-            for group in self.fp32_partitioned_groups_flat:
-                group.grad = None
-
-        self.start_timers([OPTIMIZER_FP16_UPDATE])
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat):
-            fp16_partitions.data.copy_(fp32_partition.data)
-        self.stop_timers([OPTIMIZER_FP16_UPDATE])
-
-        print(
-            f"fp16 groups norm : {[group_flat.norm() for group_flat in self.fp16_partitioned_groups_flat]}"
-        )
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            #for p in self.fp16_groups[i]:
-            #    p.data=p.ds_tensor
-
-            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
-                                            self.fp16_partitioned_groups[i])
-            for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
-                # print(f"Grad fn: {p.grad_fn}")
-                # p.data = torch.ones(1).half().cuda()
-                partitioned_param.data = q.data
-
-        #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
-
-        #self.dump_post_step_gradients()
-        self.debug_fp16_grads = [{} for _ in self.fp16_groups]
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        self.log_timers(timer_names)
-
-        see_memory_usage('After zero_optimizer step', force=False)
-        print_rank_0(f"------------------Finishing Step-----------------------",
-                     force=False)
-        return
-
     def _pre_step(self):
-
         self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         print_rank_0(f"Inside Step function")
@@ -2220,7 +2561,7 @@ def _pre_step(self):
     def _get_norm_groups(self):
         norm_groups = []
         for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
+            if self.offload_optimizer:
                 norm_groups.append(
                     self.complete_grad_norm_calculation_for_cpu_offload(
                         self.fp16_groups[i]))
@@ -2231,7 +2572,6 @@ def _get_norm_groups(self):
         return norm_groups
 
     def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
-
         partition_id = dist.get_rank(group=self.dp_process_group)
 
         single_grad_partition = self.flatten(self.averaged_gradients[sub_group_id]).to(
@@ -2251,18 +2591,42 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
     def _prepare_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}',
                          force=False)
-        if not self.cpu_offload:
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names)
+        elif not self.offload_optimizer:
             self._prepare_fp32_grad_for_sub_group(sub_group_id)
         see_memory_usage(f'After prepare optimizer sub group {sub_group_id}',
                          force=False)
 
+    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
+        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+        self.start_timers([OPTIMIZER_SWAP_IN_STATE])
+
+        self.optimizer_swapper.swap_in_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_parameter=self.next_swappable_fp32_partitioned_groups[sub_group_id])
+
+        self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
+        timer_names.add(OPTIMIZER_SWAP_IN_STATE)
+        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+
     def _release_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before release optimizer sub group {sub_group_id}',
                          force=False)
         # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             self.fp32_partitioned_groups_flat[sub_group_id].grad = None
 
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names)
         see_memory_usage(f'After release optimizer sub group {sub_group_id}',
                          force=False)
 
@@ -2287,6 +2651,32 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
         return self.flatten(padded_tensor_list)
 
+    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
+        see_memory_usage(
+            f'post-step Before swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
+
+        self.optimizer_swapper.swap_out_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is
+            not None)
+
+        self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
+        see_memory_usage(
+            f'post-step After swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
+
+        # get rid of the fp32 gradients. Not needed anymore
+        self.fp32_partitioned_groups_flat[sub_group_id].grad = None
+
     def _unflatten_partitioned_parameters(self, sub_group_id):
         updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
                                         self.fp16_partitioned_groups[sub_group_id])
@@ -2298,7 +2688,7 @@ def _overflow_clean_up(self, prev_scale):
         see_memory_usage('After overflow before clearing gradients', force=False)
         self.zero_grad()
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
         else:
             self.averaged_gradients = {}
@@ -2327,17 +2717,28 @@ def _overflow_check_and_loss_scale_update(self):
         return self.overflow
 
     def _post_step(self, timer_names=set()):
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
 
         #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
 
         self.log_timers(timer_names)
 
         see_memory_usage('After zero_optimizer step', force=False)
         print_rank_0(f"------------------Finishing Step-----------------------")
 
+    def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
+        if self.fp16_partitioned_groups_flat[sub_group_id] is not None:
+            #unflatten fp16 parameter subgroup
+            self._unflatten_partitioned_parameters(sub_group_id)
+        else:
+            self._partitioned_params_swap_out(sub_group_id)
+
     def step(self, closure=None):
         """
             Not supporting closure.
@@ -2346,6 +2747,8 @@ def step(self, closure=None):
 
         #checks for overflow, adjust the loss scale accordingly
         if self._overflow_check_and_loss_scale_update():
+            if self.swap_optimizer:
+                self.optimizer_swapper.log_timers()
             return
 
         norm_groups = self._get_norm_groups()
@@ -2370,8 +2773,8 @@ def step(self, closure=None):
             #release memory or swap out optimizer states of fp32 parameters
             self._release_sub_group(sub_group_id, timer_names)
 
-            #unflatten fp16 parameter subgroup
-            self._unflatten_partitioned_parameters(sub_group_id)
+            #put fp16 parameters in appropriate location
+            self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
 
         self.stop_timers(['optimizer_step'])
 
@@ -2460,7 +2863,7 @@ def has_overflow(self, partition_gradients=True):
                 self.local_overflow = self._has_inf_or_nan(self.gpu_sum)
                 self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            overflow = self.local_overflow if self.offload_optimizer else self.has_overflow_partitioned_grads_serial(
             )
             #overflow = self.has_overflow_partitioned_grads_serial()
             overflow_gpu = torch.cuda.ByteTensor([overflow])
@@ -2519,6 +2922,10 @@ def backward(self, loss, retain_graph=False):
         print_rank_0(
             f"Total fully available parameters {self.param_coordinator.total_available_parameter_numel}"
         )
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.pre_backward()
+
         see_memory_usage(f"Before backward", force=False)
         if self.contiguous_gradients:
             self.ipg_buffer = []
@@ -2541,6 +2948,9 @@ def backward(self, loss, retain_graph=False):
         grad computation do not trigger post call and will therefore will remain unpartitioned '''
         self._partition_all_parameters()
 
+        if self.swap_optimizer:
+            self.optimizer_swapper.post_backward()
+
     def _partition_all_parameters(self):
         for name, param in self.module.named_parameters(recurse=True):
             self.param_coordinator.release_and_reset_parameter(param)
@@ -2664,6 +3074,11 @@ def state_dict(self):
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
 
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
         return self._rigid_state_dict()
 
 
@@ -2806,19 +3221,26 @@ def load_state_dict(self,
             raise NotImplementedError(
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
-        else:
-            self._rigid_load_state_dict(
-                state_dict_list[dist.get_rank(group=self.dp_process_group)],
-                load_optimizer_states=load_optimizer_states)
 
-        self.persistent_parameters[0].partition(self.persistent_parameters)
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
+        self._rigid_load_state_dict(
+            state_dict_list[dist.get_rank(group=self.dp_process_group)],
+            load_optimizer_states=load_optimizer_states)
+
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].partition(self.persistent_parameters)
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
     def save_checkpoint_prologue(self):
         self._partition_all_parameters()
 
     def save_checkpoint_epilogue(self):
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
 
 def _handle_overflow(cpu_sum, x, i):
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
new file mode 100644
index 000000000000..c6f912500741
--- /dev/null
+++ b/deepspeed/runtime/zero/tiling.py
@@ -0,0 +1,293 @@
+import torch
+import deepspeed
+from deepspeed.runtime.utils import partition_uniform as partition
+
+
+def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension. Adapted from Megatron-LM.
+
+    Arguments:
+        tensor: input tensor.
+        partitions: list of partition sizes to supply to torch.split
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    # Split.
+    tensor_list = torch.split(tensor, partitions, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class TiledLinear(torch.nn.Module):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 in_splits=1,
+                 out_splits=1,
+                 input_is_already_split=False,
+                 combine_out_splits=True,
+                 linear_cls=torch.nn.Linear,
+                 init_linear=None,
+                 **kwargs):
+        """A replacement for ``torch.nn.Linear`` that works with ZeRO-3 to reduce
+        memory requirements via tiling.
+
+        TiledLinear breaks the input and output dimensions of a linear layer
+        into tiles that are processed in sequence. This class enables huge
+        linear layers when combined with ZeRO-3 because inactive tiles can be
+        partitioned and offloaded.
+
+        .. note::
+            We recommend using as few tiles as necessary. Tiling
+            significantly reduces memory usage, but can reduce throughput
+            for inexpensive layers. This due to the smaller kernels having
+            less parallelism and lower arithmetic intensity, while
+            introducing more frequent synchronization and communication.
+
+        Args:
+            in_features (int): See ``torch.nn.Linear``
+            out_features (int): See ``torch.nn.Linear``
+            bias (bool, optional): See ``torch.nn.Linear``
+            in_splits (int, optional): The number of tiles along the input dimension. Defaults to 1.
+            out_splits (int, optional): The number of tiles along the output dimension. Defaults to 1.
+            input_is_already_split (bool, optional): If set to ``True``, assume that the ``input_`` in
+                to ``forward()`` is already split into ``in_splits`` chunks. Defaults to ``False``.
+            combine_out_splits (bool, optional): If set to ``False``, do not combine the ``out_splits`` outputs
+                into a single tensor. Defaults to ``True``.
+            linear_cls (class, optional): The underlying class to build individual tiles.
+                Defaults to ``torch.nn.Linear``.
+            init_linear (``torch.nn.Linear``, optional): If set, copy the parameters of
+                ``init_linear``. Useful for debugging. Defaults to ``None``.
+            kwargs (dict, optional): additional keyword arguments to provide to ``linear_cls()``.
+
+        Raises:
+            RuntimeError: ``in_splits`` must be within the range [1, in_features).
+            RuntimeError: ``out_splits`` must be within the range of [1, out_features).
+        """
+
+        super().__init__()
+
+        if (in_splits < 1) or (in_splits > in_features):
+            raise RuntimeError('in splits must be in range [1, in_features].')
+        if (out_splits < 1) or (out_splits > out_features):
+            raise RuntimeError('out splits must be in range [1, out_features].')
+
+        # global, not necessarily local
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+
+        self.out_splits = out_splits
+        self.in_splits = in_splits
+        self.input_is_already_split = input_is_already_split
+        self.combine_out_splits = combine_out_splits
+
+        # Build partition-lists. These are CSR-style splits [0, part0, part1, ..., features]
+        # For example, row_parts[p] gives the start of partition p and row_parts[p+1]
+        # is the exclusive end.
+        self.in_parts = partition(num_items=in_features, num_parts=in_splits)
+        self.out_parts = partition(num_items=out_features, num_parts=out_splits)
+
+        assert len(self.out_parts) == out_splits + 1
+        assert len(self.in_parts) == in_splits + 1
+        assert self.out_parts[0] == 0
+        assert self.out_parts[out_splits] == out_features
+        assert self.in_parts[in_splits] == in_features
+
+        self.linears = torch.nn.ModuleList()
+        for out_id in range(out_splits):
+            self.linears.append(torch.nn.ModuleList())
+
+            local_out_dim = self.out_parts[out_id + 1] - self.out_parts[out_id]
+
+            for in_id in range(in_splits):
+                #if input_size is split, we only need one bias
+                local_bias = bias if in_id == (in_splits - 1) else False
+
+                local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id]
+                local = linear_cls(local_in_dim,
+                                   local_out_dim,
+                                   bias=local_bias,
+                                   **kwargs)
+                self.linears[out_id].append(local)
+
+        # Optionally initialize with a known tensor
+        if init_linear is not None:
+            self.copy_params_from(init_linear)
+
+    def forward(self, input_):
+        if self.in_splits > 1 and not self.input_is_already_split:
+            split_sizes = [
+                self.in_parts[p + 1] - self.in_parts[p] for p in range(self.in_splits)
+            ]
+            inputs = self._split_global_input(input_, split_sizes)
+        elif self.in_splits > 1:
+            inputs = input_
+            assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
+        else:
+            # no splits
+            inputs = [input_]
+
+        outputs = [None] * self.out_splits
+        for out_id in range(self.out_splits):
+            for in_id in range(self.in_splits):
+                local_output = self.linears[out_id][in_id](inputs[in_id])
+
+                outputs[out_id] = self._reduce_local_output(in_id=in_id,
+                                                            out_id=out_id,
+                                                            current_out=outputs[out_id],
+                                                            new_out=local_output)
+
+        if self.combine_out_splits:
+            return self._combine_output_splits(outputs)
+
+        return outputs
+
+    def _split_global_input(self, input, split_sizes):
+        """Partition an input tensor along the last dimension, aligned with given splits.
+
+        Subclasses should override this method to account for new input types.
+
+        Args:
+            input (List[Tensor]): The tensor to partition along the last dimension.
+            split_sizes (List[int]): The size of each partition.
+
+        Returns:
+            List[Any]: A list of the chunks of ``input``.
+        """
+        return split_tensor_along_last_dim(input, split_sizes)
+
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduce (sum) a new local result into the existing local results.
+
+        Subclasses should override this method.
+
+        For a given ``out_id``, this method is called ``in_id-1`` times. The first input
+        split is a simple assignment.
+
+        Args:
+            in_id (int): The input split that produced ``new_out``.
+            out_id (int): The output split that produced ``new_out``.
+            current_out (Any): The reduced form of all previous ``out_id`` results.
+            new_out (Any): The local result from forward (``in_id``, ``out_id``)e
+
+        Returns:
+            Any: The combined result of ``current_out`` and ``new_out``.
+        """
+
+        if current_out is None:
+            #this clone is necessary to preserve auto grad
+            #there is some issue with inplace update for outputs that are views
+            return new_out.clone()
+        else:
+            return current_out + new_out
+
+    def _combine_output_splits(self, outputs):
+        """Join the splits of the output into a single result.
+
+        Args:
+            outputs (List[Any]): The reduced outputs for each output split.
+
+        Returns:
+            Any: The combined outputs.
+        """
+        assert len(outputs) == self.out_splits
+        return torch.cat(outputs, dim=-1)
+
+    @torch.no_grad()
+    def copy_params_from(self, other):
+        """Copy the weight and bias data from ``other``.
+
+        This is especially useful for reproducible initialization and testing.
+
+        Equivalent to:
+
+        .. code-block:: python
+
+            with torch.no_grad():
+                self.weight.copy_(other.weight)
+                if self.bias is not None:
+                    self.bias.copy_(other.bias)
+
+        .. note::
+            If ZeRO-3 is enabled, this is a collective operation and the updated parameters of
+            data-parallel rank 0 will be visibly on all ranks. See
+            :class:`deepspeed.zero.GatheredParameters` for more information.
+
+
+        Args:
+            other (``torch.nn.Linear``): the linear layer to copy from.
+        """
+        assert hasattr(other, 'weight')
+        assert other.weight.size() == (self.out_features, self.in_features)
+        if self.use_bias:
+            assert hasattr(other, 'bias')
+            assert other.bias is not None
+            assert other.bias.size() == (self.out_features, )
+        else:
+            assert other.bias is None
+
+        for row in range(self.out_splits):
+            rstart = self.out_parts[row]
+            rstop = self.out_parts[row + 1]
+
+            for col in range(self.in_splits):
+                cstart = self.in_parts[col]
+                cstop = self.in_parts[col + 1]
+
+                local = self.linears[row][col]
+                global_weight = other.weight[rstart:rstop, cstart:cstop]
+                with deepspeed.zero.GatheredParameters(local.weight, modifier_rank=0):
+                    local.weight.copy_(global_weight)
+
+            if local.bias is not None:
+                with deepspeed.zero.GatheredParameters(local.bias, modifier_rank=0):
+                    local.bias.data.copy_(other.bias[rstart:rstop].data)
+
+
+class TiledLinearReturnBias(TiledLinear):
+    """Wrapper for a Linear class that returns its own bias parameter, such as
+    used by Megatron-LM.
+    """
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduces output tensors, but not the returned bias. """
+        if current_out is not None:
+            old_tensor, old_bias = current_out
+        else:
+            old_tensor, old_bias = None, None
+
+        assert isinstance(new_out, tuple)
+        assert len(new_out) == 2
+
+        tensor, bias = new_out
+        assert tensor is not None
+
+        tensor = super()._reduce_local_output(in_id=in_id,
+                                              out_id=out_id,
+                                              current_out=old_tensor,
+                                              new_out=tensor)
+
+        if bias is None:
+            bias = old_bias
+
+        return tensor, bias
+
+    def _combine_output_splits(self, outputs):
+        # stack output tensors
+        tensors = [o[0] for o in outputs]
+        tensor = super()._combine_output_splits(tensors)
+
+        # stack biases if applicable
+        biases = [o[1] for o in outputs if o[1] is not None]
+        if len(biases) > 0:
+            bias = super()._combine_output_splits(biases)
+        else:
+            bias = None
+
+        return tensor, bias
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 318cb2213404..8b41df6a79f6 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -43,6 +43,10 @@ lnav:
         url: /docs/config-json/#gradient-clipping
       - title: "ZeRO optimizations"
         url: /docs/config-json/#zero-optimizations-for-fp16-training
+      - title: "Parameter Offloading"
+        url: /docs/config-json/#parameter-offloading
+      - title: "Optimizer Offloading"
+        url: /docs/config-json/#optimizer-offloading
       - title: "Logging"
         url: /docs/config-json/#logging
       - title: "Flops Profiler"
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 4ec491e1de3f..b7df0c47a464 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -250,9 +250,12 @@ Enabling and configuring ZeRO memory optimizations
     "reduce_scatter": [true|false],
     "reduce_bucket_size": 5e8,
     "contiguous_gradients" : [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory" : [true|false],
+    "offload_param": {
+      ...
+    },
+    "offload_optimizer": {
+      ...
+    },
     "stage3_max_live_parameters" : 1e9,
     "stage3_max_reuse_distance" : 1e9,
     "stage3_prefetch_bucket_size" : 5e8,
@@ -281,7 +284,7 @@ Enabling and configuring ZeRO memory optimizations
 | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
 | Chooses between allgather collective or a series of broadcast collectives to gather updated parameters from all the GPUs at the end of each step | `true`  |
 
-***allgather_bucket_size***: [boolean]
+***allgather_bucket_size***: [integer]
 
 | Description                                                                                                  | Default |
 | ------------------------------------------------------------------------------------------------------------ | ------- |
@@ -299,7 +302,7 @@ Enabling and configuring ZeRO memory optimizations
 | ----------------------------------------------------------------------- | ------- |
 | Uses reduce or reduce scatter instead of allreduce to average gradients | `true`  |
 
-***reduce_bucket_size***: [boolean]
+***reduce_bucket_size***: [integer]
 
 | Description                                                                                                         | Default |
 | ------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -311,23 +314,18 @@ Enabling and configuring ZeRO memory optimizations
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models. | `False` |
 
-***cpu_offload***: [boolean]
-
-| Description                                                                                                              | Default |
-| ------------------------------------------------------------------------------------------------------------------------ | ------- |
-| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. | `False` |
 
-***cpu_offload_params***: [boolean]
+***offload_param***: [dictionary]
 
 | Description                                                                                                                       | Default |
 | --------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Enable offloading of model parameters to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. | `False` |
+| Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#parameter-offloading) for more details. | `False` |
 
-***cpu_offload_use_pin_memory***: [boolean]
+***offload_optimizer***: [dictionary]
 
-| Description                                                                              | Default |
-| ---------------------------------------------------------------------------------------- | ------- |
-| Use pinned CPU memory when offloading. Can improve performance. Valid only with stage 3. | `False` |
+| Description                                                                               | Default |
+| ----------------------------------------------------------------------------------------- | ------- |
+| Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#optimizer-offloading) for more details. | `False` |
 
 ***stage3_max_live_parameters***: [integer]
 
@@ -349,16 +347,112 @@ Enabling and configuring ZeRO memory optimizations
 
 
 ***stage3_param_persistence_threshold***: [integer]
+
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
 
 
 ***stage3_gather_fp16_weights_on_model_save***: [boolean]
+
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+***cpu_offload***: [boolean]
+
+**Deprecated:** **cpu_offload** is disabled and will be removed in future, please use `offload_optimizer` instead.
+{: .notice--warning}
+
+| Description                                                                                                              | Default |
+| ------------------------------------------------------------------------------------------------------------------------ | ------- |
+| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 2.| `False` |
+
+
+### Parameter offloading
+Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
+```json
+  "offload_param": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 5,
+    "buffer_size": 1e8,
+    "max_in_cpu": 1e9
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload model parameters. Supported options are `cpu` and `nvme`. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for parameter offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for parameter offloading to NVMe. | 5  |
+
+
+***buffer_size***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Size of buffers in buffer pool for parameter offloading to NVMe. | 1e8  |
+
+***max_in_cpu***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. | 1e9  |
+
+### Optimizer offloading
+Enabling and configuring ZeRO optimization of offloading optimizer computation to CPU and state to CPU/NVMe. CPU offloading is available with ZeRO stage 2 or 3. NVMe offloading is available only with ZeRO stage 3.
+```json
+  "offload_optimizer": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 4,
+    "pin_memory": [true|false],
+    "fast_init": false
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload optimizer state. Supported options are `cpu` and `nvme`. Optimizer computation is offload to CPU regardless of device option. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for optimizer state offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). | 4  |
+
+
+***pin_memory***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | `false`  |
+
+***fast_init***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Enable fast optimizer initialization when offloading to NVMe. | `false`  |
+
 ### Logging
 
 ***steps\_per\_print***: [integer]
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 1751846830ef..529da7880f94 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,9 +276,15 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
+<<<<<<< HEAD
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
 example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
+=======
+Then each worker will allocate only the layers it's assigned to. So, continuing the
+example from the previous paragraph, a machine with 16 GPUs will need to allocate a
+total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
+>>>>>>> [squash] Staging zero infinity v1 (#168)
 
 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 8f506d25babe..82c3414ff44f 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -106,121 +106,36 @@ Here is a screenshot of nvidia-smi showing GPU activity during training:
 <img src="/assets/images/zero2_dp32_10B_smi.png">
 </a>
 
-### Training trillion-scale models with ZeRO-3 Offload
+### Training trillion-scale models with ZeRO-Infinity
 
 Stage 3 can be enabled in the JSON configuration. A full description of these
 configurations is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
 
 ```json
-{
   "zero_optimization": {
     "stage": 3,
     "cpu_offload": true,
     "cpu_offload_params": true,
-    "overlap_comm": true,
     "contiguous_gradients": true,
-    "stage3_max_live_parameters": 6000000,
-    "stage3_max_reuse_distance": 100000000,
-    "stage3_prefetch_bucket_size": 200000,
-    "stage3_param_persistence_threshold": 100000,
-    "reduce_bucket_size": 3000000,
-    "sub_group_size": 1e6
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 1e7,
+    "stage3_param_persistence_threshold": 1e5,
+    "reduce_bucket_size": 1e7,
+    "sub_group_size": 1e9
   }
 }
 ```
 
 
-ZeRO-3 will automatically collect and partition the parameters as they are
-needed during the forward and backward passes. However, in some cases a
-parameter may be used outside of its module's forward pass. We call these
-*external parameters*. ZeRO-3 can coordinate these parameters if they are
-registered. Please see our [ZeRO-3 docs](https://deepspeed.readthedocs.io/en/latest/zero3.html) for more
-information and examples of external parameters.
-
-The Megatron-LM model has three external parameters that must be registered
-with ZeRO-3. External parameters are those that are accessed outside of the
-owning module's forward pass.
-
-1. `megatron/model/gpt2_model.py:GPT2Model`: register the word embedding for both uses in forward.
-
-```python
-    class GPT2Model(MegatronModule):
-    def __init__(self, num_tokentypes=0, parallel_output=True):
-        ...
-        deepspeed.zero.register_external_parameter(self,
-                                                   self.language_model.embedding.word_embeddings.weight)
-
-
-    def forward(self, input_ids, position_ids, attention_mask, labels=None,
-                tokentype_ids=None, layer_past=None, get_key_value=False,
-                forward_method_parallel_output=None):
-        # self.embeddings will compute its forward pass here
-        lm_output = self.language_model(input_ids,
-                                        position_ids,
-                                        attention_mask,
-                                        tokentype_ids=tokentype_ids,
-                                        layer_past=layer_past,
-                                        get_key_value=get_key_value)
-        ...
-
-        # Accesses word_embeddings.weight outside of the embedding's forward pass.
-        output = parallel_lm_logits(
-            lm_output,
-            self.language_model.embedding.word_embeddings.weight,
-            parallel_output)
-```
-
-2. `megatron/model/transformer.py:ParallelMLP`: register a bias that is
-returned from a submodule forward and used in this forward.
-
-```python
-class ParallelMLP(MegatronModule):
-    def __init__(self, init_method, output_layer_init_method):
-        ...
-        if self.dense_h_to_4h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.dense_h_to_4h.bias)
 
-    def forward(self, hidden_states):
 
-        # bias_parallel is a parameter of dense_h_to_4h
+#### Registering external parameters with ZeRO-3
 
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-        ...
-```
-
-3. `megatron/model/transformer.py:ParallelTransformerLayer`: register two biases that
-are returned from submodules and used in forward.
-
-```python
-class ParallelTransformerLayer(MegatronModule):
-    ...
-    def __init__(self, attention_mask_func, init_method,
-                 output_layer_init_method, layer_number):
-        ...
-        if self.attention.dense.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.attention.dense.bias)
-        if self.mlp.dense_4h_to_h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.mlp.dense_4h_to_h.bias)
-
-    def forward(self, hidden_states, attention_mask, layer_past=None,
-                get_key_value=False):
-        ...
-        # attention_bias is a parameter returned from attention
-
-        # Self attention.
-        attention_output, attention_bias = \
-            self.attention(layernorm_output,
-                           attention_mask,
-                           layer_past=layer_past,
-                           get_key_value=get_key_value)
-
-        ...
-
-        # mlp_bias is a parameter returned from mlp
-        mlp_output, mlp_bias = self.mlp(layernorm_output)
-        ...
-```
+**Deprecated:**
+DeepSpeed version `0.3.15` introduced automatic external parameter
+registration and this step is no longer needed.
+{: .notice--info}
 
 
 
@@ -231,7 +146,7 @@ that exceed *local* system memory, but not *total* system memory.
 
 1. Allocate the model in a memory-scalable fashion. The model parameters will
 be allocated and immediately partitioned across the data parallel group. If
-`remote_device="cpu"`, the model will also be allocated in CPU memory
+`remote_device` is  `"cpu"` or `"nvme"`, the model will also be allocated in CPU/NVMe memory
 instead of GPU memory. Please see the full
 [ZeRO-3 Init docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init)
 for more details.
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index d7b338561b96..53024d161b3e 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -17,4 +17,8 @@ FusedLamb (GPU)
 
 OneBitAdam (GPU)
 ----------------------------
+<<<<<<< HEAD
 .. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
+=======
+.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
+>>>>>>> [squash] Staging zero infinity v1 (#168)
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index c986990444f3..0192a69b5bb3 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -16,12 +16,13 @@ For more information on our algorithms, please see our papers on `ZeRO
 <https://arxiv.org/abs/1910.02054>`_ and `ZeRO-Offload
 <https://arxiv.org/abs/2101.06840>`_.
 
+
 Getting Started
 ---------------
 
 If you are new to DeepSpeed, check out our `Getting Started <https://www.deepspeed.ai/getting-started/>`_ page.
 
-Once you are training with DeepSpeed, enabling ZeRO-3 Offload is as simple as enabling it
+Once you are training with DeepSpeed, enabling ZeRO-3 offload is as simple as enabling it
 in your DeepSpeed configuration! Below are a few examples of ZeRO-3 configurations. Please see
 our `config guide <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`_
 for a complete list of options for configuration and performance tuning.
@@ -46,6 +47,7 @@ Example ZeRO-3 Offload Configurations
             "zero_optimization": {
                 "stage": 3,
                 "overlap_comm": true
+
             },
             "fp16": {
                 "enabled": true
@@ -69,13 +71,14 @@ Example ZeRO-3 Offload Configurations
 #. Additionally offload the optimizer states and computations to the CPU.
 
     .. code-block:: python
-        :emphasize-lines:  4
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
                 "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
             },
             ...
         }
@@ -84,14 +87,38 @@ Example ZeRO-3 Offload Configurations
 #. Save even more memory by offloading parameters to the CPU memory.
 
     .. code-block:: python
-        :emphasize-lines:  5
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
-                "cpu_offload_params": true,
                 "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
+                "offload_param": {
+                    "device": "cpu"
+                }
+            },
+            ...
+        }
+
+
+#. Save even MORE memory by offloading to NVMe (if available):
+
+    .. code-block:: python
+
+        {
+            "zero_optimization": {
+                "stage": 3,
+                "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
+                "offload_param": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
             },
             ...
         }
@@ -126,8 +153,6 @@ you can simply allocate your model in our context:
         model = MyLargeModel()
 
 
-
-.. autoclass:: deepspeed.zero.Init
     :members:
 
 
@@ -179,6 +204,35 @@ because it is used in the training loop outside of its owning module's
 forward pass. DeepSpeed will coordinate external parameters if they are
 registered prior to the first forward pass.
 
+Consider the following pattern common in language models such as GPT:
+
+.. code-block:: python
+
+    class LanguageModel(torch.nn.Module):
+        ...
+        def forward(self, inputs):
+            embeds = self.embeddings(inputs)
+            ...
+            logits = compute_logits(output, self.embeddings.weight)
+            ...
+
+
+The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
+``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
+because it is used in the training loop outside of its owning module's
+forward pass. DeepSpeed will coordinate external parameters if they are
+registered prior to the first forward pass.
+
+.. note::
+    Most models should not need to manually register parameters.
+
 .. autofunction:: deepspeed.zero.register_external_parameter
 
 .. autofunction:: deepspeed.zero.unregister_external_parameter
+
+
+Memory-Centric Tiling
+---------------------
+
+.. autoclass:: deepspeed.zero.TiledLinear
+    :members:
diff --git a/docs/index.md b/docs/index.md
index 497f88bab5c3..2642285a3f45 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -236,6 +236,7 @@ comments.
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 38f27a9897ce..72e09b293811 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -8,6 +8,7 @@
 from .transformer import TransformerBuilder
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
+from .async_io import AsyncIOBuilder
 from .builder import get_default_compute_capatabilities
 
 # TODO: infer this list instead of hard coded
@@ -19,6 +20,7 @@
     SparseAttnBuilder(),
     TransformerBuilder(),
     StochasticTransformerBuilder(),
-    UtilsBuilder()
+    UtilsBuilder(),
+    AsyncIOBuilder()
 ]
 ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
new file mode 100644
index 000000000000..749ac8725906
--- /dev/null
+++ b/op_builder/async_io.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .builder import OpBuilder
+
+
+class AsyncIOBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp'
+        ]
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        return [
+            '-g',
+            '-Wall',
+            '-O0',
+            '-std=c++14',
+            '-shared',
+            '-fPIC',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            '-laio',
+            self.simd_width()
+        ]
+
+    def extra_ldflags(self):
+        return ['-laio']
+
+    def is_compatible(self):
+        aio_libraries = ['libaio-dev']
+        aio_compatible = self.libraries_installed(aio_libraries)
+        if not aio_compatible:
+            self.warning(
+                f"{self.NAME} requires the libraries: {aio_libraries} but are missing.")
+        return super().is_compatible() and aio_compatible
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 8fcae8bcca18..015e8c508cee 100755
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -132,18 +132,15 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
     return model
 
 
-@torch.jit.script
 def f_gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_gelu(bias, y):
     x = bias + y
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_tanh(bias, y):
     x = bias + y
     return torch.tanh(x)
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index 61f07a196971..a29d22a2a954 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -96,6 +96,6 @@ def _helper():
         base_output = base_output.to('cpu')
         pipe_output = pipe_output.to('cpu')
 
-        assert torch.allclose(base_output, pipe_output)
+        assert torch.allclose(base_output, pipe_output, atol=1e-4)
 
     _helper()
diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py
index 0e5b2e0696e6..9c45b58abf66 100644
--- a/tests/unit/test_zero_context.py
+++ b/tests/unit/test_zero_context.py
@@ -1,4 +1,7 @@
 import os
+import sys
+from types import SimpleNamespace
+
 import torch
 import pytest
 
@@ -62,55 +65,59 @@ def test_gather_update():
         assert torch.equal(l.weight, torch.zeros_like(l.weight))
 
 
-@pytest.mark.skip('WIP')
-def test_external_param():
+config_dict = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+def test_ext_param_getattr():
     setup_serial_env()
 
-    print()
-
     class ExtLinear(torch.nn.Module):
-        def __init__(self, dim=10, copycat=None):
+        def __init__(self, dim=16):
             super().__init__()
             self.dim = dim
-            self.linear = torch.nn.Linear(dim, dim)
-            if copycat is not None:
-                with deepspeed.zero.GatheredParameters(self.linear.weight,
-                                                  modifier_rank=0), \
-                     torch.no_grad():
-                    self.linear.weight.copy_(copycat.linear.weight)
-
-            if hasattr(self.linear.weight, 'ds_id'):
-                print('registering')
-                super().ds_register_external_parameter('samyam', self.linear.weight)
+            self.linear1 = torch.nn.Linear(dim, dim)
+            self.linear2 = torch.nn.Linear(dim, dim)
 
         def forward(self, input):
-            yamsam = self.linear(input)
-            if hasattr(self.linear.weight, 'ds_status'):
-                assert self.linear.weight.ds_status == ZeroParamStatus.AVAILABLE
-            jeff = torch.nn.functional.linear(yamsam, self.linear.weight)
-            return jeff
+            A = self.linear1(input)
+            B = self.linear2(A)
 
-    l1_base = ExtLinear().half().cuda()
-    l2_base = ExtLinear().half().cuda()
+            # external use of self.linear1.weight
+            C = torch.nn.functional.linear(B, self.linear1.weight)
+            return C.sum()
 
-    input = torch.rand(10).half().cuda()
+    net = ExtLinear()
 
-    l1_base_out = l1_base(input.clone().detach())
-    l2_base_out = l2_base(input.clone().detach())
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
 
-    with deepspeed.zero.Init():
-        l1_test = ExtLinear(copycat=l1_base).cuda()
-        #l2_test = ExtLinear(copycat=l2_base).cuda()
-        assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-    # XXX l1 and l2 share their external parameter (l2.linear.weight)
-
-    assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    l1_test_out = l1_test(input.clone().detach())
-    #assert torch.allclose(l1_base_out, l1_test_out)
+    with deepspeed.zero.GatheredParameters(net.linear1.weight):
+        assert net.linear1.weight.numel() == net.dim**2
 
-    #l2_test_out = l2_test(input.clone().detach())
-    #assert torch.allclose(l2_base_out, l2_test_out)
+    input = torch.rand(net.dim).to(engine.device).half()
+    loss = engine(input)
+    engine.backward(loss)
+    engine.step()
 
 
 def test_scatter_halftype():
@@ -122,3 +129,117 @@ def test_scatter_halftype():
 
         y = torch.LongTensor([3, 3])
         assert y.dtype == torch.long
+
+
+class DanglingBias(torch.nn.Linear):
+    def forward(self, *inputs):
+        out = super().forward(*inputs)
+        # return the bias to trigger a dangling external param
+        return out, self.bias
+
+
+class DataClass:
+    """Just wraps data in an object. """
+    def __init__(self, out=None, bias=None):
+        self.out = out
+        self.bias = bias
+
+
+class DanglingBiasClass(DanglingBias):
+    def forward(self, *inputs):
+        out, bias = super().forward(*inputs)
+        return DataClass(out=out, bias=bias)
+
+
+class DanglingAttention(torch.nn.Linear):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__(dim, dim)
+        self.dim = dim
+        self.return_obj = return_obj
+        if return_obj:
+            self.d_linear = DanglingBiasClass(dim, dim)
+        else:
+            self.d_linear = DanglingBias(dim, dim)
+
+    def forward(self, input):
+        out = super().forward(input)
+        if self.return_obj:
+            out_obj = self.d_linear(out)
+            assert out_obj.bias.ds_status == ZeroParamStatus.AVAILABLE
+            # forward the external param
+            return out_obj.out, out_obj.bias
+        else:
+            out, bias = self.d_linear(out)
+            assert bias.ds_status == ZeroParamStatus.AVAILABLE
+            return out, bias
+
+
+class ModelContainer(torch.nn.Module):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__()
+        self.dim = dim
+        self.linear1 = torch.nn.Linear(dim, dim)
+        self.dangler = DanglingAttention(dim, return_obj=return_obj)
+
+    def forward(self, input):
+        act1 = self.linear1(input)
+        # bias is actually dangler.d_linear1.bias
+        act2, bias = self.dangler(act1)
+        assert bias.ds_status == ZeroParamStatus.AVAILABLE
+        return (act2 + bias).sum()
+
+
+class DanglingExt(torch.nn.Module):
+    def __init__(self, dim=16):
+        super().__init__()
+        self.dim = dim
+        self.container = ModelContainer(dim)
+
+    def forward(self, input):
+        out = self.container(input)
+
+        # Make sure it's at the right level of the stack
+        assert len(self._external_params) == 0
+        assert len(self.container._external_params) == 1
+        assert len(self.container.dangler._external_params) == 0
+        return out
+
+
+def test_ext_param_return():
+    setup_serial_env()
+
+    net = DanglingExt()
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        engine.backward(loss)
+        engine.step()
+
+
+@pytest.mark.skip('WIP')
+def test_ext_param_returnobj():
+    setup_serial_env()
+    print()
+
+    net = ModelContainer(return_obj=True)
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        assert len(net._external_params) == 1
+        assert len(net.dangler._external_params) == 0
+        engine.backward(loss)
+        engine.step()
diff --git a/tests/unit/test_zero_tiled.py b/tests/unit/test_zero_tiled.py
new file mode 100644
index 000000000000..a8b63b11d32a
--- /dev/null
+++ b/tests/unit/test_zero_tiled.py
@@ -0,0 +1,169 @@
+import copy
+
+import torch
+import deepspeed
+from deepspeed.runtime.zero.tiling import TiledLinear, TiledLinearReturnBias
+
+import pytest
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2), (5, 5), (32, 32)])
+def test_tiled_init(in_splits, out_splits):
+    in_f = 32
+    out_f = 40
+    base = torch.nn.Linear(in_f, out_f, bias=True)
+    l = TiledLinear(in_f,
+                    out_f,
+                    bias=True,
+                    init_linear=copy.deepcopy(base),
+                    out_splits=out_splits,
+                    in_splits=in_splits)
+
+    for out_id in range(out_splits):
+        for in_id in range(in_splits):
+            local_l = l.linears[out_id][in_id]
+            assert isinstance(local_l, torch.nn.Linear)
+
+            rstart = l.out_parts[out_id]
+            rstop = l.out_parts[out_id + 1]
+            cstart = l.in_parts[in_id]
+            cstop = l.in_parts[in_id + 1]
+
+            local_out = rstop - rstart
+            local_in = cstop - cstart
+            assert local_l.weight.size()[1] == local_in, f'local[{out_id}][{in_id}].size {local_l.weight.size()}'
+            assert local_l.weight.size()[0] == local_out
+
+            test = base.weight[rstart:rstop, cstart:cstop]
+
+            assert local_l.weight.size() == test.size()
+            assert torch.equal(local_l.weight.data, test.data)
+
+            if in_id == in_splits - 1:
+                assert local_l.bias is not None
+                assert local_l.bias.size()[0] == local_out
+            else:
+                assert local_l.bias is None
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(0, 0), (33, 33)])
+def test_tiled_baddim(in_splits, out_splits):
+    dim = 32
+    with pytest.raises(RuntimeError):
+        l = TiledLinear(dim, dim, out_splits=out_splits, in_splits=in_splits)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_forward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+    base_out.sum().backward()
+    test_out.sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
+
+
+class LinearWrapper(torch.nn.Linear):
+    """Returns its own bias to simulate Megatron-LM's behavior.
+
+    Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
+    """
+    def forward(self, input):
+        out = super().forward(input)
+        return out, self.bias
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_returnbias_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = LinearWrapper(in_f, out_f, bias=bias)
+    test = TiledLinearReturnBias(in_f,
+                                 out_f,
+                                 bias=bias,
+                                 linear_cls=LinearWrapper,
+                                 init_linear=copy.deepcopy(base),
+                                 out_splits=out_splits,
+                                 in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out_t, base_out_b = base(copy.deepcopy(inp))
+    test_out_t, test_out_b = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out_t, test_out_t, rtol=1e-4)
+    if base_out_b is None:
+        assert test_out_b is None
+        base_out_b = torch.zeros_like(base_out_t)
+        test_out_b = torch.zeros_like(test_out_t)
+    else:
+        assert test_out_b is not None
+        assert torch.allclose(base_out_b, test_out_b, rtol=1e-4)
+
+    (base_out_t + base_out_b).sum().backward()
+    (test_out_t + test_out_b).sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)

From 72a30c1eaba0383eba83f2aed006899316ec2b0a Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Sun, 18 Apr 2021 23:03:49 -0700
Subject: [PATCH 61/78] revert zero-inf change to launcher

---
 deepspeed/launcher/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 8b7c52e68d7e..a4a49dca6bf3 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -304,7 +304,7 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = True  # len(active_resources) > 1
+    multi_node_exec = len(active_resources) > 1
 
     if not multi_node_exec:
         deepspeed_launch = [

From 3c47d09caeac874725868da5a1280a902c72575d Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 19 Apr 2021 08:47:01 -0700
Subject: [PATCH 62/78] ZeRO-Infinity tutorial additions (#978)

* zinf tutorial

* more megatron integration docs
---
 docs/_tutorials/zero.md | 114 ++++++++++++++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 23 deletions(-)

diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 82c3414ff44f..217160400e29 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -12,7 +12,9 @@ ZeRO leverages the aggregate computation and memory resources of data parallelis
 
 * **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
-* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO will automatically collect and partition them during the forward and backward passes.
+* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+In addition, ZeRO-3 includes the *infinity offload engine* to form ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload to both CPU and NVMe memory for huge memory savings.
 
 ## Training environment
 We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
@@ -108,37 +110,52 @@ Here is a screenshot of nvidia-smi showing GPU activity during training:
 
 ### Training trillion-scale models with ZeRO-Infinity
 
-Stage 3 can be enabled in the JSON configuration. A full description of these
-configurations is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
+ZeRO-3, the third stage of ZeRO, partitions the full model state (i.e.,
+weights, gradients, and optimizer states) to scale memory savings linearly
+with the degree of data parallelism. ZeRO-3 can be enabled in the JSON
+configuration. A full description of these configurations is available
+[here](/docs/config-json/#zero-optimizations-for-fp16-training).
 
-```json
+
+#### Offloading to CPU and NVMe with ZeRO-Infinity
+
+ZeRO-Infinity uses DeepSpeed's infinity offload engine to offload the full
+model state to CPU or NVMe memory, allowing for even larger model sizes. Offloading
+can be enabled inside the DeepSpeed configuration:
+
+```diff
+@@ -6,5 +6,11 @@
   "zero_optimization": {
     "stage": 3,
-    "cpu_offload": true,
-    "cpu_offload_params": true,
     "contiguous_gradients": true,
     "stage3_max_live_parameters": 1e9,
     "stage3_max_reuse_distance": 1e9,
-    "stage3_prefetch_bucket_size": 1e7,
-    "stage3_param_persistence_threshold": 1e5,
-    "reduce_bucket_size": 1e7,
-    "sub_group_size": 1e9
+     "stage3_prefetch_bucket_size": 1e7,
+     "stage3_param_persistence_threshold": 1e5,
+     "reduce_bucket_size": 1e7,
+-    "sub_group_size": 1e9
++    "sub_group_size": 1e9,
++    "offload_optimizer": {
++      "device": "cpu"
++    },
++    "offload_param": {
++      "device": "cpu"
++    }
   }
-}
 ```
 
-
-
-
-#### Registering external parameters with ZeRO-3
-
-**Deprecated:**
-DeepSpeed version `0.3.15` introduced automatic external parameter
-registration and this step is no longer needed.
+**ZeRO-Infinity vs ZeRO-Offload:**
+DeepSpeed first included offloading capabilities with ZeRO-Offload,
+a system for offloading optimizer and gradient states to CPU memory
+within ZeRO-2. ZeRO-Infinity is the next generation of offloading
+capabilities accessible to ZeRO-3. ZeRO-Infinity is able to offload
+more data than ZeRO-Offload and has more effective bandwidth utilization
+and overlapping of computation and communication.
 {: .notice--info}
 
 
 
+
 #### Allocating Massive Megatron-LM Models
 
 We make two further changes to model initialization in order to support models
@@ -158,7 +175,7 @@ for more details.
         model = GPT2Model(num_tokentypes=0, parallel_output=True)
     ```
 
-2. Gather the position embeddings weight for initialization. DeepSpeed will automatically
+2. Gather the embeddings weight for initialization. DeepSpeed will automatically
 gather a module's parameters during its constructor and for its forward and backward pass.
 However, additional accesses must coordinate with DeepSpeed to ensure that parameter data
 is gathered and subsequently partitioned. If the tensor is modified, the `modifier_rank`
@@ -173,8 +190,59 @@ for more details.
                                            modifier_rank=0):
         # Initialize the position embeddings.
         self.init_method(self.position_embeddings.weight)
+
+    ...
+
+    self.tokentype_embeddings = torch.nn.Embedding(...)
+    with deepspeed.zero.GatheredParameters(self.tokentype_embeddings.weight,
+                                        modifier_rank=0):
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
     ```
 
+#### Memory-centric tiling
+ZeRO-Infinity includes a replacement for `Linear` layers that further reduces memory.
+We optionally tile the model parallel linear layers found in each Transformer layer. Note
+that model parallelism and tiling can be combined by specifying the corresponding
+base class when building the layer.
+The `deepspeed.zero.TiledLinear` module exploits the data fetch and release
+pattern of ZeRO-3 to reduce the working memory requirements by breaking down
+a large operator into smaller tiles that can be executed sequentially.
+
+We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/microsoft/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
+model-parallel layers in `transformer.py` proceed similarly.
+
+The model parallel layers of Megatron-LM have a special form in which the
+additive `bias` of the layer is delayed and instead returned from `forward()`
+to be fused with a later operator. DeepSpeed's
+`deepspeed.zero.TiledLinearReturnBias` subclass of `TiledLinear` simply also
+forwards the returned `bias` parameter without accumulating.
+
+```diff
+@@ -1,6 +1,9 @@
+-self.dense_h_to_4h = mpu.ColumnParallelLinear(
++self.dense_h_to_4h = deepspeed.zero.TiledLinearReturnBias(
+     args.hidden_size,
+     4 * args.hidden_size,
++    in_splits=args.tile_factor,
++    out_splits=4*args.tile_factor,
++    linear_cls=mpu.ColumnParallelLinear,
+     gather_output=False,
+     init_method=init_method,
+     skip_bias_add=True)
+```
+
+Note that we scale `in_splits` and `out_splits` proportionally with `input_size` and `output_size`.  This
+results in tiles of fixed size `[hidden/tile_factor, hidden/tile_factor]`.
+
+#### Registering external parameters
+
+**Deprecated:**
+DeepSpeed version `0.3.15` introduced automatic external parameter
+registration and this step is no longer needed.
+{: .notice--info}
+
+
 ## Extracting weights
 
 If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
@@ -182,14 +250,14 @@ If you need to take the pretrained weights out of Deepspeed here is what you can
 - under ZeRO-2 `state_dict` contains the fp16 model weights and these can be saved normally with `torch.save`.
 - under ZeRO-3 `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs. If you want to get to these weights enable:
 
-```
+```json
     "zero_optimization": {
         "stage3_gather_fp16_weights_on_model_save": true
     },
 ```
 And then save the model using:
 
-```
+```python
             if self.deepspeed:
                 self.deepspeed.save_fp16_model(output_dir, output_file)
 ```
@@ -201,7 +269,7 @@ You can use this method to save ZeRO-2 weights as well.
 
 If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:
 
-```
+``` bash
 $ cd /path/to/checkpoints_dir
 $ ./zero_to_fp32.py global_step1 pytorch_model.bin
 Processing zero checkpoint at global_step1

From 1a7419578cd7c980a1809876f6fb3a49c83f9b37 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 19 Apr 2021 08:10:20 -0700
Subject: [PATCH 63/78] [docs] add ZeRO-Inf news items

---
 README.md     | 2 ++
 docs/index.md | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2700b7175fe0..ddbc94467198 100755
--- a/README.md
+++ b/README.md
@@ -33,6 +33,8 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+  * [Tutorial](https://www.deepspeed.ai/tutorials/zero/) on how to use different stages of ZeRO
 * [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
diff --git a/docs/index.md b/docs/index.md
index 2642285a3f45..a350866c3bf5 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,6 +30,8 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+  * [Tutorial](https://www.deepspeed.ai/tutorials/zero/) on how to use different stages of ZeRO
 * [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
@@ -38,10 +40,6 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
-  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
-  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
-  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
-  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,

From 598e50f9add7d79842f826bd4f69bc0657ecc4e6 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 19 Apr 2021 08:14:58 -0700
Subject: [PATCH 64/78] [docs] zero-inf updates

---
 README.md               | 6 +-----
 docs/_tutorials/zero.md | 2 ++
 docs/index.md           | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ddbc94467198..61520664b42e 100755
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 # News
 * [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
-  * [Tutorial](https://www.deepspeed.ai/tutorials/zero/) on how to use different stages of ZeRO
+  * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
@@ -43,10 +43,6 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
-  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
-  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
-  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
-  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 
 
 # Table of Contents
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 217160400e29..768502588ee9 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -1,6 +1,8 @@
 ---
 title: "Zero Redundancy Optimizer (ZeRO)"
 ---
+ZeRO stage 3 consists of a subset of feature in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
+
 If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
 
 In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.
diff --git a/docs/index.md b/docs/index.md
index a350866c3bf5..ab6b1a0445d8 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -31,7 +31,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 # What's New?
 * [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
-  * [Tutorial](https://www.deepspeed.ai/tutorials/zero/) on how to use different stages of ZeRO
+  * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)

From 11279ae4d5aa805569a4888683ebd5caeb1ef14f Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 19 Apr 2021 09:19:12 -0700
Subject: [PATCH 65/78] ZeRO-Infinity docs (#979)

* zinf tutorial

* more megatron integration docs

* ZInf + tiling docs
---
 deepspeed/runtime/zero/tiling.py |   7 +-
 docs/code-docs/source/zero3.rst  | 135 ++++++++++++++++++++-----------
 2 files changed, 91 insertions(+), 51 deletions(-)

diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
index c6f912500741..d78fc81515e4 100644
--- a/deepspeed/runtime/zero/tiling.py
+++ b/deepspeed/runtime/zero/tiling.py
@@ -216,9 +216,10 @@ def copy_params_from(self, other):
                     self.bias.copy_(other.bias)
 
         .. note::
-            If ZeRO-3 is enabled, this is a collective operation and the updated parameters of
-            data-parallel rank 0 will be visibly on all ranks. See
-            :class:`deepspeed.zero.GatheredParameters` for more information.
+            If ZeRO-3 is enabled, this is a collective operation and the
+            updated parameters of data-parallel rank 0 will be visible on all
+            ranks. See :class:`deepspeed.zero.GatheredParameters` for more
+            information.
 
 
         Args:
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 0192a69b5bb3..daced77d9093 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -1,5 +1,5 @@
-ZeRO-3 Offload
-##############
+ZeRO
+####
 
 The Zero Redundancy Optimizer (ZeRO) removes the memory redundancies across
 data-parallel processes by partitioning the three model states (optimizer
@@ -8,13 +8,31 @@ replicating them. By doing this, it boosts memory efficiency compared to
 classic data-parallelism while retaining its computational granularity and
 communication efficiency.
 
-ZeRO-Offload further increases memory efficiency by offloading the
-optimizer's states and computations to the CPU. The model parameters can also
-be offloaded for even more memory savings!
+#. **ZeRO Stage 1**: The optimizer states (e.g., for `Adam optimizer <https://arxiv.org/abs/1412.6980>`_, 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
+
+#. **ZeRO Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+
+#. **ZeRO Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+In addition, ZeRO-3 includes the *infinity offload engine* to form
+ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload
+all model states to both CPU and NVMe memory for huge memory savings.
+
+
+For a deep dive of our algorithms, please see our `papers <https://www.deepspeed.ai/#publications>`_ on `ZeRO
+<https://arxiv.org/abs/1910.02054>`_, `ZeRO-Offload
+<https://arxiv.org/abs/2101.06840>`_,
+and `ZeRO-Infinity <https://arxiv.org/abs/2104.07857>`_.
+
+.. note::
+    DeepSpeed first included offloading capabilities with **ZeRO-Offload**, a
+    system for offloading optimizer and gradient states to CPU memory within
+    ZeRO-2. **ZeRO-Infinity** is the next generation of offloading
+    capabilities, accessible to ZeRO-3. ZeRO-Infinity has all of the savings
+    of ZeRO-Offload, plus is able to offload more the model weights and has
+    more effective bandwidth utilization and overlapping of computation and
+    communication.
 
-For more information on our algorithms, please see our papers on `ZeRO
-<https://arxiv.org/abs/1910.02054>`_ and `ZeRO-Offload
-<https://arxiv.org/abs/2101.06840>`_.
 
 
 Getting Started
@@ -28,14 +46,15 @@ our `config guide <https://www.deepspeed.ai/docs/config-json/#zero-optimizations
 for a complete list of options for configuration and performance tuning.
 
 .. note::
-        ZeRO-3 Offload works best with our heavily optimized
+        ZeRO-Infinity and ZeRO-Offload work best with our heavily optimized
         :class:`deepspeed.ops.adam.DeepSpeedCPUAdam` optimizer. We recommend using
         our `optimizer config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
         to instruct :meth:`deepspeed.initialize` to build the optimizer for you.
 
 
-Example ZeRO-3 Offload Configurations
-=====================================
+
+Example ZeRO-3 Configurations
+=============================
 
 #. Use ZeRO to partition the optimizer states (stage 1), gradients (stage 2),
    and parameters (stage 3).
@@ -46,8 +65,6 @@ Example ZeRO-3 Offload Configurations
         {
             "zero_optimization": {
                 "stage": 3,
-                "overlap_comm": true
-
             },
             "fp16": {
                 "enabled": true
@@ -68,14 +85,13 @@ Example ZeRO-3 Offload Configurations
         }
 
 
-#. Additionally offload the optimizer states and computations to the CPU.
+#. Additionally offload the optimizer states and computations to the CPU with ZeRO-Infinity.
 
     .. code-block:: python
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "overlap_comm": true
                 "offload_optimizer": {
                     "device": "cpu"
                 }
@@ -91,7 +107,6 @@ Example ZeRO-3 Offload Configurations
         {
             "zero_optimization": {
                 "stage": 3,
-                "overlap_comm": true
                 "offload_optimizer": {
                     "device": "cpu"
                 }
@@ -103,14 +118,13 @@ Example ZeRO-3 Offload Configurations
         }
 
 
-#. Save even MORE memory by offloading to NVMe (if available):
+#. Save even MORE memory by offloading to NVMe (if available on your system):
 
     .. code-block:: python
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "overlap_comm": true
                 "offload_optimizer": {
                     "device": "nvme",
                     "nvme_path": "/nvme_data"
@@ -134,6 +148,9 @@ granularity of (sub)module ``forward()`` methods. The backward pass is
 handled similarly. This strategy has two underlying assumptions:
 
 #. The forward and backward passes of submodules must individually fit in device memory.
+   If this not the case, :class:`deepspeed.zero.TiledLinear` implements
+   **memory-centric tiling** and works with ZeRO-3 to break linear layers
+   into a sequence of smaller submodules that can fit in memory.
 
 #. A module's parameters are only accessed within its own ``__init__`` and ``forward()`` methods.
    Otherwise, DeepSpeed must be instructed to collect and re-partition the parameter.
@@ -153,6 +170,7 @@ you can simply allocate your model in our context:
         model = MyLargeModel()
 
 
+.. autoclass:: deepspeed.zero.Init
     :members:
 
 
@@ -185,46 +203,56 @@ parameters are accessed outside of the module that created them. To do so, use
 Registering External Parameters
 ===============================
 
-Consider the following pattern common in language models such as GPT:
+ZeRO-3 will automatically collect and partition the model parameters as they
+are needed during the forward and backward passes. However, in some cases a
+parameter may be used outside of its module's forward pass. We call these
+*external* parameters. ZeRO-3 can coordinate these parameters if they are
+registered either automatically or manually.
 
-.. code-block:: python
 
-    class LanguageModel(torch.nn.Module):
-        ...
-        def forward(self, inputs):
-            embeds = self.embeddings(inputs)
-            ...
-            logits = compute_logits(output, self.embeddings.weight)
-            ...
+.. note::
+    DeepSpeed version ``0.3.15`` includes automatic external parameter
+    discovery and registration to support the most common cases. Parameters
+    can still be manually registered if they cannot be automatically
+    detected.
 
 
-The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
-``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
-because it is used in the training loop outside of its owning module's
-forward pass. DeepSpeed will coordinate external parameters if they are
-registered prior to the first forward pass.
+DeepSpeed can automatically detect the following external parameter scenarios:
 
-Consider the following pattern common in language models such as GPT:
 
-.. code-block:: python
+#. Parameter access: consider the following pattern common in language models such as GPT:
 
-    class LanguageModel(torch.nn.Module):
-        ...
-        def forward(self, inputs):
-            embeds = self.embeddings(inputs)
-            ...
-            logits = compute_logits(output, self.embeddings.weight)
-            ...
+   The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
+   ``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
+   because it is used in the training loop outside of its owning module's
+   forward pass.
 
 
-The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
-``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
-because it is used in the training loop outside of its owning module's
-forward pass. DeepSpeed will coordinate external parameters if they are
-registered prior to the first forward pass.
+   .. code-block:: python
+
+       class LanguageModel(torch.nn.Module):
+           ...
+           def forward(self, inputs):
+               embeds = self.embeddings(inputs)
+               ...
+               logits = compute_logits(output, self.embeddings.weight)
+               ...
+
+
+#. Returning a parameter:
+
+   ``CustomLinear`` returns both an output and its own ``bias`` parameter. DeepSpeed
+   will detect the external ``bias`` parameter and register it with submodules that
+   use ``CustomLinear``.
+
+   .. code-block:: python
+
+       class CustomLinear(torch.nn.Linear):
+           def forward(self, *input):
+               output = super().forward(*input)
+               return output, self.bias
+
 
-.. note::
-    Most models should not need to manually register parameters.
 
 .. autofunction:: deepspeed.zero.register_external_parameter
 
@@ -234,5 +262,16 @@ registered prior to the first forward pass.
 Memory-Centric Tiling
 ---------------------
 
+To reduce the working memory requirements of DL training for large models,
+ZeRO-Infinity includes technique called *memory-centric tiling* that exploits
+the data fetch and release pattern of ZeRO-3 to reduce the working memory
+requirements by breaking down a large operator into smaller tiles that can be
+executed sequentially. When combined with ZeRO-3, the parameter and gradients
+of each tile can be fetched and released one at a time, reducing the working
+memory proportional to the number of tiles. Therefore, ZeRO-Infinity can
+support operators of arbitrary sizes, without refactoring for model
+parallelism to fit them in limited GPU memory.
+
+
 .. autoclass:: deepspeed.zero.TiledLinear
     :members:

From 5f570bbce4c80a1295c4b91ada2df3cdd3062dc5 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 19 Apr 2021 08:23:24 -0700
Subject: [PATCH 66/78] [docs] zero-inf updates

---
 docs/_tutorials/zero-offload.md | 2 ++
 docs/_tutorials/zero.md         | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/_tutorials/zero-offload.md b/docs/_tutorials/zero-offload.md
index 31c89bd5934e..a70a3674e90b 100644
--- a/docs/_tutorials/zero-offload.md
+++ b/docs/_tutorials/zero-offload.md
@@ -1,6 +1,8 @@
 ---
 title: "ZeRO-Offload"
 ---
+ZeRO-3 Offload consists of a subset of features in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
+
 We recommend that you read the tutorials on [Getting Started](/getting-started/)  and [ZeRO](/tutorials/zero/) before stepping through this tutorial.
 
 ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 768502588ee9..217160400e29 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -1,8 +1,6 @@
 ---
 title: "Zero Redundancy Optimizer (ZeRO)"
 ---
-ZeRO stage 3 consists of a subset of feature in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
-
 If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
 
 In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.

From 2c2a7f31bcc20ae12ce8d2b8af14448939ebdf12 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 19 Apr 2021 08:24:10 -0700
Subject: [PATCH 67/78] bump to 0.3.15

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 0b69c00c5f5a..9e29e106195a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.14
+0.3.15

From 5016e93e635eec89f108e6328405795d7cab6947 Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Mon, 19 Apr 2021 17:51:13 +0200
Subject: [PATCH 68/78] refactor

---
 deepspeed/runtime/bs_schedules.py         | 83 +++++++++++++++++++++++
 deepspeed/runtime/engine.py               |  3 +-
 deepspeed/runtime/pipe/pipe_visualizer.py | 70 +++++++++++++++++++
 deepspeed/runtime/pipe/topology.py        | 21 ++++--
 4 files changed, 171 insertions(+), 6 deletions(-)
 create mode 100644 deepspeed/runtime/bs_schedules.py
 create mode 100644 deepspeed/runtime/pipe/pipe_visualizer.py

diff --git a/deepspeed/runtime/bs_schedules.py b/deepspeed/runtime/bs_schedules.py
new file mode 100644
index 000000000000..e74009b742cd
--- /dev/null
+++ b/deepspeed/runtime/bs_schedules.py
@@ -0,0 +1,83 @@
+import math
+import numpy as np
+
+
+class BatchSizeScheduler(object):
+    """Increase the batch size linearly from int(mb_size_per_gpu * min_batch_size_multiplier) to mb_size_per_gpu
+        over warmup_num_steps steps, and then fix at mb_size_per_gpu.
+
+    TODO: documentation
+    """
+
+    def __init__(self,
+                 final_batch_size,
+                 min_batch_size_multiplier: float = 0.01,
+                 warmup_num_steps: int = 1000,
+                 num_intervals=4,
+                 last_batch_iteration: int = -1,
+                 deepspeed=None):
+
+        self.warmup_num_steps = warmup_num_steps
+        self.last_batch_iteration = last_batch_iteration
+        self.final_batch_size = final_batch_size
+        self.num_intervals = num_intervals
+        self.min_batch_size_multiplier = min_batch_size_multiplier
+        self.schedule = self._build_schedule()
+        self.current_batch_size = None
+        self.deepspeed = deepspeed
+
+    def _build_schedule(self):
+        start = math.ceil(self.min_batch_size_multiplier * self.final_batch_size)
+        batch_sizes = np.linspace(start, self.final_batch_size, num=self.num_intervals, endpoint=True, retstep=False,
+                                  dtype=int, axis=0)
+        steps = np.linspace(0, self.warmup_num_steps, num=self.num_intervals, endpoint=True, retstep=False, dtype=int,
+                            axis=0)
+        schedule = {step: batch_size for step, batch_size in zip(steps, batch_sizes)}
+        # deduplicate intervals with same batch size
+        prev_v = None
+        to_pop = []
+        for k, v in schedule.items():
+            if v == prev_v:
+                to_pop.append(k)
+            prev_v = v
+        for k in to_pop:
+            schedule.pop(k)
+        return schedule
+
+    def get_current_batch_size(self):
+        i = None
+        iterator = sorted(self.schedule.keys(), reverse=True)
+        for i, v in enumerate(iterator):
+            if self.last_batch_iteration >= v:
+                break
+            else:
+                pass
+        current_batch_size = self.schedule[iterator[i]]
+        return current_batch_size
+
+    def step(self, last_batch_iteration=None):
+        if last_batch_iteration is None:
+            last_batch_iteration = self.last_batch_iteration + 1
+        self.last_batch_iteration = last_batch_iteration
+        self.current_batch_size = self.get_current_batch_size()
+
+    def state_dict(self):
+        return {'last_batch_iteration': self.last_batch_iteration}
+
+    def load_state_dict(self, sd):
+        self.last_batch_iteration = sd['last_batch_iteration']
+
+
+if __name__ == "__main__":
+    sched = BatchSizeScheduler(
+        final_batch_size=16,
+        num_intervals=8,
+        warmup_num_steps=10000
+    )
+    print(f'SCHEDULE: {sched.schedule}')
+    prev_bs = None
+    for i in range(sched.warmup_num_steps + 1):
+        sched.step()
+        if sched.current_batch_size != prev_bs:
+            print(i, sched.current_batch_size)
+        prev_bs = sched.current_batch_size
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 22a213454091..020dc32466d6 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -109,7 +109,8 @@ def __init__(self,
                  dist_init_required=None,
                  collate_fn=None,
                  config_params=None,
-                 dont_change_device=False):
+                 dont_change_device=False,
+                 bs_schedule=None):
         super(DeepSpeedEngine, self).__init__()
         self.dont_change_device = dont_change_device
         self.client_optimizer = optimizer
diff --git a/deepspeed/runtime/pipe/pipe_visualizer.py b/deepspeed/runtime/pipe/pipe_visualizer.py
new file mode 100644
index 000000000000..abfeb16fa656
--- /dev/null
+++ b/deepspeed/runtime/pipe/pipe_visualizer.py
@@ -0,0 +1,70 @@
+from deepspeed.runtime.pipe.schedule import (TrainSchedule, ForwardPass, BackwardPass, OptimizerStep, RecvGrad,
+                                             RecvActivation,
+                                             SendGrad, SendActivation, LoadMicroBatch, ReduceGrads, ReduceTiedGrads)
+from pprint import pprint
+from pytablewriter import MarkdownTableWriter
+
+flatten = lambda t: [item for sublist in t for item in sublist]
+
+
+def expand(steps, include_all=False):
+    for c, i in enumerate(steps):
+        string = ''
+        for j in range(len(i)):
+            if not include_all:
+                cond = lambda x: (isinstance(x, ForwardPass) or isinstance(x, BackwardPass))
+            else:
+                cond = lambda x: x
+            if not i[j]: i[j] = [None]
+            if i[j] is not None:
+                if cond(i[j]):
+                    if string != '':
+                        string += ' / '
+                    string += f'{reprs[type(i[j])]}'
+                    if hasattr(i[j], 'buffer_id'):
+                        string += f'_{i[j].buffer_id + 1}'
+        steps[c] = string if string != '' else None
+    return steps
+
+
+reprs = {
+    ForwardPass: 'fwd',
+    BackwardPass: 'bwd',
+    RecvActivation: 'recv_act',
+    SendActivation: 'send_act',
+    RecvGrad: 'recv_grad',
+    SendGrad: 'send_grad',
+    LoadMicroBatch: 'load_batch',
+    ReduceGrads: 'reduce_grads',
+    ReduceTiedGrads: 'reduce_tied_grads',
+    OptimizerStep: 'step',
+}
+
+
+def pipeline_visualizer(num_stages, num_microbatches, include_all=False):
+    stages = {}
+    for stage_id in range(num_stages):
+        steps = [i for i in TrainSchedule(micro_batches=num_microbatches, stages=num_stages - 1 ,
+                                          stage_id=stage_id).steps()]
+        steps = expand(steps, include_all=include_all)
+        stages[stage_id] = steps
+    value_matrix = [v for k, v in stages.items()]
+    headers = ['GPU ID'] + [str(i) for i in range(len(stages[0]))]
+    value_matrix = [[f'GPU {i}'] + value_matrix[i] for i in range(len(value_matrix))]
+    writer = MarkdownTableWriter(
+        table_name=f"Pipe Schedule\n",
+        headers=headers,
+        value_matrix=value_matrix
+    )
+    string = writer.dumps()
+    all_steps = flatten(value_matrix)
+    idle_time = len([i for i in all_steps if i is None])
+    print(all_steps)
+    non_idle_time = len([i for i in all_steps if (i is not None and 'GPU' not in i)])
+    string += f'\nNum Devices: {num_stages}\nNum Microbatches: {num_microbatches} \n' \
+              f'Idle Time: {idle_time}\nNon Idle Time: {non_idle_time}'
+    return string
+
+
+if __name__ == "__main__":
+    print(pipeline_visualizer(num_stages=8, num_microbatches=16, include_all=False))
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index a4a3541c8767..dda7aa9f4cb6 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from itertools import product as cartesian_product
+from pprint import pprint
 
 
 class ProcessTopology:
@@ -22,6 +23,7 @@ class ProcessTopology:
 
     Some methods return ProcessCoord namedtuples.
     """
+
     def __init__(self, axes, dims):
         """Create a mapping of n-dimensional tensor coordinates to linear indices.
 
@@ -58,7 +60,7 @@ def get_rank(self, **coord_kwargs):
             raise ValueError('get_rank() does not support slices. Use filter_match())')
 
         key = self.ProcessCoord(**coord_kwargs)
-        assert key in self.mapping, f'key {kwargs} invalid'
+        assert key in self.mapping, f'key {coord_kwargs} invalid'
         return self.mapping[key]
 
     def get_axis_names(self):
@@ -184,6 +186,7 @@ def filter_match(self, **filter_kwargs):
         Returns:
             The list of ranks whose coordinates match filter_kwargs.
         """
+
         def _filter_helper(x):
             for key, val in filter_kwargs.items():
                 if getattr(x, key) != val:
@@ -239,12 +242,14 @@ class PipeDataParallelTopology(ProcessTopology):
         reductions to use high-bandwidth intra-node links and lower-volume
         pipeline communications to use low-bandwidth inter-node links.
     """
+
     def __init__(self, num_pp, num_dp):
         super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
 
 
 class PipeModelDataParallelTopology(ProcessTopology):
     """ A topology for hybrid pipeline, model, and data parallelism. """
+
     def __init__(self, num_pp, num_mp, num_dp):
         super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
 
@@ -271,6 +276,7 @@ class PipelineParallelGrid:
     data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
     for data_parallel_id = 1.
     """
+
     def __init__(self, topology=None, process_group=None):
         # TODO use process_group if provided
         self.global_rank = dist.get_rank()
@@ -303,7 +309,7 @@ def __init__(self, topology=None, process_group=None):
         for dp in range(self.data_parallel_size):
             ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp))
             if self.global_rank == 0:
-                #print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
+                # print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
                 pass
             proc_group = dist.new_group(ranks=ranks)
             if self.global_rank in ranks:
@@ -333,7 +339,7 @@ def __init__(self, topology=None, process_group=None):
         self.pipe_groups = self._topo.get_axis_comm_lists('pipe')
         for ranks in self.pipe_groups:
             if self.global_rank == 0:
-                #print(f'RANK={self.global_rank} building pipeline group: {ranks}')
+                # print(f'RANK={self.global_rank} building pipeline group: {ranks}')
                 pass
             proc_group = dist.new_group(ranks=ranks)
             if self.global_rank in ranks:
@@ -392,8 +398,8 @@ def _is_grid_valid(self):
             ranks *= self._topo.get_dim(ax)
         return ranks == dist.get_world_size()
 
-    #returns the global rank of the process with the provided stage id
-    #which has the same data_parallel_id as caller process
+    # returns the global rank of the process with the provided stage id
+    # which has the same data_parallel_id as caller process
     def stage_to_global(self, stage_id, **kwargs):
         me = self._topo.get_coord(self.global_rank)
         transform = me._replace(pipe=stage_id, **kwargs)._asdict()
@@ -459,3 +465,8 @@ def get_slice_parallel_world_size(self):
 
     def get_slice_parallel_group(self):
         return self.slice_proc_group
+
+
+if __name__ == "__main__":
+    topo = PipeModelDataParallelTopology(num_pp=6, num_dp=2, num_mp=4)
+    pprint(str(topo))

From fbece50b21ffeb4fe05e498f930aded71de12aed Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 19 Apr 2021 11:26:17 -0700
Subject: [PATCH 69/78] assert no Z2/Z3 with pipeline and fix some docs links
 (#980)

---
 deepspeed/runtime/pipe/engine.py     |  2 ++
 deepspeed/runtime/pipe/module.py     |  3 +++
 docs/_tutorials/pipeline.md          |  6 ------
 docs/code-docs/source/optimizers.rst | 23 +++++++++++++----------
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 573dccce78a5..1a401a27e36f 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -52,6 +52,8 @@ def __init__(self, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
         assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
 
+        assert self.zero_optimization_stage() < 2, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
+
         # We schedule the all-reduces, so disable it in super().backward()
         self.enable_backward_allreduce = False
         assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 6d24ed469f3a..dcd4be0ea342 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -112,6 +112,9 @@ def forward(self, inputs):
                     x = layer(x)
                 return x
 
+        .. note::
+            Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3.
+
         Args:
             layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
             num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 529da7880f94..1751846830ef 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,15 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
-<<<<<<< HEAD
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
 example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
-=======
-Then each worker will allocate only the layers it's assigned to. So, continuing the
-example from the previous paragraph, a machine with 16 GPUs will need to allocate a
-total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
->>>>>>> [squash] Staging zero infinity v1 (#168)
 
 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 53024d161b3e..04416486d954 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,24 +1,27 @@
 Optimizers
-===================
+==========
 
 DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
 
 Adam (CPU)
-----------------------------
+----------
+
 .. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
 
+
 FusedAdam (GPU)
-----------------------------
+---------------
+
 .. autoclass:: deepspeed.ops.adam.FusedAdam
 
+
 FusedLamb (GPU)
-----------------------------
+---------------
+
 .. autoclass:: deepspeed.ops.lamb.FusedLamb
 
+
 OneBitAdam (GPU)
-----------------------------
-<<<<<<< HEAD
-.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
-=======
-.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
->>>>>>> [squash] Staging zero infinity v1 (#168)
+----------------
+
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam

From 9e0dab402d166a13286033afdb65652e0186515c Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 20 Apr 2021 10:00:05 -0700
Subject: [PATCH 70/78] add option to force multi-node launcher mode (#977)

---
 deepspeed/launcher/runner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index a4a49dca6bf3..0bb0b57e9e58 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -104,6 +104,11 @@ def parse_args(args=None):
                         help="(optional) pass launcher specific arguments as a "
                         "single quoted argument.")
 
+    parser.add_argument("--force_multi",
+                        action="store_true",
+                        help="Force multi-node launcher mode, helps in cases where user "
+                        "wants to launch on single remote node.")
+
     parser.add_argument("user_script",
                         type=str,
                         help="User script to launch, followed by any required "
@@ -304,7 +309,7 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = len(active_resources) > 1
+    multi_node_exec = args.force_multi or len(active_resources) > 1
 
     if not multi_node_exec:
         deepspeed_launch = [

From 3525102353605770b320c75ecb67509ccf9bac8f Mon Sep 17 00:00:00 2001
From: Sean Naren <sean@grid.ai>
Date: Tue, 20 Apr 2021 22:21:29 +0100
Subject: [PATCH 71/78] [ZeRO Infinity] Allow Init to take a dict for the
 deepspeed config  (#983)

* Add check to see if json file is already loaded

* Update doc

* Address review

* Remove doc comment

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/partition_parameters.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index c8bde6390b3c..9ff5a7232a73 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -271,6 +271,7 @@ def __init__(self,
                  remote_device=None,
                  pin_memory=False,
                  deepspeed_config=None,
+                 param_dict=None,
                  enabled=True):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
@@ -293,6 +294,8 @@ def __init__(self,
                 ``"cpu"``. Defaults to ``False``.
             deepspeed_config (``json file``, optional): If provided, provides configuration
                 for swapping fp16 params to NVMe.
+            param_dict (dict, optional): Instead of requiring a deepspeed_config you can pass your deepspeed config
+                as a dictionary instead for swapping fp16 params to NVMe.
             enabled (bool, optional): If ``False``, this context has no
                 effect. Defaults to ``True``.
 
@@ -382,7 +385,7 @@ def get_model():
         #It is the device where parameters are fully instantiated using allgather
         self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
 
-        self._validate_remote_device(remote_device, deepspeed_config)
+        self._validate_remote_device(remote_device, deepspeed_config, param_dict)
 
         #Remote device is the device where parameter partiitons are stored
         #It can be same as local_device or it could be CPU or NVMe.
@@ -392,7 +395,7 @@ def get_model():
 
         # Enable fp16 param swapping to NVMe
         if self.remote_device == OFFLOAD_NVME_DEVICE:
-            _ds_config = DeepSpeedConfig(deepspeed_config)
+            _ds_config = DeepSpeedConfig(deepspeed_config, param_dict=param_dict)
             self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config)
         else:
             self.param_swapper = None
@@ -406,9 +409,9 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
-    def _validate_remote_device(self, remote_device, ds_config):
+    def _validate_remote_device(self, remote_device, ds_config, param_dict):
         if ds_config is not None:
-            _ds_config = DeepSpeedConfig(ds_config)
+            _ds_config = DeepSpeedConfig(ds_config, param_dict=param_dict)
             if remote_device in [None, OFFLOAD_CPU_DEVICE]:
                 if _ds_config.zero_config.offload_param is not None:
                     offload_param_device = _ds_config.zero_config.offload_param[

From 835b4c87339fef4083035873df66449e91f5fd36 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 20 Apr 2021 14:58:05 -0700
Subject: [PATCH 72/78] make bold+italic work without escaping _ (#775)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 docs/_pages/config-json.md     | 113 +++++++++++++++++----------------
 docs/_tutorials/onebit-adam.md |   2 +-
 2 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index b7df0c47a464..1f59c29d2202 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -4,40 +4,40 @@ title: "DeepSpeed Configuration JSON"
 
 ### Batch Size Related Parameters
 
-**Note:** configuring ***train\_batch\_size*** is required.
+**Note:** configuring <i>**train_batch_size**</i> is required.
 {: .notice--warning}
 
-***train\_batch\_size***: [integer]
+<i>**train_batch_size**</i>: [integer]
 
 | Value                                                                                                                                                                                                                                                                                                                                                                             | Example |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| The effective training batch size. This is the amount of data samples that leads to one step of model update. ***train\_batch\_size*** is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., ***train\_step\_batch\_size***),  the gradient accumulation steps (a.k.a., ***gradient\_accumulation\_steps***), and the number of GPUs. | `32`    |
+| The effective training batch size. This is the amount of data samples that leads to one step of model update. <i>**train_batch_size**</i> is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., <i>**train_step_batch_size**</i>),  the gradient accumulation steps (a.k.a., <i>**gradient_accumulation_steps**</i>), and the number of GPUs. | `32`    |
 
 
-***train\_micro\_batch\_size\_per\_gpu***: [integer]
+<i>**train_micro_batch_size_per_gpu**</i>: [integer]
 
 | Description                                                                                                                                                                                                                                                                                                                    | Default                        |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------ |
-| Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, ***gradient\_accumulation\_steps*** is automatically calculated using ***train\_batch\_size*** and number of GPUs. Should not be concurrently specified with ***gradient\_accumulation\_steps*** in the configuration JSON. | ***train\_batch\_size*** value |
+| Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, <i>**gradient_accumulation_steps**</i> is automatically calculated using <i>**train_batch_size**</i> and number of GPUs. Should not be concurrently specified with <i>**gradient_accumulation_steps**</i> in the configuration JSON. | <i>**train_batch_size**</i> value |
 
-***gradient\_accumulation\_steps***: [integer]
+<i>**gradient_accumulation_steps**</i>: [integer]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, ***train\_step\_batch\_size*** is automatically calculated using ***train\_batch\_size*** and number of GPUs. Should not be concurrently specified with ***train\_step\_batch\_size*** in the configuration JSON. | `1`     |
+| Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, <i>**train_step_batch_size**</i> is automatically calculated using <i>**train_batch_size**</i> and number of GPUs. Should not be concurrently specified with <i>**train_step_batch_size**</i> in the configuration JSON. | `1`     |
 
 
 
 ### Optimizer Parameters
 
-***optimizer***: [dictionary]
+<i>**optimizer**</i>: [dictionary]
 
 | Fields | Value                                                                                                                                                                                                                                                                                        | Example                      |
 | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
 | type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
 | params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
-  Example of ***optimizer*** with Adam
+  Example of <i>**optimizer**</i> with Adam
 
 ```json
 "optimizer": {
@@ -60,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
 | torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
 | adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |
 
-  Another example of ***optimizer*** with 1-bit Adam
+Another example of <i>**optimizer**</i> with 1-bit Adam specific parameters is as follows.
 
 ```json
 "optimizer": {
@@ -90,6 +90,7 @@ The 1-bit Adam optimizer supports the following three params keys/values in addi
 
 ### Scheduler Parameters
 
+
 DeepSpeed calls the `step()` method of the scheduler at every training step when `model_engine.step()` is executed.
 
 ***scheduler***: [dictionary]
@@ -99,7 +100,7 @@ DeepSpeed calls the `step()` method of the scheduler at every training step when
 | type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers. | `"WarmupLR"`                                   |
 | params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.       | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
 
-Example of ***scheduler***
+Example of <i>**scheduler**</i>
 
 ```json
  "scheduler": {
@@ -114,25 +115,25 @@ Example of ***scheduler***
 
 ### Communication options
 
-***fp32\_allreduce***: [boolean]
+<i>**fp32_allreduce**</i>: [boolean]
 
 | Description                                                    | Default |
 | -------------------------------------------------------------- | ------- |
 | During gradient averaging perform allreduce with 32 bit values | `false` |
 
-***prescale\_gradients***: [boolean]
+<i>**prescale_gradients**</i>: [boolean]
 
 | Description                            | Default |
 | -------------------------------------- | ------- |
 | Scale gradients before doing allreduce | `false` |
 
-***gradient_predivide_factor***: [float]
+<i>**gradient_predivide_factor**</i>: [float]
 
 | Description                                                                                                                                       | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs | `1.0`   |
 
-***sparse\_gradients***: [boolean]
+<i>**sparse_gradients**</i>: [boolean]
 
 | Description                                                                                                              | Default |
 | ------------------------------------------------------------------------------------------------------------------------ | ------- |
@@ -143,7 +144,7 @@ Example of ***scheduler***
 **Note:** this mode cannot be combined with the `amp` mode described below.
 {: .notice--warning}
 
-***fp16***: [dictionary]
+<i>**fp16**</i>: [dictionary]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -160,48 +161,48 @@ Example of ***scheduler***
 }
 ```
 
-***fp16:enabled***: [boolean]
+<i>**fp16:enabled**</i>: [boolean]
 
 | Description                                                                            | Default |
 | -------------------------------------------------------------------------------------- | ------- |
-| ***enabled*** is a **fp16** parameter indicating whether or not FP16 training enabled. | `false` |
+| <i>**enabled**</i> is a **fp16** parameter indicating whether or not FP16 training enabled. | `false` |
 
-***fp16:loss\_scale***: [float]
+<i>**fp16:loss_scale**</i>: [float]
 
 | Description                                                                                                                                                                                                                  | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***loss\_scale*** is a ***fp16*** parameter representing the loss scaling value for FP16 training. The default value of 0.0 results in dynamic loss scaling, otherwise the value will be used for static fixed loss scaling. | `0.0`   |
+| <i>**loss_scale**</i> is a <i>**fp16**</i> parameter representing the loss scaling value for FP16 training. The default value of 0.0 results in dynamic loss scaling, otherwise the value will be used for static fixed loss scaling. | `0.0`   |
 
-***fp16:initial\_scale\_power***: [integer]
+<i>**fp16:initial_scale_power**</i>: [integer]
 
-| Description                                                                                                                                                                                       | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***initial\_scale\_power*** is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup>***initial\_scale\_power***</sup>. | `32`    |
+| Description                                                                                                                                                                                                   | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `32`    |
 
-***fp16:loss\_scale\_window***: [integer]
+<i>**fp16:loss_scale_window**</i>: [integer]
 
 | Description                                                                                                                       | Default |
 | --------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***loss\_scale\_window*** is a **fp16** parameter representing the window over which to raise/lower the dynamic loss scale value. | `1000`  |
+| <i>**loss_scale_window**</i> is a **fp16** parameter representing the window over which to raise/lower the dynamic loss scale value. | `1000`  |
 
-***fp16:hysteresis***: [integer]
+<i>**fp16:hysteresis**</i>: [integer]
 
 | Description                                                                                    | Default |
 | ---------------------------------------------------------------------------------------------- | ------- |
-| ***hysteresis*** is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |
+| <i>**hysteresis**</i> is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |
 
-***fp16:min\_loss\_scale***: [integer]
+<i>**fp16:min_loss_scale**</i>: [integer]
 
 | Description                                                                                        | Default |
 | -------------------------------------------------------------------------------------------------- | ------- |
-| ***min\_loss\_scale*** is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1000`  |
+| <i>**min_loss_scale**</i> is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1000`  |
 
 ### Automatic mixed precision (AMP) training options
 
 **Note:** this mode cannot be combined with the `fp16` mode described above. In addition this mode is not currently compatible with ZeRO.
 {: .notice--warning}
 
-***amp***: [dictionary]
+<i>**amp**</i>: [dictionary]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Default |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -216,11 +217,11 @@ Example of ***scheduler***
 }
 ```
 
-***amp:enabled***: [boolean]
+<i>**amp:enabled**</i>: [boolean]
 
 | Description                                                                              | Default |
 | ---------------------------------------------------------------------------------------- | ------- |
-| ***enabled*** is an **amp** parameter indicating whether or not AMP training is enabled. | `false` |
+| <i>**enabled**</i> is an **amp** parameter indicating whether or not AMP training is enabled. | `false` |
 
 ***amp params***: [various]
 
@@ -230,7 +231,7 @@ Example of ***scheduler***
 
 ### Gradient Clipping
 
-***gradient\_clipping***: [float]
+<i>**gradient_clipping**</i>: [float]
 
 | Description                         | Default |
 | ----------------------------------- | ------- |
@@ -266,19 +267,19 @@ Enabling and configuring ZeRO memory optimizations
     }
 ```
 
-***zero\_optimization***: [dictionary]
+<i>**zero_optimization**</i>: [dictionary]
 
 | Description                                                                                               | Default |
 | --------------------------------------------------------------------------------------------------------- | ------- |
 | Enable ZeRO memory optimization wrapper for FP16 Training. Currently compatible only with Adam optimizer. | `false` |
 
-***stage***: [integer]
+<i>**stage**</i>: [integer]
 
 | Description                                                                                                                                                                                                               | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning, respectively. | `0`     |
 
-***allgather_partitions***: [boolean]
+<i>**allgather_partitions**</i>: [boolean]
 
 | Description                                                                                                                                      | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
@@ -290,13 +291,13 @@ Enabling and configuring ZeRO memory optimizations
 | ------------------------------------------------------------------------------------------------------------ | ------- |
 | Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes | `5e8`   |
 
-***overlap_comm***: [boolean]
+<i>**overlap_comm**</i>: [boolean]
 
 | Description                                                                  | Default |
 | ---------------------------------------------------------------------------- | ------- |
 | Attempts to overlap the reduction of the gradients with backward computation | `false` |
 
-***reduce_scatter***: [boolean]
+<i>**reduce_scatter**</i>: [boolean]
 
 | Description                                                             | Default |
 | ----------------------------------------------------------------------- | ------- |
@@ -308,7 +309,7 @@ Enabling and configuring ZeRO memory optimizations
 | ------------------------------------------------------------------------------------------------------------------- | ------- |
 | Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes | `5e8`   |
 
-***contiguous_gradients***: [boolean]
+<i>**contiguous_gradients**</i>: [boolean]
 
 | Description                                                                                                                                                     | Default |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -455,19 +456,19 @@ Enabling and configuring ZeRO optimization of offloading optimizer computation t
 
 ### Logging
 
-***steps\_per\_print***: [integer]
+<i>**steps_per_print**</i>: [integer]
 
 | Description                    | Default |
 | ------------------------------ | ------- |
 | Print train loss every N steps | `10`    |
 
-***wall\_clock\_breakdown***: [boolean]
+<i>**wall_clock_breakdown**</i>: [boolean]
 
 | Description                                                             | Default |
 | ----------------------------------------------------------------------- | ------- |
 | Enable timing of the latency of forward/backward/update training phases | `false` |
 
-***dump_state***: [boolean]
+<i>**dump_state**</i>: [boolean]
 
 | Description                                                          | Default |
 | -------------------------------------------------------------------- | ------- |
@@ -485,31 +486,31 @@ Enabling and configuring ZeRO optimization of offloading optimizer computation t
     }
 }
 ```
-***enabled***: [boolean]
+<i>**enabled**</i>: [boolean]
 
 | Description                 | Default |
 | --------------------------- | ------- |
 | Enables the flops profiler. | `false` |
 
-***profile\_step***: [integer]
+<i>**profile_step**</i>: [integer]
 
 | Description                                                                                                     | Default |
 | --------------------------------------------------------------------------------------------------------------- | ------- |
 | The global training step at which to profile. Note that warm up steps are needed for accurate time measurement. | `1`     |
 
-***module\_depth***: [integer]
+<i>**module_depth**</i>: [integer]
 
 | Description                                                                                                                                                            | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The depth of the model at which to print the aggregated module information. When set to `-1`, it prints information on the innermost modules (with the maximum depth). | `-1`    |
 
-***top\_modules***: [integer]
+<i>**top_modules**</i>: [integer]
 
 | Description                                                                  | Default |
 | ---------------------------------------------------------------------------- | ------- |
 | Limits the aggregated profile output to the number of top modules specified. | `3`     |
 
-***detailed***: [boolean]
+<i>**detailed**</i>: [boolean]
 
 | Description                                  | Default |
 | -------------------------------------------- | ------- |
@@ -526,39 +527,39 @@ Enabling and configuring ZeRO optimization of offloading optimizer computation t
     "profile": false
     }
 ```
-***partition\_activations***: [boolean]
+<i>**partition_activations**</i>: [boolean]
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
 | Enables partition activation when used with model parallelism | `false` |
 
-***cpu\_checkpointing***: [boolean]
+<i>**cpu_checkpointing**</i>: [boolean]
 
 | Description                                                                 | Default |
 | --------------------------------------------------------------------------- | ------- |
 | Offloads partitioned activations to CPU if partition_activations is enabled | `false` |
 
 
-***contiguous\_memory\_optimization***: [boolean]
+<i>**contiguous_memory_optimization**</i>: [boolean]
 
 | Description                                                          | Default |
 | -------------------------------------------------------------------- | ------- |
 | Copies partitioned activations so that they are contiguous in memory | `false` |
 
-***number_checkpoints***: [integer]
+<i>**number_checkpoints**</i>: [integer]
 
 | Description                                                                                              | Default |
 | -------------------------------------------------------------------------------------------------------- | ------- |
 | Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | `None`  |
 
-***synchronize\_checkpoint\_boundary***: [boolean]
+<i>**synchronize_checkpoint_boundary**</i>: [boolean]
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
 | Inserts torch.cuda.synchronize() at each checkpoint boundary. | `false` |
 
 
-***profile***: [boolean]
+<i>**profile**</i>: [boolean]
 
 | Description                                                     | Default |
 | --------------------------------------------------------------- | ------- |
@@ -566,7 +567,7 @@ Enabling and configuring ZeRO optimization of offloading optimizer computation t
 
 ### Sparse Attention
 
-***sparse\_attention***: [dictionary]
+<i>**sparse_attention**</i>: [dictionary]
 
 | Fields                           | Value                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Example           |
 | -------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------- |
@@ -584,7 +585,7 @@ Enabling and configuring ZeRO optimization of offloading optimizer computation t
 | global\_block\_end\_indices      | A list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global\_block\_indices parameter, and combining this two parameters, for each index i, blocks from global\_block\_indices[i] to global\_block\_end\_indices[i], exclusive, are considered as global attention; used in `"variable"` and `"bslongformer"` modes.                                                                                               | None              |
 | num\_sliding\_window\_blocks     | An integer determining the number of blocks in sliding local attention window; used in `"bigbird"` and `"bslongformer"` modes.                                                                                                                                                                                                                                                                                                                                                                                 | 3                 |
 
-  Example of ***sparse\_attention***
+  Example of <i>**sparse_attention**</i>
 
 ```json
   "sparse_attention": {
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 1a15000135c9..8fba712937f8 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -191,7 +191,7 @@ Table 1. Fine-tuning configuration
 
 ### 2.3 Performance Results for BingBertSQuAD Fine-tuning
 
-***Accuracy:***
+<i>**Accuracy:**</i>
 The results are summarized in the table below. The total batch size is set to 96 and training is conducted
 on 32 GPUs for 2 epochs. A set of parameters (seeds and learning rates) were tried and the best ones were selected.
 We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scores we achieved that are on-par or better than the [HuggingFace results](https://github.com/huggingface/transformers/tree/master/examples/question-answering).

From eecef309cb12528cfa78d932a6f073afb43847e5 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 20 Apr 2021 17:55:58 -0700
Subject: [PATCH 73/78] remove debug prints: (#986)

---
 deepspeed/runtime/zero/stage3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index f8b526952de8..2b6e12abd84b 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1418,13 +1418,13 @@ def _post_forward_module_hook(module, input, output):
                 if torch.is_tensor(output):
                     output = [output]
                 else:
-                    print(f'got UNKNOWN type {type(output)}')
+                    #print(f'got UNKNOWN type {type(output)}')
                     outputs = []
                     for name, val in vars(output).items():
                         if not name.startswith('__') and torch.is_tensor(val):
                             outputs.append(val)
                     output = outputs
-                    print(f'convert output to {output}')
+                    #print(f'convert output to {output}')
 
             for item in filter(lambda item: is_zero_param(item), output):
                 if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):

From 67a48aaa8906878b2ce244319e219155c85de46c Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 20 Apr 2021 18:28:22 -0700
Subject: [PATCH 74/78] 1-bit LAMB optimizer (#970)

1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed.
Author: @conglongli, @awan-10, @samyam, Hanlin Tang, Yuxiong He
Paper: https://arxiv.org/abs/2104.06069

Co-authored-by: sdtblck <46172032+sdtblck@users.noreply.github.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 README.md                                   |   7 +-
 deepspeed/runtime/comm/nccl.py              |  20 +-
 deepspeed/runtime/config.py                 |   2 +
 deepspeed/runtime/engine.py                 |  15 +-
 deepspeed/runtime/fp16/fused_optimizer.py   |   5 +-
 deepspeed/runtime/fp16/onebit/adam.py       |  25 +-
 deepspeed/runtime/fp16/onebit/lamb.py       | 471 +++++++++++++++++
 deepspeed/runtime/fp16/unfused_optimizer.py |   5 +-
 deepspeed/runtime/pipe/engine.py            |   6 +-
 deepspeed/runtime/utils.py                  |  22 +-
 docs/_config.yml                            |  17 +-
 docs/_data/navigation.yml                   |   2 +
 docs/_pages/config-json.md                  |  38 +-
 docs/_pages/features.md                     |  14 +-
 docs/_tutorials/onebit-adam.md              |   6 +-
 docs/_tutorials/onebit-lamb.md              | 130 +++++
 docs/code-docs/source/optimizers.rst        |  51 +-
 docs/index.md                               |   8 +-
 tests/unit/test_onebit.py                   | 556 +++++++++++++++++++-
 19 files changed, 1332 insertions(+), 68 deletions(-)
 create mode 100644 deepspeed/runtime/fp16/onebit/lamb.py
 create mode 100644 docs/_tutorials/onebit-lamb.md

diff --git a/README.md b/README.md
index 61520664b42e..eed70fa5e04f 100755
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ DeepSpeed delivers extreme-scale model training for everyone, from data scientis
 * Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.  
 * Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
 * Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.  
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam/1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
 
 Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
@@ -33,6 +33,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/04/20] [1-bit LAMB: up to 4.6x less communication and 2.8x faster training, together with LAMB's convergence speed at large batch sizes](https://www.deepspeed.ai/tutorials/onebit-lamb/)
 * [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
   * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
@@ -119,7 +120,7 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
   * Memory- and compute-efficient sparse kernels
   * Support 10x longer sequences than dense
   * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
   * Custom communication collective
   * Up to 5x communication volume saving
 * [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
@@ -192,7 +193,7 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
 6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
-
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 0ac2646bd0d7..e8bd03514a1b 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -12,8 +12,12 @@
 
 
 class NcclBackend(object):
-    def __init__(self):
-        self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+    def __init__(self, mpu=None):
+        if mpu is None:
+            self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        else:
+            self.mpu = mpu
+            self.world_group = self.mpu.get_data_parallel_group()
         self.rank = dist.get_rank(group=self.world_group)
         self.size = dist.get_world_size(group=self.world_group)
         self.compression_backend = CupyBackend()
@@ -92,9 +96,11 @@ def compressed_allreduce(self,
         # communication phase 1
         # gather_start = time.time()
         # Alltoall for sign
-        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed))
+        dist.all_to_all_single(recvbuf_sign,
+                               torch.stack(sign_list_packed),
+                               group=self.world_group)
         # Allgather for scale
-        dist.all_gather(recvbuf_scale, worker_scale)
+        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
 
         # gather_end = time.time()
 
@@ -151,8 +157,10 @@ def compressed_allreduce(self,
         ]
 
         # Communication Phase 2
-        dist.all_gather(recvbuf_sign_server, server_sign_packed[0])
-        dist.all_gather(recvbuf_scale_server, server_scale)
+        dist.all_gather(recvbuf_sign_server,
+                        server_sign_packed[0],
+                        group=self.world_group)
+        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
 
         cupy_server_sign_packed = None
 
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 9e33876994f9..3fa0b32a6032 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -32,11 +32,13 @@
 ADAMW_OPTIMIZER = 'adamw'
 LAMB_OPTIMIZER = 'lamb'
 ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
+ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
 DEEPSPEED_OPTIMIZERS = [
     ADAM_OPTIMIZER,
     ADAMW_OPTIMIZER,
     LAMB_OPTIMIZER,
     ONEBIT_ADAM_OPTIMIZER,
+    ONEBIT_LAMB_OPTIMIZER,
 ]
 
 # extra optimizer parameters for adam/adamw
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f71a7324585a..6a857bca378c 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -24,7 +24,7 @@
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
-    ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
+    ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
@@ -553,7 +553,8 @@ def _do_sanity_check(self):
                 assert self._is_supported_optimizer(self.optimizer_name()), \
                     '{} is not a supported DeepSpeed Optimizer'.format(self.optimizer_name())
 
-        if self.optimizer_name() == LAMB_OPTIMIZER:
+        if self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name(
+        ) == ONEBIT_LAMB_OPTIMIZER:
             assert self.dynamic_loss_scale(), \
                 'DeepSpeed {} optimizer requires dynamic loss scaling'.format(self.optimizer_name())
 
@@ -694,6 +695,13 @@ def _configure_basic_optimizer(self, model_parameters):
                 logger.warning(
                     f'Currently the convergence of 1-bit Adam is only verified under FP16'
                 )
+        elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
+            from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
+            optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
+            if not self.fp16_enabled():
+                logger.warning(
+                    f'Currently the convergence of 1-bit Lamb is only verified under FP16'
+                )
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -710,6 +718,7 @@ def _configure_fp16_optimizer(self, optimizer):
                 timers = self.timers if self.wall_clock_breakdown() else None
                 optimizer = FP16_Optimizer(
                     optimizer,
+                    deepspeed=self,
                     dynamic_loss_scale=True,
                     initial_dynamic_scale=initial_dynamic_scale,
                     dynamic_loss_args=dynamic_loss_args,
@@ -723,6 +732,7 @@ def _configure_fp16_optimizer(self, optimizer):
                          ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
+                    deepspeed=self,
                     static_loss_scale=self.loss_scale(),
                     mpu=self.mpu,
                     clip_grad=clip_grad,
@@ -732,6 +742,7 @@ def _configure_fp16_optimizer(self, optimizer):
                      ranks=[0])
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
+                deepspeed=self,
                 static_loss_scale=self.loss_scale(),
                 dynamic_loss_scale=self.dynamic_loss_scale(),
                 dynamic_loss_args=dynamic_loss_args,
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index 5f35c1884a41..98275e5bb832 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -22,6 +22,7 @@ class FP16_Optimizer(object):
     """
     def __init__(self,
                  init_optimizer,
+                 deepspeed=None,
                  static_loss_scale=1.0,
                  dynamic_loss_scale=False,
                  initial_dynamic_scale=2**32,
@@ -100,7 +101,9 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.overflow_checker = CheckOverflow(self.fp16_groups,
+                                              mpu=self.mpu,
+                                              deepspeed=deepspeed)
         self.initialize_optimizer_states()
 
     def initialize_optimizer_states(self):
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index e3417fea9d6f..35e35411cfde 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -82,6 +82,7 @@ def __init__(self,
         self.initialize = False
         self.freeze_step = freeze_step
         self.cuda_aware = cuda_aware
+        self.using_pipeline = False
 
         self.comm_backend_name = comm_backend_name
 
@@ -94,7 +95,9 @@ def __init__(self,
             assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.comm_backend_handle = NcclBackend()
+            self.using_pipeline = hasattr(self.deepspeed,
+                                          'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
             from deepspeed.runtime.comm.mpi import MpiBackend
@@ -254,8 +257,12 @@ def step(self, closure=None, grads=None):
 
         if self.adam_freeze_key is False:
             if state['step'] >= self.freeze_step:
+                print('OnebitAdam - starting compressed communication')
                 self.adam_freeze_key = True
-                self.deepspeed.enable_backward_allreduce = False
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
 
         return loss
 
@@ -277,18 +284,24 @@ def load_state_dict(self, state_dict):
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
             if torch.distributed.get_rank() == 0:
-                print("Checkpoint loaded and 1-bit Adam warmup stage starts/continues.")
+                print("Checkpoint loaded and OnebitAdam warmup stage starts/continues.")
             if self.adam_freeze_key is True:
                 self.adam_freeze_key = False
-                self.deepspeed.enable_backward_allreduce = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = True
+                else:
+                    self.deepspeed.enable_backward_allreduce = True
         else:
             if torch.distributed.get_rank() == 0:
                 print(
-                    "Checkpoint loaded and 1-bit Adam compression stage starts/continues."
+                    "Checkpoint loaded and OnebitAdam compression stage starts/continues."
                 )
             if self.adam_freeze_key is False:
                 self.adam_freeze_key = True
-                self.deepspeed.enable_backward_allreduce = False
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
         # We reset the compression errors when loading checkpoints for 3 reasons:
         # 1) The worker and server error at each GPU are distinct, so in current implementation
         # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
new file mode 100644
index 000000000000..01c6cd878488
--- /dev/null
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -0,0 +1,471 @@
+'''
+Copyright 2021 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import numpy as np
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class OnebitLamb(torch.optim.Optimizer):
+    """Implements the 1-bit Lamb algorithm. Currently GPU-only.
+    For usage example please see https://www.deepspeed.ai/tutorials/onebit-lamb/
+    For technical details please see our paper https://arxiv.org/abs/2104.06069.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Lamb!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
+        coeff_beta (float, optional): coefficient used for computing
+            running averages of lamb coefficient (default: 0.9) note that you may want to
+            increase or decrease this beta depending on the freeze_step you choose, as
+            1/(1 - coeff_beta) should be smaller than or equal to freeze_step
+        factor_max (float, optional): maximum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 4.0)
+        factor_min (float, optional): minimum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 0.5)
+        factor_threshold (float, optional): threshold of how much the scaling factor can
+            fluctuate between steps (default: 0.1)
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 max_coeff=10.0,
+                 min_coeff=0.01,
+                 amsgrad=False,
+                 cuda_aware=False,
+                 comm_backend_name='nccl',
+                 coeff_beta=0.9,
+                 factor_max=4.0,
+                 factor_min=0.5,
+                 factor_threshold=0.1):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Lamb does not support the AMSGrad variant.')
+
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm,
+                        max_coeff=max_coeff,
+                        min_coeff=min_coeff)
+
+        super(OnebitLamb, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
+
+        self.deepspeed = deepspeed
+        self.lamb_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+        self.coeff_beta = coeff_beta
+        self.factor_max = factor_max
+        self.factor_min = factor_min
+        self.factor_threshold = factor_threshold
+        self.using_pipeline = False
+
+        self.comm_backend_name = comm_backend_name
+
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
+
+        if self.comm_backend_name == 'nccl':
+            TORCH_MAJOR = int(torch.__version__.split('.')[0])
+            TORCH_MINOR = int(torch.__version__.split('.')[1])
+            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.using_pipeline = hasattr(self.deepspeed,
+                                          'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
+
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
+
+        self.size = self.comm_backend_handle.size
+
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
+        self.exp_avg_flat = []
+        self.dummy_exp_avg = {}
+        self.corrected_tensor_sizes = []
+        self.server_chunk_sizes = []
+        self.worker_errors = []
+        self.server_errors = []
+
+        self.lamb_coeffs = []
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        #remove the previous stats
+        del self.lamb_coeffs[:]
+
+        if self.lamb_freeze_key:
+            exp_avg_last_step = []
+            for group in self.param_groups:
+                exp_avg_last_step.append(
+                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
+            if 'scaling_coeff' not in self.state[self.param_groups[0]['params'][0]]:
+                # Compute the scaling_coeff for each momentum at the end of warmup stage.
+                # This is used to reduce compression error during compression stage.
+                momentum_scales = []
+                for group in self.param_groups:
+                    momentum_scales.append([
+                        (torch.norm(self.state[p]['exp_avg']) /
+                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
+                        for p in group['params']
+                    ])
+                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
+                    [len(x) for x in momentum_scales])
+                for i, group in enumerate(self.param_groups):
+                    for j, p in enumerate(group['params']):
+                        self.state[p][
+                            'scaling_coeff'] = united_scale / momentum_scales[i][j]
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('1-bit Lamb does not support sparse gradients')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0 or (len(state) == 1
+                                       and 'scaling_coeff' in state.keys()):
+                    state['step'] = 0
+                    state['lamb_coeff_freeze'] = 0.0
+                    state['last_factor'] = 1.0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    state['exp_avg_sq_fresh'] = torch.zeros_like(p.data)
+
+                if not self.initialize:
+                    self.lamb_freeze_key = True
+
+                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                beta1, beta2 = group['betas']
+                max_coeff = group['max_coeff']
+                min_coeff = group['min_coeff']
+
+                state['step'] += 1
+
+                if self.lamb_freeze_key is False:
+                    # warmup stage, baseline Lamb optimization
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    if state['step'] == self.freeze_step:
+                        exp_avg_sq_fresh.data = exp_avg_sq.detach().clone()
+                    grad = None
+                    if self.initialize:
+                        weight_norm = p.data.pow(2).sum().sqrt()
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+                        if group['weight_decay'] > 0.0:
+                            update += group['weight_decay'] * p.data
+                        update_norm = update.pow(2).sum().sqrt()
+                        lamb_coeff = 1.0
+                        if weight_norm != 0 and update_norm != 0:
+                            lamb_coeff = (weight_norm / update_norm).item()
+                            if lamb_coeff > max_coeff:
+                                lamb_coeff = max_coeff
+                            if lamb_coeff < min_coeff:
+                                lamb_coeff = min_coeff
+                        if lamb_coeff != 1.0:
+                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
+                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
+                        self.lamb_coeffs.append(lamb_coeff)
+                        with torch.no_grad():
+                            p.add_(-group['lr'] * lamb_coeff * update)
+                else:
+                    # compression stage, update each momentum locally, then
+                    # communicate based on the compressed_allreduce below
+                    if self.initialize:
+                        exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        exp_avg.mul_(self.state[p]['scaling_coeff'])
+                    grad = None
+
+        # init fused momentum
+        if len(self.exp_avg_flat) == 0:
+            momentum_groups = []
+            tensor_size = 0
+            for group in self.param_groups:
+                for p in group['params']:
+                    momentum_groups.append(self.state[p]['exp_avg'])
+                    tensor_size += torch.numel(p.data)
+            corrected_tensor_size = tensor_size
+            if tensor_size % (self.size * self.divider) != 0:
+                difference = ((self.size * self.divider) - (tensor_size %
+                                                            (self.size * self.divider)))
+                corrected_tensor_size += difference
+                self.dummy_exp_avg[0] = torch.zeros(
+                    difference,
+                    device=momentum_groups[0].data.device)
+                momentum_groups.append(self.dummy_exp_avg[0])
+            self.corrected_tensor_sizes.append(corrected_tensor_size)
+            self.server_chunk_sizes.append(corrected_tensor_size // self.size)
+
+            self.exp_avg_flat.append(
+                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
+            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
+                                                      momentum_groups)
+            for p, q in zip(momentum_groups, updated_params):
+                p.data = q.data
+
+        if self.initialize and len(self.worker_errors) == 0:
+            torch.cuda.empty_cache()
+            for i in range(len(self.exp_avg_flat)):
+                self.worker_errors.append(
+                    torch.zeros(self.corrected_tensor_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+                self.server_errors.append(
+                    torch.zeros(self.server_chunk_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+            torch.cuda.empty_cache()
+
+        if self.lamb_freeze_key:
+            if self.size > 1:
+                for i in range(len(self.exp_avg_flat)):
+                    if not self.initialize:
+                        torch.cuda.empty_cache()
+                        self.worker_errors.append(
+                            torch.zeros(self.corrected_tensor_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        self.server_errors.append(
+                            torch.zeros(self.server_chunk_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        torch.cuda.empty_cache()
+                        if torch.distributed.get_rank() == 0:
+                            print("Cupy Buffers Initialized Successfully.")
+
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[0],
+                            self.server_errors[0],
+                            self.deepspeed.local_rank)
+
+                        if torch.distributed.get_rank() == 0:
+                            print('Pop out errors', flush=True)
+                        del self.worker_errors[:]
+                        del self.server_errors[:]
+                    else:
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[i],
+                            self.server_errors[i],
+                            self.deepspeed.local_rank)
+
+        if self.lamb_freeze_key and self.initialize:
+            for i, group in enumerate(self.param_groups):
+                bias_correction = 1 if group['bias_correction'] else 0
+
+                for j, p in enumerate(group['params']):
+                    state = self.state[p]
+                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                    beta1, beta2 = group['betas']
+                    exp_avg.div_(self.state[p]['scaling_coeff'])
+                    # Because 1-bit compression cannot represent exact zero, it is required to
+                    # provide a momentum mask for those params that have constant exact zeros in their
+                    # momentums, otherwise the compression error would keep accumulating.
+                    # For example, for BERT pre-training seq 128, bert.embeddings.position_embeddings.weight
+                    # always have exact zeros in its momentum for row 129 to 512, because it only
+                    # learns up to seq length 128 while the model supports up to 512 seq length.
+                    # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py about how
+                    # to add this exp_avg_mask for BERT pre-training.)
+                    if 'exp_avg_mask' in group:
+                        if exp_avg.device != group['exp_avg_mask'].device:
+                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                device=exp_avg.device)
+                        exp_avg.mul_(group['exp_avg_mask'])
+
+                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
+                                        (1 - beta1))
+                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2,
+                                                          grad_reconstruct,
+                                                          grad_reconstruct)
+                    denom = exp_avg_sq.sqrt() + group['eps']
+                    update_prelim = exp_avg / denom
+
+                    if group['weight_decay'] > 0.0:
+                        update = update_prelim + group['weight_decay'] * p.data
+                    else:
+                        update = update_prelim
+
+                    lamb_coeff = 1.0
+                    update_norm = update.pow(2).sum().sqrt()
+                    denom_real = exp_avg_sq_fresh.sqrt() + group['eps']
+                    factor = (denom / denom_real).max().item()
+                    if group['weight_decay'] > 0.0:
+                        update_ratio = min(1.0,
+                                           (update_prelim.pow(2).sum().sqrt() /
+                                            update_norm).item())
+                        factor = factor * update_ratio + (1.0 - update_ratio)
+                    if factor > self.factor_max:
+                        factor = self.factor_max
+                    if factor < self.factor_min:
+                        factor = self.factor_min
+                    if factor > state['last_factor'] * (1.0 + self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 + self.factor_threshold)
+                    if factor < state['last_factor'] * (1.0 - self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 - self.factor_threshold)
+                    state['last_factor'] = factor
+                    lamb_coeff = state['lamb_coeff_freeze'] * factor
+                    self.lamb_coeffs.append(lamb_coeff)
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * lamb_coeff * update)
+            del exp_avg_last_step[:]
+            exp_avg_last_step = None
+
+        if not self.initialize:
+            self.lamb_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.lamb_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                print('OnebitLamb - starting compressed communication')
+                self.lamb_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+
+        return loss
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides load_state_dict() to add special handling when loading checkpoints
+        """
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # BERT pre-training seqlen 128 and 512 ), we don't use the exp_avg_mask
+        # in checkpoints but always use the one user provided in training script.
+        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+        # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
+                    'param_groups'][i]:
+                state_dict['param_groups'][i].pop('exp_avg_mask')
+        super().load_state_dict(state_dict)
+        # need to reset the fused momentum since loading states will break the linking
+        del self.exp_avg_flat[:]
+        self.dummy_exp_avg.clear()
+        del self.corrected_tensor_sizes[:]
+        del self.server_chunk_sizes[:]
+        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print("Checkpoint loaded and OnebitLamb warmup stage starts/continues.")
+            if self.lamb_freeze_key is True:
+                self.lamb_freeze_key = False
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = True
+                else:
+                    self.deepspeed.enable_backward_allreduce = True
+            for group in self.param_groups:
+                for p in group['params']:
+                    self.state[p]['lamb_coeff_freeze'] = 0.0
+                    self.state[p]['last_factor'] = 1.0
+                    if 'scaling_coeff' in self.state[p]:
+                        self.state[p].pop('scaling_coeff')
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and OnebitLamb compression stage starts/continues."
+                )
+            if self.lamb_freeze_key is False:
+                self.lamb_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+        # We reset the compression errors when loading checkpoints for 3 reasons:
+        # 1) The worker and server error at each GPU are distinct, so in current implementation
+        # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
+        # If we want to save them correctly we need O(num_gpu*model_size) memory in order to
+        # gather all the error, which is a very large memory requirement. It's possible to save
+        # them in a distributed way, but it will make the checkpoint saving/loading much more complicated.
+        # 2) Even if we are able to save the compression errors correctly, you need to have the
+        # exact same number of GPUs in order to load them correctly.
+        # 3) We verified on BERT pre-training that occasionally resetting the compression error
+        # at checkpoint loading does not affect the convergence.
+        # However, please avoid frequent checkpoint loading which could break the error
+        # compensation mechanism thus affect the convergence.
+        del self.worker_errors[:]
+        del self.server_errors[:]
+
+    def get_lamb_coeffs(self):
+        return self.lamb_coeffs
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index c0cef6a56ba7..441dbd61ccb9 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -22,6 +22,7 @@ class FP16_UnfusedOptimizer(object):
     """
     def __init__(self,
                  init_optimizer,
+                 deepspeed=None,
                  static_loss_scale=1.0,
                  dynamic_loss_scale=False,
                  dynamic_loss_args=None,
@@ -96,7 +97,9 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.overflow_checker = CheckOverflow(self.fp16_groups,
+                                              mpu=self.mpu,
+                                              deepspeed=deepspeed)
 
         self.initialize_optimizer_states()
 
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 1a401a27e36f..d4e5e5edfe71 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -56,6 +56,10 @@ def __init__(self, *super_args, **super_kwargs):
 
         # We schedule the all-reduces, so disable it in super().backward()
         self.enable_backward_allreduce = False
+
+        # used to disable the pipeline all-reduce when used with 1-bit Adam/1-bit LAMB
+        self.pipeline_enable_backward_allreduce = True
+
         assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
             " with pipeline parallelism."
 
@@ -222,7 +226,7 @@ def _exec_reduce_tied_grads(self):
 
     def _exec_reduce_grads(self):
         self._force_grad_boundary = True
-        if self.is_data_parallel:
+        if self.is_data_parallel and self.pipeline_enable_backward_allreduce:
             self.buffered_allreduce_fallback(
                 elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE)
         self._force_grad_boundary = False
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index b1a7a4b0aae1..d54613e196f5 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -64,10 +64,15 @@ def move_to_device(item, device):
 
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
-    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False):
+    def __init__(self,
+                 param_groups=None,
+                 mpu=None,
+                 zero_reduce_scatter=False,
+                 deepspeed=None):
         self.mpu = mpu
         self.params = [] if param_groups else None
         self.zero_reduce_scatter = zero_reduce_scatter
+        self.deepspeed = deepspeed
         if param_groups:
             for group in param_groups:
                 for param in group:
@@ -125,9 +130,24 @@ def has_overflow(self, params):
                                          op=torch.distributed.ReduceOp.MAX,
                                          group=torch.distributed.group.WORLD)
         elif self.mpu is not None:
+            if self.deepspeed is not None:
+                using_pipeline = hasattr(self.deepspeed,
+                                         'pipeline_enable_backward_allreduce')
+                if (using_pipeline
+                        and self.deepspeed.pipeline_enable_backward_allreduce is False
+                    ) or (not using_pipeline
+                          and self.deepspeed.enable_backward_allreduce is False):
+                    torch.distributed.all_reduce(
+                        overflow_gpu,
+                        op=torch.distributed.ReduceOp.MAX,
+                        group=self.mpu.get_data_parallel_group())
             torch.distributed.all_reduce(overflow_gpu,
                                          op=torch.distributed.ReduceOp.MAX,
                                          group=self.mpu.get_model_parallel_group())
+        elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=torch.distributed.group.WORLD)
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
diff --git a/docs/_config.yml b/docs/_config.yml
index 19d679042b90..a39298be04f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -33,15 +33,22 @@ collections:
       - advanced-install.md
       - getting-started.md
       - azure.md
-      - cifar-10.md
-      - bert-pretraining.md
       - bert-finetuning.md
-      - transformer_kernel.md
+      - bert-pretraining.md
+      - cifar-10.md
+      - flops-profiler.md
+      - gan.md
+      - lrrt.md
       - megatron.md
       - one-cycle.md
-      - lrrt.md
+      - onebit-adam.md
+      - onebit-lamb.md
+      - pipeline.md
+      - progressive_layer_dropping.md
+      - sparse-attention.md
+      - transformer_kernel.md
+      - zero-offload.md
       - zero.md
-      - flops-profiler.md
 
 defaults:
   - scope:
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 8b41df6a79f6..6ab28bb84fd4 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -80,6 +80,8 @@ lnav:
         url: /tutorials/one-cycle/
       - title: "One-Bit Adam"
         url: /tutorials/onebit-adam/
+      - title: "One-Bit LAMB"
+        url: /tutorials/onebit-lamb/
       - title: "Pipeline Parallelism"
         url: /tutorials/pipeline/
       - title: "Progressive Layer Dropping"
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 1f59c29d2202..8d33179862ef 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -34,7 +34,7 @@ title: "DeepSpeed Configuration JSON"
 
 | Fields | Value                                                                                                                                                                                                                                                                                        | Example                      |
 | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
-| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
+| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, **Lamb**, and **OneBitLamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
 | params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
   Example of <i>**optimizer**</i> with Adam
@@ -88,6 +88,42 @@ The 1-bit Adam optimizer supports the following three params keys/values in addi
 | cuda\_aware         | To indicate that the underlying MPI library supports CUDA-Aware communication      | false   |
 | comm\_backend\_name | To indicate which backend implementation to use                                    | "nccl"  |
 
+Another example of ***optimizer*** with 1-bit LAMB
+
+```json
+"optimizer": {
+    "type": "OneBitLamb",
+    "params": {
+      "lr": 11e-3,
+      "weight_decay": 0.01,
+      "bias_correction": false,
+      "max_coeff": 0.3,
+      "min_coeff": 0.01,
+      "freeze_step": 1000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl",
+      "coeff_beta": 0.9,
+      "factor_max": 4.0,
+      "factor_min": 0.5,
+      "factor_threshold": 0.1
+    }
+  }
+```
+
+The 1-bit LAMB optimizer supports the following params keys/values in addition to the standard LAMB (learn more in our [tutorial](/tutorials/onebit-lamb/)):
+
+| "params" key  | Description                                                                 | Default |
+| ------------- | --------------------------------------------------------------------------- | ------- |
+| max\_coeff   | Scaling coefficient upper bound for original LAMB algorithm and 1-bit LAMB's warmup stage   | 10.0   |
+| min\_coeff   | Scaling coefficient lower bound for original LAMB algorithm and 1-bit LAMB's warmup stage   | 0.01   |
+| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication   | 100000   |
+| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication           | false    |
+| comm\_backend\_name | To indicate which backend implementation to use                                 | "nccl"   |
+| coeff\_beta | Coefficient used for computing running averages of lamb coefficient                     | 0.9      |
+| factor\_max | Maximum value of scaling factor to the frozen lamb coefficient during compression stage | 4.0      |
+| factor\_min | Minimum value of scaling factor to the frozen lamb coefficient during compression stage | 0.5      |
+| factor\_threshold | Threshold of how much the scaling factor can fluctuate between steps              | 0.1      |
+
 ### Scheduler Parameters
 
 
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index ba955fd574db..9b0b89d0a64b 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -172,15 +172,17 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
 
 ## Training Optimizers
 
-### 1-bit Adam optimizer with up to 5x less communication
+### 1-bit Adam and 1-bit LAMB optimizers with up to 5x less communication
 
-DeepSpeed has an efficient implementation of a novel algorithm called 1-bit Adam.
-It offers the same convergence as Adam, incurs up to 5x less communication that enables
+DeepSpeed has two communication-efficient optimizers called 1-bit Adam and 1-bit LAMB.
+They offer the same convergence as Adam/LAMB, incur up to 5x less communication that enables
 up to 3.5x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
 for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
-please refer to the detailed [tutorial](https://www.deepspeed.ai/tutorials/onebit-adam) and
-[blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md), respectively.
-<!-- **TODO: add paper link when it is ready ** -->
+please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
+[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
+and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
+please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888) and
+[1-bit LAMB paper](https://arxiv.org/abs/2104.06069).
 
 ### Fused Adam optimizer and arbitrary torch.optim.Optimizer
 With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 8fba712937f8..feef71682513 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -7,7 +7,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 {: .notice--info}
 
 **Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
@@ -23,7 +23,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert
 
 ### 1.1 Pre-requisites for installing DeepSpeed
 
-If you don't already have a copy of the DeepSpeed repository, please clone in
+If you don't already have a copy of the DeepSpeed repository, please clone it
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
 
 ```shell
@@ -106,7 +106,7 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
-1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+1-bit Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
 {: .notice--warning}
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
diff --git a/docs/_tutorials/onebit-lamb.md b/docs/_tutorials/onebit-lamb.md
new file mode 100644
index 000000000000..f6d9341d9095
--- /dev/null
+++ b/docs/_tutorials/onebit-lamb.md
@@ -0,0 +1,130 @@
+---
+title: "1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed"
+---
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit LAMB is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit LAMB's convergence. See details below.
+{: .notice--warning}
+
+In this tutorial, we introduce DeepSpeed's 1-bit LAMB optimizer which enables communication-efficient large-scale large-batch training with LAMB's convergence speed. 1-bit LAMB can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 4.6x. We also have a [paper](https://arxiv.org/abs/2104.06069) which provides the technical details including algorithm, system implementation, and evaluations.
+
+To illustrate the benefits and usage of 1-bit LAMB optimizer, we use the BERT Pre-training task as example. For more details on this task, please refer to the [tutorial](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### 1.1 Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone it
+now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### 1.2 Pre-requisites for 1-bit LAMB
+
+#### 1.2.1 NCCL-based implementation
+
+In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
+
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
+
+```shell
+pip install deepspeed[1bit_adam]
+```
+
+We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 1-bit LAMB using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Please note that for MPI-based implementation of 1-bit LAMB, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+```
+
+### 1.3 1-bit LAMB Algorithm
+
+The detailed description of the 1-bit LAMB algorithm can be seen from our [paper](https://arxiv.org/abs/2104.06069).
+
+### 1.4 Configuration of 1-bit LAMB
+The 1-bit LAMB feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 65536,
+  "train_micro_batch_size_per_gpu": 64,
+  "optimizer": {
+    "type": "OneBitLamb",
+    "params": {
+      "lr": 11e-3,
+      "max_coeff": 0.3,
+      "min_coeff": 0.01,
+      "freeze_step": 1000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl",
+      "coeff_beta": 0.9,
+      "factor_max": 4.0,
+      "factor_min": 0.5,
+      "factor_threshold": 0.1
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+Please note the new parameters `freeze_step`, `cuda_aware`, `comm_backend_name`, `coeff_beta`, `factor_max`, `factor_min`, and `factor_threshold` that have been added to support the 1-bit LAMB feature:
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to LAMB's variance/second moment term and scaling coefficient. See detailed analysis in our [paper](https://arxiv.org/abs/2104.06069)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+`coeff_beta` is used when calculating a moving average of the LAMB scaling coefficient during the warmup stage. This moving average is then used as the frozen base scaling coefficient during the compression stage.
+
+`factor_max`, `factor_min`, and `factor_threshold` are used to regularize the adaptive scaling of the frozen base scaling coefficient during the compression stage. `factor_max` and `factor_min` are the scaling factor upper/lower bound. `factor_threshold` defines the threshold of how much the scaling factor can fluctuate between steps.
+
+#### 1.4.1 Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit LAMB we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit LAMB relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
+
+## 2. BERT Pre-training with 1-bit LAMB
+For data downloading and pre-processing, please refer to the [BERT Pre-training tutorial](/tutorials/bert-pretraining/).
+
+### 2.1 Running Pre-training with DeepSpeed and 1-bit LAMB
+
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_lamb/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_lamb). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+### 2.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit LAMB enabled
+
+The `deepspeed_bsz64k_onebitlamb_config_seq128_*.json` and `deepspeed_bsz32k_onebitlamb_config_seq512_*.json` files give the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters. In these files we include the tuned hyperparameters to reproduce experiments in our [paper](https://arxiv.org/abs/2104.06069).
+
+### 2.3 Performance Results for BERT Pre-training
+
+Performance results can be seen in our [paper](https://arxiv.org/abs/2104.06069).
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 04416486d954..fda69e0677eb 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,27 +1,24 @@
-Optimizers
-==========
-
-DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
-
-Adam (CPU)
-----------
-
-.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
-
-
-FusedAdam (GPU)
----------------
-
-.. autoclass:: deepspeed.ops.adam.FusedAdam
-
-
-FusedLamb (GPU)
----------------
-
-.. autoclass:: deepspeed.ops.lamb.FusedLamb
-
-
-OneBitAdam (GPU)
-----------------
-
-.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam
+Optimizers
+===================
+
+DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedLamb``, ``OnebitAdam``, ``OnebitLamb`` optimizers on GPU.
+
+Adam (CPU)
+----------------------------
+.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
+
+FusedAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.adam.FusedAdam
+
+FusedLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.lamb.FusedLamb
+
+OneBitAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam
+
+OnebitLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.onebit.lamb.OnebitLamb
diff --git a/docs/index.md b/docs/index.md
index ab6b1a0445d8..9d60ed6e1298 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,7 +17,7 @@ DeepSpeed delivers extreme-scale model training for everyone, from data scientis
 * Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.
 * Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
 * Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam/1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
 
 Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
@@ -30,6 +30,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/04/20] [1-bit LAMB: up to 4.6x less communication and 2.8x faster training, together with LAMB's convergence speed at large batch sizes](https://www.deepspeed.ai/tutorials/onebit-lamb/)
 * [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
   * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
@@ -134,7 +135,7 @@ combinations, which we call 3D parallelism.
 Pipeline parallelism of DeepSpeed reduce communication volume during distributed training, which allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.
 ![Low-bandwidth GPT-2 Performance](/assets/images/pp-lowbw-gpt2.png)
 
-1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+1-bit Adam and 1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html), [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam/), [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/).
 
 ## Supporting long sequence length
 DeepSpeed offers sparse attention kernels—an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers **an order-of-magnitude longer input sequence** and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5–3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
@@ -178,7 +179,7 @@ Below we provide a brief feature list, see our detailed [feature overview](https
   * Memory- and compute-efficient sparse kernels
   * Support 10x long sequences than dense
   * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
   * Custom communication collective
   * Up to 5x communication volume saving
 * [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
@@ -235,6 +236,7 @@ comments.
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
 6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 8e0056be0cff..9796a70953f8 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -1,14 +1,22 @@
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.distributed as dist
 import deepspeed
 import argparse
 import pytest
+import copy
 import json
 import os
 import numpy as np
 import time
+
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
 from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
+from test_pipe import AlexNetPipe, train_cifar
 
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -241,9 +249,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
         mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
         assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        # optimizer_1.optimizer.gather_compression_errors()
         model_1.save_checkpoint(save_folder, tag=None)
-        time.sleep(5)
         assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
 
@@ -297,6 +303,552 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
                                    hidden_dim=hidden_dim)
 
 
+def test_onebitadam_checkpointing_overflow(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_checkpointing_overflow(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+    _test_onebitadam_checkpointing_overflow(args=args,
+                                            model=model,
+                                            hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('topo',
+                         [
+                             PipeTopo(num_pp=1,
+                                      num_dp=4),
+                             PipeTopo(num_pp=2,
+                                      num_dp=2),
+                             PipeTopo(num_pp=4,
+                                      num_dp=1),
+                         ])
+def test_onebitadam_fp16_pipeline(topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7,
+                "freeze_step": 200,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    # Allocate model for consistent initial weights.
+    init_net = AlexNetPipe()
+
+    @distributed_test(world_size=4)
+    def _helper(topo, tmpdir, steps=500):
+        assert steps >= 100
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+    _helper(topo, tmpdir)
+
+
+def test_onebitlamb_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitlamb_fp16_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitlamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_fp32_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitlamb_fp32_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitlamb_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_exp_avg_mask(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+    optimizer_grouped_parameters = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                    {
+                                        'params': [param_optimizer[1][1]],
+                                        'weight_decay': 0.01
+                                    }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_exp_avg_mask(args, model, hidden_dim):
+        model, optimizer, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      model_parameters=optimizer_grouped_parameters)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v['exp_avg'].size() == mask1.size():
+                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
+
+    _test_onebitlamb_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_checkpointing(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    mask2 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+        mask2[1][col] += 1
+
+    optimizer_grouped_parameters_1 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_2 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask2
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_3 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_checkpointing(mask1, mask2, args, model, hidden_dim):
+        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_1)
+        data_loader = random_dataloader(model=model_1,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model_1.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.lamb_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
+        scaling_coeff_1 = []
+        for v in optimizer_1.state.values():
+            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
+            scaling_coeff_1.append(v['scaling_coeff'])
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
+
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_2)
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
+        model_2.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs is loaded correctly
+        scaling_coeff_2 = []
+        for v in optimizer_2.state.values():
+            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
+            scaling_coeff_2.append(v['scaling_coeff'])
+        assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
+        assert optimizer_2.optimizer.lamb_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_3)
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(model=model_3,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model_3.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.lamb_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
+        model_3.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are resetted
+        for v in optimizer_3.state.values():
+            assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
+            assert v['last_factor'] == 1.0, f"Incorrect last_factor"
+            assert 'scaling_coeff' not in v, f"Incorrect scaling_coeff"
+        assert optimizer_3.optimizer.lamb_freeze_key is False
+
+    _test_onebitlamb_checkpointing(mask1,
+                                   mask2,
+                                   args=args,
+                                   model=model,
+                                   hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_checkpointing_overflow(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+    _test_onebitlamb_checkpointing_overflow(args=args,
+                                            model=model,
+                                            hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('topo',
+                         [
+                             PipeTopo(num_pp=1,
+                                      num_dp=4),
+                             PipeTopo(num_pp=2,
+                                      num_dp=2),
+                             PipeTopo(num_pp=4,
+                                      num_dp=1),
+                         ])
+def test_onebitlamb_fp16_pipeline(topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7,
+                "freeze_step": 200,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    # Allocate model for consistent initial weights.
+    init_net = AlexNetPipe()
+
+    @distributed_test(world_size=4)
+    def _helper(topo, tmpdir, steps=500):
+        assert steps >= 100
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+    _helper(topo, tmpdir)
+
+
 def test_compressed_allreduce_basic(tmpdir):
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():

From 894f21daaab90a083d6f193426e551333d20a6bf Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Wed, 21 Apr 2021 14:02:39 -0700
Subject: [PATCH 75/78] Use odd shape tensor to represent parameter data in
 partitioned state (#981)

* use wierd shaped tensor to avoid silent failures when not registering externel params

* fix typo

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/partition_parameters.py | 6 ++++--
 tests/unit/test_zero_context.py                | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 9ff5a7232a73..e831911efd62 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -23,6 +23,7 @@
 from ..config import DeepSpeedConfig
 
 param_count = 0
+partitioned_param_data_shape = [1]
 
 
 def print_rank_0(message, debug=False, force=False):
@@ -634,7 +635,8 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                     f'Before partitioning param {param.ds_id} {param.shape}',
                     force=False)
                 #param.data does not store anything meaningful in partitioned state
-                param.data = torch.ones(1).half().to(param.device)
+                param.data = torch.ones(partitioned_param_data_shape).half().to(
+                    param.device)
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                                  force=False)
 
@@ -715,7 +717,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
             see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
                              force=False)
-            param.data = torch.ones(1).half().to(param.device)
+            param.data = torch.ones(partitioned_param_data_shape).half().to(param.device)
             see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                              force=False)
 
diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py
index 9c45b58abf66..5ccccb5c18a0 100644
--- a/tests/unit/test_zero_context.py
+++ b/tests/unit/test_zero_context.py
@@ -6,7 +6,7 @@
 import pytest
 
 import deepspeed
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
 
 from common import distributed_test
 
@@ -32,7 +32,7 @@ def test_scatter_gather():
     with deepspeed.zero.Init():
         l = torch.nn.Linear(6, 3)
     assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert l.weight.numel() == 1
+    assert l.weight.shape == torch.Size(partitioned_param_data_shape)
 
     # Ensure there is no impact outside the context
     l2 = torch.nn.Linear(6, 3)

From 0b80ad06b694c82ad4558fd3a23812ebbd17247d Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 21 Apr 2021 14:41:59 -0700
Subject: [PATCH 76/78] Make reduce scatter optional for ZeRO-1 as workaround
 (#971)

* Make reduce scatter optional for ZeRO-1 as workaround

* Make allreduce default for ZeRO 1

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/engine.py         | 5 ++---
 deepspeed/runtime/zero/constants.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 6a857bca378c..646e492cc3dd 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -759,7 +759,6 @@ def _configure_zero_optimizer(self, optimizer):
         timers = self.timers if self.wall_clock_breakdown() else None
 
         if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
-            assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode'
             optimizer = FP16_DeepSpeedZeroOptimizer_Stage1(
                 optimizer,
                 static_loss_scale=self.loss_scale(),
@@ -970,8 +969,8 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
 
         #Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
-            if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
-                assert self.zero_reduce_scatter()
+            if self.zero_optimization_stage(
+            ) == ZERO_OPTIMIZATION_OPTIMIZER_STATES and self.zero_reduce_scatter():
                 self.optimizer.reduce_scatter_gradients(
                     postscale_gradients=self.postscale_gradients(),
                     gradient_predivide_factor=self.gradient_predivide_factor(),
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index 40b450649850..9cfe313c75b0 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -52,7 +52,7 @@
 ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
 
 ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
-ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
+ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = False
 
 ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
 ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False

From 669028f0fd5067c9247120cb21fd6e9bea4820a9 Mon Sep 17 00:00:00 2001
From: sdtblck <46172032+sdtblck@users.noreply.github.com>
Date: Wed, 21 Apr 2021 23:44:50 +0200
Subject: [PATCH 77/78] Fix all Pipeline Module Parameters being sent to cuda:0
 (#687)

---
 deepspeed/runtime/pipe/module.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index dcd4be0ea342..abf90b03e911 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -151,6 +151,8 @@ def forward(self, inputs):
         self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
         self.global_rank = dist.get_rank(group=self.world_group)
         self.world_size = dist.get_world_size(group=self.world_group)
+        self.local_rank = int(os.environ.get("LOCAL_RANK", None))
+        assert self.local_rank != None
 
         if topology:
             self._topo = topology
@@ -189,7 +191,7 @@ def forward(self, inputs):
 
         #with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
         self._build()
-        self.to('cuda')
+        self.to(f'cuda:{self.local_rank}')
 
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()

From 7c023f27b402fa0a6d0cc620eb8c61d28efa9e2c Mon Sep 17 00:00:00 2001
From: sid <sidney.black@aleph-alpha.de>
Date: Thu, 22 Apr 2021 19:02:10 +0200
Subject: [PATCH 78/78] remove communicate overflow (already in
 utils.CheckOverflow)

---
 deepspeed/runtime/fp16/fused_optimizer.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index d7a42d5d47eb..2a03760860cc 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -219,18 +219,6 @@ def step(self, closure=None):
         self.overflow = self.overflow_checker.has_overflow(fp16_params)
         self.stop_timers([OVERFLOW_CHECK])
         prev_scale = self.cur_scale
-        if isinstance(self.optimizer, OnebitAdam) or isinstance(self.optimizer, OnebitLamb):
-            # if optimizer has mpu (i.e, is pipeline parallel), communicate the skipped step to all optimizers in group
-            if hasattr(self.optimizer.comm_backend_handle,
-                       "mpu") and self.optimizer.comm_backend_handle.mpu is not None:
-                if self.overflow:
-                    bool_tensor = torch.zeros(1).cuda()
-                else:
-                    bool_tensor = torch.ones(1).cuda()
-                dist.all_reduce(bool_tensor, op=dist.ReduceOp.PRODUCT)
-                if not any(bool_tensor):
-                    self.overflow = True
-
         self._update_scale(self.overflow)
 
         if self.overflow: