From a6dba72aeafad63661dfe566d3accd03d00be78c Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 14 Dec 2020 11:18:19 -0800
Subject: [PATCH 01/41] NCCL-based 1-bit Adam + Code Refactor for Comm.
 Backends (#594)

* NCCL based 1-bit Implementation + Refactor to add communication backends (#593)

* add nccl 1-bit optim.

* temporary commit to save stuff.

* Use dist collectives instead of mpi routines.

* remove old code for comm.

* Fix bugs. still does not work.

* modify to test the nccl side code path

* Initial gather impl. Works intra-node.

* Updates to comm. phase 2. nccl comm. passed the tests.

* refactor code to introduce nccl/mpi as backends for onebit adam.

* Refactor updates to test/engine.

* Fix compile/runtime errors.

* simplify support for nccl/mpi backends.

* Add missign file

* Add compression backend in constructor. Revert later.

* modify test with some perf counting.

* Implement a true non-blocking gather for nccl side.

* Revert "Add compression backend in constructor. Revert later."

This reverts commit df8c40d3105e9f2542a8aa6619e80d675a09753f.

* improve the 1-bit adam test.

* Refactor comm. and compression backend in 1-bit adam.

* Fix the test.

* Fix runtime errors and typos in nccl backend

* fix mpi backend. modify tests.

* modify nccl perf test.

* fix mpi side errors.

* Add an mpi perf test

* Sync DSE.

* Remove old collectives file.

* Undo a typo.

* Graceful failure for torch versions that don't support nccl pt2pt.
---
 deepspeed/runtime/comm/__init__.py            |   0
 deepspeed/runtime/comm/mpi.py                 | 299 ++++++++++++++++++
 deepspeed/runtime/comm/nccl.py                | 194 ++++++++++++
 deepspeed/runtime/compression/__init__.py     |   0
 deepspeed/runtime/compression/cupy.py         |  24 ++
 deepspeed/runtime/custom_collectives.py       | 154 ---------
 deepspeed/runtime/engine.py                   |   4 +-
 deepspeed/runtime/fp16/onebit/__init__.py     |   0
 .../fp16/{onebit_adam.py => onebit/adam.py}   | 189 ++---------
 ...com_reduce_cuda.py => test_mpi_backend.py} |  42 +--
 tests/onebitadam/test_mpi_perf.py             |  78 +++++
 tests/onebitadam/test_nccl_backend.py         |  88 ++++++
 tests/onebitadam/test_nccl_perf.py            |  78 +++++
 13 files changed, 818 insertions(+), 332 deletions(-)
 create mode 100644 deepspeed/runtime/comm/__init__.py
 create mode 100644 deepspeed/runtime/comm/mpi.py
 create mode 100644 deepspeed/runtime/comm/nccl.py
 create mode 100644 deepspeed/runtime/compression/__init__.py
 create mode 100644 deepspeed/runtime/compression/cupy.py
 delete mode 100644 deepspeed/runtime/custom_collectives.py
 create mode 100644 deepspeed/runtime/fp16/onebit/__init__.py
 rename deepspeed/runtime/fp16/{onebit_adam.py => onebit/adam.py} (56%)
 rename tests/onebitadam/{test_com_reduce_cuda.py => test_mpi_backend.py} (71%)
 create mode 100644 tests/onebitadam/test_mpi_perf.py
 create mode 100644 tests/onebitadam/test_nccl_backend.py
 create mode 100644 tests/onebitadam/test_nccl_perf.py

diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
new file mode 100644
index 000000000000..532f03d940cd
--- /dev/null
+++ b/deepspeed/runtime/comm/mpi.py
@@ -0,0 +1,299 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import cupy
+import time
+import numpy as np
+from mpi4py import MPI
+
+from deepspeed.runtime.compression.cupy import CupyBackend
+
+
+class MpiBackend(object):
+    def __init__(self, cuda_aware):
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.cuda_aware = cuda_aware
+        self.compression_backend = CupyBackend()
+
+    def my_igather(self, rank, size, comm, sendbuf, recbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(comm.Irecv(recbuf[idx], source=idx))
+                else:
+                    recbuf[rank] = sendbuf
+        else:
+            req.append(comm.Isend(sendbuf, dest=root))
+        return req
+
+    def gather_cuda(self,
+                    rank,
+                    world_size,
+                    comm,
+                    cupy_sign_list_packed,
+                    cupy_recvbuf_sign,
+                    cupy_worker_scale,
+                    cupy_recvbuf_scale):
+        # We do in-place operations on cupy buffers so we do not return any buffers
+        requests = []
+        for idx in range(world_size):
+            req_sign = self.my_igather(rank,
+                                  world_size,
+                                  comm,
+                                  cupy_sign_list_packed[idx],
+                                  cupy_recvbuf_sign,
+                                  root=idx)
+            requests += req_sign
+
+        for idx in range(world_size):
+            req_scale = self.my_igather(rank,
+                                   world_size,
+                                   comm,
+                                   cupy_worker_scale,
+                                   cupy_recvbuf_scale,
+                                   root=idx)
+            requests += req_scale
+
+        MPI.Request.Waitall(requests)
+
+    def gather_host(self,
+                    rank,
+                    world_size,
+                    comm,
+                    cupy_sign_list_packed,
+                    cupy_recvbuf_sign,
+                    cupy_worker_scale,
+                    cupy_recvbuf_scale):
+
+        # In-place operations are not possible for newly created cupy arrays
+        # so we need to return the new buffers
+        numpy_recvbuf_sign = np.zeros([world_size,
+                                       cupy_sign_list_packed[rank].size],
+                                      dtype=cupy_sign_list_packed[0].dtype)
+        numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+        # 1. convert from cupy to numpy
+        numpy_sign_list_packed = cupy_sign_list_packed
+
+        for idx in range(world_size):
+            numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
+
+        numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
+        numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
+
+        cupy.cuda.get_current_stream().synchronize()
+
+        # 2. use numpy buffers for communication
+        requests = []
+
+        for idx in range(world_size):
+            req_sign = self.my_igather(rank,
+                                  world_size,
+                                  comm,
+                                  numpy_sign_list_packed[idx],
+                                  numpy_recvbuf_sign,
+                                  root=idx)
+            requests += req_sign
+
+        for idx in range(world_size):
+            req_scale = self.my_igather(rank,
+                                   world_size,
+                                   comm,
+                                   numpy_worker_scale,
+                                   numpy_recvbuf_scale,
+                                   root=idx)
+            requests += req_scale
+
+        MPI.Request.Waitall(requests)
+
+        # 3. Convert back from numpy to cupy
+        cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
+        for idx in range(world_size):
+            cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
+
+        cupy_worker_scale = cupy.asarray(numpy_worker_scale)
+        cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
+        cupy.cuda.get_current_stream().synchronize()
+
+        return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
+
+    def allgather_cuda(self,
+                       comm,
+                       cupy_server_sign_packed,
+                       cupy_recvbuf_sign_server,
+                       cupy_server_scale,
+                       cupy_recvbuf_scale_server):
+        comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
+        comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
+
+    def allgather_host(self,
+                       comm,
+                       cupy_server_sign_packed,
+                       cupy_recvbuf_sign_server,
+                       cupy_server_scale,
+                       cupy_recvbuf_scale_server):
+
+        # 1. Convert cupy to numpy
+        numpy_recvbuf_sign_server = np.zeros(
+            [comm.Get_size(),
+             cupy_server_sign_packed.size],
+            dtype=cupy_server_sign_packed.dtype)
+        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
+                                               1],
+                                              dtype=cupy_server_scale.dtype)
+
+        numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
+        numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
+        numpy_server_scale = cupy.asnumpy(cupy_server_scale)
+        numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
+        cupy.cuda.get_current_stream().synchronize()
+
+        # 2. Communicate numpy buffers
+        comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
+        comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
+        comm.Barrier()
+
+        # 3. Convert numpy back to cupy
+        cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
+        cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
+        cupy_server_scale = cupy.asarray(numpy_server_scale)
+        cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
+        cupy.cuda.get_current_stream().synchronize()
+
+        return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
+
+    def compressed_allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             local_rank):
+
+        all_start_time = time.time()
+        original_size = buffer_m.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if torch.numel(buffer_m) != torch.numel(worker_error):
+            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        sign_buffer_m = buffer_m.sign().add_(1).bool()
+        sign_buffer_m = sign_buffer_m.float()
+        sign_buffer_m.add_(-0.5).mul_(2.0)
+        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
+        sign_buffer_m = None
+
+        compensated_buffer_m = buffer_m
+        compensated_buffer_m.sign_()
+        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
+        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
+        cupy_compensated_buffer_m = self.compression_backend.torch2cupy(
+            compensated_buffer_m)
+        compensated_buffer_m = None
+
+        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
+            cupy_compensated_buffer_m,
+            self.size)
+        cupy_compensated_buffer_m = None
+
+        cupy_recvbuf_sign = cupy.zeros(
+            [self.size,
+             cupy_sign_list_packed[self.rank].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 1
+        gather_start = time.time()
+        if self.cuda_aware:
+            self.gather_cuda(self.rank,
+                        self.size,
+                        self.comm,
+                        cupy_sign_list_packed,
+                        cupy_recvbuf_sign,
+                        cupy_worker_scale,
+                        cupy_recvbuf_scale)
+        else:
+            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = self.gather_host(self.rank,
+               self.size,
+               self.comm,
+               cupy_sign_list_packed,
+               cupy_recvbuf_sign,
+               cupy_worker_scale,
+               cupy_recvbuf_scale)
+        gather_end = time.time()
+
+        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+            self.size,
+            -1)
+        cupy_recvbuf_sign = None
+        unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float()
+        cupy_unpacked_sign = None
+        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
+        worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
+            1 / self.size)
+        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
+        unpacked_sign = None
+
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        sign_server_m = compensated_server_m.sign().add_(1).bool()
+        sign_server_m = sign_server_m.float()
+        sign_server_m.add_(-0.5).mul_(2.0)
+        server_error.set_(compensated_server_m - server_scale * sign_server_m)
+        sign_server_m = None
+
+        compensated_server_m.sign_()
+        compensated_server_m = compensated_server_m.add_(1).bool()
+        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
+        cupy_compensated_server_m = self.compression_backend.torch2cupy(
+            compensated_server_m)
+        compensated_server_m = None
+
+        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
+            cupy_compensated_server_m,
+            1)
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [self.size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size,
+                                                1],
+                                               dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 2
+        if self.cuda_aware:
+            self.allgather_cuda(self.comm,
+                           cupy_server_sign_packed[0],
+                           cupy_recvbuf_sign_server,
+                           cupy_server_scale,
+                           cupy_recvbuf_scale_server)
+        else:
+            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
+                  cupy_server_sign_packed[0],
+                  cupy_recvbuf_sign_server,
+                  cupy_server_scale,
+                  cupy_recvbuf_scale_server)
+
+        cupy_server_unpacked_sign = (cupy.unpackbits(
+            cupy_recvbuf_sign_server.flatten())).reshape(self.size,
+                                                         -1)
+        cupy_recvbuf_sign_server = None
+
+        server_unpacked_sign = self.compression_backend.cupy2torch(
+            cupy_server_unpacked_sign)
+        cupy_server_unpacked_sign = None
+
+        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
+        server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)
+        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+
+        return buffer_m
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
new file mode 100644
index 000000000000..49f57f57df36
--- /dev/null
+++ b/deepspeed/runtime/comm/nccl.py
@@ -0,0 +1,194 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import torch
+import torch.distributed as dist
+import time 
+import cupy 
+import numpy as np 
+
+from deepspeed.runtime.compression.cupy import CupyBackend
+
+class NcclBackend(object):
+
+    def __init__(self):
+        self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        self.rank = dist.get_rank(group=self.world_group)
+        self.size = dist.get_world_size(group=self.world_group)
+        self.compression_backend = CupyBackend()
+
+    def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(dist.irecv(recvbuf[idx], src=idx, group=group))
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            req.append(dist.isend(sendbuf, group=group, dst=root))
+        return req
+
+    def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    dist.recv(recvbuf[idx], src=idx, group=group)
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            dist.send(sendbuf, group=group, dst=root)
+
+    def compressed_allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             local_rank):
+
+        all_start_time = time.time()
+        original_size = buffer_m.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if torch.numel(buffer_m) != torch.numel(worker_error):
+            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        sign_buffer_m = buffer_m.sign().add_(1).bool()
+        sign_buffer_m = sign_buffer_m.float()
+        sign_buffer_m.add_(-0.5).mul_(2.0)
+        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
+        sign_buffer_m = None
+
+        compensated_buffer_m = buffer_m
+        compensated_buffer_m.sign_()
+        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
+
+        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
+        cupy_compensated_buffer_m = self.compression_backend.torch2cupy(compensated_buffer_m)
+        compensated_buffer_m = None
+
+        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(cupy_compensated_buffer_m,
+                                                       self.size)
+        cupy_compensated_buffer_m = None
+
+        cupy_recvbuf_sign = cupy.zeros(
+            [self.size,
+             cupy_sign_list_packed[self.rank].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
+
+        sign_list_packed = [None] * self.size
+
+        for idx in range(self.size):
+            sign_list_packed[idx] = self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
+
+        recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
+
+        worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
+        recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+
+        # communication phase 1
+        gather_start = time.time()
+        requests = []
+        for idx in range(self.size):
+            requests += self.my_igather(self.rank,
+                                        self.size,
+                                        self.world_group,
+                                        sign_list_packed[idx],
+                                        recvbuf_sign,
+                                        root=idx)
+            requests += self.my_igather(self.rank,
+                                        self.size,
+                                        self.world_group,
+                                        worker_scale,
+                                        recvbuf_scale,
+                                        root=idx)
+
+        for i in range(len(requests)):
+            requests[i].wait()
+
+        gather_end = time.time()
+
+        cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
+        cupy_recvbuf_scale = self.compression_backend.torch2cupy(recvbuf_scale)
+
+        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+            self.size,
+            -1)
+        cupy_recvbuf_sign = None
+
+        unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float()
+        cupy_unpacked_sign = None
+
+        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
+        worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size)
+
+        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
+        unpacked_sign = None
+
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        sign_server_m = compensated_server_m.sign().add_(1).bool()
+        sign_server_m = sign_server_m.float()
+        sign_server_m.add_(-0.5).mul_(2.0)
+        server_error.set_(compensated_server_m - server_scale * sign_server_m)
+        sign_server_m = None
+
+        compensated_server_m.sign_()
+        compensated_server_m = compensated_server_m.add_(1).bool()
+        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
+        cupy_compensated_server_m = self.compression_backend.torch2cupy(compensated_server_m)
+        compensated_server_m = None
+
+        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(cupy_compensated_server_m, 1)
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [self.size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+
+        server_sign_packed = [None] * 1
+        recvbuf_sign_server = [None] * self.size
+
+        for idx in range(self.size):
+            recvbuf_sign_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
+
+        server_sign_packed[0] = self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
+
+        server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size,
+                                                1],
+                                               dtype=cupy_worker_scale.dtype)
+
+        recvbuf_scale_server = [None] * self.size
+        for idx in range(self.size):
+            recvbuf_scale_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
+
+        # Communication Phase 2
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0])
+        dist.all_gather(recvbuf_scale_server, server_scale)
+
+        # need to convert from a tensor list to a single tensor
+        # dist.all_gather only provides a tensor list as the recv/output buffer
+        recvbuf_sign_server = torch.stack(recvbuf_sign_server)
+
+        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server)
+
+        cupy_server_unpacked_sign = (cupy.unpackbits(
+            cupy_recvbuf_sign_server.flatten())).reshape(self.size,
+                                                         -1)
+        cupy_recvbuf_sign_server = None
+
+        server_unpacked_sign = self.compression_backend.cupy2torch(cupy_server_unpacked_sign)
+        cupy_server_unpacked_sign = None
+
+        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
+        server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)
+        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+
+        return buffer_m
diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py
new file mode 100644
index 000000000000..68e56c68e9d0
--- /dev/null
+++ b/deepspeed/runtime/compression/cupy.py
@@ -0,0 +1,24 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import cupy
+from torch.utils.dlpack import to_dlpack
+from torch.utils.dlpack import from_dlpack
+
+
+class CupyBackend(object):
+    def __init__(self):
+        pass
+
+    def torch2cupy(self, tensor):
+        return cupy.fromDlpack(to_dlpack(tensor))
+
+    def cupy2torch(self, cupy_tensor):
+        return from_dlpack(cupy_tensor.toDlpack())
+
+    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
+        packed_sign = cupy.packbits(cupy_bool_tensor)
+        sign_list_packed = cupy.split(packed_sign, num_chunks)
+        cupy.cuda.get_current_stream().synchronize()
+        return sign_list_packed
diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py
deleted file mode 100644
index cb77edcaf60d..000000000000
--- a/deepspeed/runtime/custom_collectives.py
+++ /dev/null
@@ -1,154 +0,0 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-from mpi4py import MPI
-import numpy as np
-import cupy
-
-
-def my_igather(rank, size, comm, sendbuf, recbuf, root):
-    req = []
-    if rank == root:
-        for idx in range(size):
-            if idx != rank:
-                req.append(comm.Irecv(recbuf[idx], source=idx))
-            else:
-                recbuf[rank] = sendbuf
-    else:
-        req.append(comm.Isend(sendbuf, dest=root))
-    return req
-
-
-def gather_cuda(rank,
-                world_size,
-                comm,
-                cupy_sign_list_packed,
-                cupy_recvbuf_sign,
-                cupy_worker_scale,
-                cupy_recvbuf_scale):
-    # We do in-place operations on cupy buffers so we do not return any buffers
-    requests = []
-    for idx in range(world_size):
-        req_sign = my_igather(rank,
-                              world_size,
-                              comm,
-                              cupy_sign_list_packed[idx],
-                              cupy_recvbuf_sign,
-                              root=idx)
-        requests += req_sign
-
-    for idx in range(world_size):
-        req_scale = my_igather(rank,
-                               world_size,
-                               comm,
-                               cupy_worker_scale,
-                               cupy_recvbuf_scale,
-                               root=idx)
-        requests += req_scale
-
-    MPI.Request.Waitall(requests)
-
-
-def gather_host(rank,
-                world_size,
-                comm,
-                cupy_sign_list_packed,
-                cupy_recvbuf_sign,
-                cupy_worker_scale,
-                cupy_recvbuf_scale):
-    # In-place operations are not possible for newly created cupy arrays
-    # so we need to return the new buffers
-    numpy_recvbuf_sign = np.zeros([world_size,
-                                   cupy_sign_list_packed[rank].size],
-                                  dtype=cupy_sign_list_packed[0].dtype)
-    numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
-
-    # 1. convert from cupy to numpy
-    numpy_sign_list_packed = cupy_sign_list_packed
-
-    for idx in range(world_size):
-        numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
-
-    numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
-    numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
-
-    cupy.cuda.get_current_stream().synchronize()
-
-    # 2. use numpy buffers for communication
-    requests = []
-
-    for idx in range(world_size):
-        req_sign = my_igather(rank,
-                              world_size,
-                              comm,
-                              numpy_sign_list_packed[idx],
-                              numpy_recvbuf_sign,
-                              root=idx)
-        requests += req_sign
-
-    for idx in range(world_size):
-        req_scale = my_igather(rank,
-                               world_size,
-                               comm,
-                               numpy_worker_scale,
-                               numpy_recvbuf_scale,
-                               root=idx)
-        requests += req_scale
-
-    MPI.Request.Waitall(requests)
-
-    # 3. Convert back from numpy to cupy
-    cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
-    for idx in range(world_size):
-        cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
-
-    cupy_worker_scale = cupy.asarray(numpy_worker_scale)
-    cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
-    cupy.cuda.get_current_stream().synchronize()
-
-    return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
-
-
-def allgather_cuda(comm,
-                   cupy_server_sign_packed,
-                   cupy_recvbuf_sign_server,
-                   cupy_server_scale,
-                   cupy_recvbuf_scale_server):
-    comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
-    comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
-
-
-def allgather_host(comm,
-                   cupy_server_sign_packed,
-                   cupy_recvbuf_sign_server,
-                   cupy_server_scale,
-                   cupy_recvbuf_scale_server):
-
-    # 1. Convert cupy to numpy
-    numpy_recvbuf_sign_server = np.zeros([comm.Get_size(),
-                                          cupy_server_sign_packed.size],
-                                         dtype=cupy_server_sign_packed.dtype)
-    numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
-                                           1],
-                                          dtype=cupy_server_scale.dtype)
-
-    numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
-    numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
-    numpy_server_scale = cupy.asnumpy(cupy_server_scale)
-    numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
-    cupy.cuda.get_current_stream().synchronize()
-
-    # 2. Communicate numpy buffers
-    comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
-    comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
-    comm.Barrier()
-
-    # 3. Convert numpy back to cupy
-    cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
-    cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
-    cupy_server_scale = cupy.asarray(numpy_server_scale)
-    cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
-    cupy.cuda.get_current_stream().synchronize()
-
-    return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 7431b2c892c4..7c9b920d8bb6 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -664,8 +664,8 @@ def _configure_basic_optimizer(self, model_parameters):
             from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
-            optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
+            from deepspeed.runtime.fp16.onebit.adam import Adam
+            optimizer = Adam(model_parameters, self, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/fp16/onebit_adam.py b/deepspeed/runtime/fp16/onebit/adam.py
similarity index 56%
rename from deepspeed/runtime/fp16/onebit_adam.py
rename to deepspeed/runtime/fp16/onebit/adam.py
index c6566c28777b..5cb0be7546e5 100644
--- a/deepspeed/runtime/fp16/onebit_adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -6,18 +6,14 @@
 import importlib
 import numpy as np
 import time
-import cupy
-from torch.utils.dlpack import to_dlpack
-from torch.utils.dlpack import from_dlpack
-from deepspeed.utils.logging import logger
+import torch.distributed as dist
 
-from mpi4py import MPI
-from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host
+from deepspeed.utils.logging import logger
 
 
-class OnebitAdam(torch.optim.Optimizer):
+class Adam(torch.optim.Optimizer):
     """Implements the 1-bit Adam algorithm. Currently GPU-only.
-    For usage example please see, TODO DeepSpeed Tutorial
+    For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
     It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
 
     Arguments:
@@ -42,6 +38,8 @@ class OnebitAdam(torch.optim.Optimizer):
             second moment estimate as in the original paper. (default: False)
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
+            from cupy. (default: 'deepspeed')
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
@@ -60,10 +58,12 @@ def __init__(self,
                  weight_decay=0.,
                  max_grad_norm=0.,
                  amsgrad=False,
-                 cuda_aware=False):
+                 cuda_aware=False,
+                 comm_backend_name='nccl'):
 
         if amsgrad:
             raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
+
         defaults = dict(lr=lr,
                         bias_correction=bias_correction,
                         betas=betas,
@@ -71,161 +71,39 @@ def __init__(self,
                         weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
 
-        super(OnebitAdam, self).__init__(params, defaults)
-        from mpi4py import MPI
+        super(Adam, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
 
-        self.comm = MPI.COMM_WORLD
-        self.rank = self.comm.Get_rank()
-        self.size = self.comm.Get_size()
         self.comm_time = 0.0
         self.step_time = 0.0
         self.ave_step = 1
         self.bk_time = 0.0
-        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
         self.deepspeed = deepspeed
         self.adam_freeze_key = False
         self.initialize = False
         self.freeze_step = freeze_step
         self.cuda_aware = cuda_aware
 
-    def torch2cupy(self, tensor):
-        return cupy.fromDlpack(to_dlpack(tensor))
-
-    def cupy2torch(self, cupy_tensor):
-        return from_dlpack(cupy_tensor.toDlpack())
-
-    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
-        packed_sign = cupy.packbits(cupy_bool_tensor)
-        sign_list_packed = cupy.split(packed_sign, num_chunks)
-        cupy.cuda.get_current_stream().synchronize()
-        return sign_list_packed
-
-    def Compressed_Allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             rank,
-                             world_size,
-                             comm,
-                             local_rank):
-
-        all_start_time = time.time()
-        original_size = buffer_m.numel()
-        cupy.cuda.Device(local_rank).use()
-
-        if torch.numel(buffer_m) != torch.numel(worker_error):
-            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
-                                       device=buffer_m.device)
-            buffer_m = torch.cat([buffer_m, empty_tensor])
-
-        buffer_m.add_(worker_error)
-        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        sign_buffer_m = buffer_m.sign().add_(1).bool()
-        sign_buffer_m = sign_buffer_m.float()
-        sign_buffer_m.add_(-0.5).mul_(2.0)
-        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
-        sign_buffer_m = None
-
-        compensated_buffer_m = buffer_m
-        compensated_buffer_m.sign_()
-        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
-        cupy_worker_scale = self.torch2cupy(worker_scale)
-        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
-        compensated_buffer_m = None
-
-        cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m,
-                                                       world_size)
-        cupy_compensated_buffer_m = None
-
-        cupy_recvbuf_sign = cupy.zeros([world_size,
-                                        cupy_sign_list_packed[rank].size],
-                                       dtype=cupy_sign_list_packed[0].dtype)
-        cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
-
-        # Communication Phase 1
-        gather_start = time.time()
-        if self.cuda_aware:
-            gather_cuda(rank,
-                        world_size,
-                        comm,
-                        cupy_sign_list_packed,
-                        cupy_recvbuf_sign,
-                        cupy_worker_scale,
-                        cupy_recvbuf_scale)
-        else:
-            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank,
-               world_size,
-               comm,
-               cupy_sign_list_packed,
-               cupy_recvbuf_sign,
-               cupy_worker_scale,
-               cupy_recvbuf_scale)
-        gather_end = time.time()
-
-        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-            world_size,
-            -1)
-        cupy_recvbuf_sign = None
-        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
-        cupy_unpacked_sign = None
-        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
-        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
-        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
-        unpacked_sign = None
-
-        compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        sign_server_m = compensated_server_m.sign().add_(1).bool()
-        sign_server_m = sign_server_m.float()
-        sign_server_m.add_(-0.5).mul_(2.0)
-        server_error.set_(compensated_server_m - server_scale * sign_server_m)
-        sign_server_m = None
-
-        compensated_server_m.sign_()
-        compensated_server_m = compensated_server_m.add_(1).bool()
-        cupy_server_scale = self.torch2cupy(server_scale)
-        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
-        compensated_server_m = None
-
-        cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1)
-
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [world_size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_sign_list_packed[0].dtype)
-        cupy_recvbuf_scale_server = cupy.zeros([world_size,
-                                                1],
-                                               dtype=cupy_worker_scale.dtype)
-
-        # Communication Phase 2
-        if self.cuda_aware:
-            allgather_cuda(comm,
-                           cupy_server_sign_packed[0],
-                           cupy_recvbuf_sign_server,
-                           cupy_server_scale,
-                           cupy_recvbuf_scale_server)
-        else:
-            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm,
-                  cupy_server_sign_packed[0],
-                  cupy_recvbuf_sign_server,
-                  cupy_server_scale,
-                  cupy_recvbuf_scale_server)
+        self.comm_backend_name = comm_backend_name
 
-        cupy_server_unpacked_sign = (cupy.unpackbits(
-            cupy_recvbuf_sign_server.flatten())).reshape(world_size,
-                                                         -1)
-        cupy_recvbuf_sign_server = None
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
 
-        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
-        cupy_server_unpacked_sign = None
+        if self.comm_backend_name == 'nccl':
+            assert torch.__version__.startswith("1.8."), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.comm_backend_handle = NcclBackend()
 
-        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
-        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
-        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
 
-        return buffer_m
+        self.size = self.comm_backend_handle.size
+
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
 
     def step(self, closure=None, grads=None):
         """Performs a single optimization step.
@@ -337,13 +215,12 @@ def step(self, closure=None, grads=None):
 
                         if self.size > 1:
                             exp_avg.set_(
-                                self.Compressed_Allreduce(exp_avg,
-                                                          state['worker_error'],
-                                                          state['server_error'],
-                                                          self.rank,
-                                                          self.size,
-                                                          self.comm,
-                                                          self.deepspeed.local_rank))
+                                self.comm_backend_handle.compressed_allreduce(
+                                    exp_avg,
+                                    state['worker_error'],
+                                    state['server_error'],
+                                    self.deepspeed.local_rank))
+
                     if self.initialize:
                         update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
 
@@ -362,7 +239,7 @@ def step(self, closure=None, grads=None):
             self.adam_freeze_key = False
             self.initialize = True
             print(
-                f"Finished the initialization step at rant {torch.distributed.get_rank()}"
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
             )
             return loss
 
diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_mpi_backend.py
similarity index 71%
rename from tests/onebitadam/test_com_reduce_cuda.py
rename to tests/onebitadam/test_mpi_backend.py
index a5a87ce67232..7c1b59737532 100644
--- a/tests/onebitadam/test_com_reduce_cuda.py
+++ b/tests/onebitadam/test_mpi_backend.py
@@ -4,7 +4,8 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+
+from deepspeed.runtime.comm.mpi import MpiBackend
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
@@ -12,18 +13,17 @@
 
 #TODO: Detect the hostname we are running on automatically
 torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-1:2245',
+                                     init_method='tcp://worker-0:2245',
                                      world_size=size,
                                      rank=rank)
 
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
-
-# Set cuda_aware to True to use CUDA buffers for communication
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=True)
+# Change cuda_aware to True to test out CUDA-Aware MPI communication
+backend = MpiBackend(cuda_aware=False)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
 
+# A simulated compression function using torch.distributed
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
     scale = a.norm() / np.sqrt(a.numel())
@@ -52,21 +52,20 @@ def torch_sim(a):
 else:
     right_tensor_size = tensor_size
 right_server_size = right_tensor_size // size
+
 # Adding bias to the initialization of the gradient we are communicating
 # In order to get rid of the case where some elements in the gradient are too small
 a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
 worker_error = torch.zeros(right_tensor_size, device=device)
 server_error = torch.zeros(right_server_size, device=device)
+
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
 torch.cuda.empty_cache()
 local_rank = rank % torch.cuda.device_count()
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
+
+a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
 threshold = 1e-6
 magnitude_threshold = 1e-6
 diff_mask = (a_after - a_torch) > threshold
@@ -74,13 +73,16 @@ def torch_sim(a):
 mpi_server = torch.chunk(a_after, size)[rank] + server_error
 torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
 
+test_correctness = True
+
 # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
 # The test would skip those numbers that are too small in compensated_server_m
-if torch.sum(diff_server_mask) == 0:
-    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
-else:
-    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
-    if torch.sum(check_mag_mask) == 0:
-        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+if test_correctness:
+    if torch.sum(diff_server_mask) == 0:
+        print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
     else:
-        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
+        check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) == 0:
+            print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
+        else:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebitadam/test_mpi_perf.py b/tests/onebitadam/test_mpi_perf.py
new file mode 100644
index 000000000000..63e445e89c50
--- /dev/null
+++ b/tests/onebitadam/test_mpi_perf.py
@@ -0,0 +1,78 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+
+from deepspeed.runtime.comm.mpi import MpiBackend
+
+# Configure wall clock timer
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+from statistics import mean 
+
+timers = SynchronizedWallClockTimer()
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+#TODO: Detect the hostname we are running on automatically
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-0:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+backend = MpiBackend(cuda_aware=False)
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+warmup = 10
+iters = 100
+
+local_rank = rank % torch.cuda.device_count()
+
+# Warmup
+for i in range(warmup):
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+time_list = []
+
+for i in range(iters):
+    timers('compressed_allreduce').start()
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    timers('compressed_allreduce').stop()
+    time_list.append(timers('compressed_allreduce').elapsed())
+
+timer_names = ['compressed_allreduce']
+timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+
+places = 2
+convert = 1e3
+float_size = 4
+
+if rank == 0:
+    for i in range(iters):
+        lat = time_list[i]
+        print("latency = ", lat * convert)
+
+minlat = round(min(time_list) * convert)
+maxlat = round(max(time_list) * convert)
+meanlat = round(mean(time_list) * convert, places)
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat))
diff --git a/tests/onebitadam/test_nccl_backend.py b/tests/onebitadam/test_nccl_backend.py
new file mode 100644
index 000000000000..be4acc8a31d8
--- /dev/null
+++ b/tests/onebitadam/test_nccl_backend.py
@@ -0,0 +1,88 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+
+from deepspeed.runtime.comm.nccl import NcclBackend
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+#TODO: Detect the hostname we are running on automatically
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-0:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+backend = NcclBackend()
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+
+# A simulated compression function using torch.distributed
+def torch_sim(a):
+    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    scale = a.norm() / np.sqrt(a.numel())
+    a_compressed = scale * a_sign
+    a_sign = None
+    worker_error = a - a_compressed
+    dist.all_reduce(a_compressed)
+    a_compressed.mul_(1 / dist.get_world_size())
+    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
+    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+    a_server_compressed = torch.cat(
+        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    rank = dist.get_rank()
+    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    return a_server_compressed, worker_error, server_error
+
+
+tensor_size = 100 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+torch.cuda.empty_cache()
+local_rank = rank % torch.cuda.device_count()
+
+a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+threshold = 1e-6
+magnitude_threshold = 1e-6
+diff_mask = (a_after - a_torch) > threshold
+diff_server_mask = torch.chunk(diff_mask, size)[rank]
+mpi_server = torch.chunk(a_after, size)[rank] + server_error
+torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+test_correctness = True
+
+# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+# The test would skip those numbers that are too small in compensated_server_m
+if test_correctness:
+    if torch.sum(diff_server_mask) == 0:
+        print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
+    else:
+        check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) == 0:
+            print(
+                'Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
+        else:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py
new file mode 100644
index 000000000000..e079838288a0
--- /dev/null
+++ b/tests/onebitadam/test_nccl_perf.py
@@ -0,0 +1,78 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+
+from deepspeed.runtime.comm.nccl import NcclBackend
+
+# Configure wall clock timer
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+from statistics import mean 
+
+timers = SynchronizedWallClockTimer()
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+#TODO: Detect the hostname we are running on automatically
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-0:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+backend = NcclBackend()
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+warmup = 10
+iters = 100
+
+local_rank = rank % torch.cuda.device_count()
+
+# Warmup
+for i in range(warmup):
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+time_list = []
+
+for i in range(iters):
+    timers('compressed_allreduce').start()
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    timers('compressed_allreduce').stop()
+    time_list.append(timers('compressed_allreduce').elapsed())
+
+timer_names = ['compressed_allreduce']
+timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+
+places = 2
+convert = 1e3
+float_size = 4
+
+if rank == 0:
+    for i in range(iters):
+        lat = time_list[i]
+        print("latency = ", lat * convert)
+
+minlat = round(min(time_list) * convert)
+maxlat = round(max(time_list) * convert)
+meanlat = round(mean(time_list) * convert, places)
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat))

From 6dbdd9858bafef4d340c089fdc0e3ddde3706f47 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 23 Dec 2020 07:57:48 +0000
Subject: [PATCH 02/41] Revert "Merge branch 'master' into
 staging-1bit-nccl-v2"

This reverts commit 78400850703b4b2d84f11b73c109f56919e748ea, reversing
changes made to a6dba72aeafad63661dfe566d3accd03d00be78c.
---
 .github/workflows/pre-compile-ops.yml     |  47 --------
 DeepSpeedExamples                         |   2 +-
 csrc/transformer/ds_transformer_cuda.cpp  |  13 +--
 csrc/transformer/softmax_kernels.cu       |  20 +---
 deepspeed/__init__.py                     |   1 -
 deepspeed/constants.py                    |   8 --
 deepspeed/git_version_info.py             |   8 +-
 deepspeed/launcher/constants.py           |   5 +
 deepspeed/launcher/launch.py              |   3 +-
 deepspeed/launcher/runner.py              |   4 +-
 deepspeed/ops/sparse_attention/softmax.py |   4 +-
 deepspeed/ops/transformer/transformer.py  |  28 +----
 deepspeed/runtime/constants.py            |   5 +
 deepspeed/runtime/engine.py               | 116 +++++++++++++++++--
 deepspeed/runtime/lr_schedules.py         |  21 ----
 deepspeed/runtime/pipe/engine.py          |   4 +-
 deepspeed/runtime/zero/config.py          |   4 -
 deepspeed/utils/__init__.py               |   3 +-
 deepspeed/utils/distributed.py            | 129 ----------------------
 docs/_pages/features.md                   |   3 +-
 docs/_tutorials/getting-started.md        |  30 +++--
 docs/code-docs/source/conf.py             |   2 +-
 install.sh                                |   2 +-
 op_builder/builder.py                     |   9 +-
 requirements/requirements-readthedocs.txt |   1 +
 tests/unit/common.py                      |  12 +-
 tests/unit/test_cuda_backward.py          |  32 ++++--
 tests/unit/test_cuda_forward.py           |  34 +++---
 28 files changed, 208 insertions(+), 342 deletions(-)
 delete mode 100644 .github/workflows/pre-compile-ops.yml
 mode change 100755 => 100644 csrc/transformer/ds_transformer_cuda.cpp
 delete mode 100644 deepspeed/constants.py
 delete mode 100644 deepspeed/utils/distributed.py

diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
deleted file mode 100644
index 4005d4baf2fc..000000000000
--- a/.github/workflows/pre-compile-ops.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# This is a basic workflow to help you get started with Actions
-
-name: Tests-w-precompiled-ops
-
-# Controls when the action will run.
-on:
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
-  # This workflow contains a single job called "build"
-  build:
-    # The type of runner that the job will run on
-    runs-on: self-hosted
-
-    # Steps represent a sequence of tasks that will be executed as part of the job
-    steps:
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
-
-      # Runs a single command using the runners shell
-      - name: environment
-        run: |
-          nvidia-smi
-          which python
-          python --version
-          which nvcc
-          nvcc --version
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      # Runs a set of commands using the runners shell
-      - name: Install deepspeed
-        run: |
-          DS_BUILD_OPS=1 pip install .[dev]
-          ds_report
-
-      - name: Formatting checks
-        run: |
-           pre-commit run --all-files
-
-      # Runs a set of commands using the runners shell
-      - name: Unit tests
-        run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 78d69cb2f89a..fa1d1a71c486 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 78d69cb2f89a27b1e9b072df8c3e47d00c024fdc
+Subproject commit fa1d1a71c48623db8a091d9cf636a5fe3b8f43c7
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
old mode 100755
new mode 100644
index ebd534d04ab3..85ec0418971c
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -14,8 +14,6 @@
 
 static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
 
-const int init_seq_length = 128;
-
 // C++ interface
 
 template <typename T>
@@ -593,6 +591,7 @@ int create_transformer_layer(int layer_id,
                              int hidden_dim,
                              int num_heads,
                              int intermediate_size,
+                             int seq_length,
                              float attn_dropout_ratio,
                              float hidden_dropout_ratio,
                              int seed,
@@ -605,14 +604,14 @@ int create_transformer_layer(int layer_id,
 {
     Context::Instance().SetSeed(seed);
     Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
+        test_gemm, batch_size, seq_length, num_heads, hidden_dim / num_heads);
 
     auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
                                                            batch_size,
                                                            hidden_dim,
                                                            num_heads,
                                                            intermediate_size,
-                                                           init_seq_length,
+                                                           seq_length,
                                                            attn_dropout_ratio,
                                                            hidden_dropout_ratio,
                                                            pre_or_postLayerNorm,
@@ -874,12 +873,6 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
     std::shared_ptr<BertTransformerLayer<T>> layer =
         std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
 
-    int seq_len = layer->GetSeqLength();
-    if (g_output.size(1) != seq_len) {
-        seq_len = g_output.size(1);
-        layer->SetSeqLength(seq_len, bsz);
-    }
-
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
     auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index be776b0c074d..582da4829f47 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -80,8 +80,7 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -114,8 +113,7 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -218,8 +216,7 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -255,8 +252,7 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride)
-            iters = warp_num / (iteration_stride / max_threads_in_sequence);
+        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -343,9 +339,7 @@ void launch_attn_softmax<float>(float* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
+
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
@@ -414,9 +408,7 @@ void launch_attn_softmax<__half>(__half* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-        iterations =
-            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
-                                                     : MAX_THREAD_ITERATIONS);
+
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index ba6f9b5bb6bf..8ac0aad05562 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -14,7 +14,6 @@
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .utils import log_dist
-from .utils.distributed import init_distributed
 
 from .pipe import PipelineModule
 
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
deleted file mode 100644
index 467e85aefcb6..000000000000
--- a/deepspeed/constants.py
+++ /dev/null
@@ -1,8 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-
-#############################################
-# Torch distributed constants
-#############################################
-TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index f04982c74f0d..d17948ae41a7 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -2,12 +2,8 @@
     #  This is populated by setup.py
     from .git_version_info_installed import *
 except ModuleNotFoundError:
-    import os
-    if os.path.isfile('version.txt'):
-        # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
-        version = open('version.txt', 'r').read().strip()
-    else:
-        version = "0.0.0"
+    # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+    version = open('version.txt', 'r').read().strip()
     git_hash = '[none]'
     git_branch = '[none]'
 
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index fd56facc4343..f384d58b2c52 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -1,5 +1,10 @@
 # Copyright 2020 The Microsoft DeepSpeed Team
 
+#############################################
+# Torch distributed constants
+#############################################
+TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+
 PDSH_LAUNCHER = 'pdsh'
 PDSH_MAX_FAN_OUT = 1024
 
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index 0958295efe06..205aee2d6ac4 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -16,7 +16,7 @@
 from collections import defaultdict
 from argparse import ArgumentParser, REMAINDER
 
-from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..utils import logger
 
 
@@ -113,7 +113,6 @@ def main():
         # each process's rank
         dist_rank = global_rank_mapping[local_node][local_rank]
         current_env["RANK"] = str(dist_rank)
-        current_env["LOCAL_RANK"] = str(local_rank)
 
         # spawn the processes
         cmd = [
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index eb03502cc3f2..9479bb63758c 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -19,8 +19,8 @@
 import torch.cuda
 
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
-from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
-from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT, \
+    PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
 from ..utils import logger
 
 DLTS_HOSTFILE = "/job/hostfile"
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
index a0805ada4bc0..cd18fbcae71f 100644
--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -224,8 +224,8 @@ class Softmax:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
-    def sparse_softmax(*args, **kwargs):
-        return _sparse_softmax.apply(*args, **kwargs)
+
+    sparse_softmax = _sparse_softmax.apply
 
     def make_lut(self, device):
         """Generates the sparsity layout used in block-sparse softmax
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index ea4b98848d3c..a91e5ce6f08b 100755
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -18,6 +18,7 @@
 class TransformerConfig():
     def __init__(self,
                  batch_size,
+                 max_seq_length,
                  hidden_size,
                  intermediate_size,
                  heads,
@@ -29,6 +30,7 @@ def __init__(self,
         self.batch_size = batch_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
+        self.max_seq_length = max_seq_length
         self.heads = heads
         self.attn_dropout_ratio = attn_dropout_ratio
         self.hidden_dropout_ratio = hidden_dropout_ratio
@@ -90,6 +92,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
     """
     def __init__(self,
                  batch_size=-1,
+                 max_seq_length=-1,
                  hidden_size=-1,
                  intermediate_size=-1,
                  heads=-1,
@@ -109,6 +112,7 @@ def __init__(self,
         super(DeepSpeedTransformerConfig,
               self).__init__(
                   batch_size,
+                  max_seq_length,
                   hidden_size,
                   (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
                   heads,
@@ -138,7 +142,7 @@ def from_dict(cls, json_object):
 
     @classmethod
     def from_json_file(cls, json_file):
-        with open(json_file, "r", encoding='utf-16') as reader:
+        with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
@@ -173,18 +177,6 @@ def forward(ctx,
         cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
         forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
 
-        inp_size = input.size()
-        if inp_size[1] % 16 != 0:
-            input = torch.cat((input,
-                               torch.randn((inp_size[0],
-                                            (16 - (inp_size[1] % 16)),
-                                            inp_size[2]),
-                                           device=input.device,
-                                           dtype=input.dtype)),
-                              1)
-            input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
-                                            (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
-
         (output,
          inp_norm,
          qkv_tf,
@@ -311,17 +303,11 @@ def forward(ctx,
             ctx.attn_layer_norm_var = attn_layer_norm_var
             ctx.layer_norm_var = layer_norm_var
 
-        if inp_size[1] % 16 != 0:
-            output = torch.narrow(output, 1, 0, inp_size[1])
         return output
 
     @staticmethod
     def backward(ctx, grad_output):
         bsz = grad_output.shape[0]
-        grad_output_shape = grad_output.size()
-        if grad_output_shape[1] % 16 != 0:
-            grad_output = torch.cat((grad_output, torch.zeros((bsz, (16 - (grad_output_shape[1] % 16)), \
-                                        grad_output_shape[2]), device=grad_output.device, dtype=grad_output.dtype)), 1)
 
         if bsz > ctx.config.batch_size:
             raise ValueError('grad_output batch size exceeds the limit.')
@@ -412,9 +398,6 @@ def backward(ctx, grad_output):
              norm_w,
              norm_b)
 
-        if grad_output_shape[1] % 16 != 0:
-            grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
-
         return (grad_input,
                 None,
                 None,
@@ -518,6 +501,7 @@ def __init__(self, layer_id, config, initial_weights=None, initial_biases=None):
                           self.config.hidden_size,
                           self.config.heads,
                           self.config.intermediate_size,
+                          self.config.max_seq_length,
                           self.config.attn_dropout_ratio,
                           self.config.hidden_dropout_ratio,
                           self.config.seed,
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index c56c3898f60f..a731865714fe 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -73,6 +73,11 @@
 ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
 ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
 
+#############################################
+# Torch distributed constants
+#############################################
+TORCH_DISTRIBUTED_DEFAULT_PORT = "29500"
+
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
 STEPS_PER_PRINT_DEFAULT = 10
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8b2901f8452e..7c9b920d8bb6 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -24,12 +24,12 @@
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    PLD_THETA, PLD_GAMMA
+    TORCH_DISTRIBUTED_DEFAULT_PORT, PLD_THETA, PLD_GAMMA
 from deepspeed.runtime.zero.constants import \
     ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.runtime.csr_tensor import CSRTensor
 import deepspeed.runtime.lr_schedules as lr_schedules
-from deepspeed.utils import logger, log_dist, init_distributed
+from deepspeed.utils import logger, log_dist
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
 
@@ -130,14 +130,29 @@ def __init__(self,
         if dist_init_required is False:
             assert (dist.is_initialized()==True), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
 
-        # Initialize torch distributed if needed
-        init_distributed(dist_backend=self.dist_backend)
+        # DeepSpeed will initialize torch distributed only if the user has not already intialized it.
+        if dist_init_required and not dist.is_initialized():
+            # discover using mpi4py if user specifies the flag
+            if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+                # if in Azure ML environment and user specified this flag, notify the user to remove the flag.
+                if self._in_aml():
+                    logger.warning(
+                        "Please remove the --deepspeed_mpi flag if running on AzureML.")
+                self._mpi_check(args, dist_init_required)
+            else:
+                # detect if we are in Azure ML environment
+                if self._in_aml():
+                    self._set_environment_variables_for_nccl_backend(args)
+
+            logger.info("Initializing torch distributed with backend: {}".format(
+                self.dist_backend))
+            dist.init_process_group(backend=self.dist_backend)
 
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
 
-        self._set_distributed_vars()
+        self._init_distributed(dist_init_required)
 
         if self.tensorboard_enabled() and self.global_rank == 0:
             self.summary_writer = self.get_summary_writer()
@@ -194,6 +209,87 @@ def __init__(self,
         self.flatten = util_ops.flatten
         self.unflatten = util_ops.unflatten
 
+    def _in_aml(self):
+        # read AzureML environment variable to detect if we are using an Azure ML environment
+        if 'AZUREML_EXPERIMENT_ID' in os.environ:
+            return True
+        else:
+            return False
+
+    def _set_environment_variables_for_nccl_backend(self,
+                                                    args,
+                                                    master_port=6105,
+                                                    verbose=True):
+        """Helper routine to get and set environment variables.
+        This is adapted from Azure ML's documentation available from:
+        https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
+        """
+        os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+        os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+        single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
+            os.environ["WORLD_SIZE"])
+        if not single_node:
+            master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
+            os.environ["MASTER_ADDR"] = master_node_params[0]
+            # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = str(master_port)
+        else:
+            os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
+            os.environ["MASTER_PORT"] = "54965"
+        print("NCCL_SOCKET_IFNAME original value = {}".format(
+            os.environ["NCCL_SOCKET_IFNAME"]))
+
+        os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
+        args.local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
+
+        if verbose:
+            logger.info(
+                "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+                .format(os.environ['RANK'],
+                        args.local_rank,
+                        os.environ['WORLD_SIZE'],
+                        os.environ['MASTER_ADDR'],
+                        os.environ['MASTER_PORT']))
+
+    def _mpi_check(self, args, dist_init_required):
+        from mpi4py import MPI
+        import subprocess
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        world_size = comm.Get_size()
+
+        master_addr = None
+        if rank == 0:
+            hostname_cmd = ["hostname -I"]
+            result = subprocess.check_output(hostname_cmd, shell=True)
+            master_addr = result.decode('utf-8').split()[0]
+        master_addr = comm.bcast(master_addr, root=0)
+
+        # Determine local rank by assuming hostnames are unique
+        proc_name = MPI.Get_processor_name()
+        all_procs = comm.allgather(proc_name)
+        local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+        os.environ['RANK'] = str(rank)
+        os.environ['WORLD_SIZE'] = str(world_size)
+        args.local_rank = local_rank
+        os.environ['MASTER_ADDR'] = master_addr
+        os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT
+
+        logger.info(
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    args.local_rank,
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
+
+        if not dist_init_required and dist.is_initialized():
+            assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
+            assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
+                world_size, dist.get_world_size())
+
     def pld_enabled(self):
         return self._config.pld_enabled
 
@@ -401,7 +497,7 @@ def _scheduler_from_config(self, optimizer):
         else:
             return None
 
-    def _set_distributed_vars(self):
+    def _init_distributed(self, dist_init_required):
         if self.local_rank >= 0:
             torch.cuda.set_device(self.local_rank)
             self.device = torch.device("cuda", self.local_rank)
@@ -883,7 +979,7 @@ def clip_fp32_gradients(self):
         torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(),
                                        max_norm=self.gradient_clipping())
 
-    def _take_model_step(self, lr_kwargs):
+    def _take_model_step(self):
         if self.gradient_clipping() > 0.0:
             if not self.fp16_enabled() and not self.amp_enabled():
                 self.clip_fp32_gradients()
@@ -914,14 +1010,14 @@ def _take_model_step(self, lr_kwargs):
             self.skipped_steps += 1
         else:
             if self.lr_scheduler is not None:
-                self.lr_scheduler.step(**(lr_kwargs or {}))
+                self.lr_scheduler.step()
             if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
                 self._report_progress(self.global_steps + 1)
 
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
 
-    def step(self, lr_kwargs=None):
+    def step(self):
         r"""Execute the weight update step after forward and backward propagation
         on effective_train_batch.
         """
@@ -938,7 +1034,7 @@ def step(self, lr_kwargs=None):
             if self.progressive_layer_drop:
                 self.progressive_layer_drop.update_state(self.global_steps)
 
-            self._take_model_step(lr_kwargs)
+            self._take_model_step()
 
         self.tput_timer.stop(report_progress)
 
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index e7e3be1e786b..5ec106c28d67 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -381,12 +381,6 @@ def get_lr(self):
             lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr
         ]
 
-    def get_last_lr(self):
-        """ Return last computed learning rate by current scheduler.
-        """
-        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
-        return self._last_lr
-
     def _update_optimizer(self, group_lrs):
         for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
             param_group['lr'] = lr
@@ -396,7 +390,6 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
         self._update_optimizer(self.get_lr())
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -635,19 +628,12 @@ def get_lr(self):
             return self._get_cycle_lr()
         return self._get_decay_lr(self.last_batch_iteration - self.total_size)
 
-    def get_last_lr(self):
-        """ Return last computed learning rate by current scheduler.
-        """
-        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
-        return self._last_lr
-
     def step(self, batch_iteration=None):
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -704,19 +690,12 @@ def get_lr(self):
                             self.delta_lrs)
         ]
 
-    def get_last_lr(self):
-        """ Return last computed learning rate by current scheduler.
-        """
-        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
-        return self._last_lr
-
     def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 5c5d896dfc0d..954774e58912 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -940,14 +940,14 @@ def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_grad').stop()
 
-    def _exec_optimizer_step(self, lr_kwargs=None):
+    def _exec_optimizer_step(self):
         if self.wall_clock_breakdown():
             self.timers('step_microstep').start()
             self.timers('step').start()
         self.mem_status('BEFORE STEP', reset_max=True)
 
         self._force_grad_boundary = True
-        self._take_model_step(lr_kwargs)
+        self._take_model_step()
         self._force_grad_boundary = False
 
         self.mem_status('AFTER STEP')
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index b784f3ffdd6c..14bfc937705c 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -6,7 +6,6 @@
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.utils import logger
 from deepspeed.runtime.zero.constants import *
-import json
 
 
 class DeepSpeedZeroConfig(object):
@@ -55,9 +54,6 @@ def read_zero_config_deprecated(self, param_dict):
     def repr(self):
         return self.__dict__
 
-    def __repr__(self):
-        return json.dumps(self.__dict__, sort_keys=True, indent=4)
-
     def _initialize(self, zero_config_dict):
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index c231edca4919..37517764b375 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,3 +1,2 @@
-from .logging import logger, log_dist
-from .distributed import init_distributed
+from deepspeed.utils.logging import logger, log_dist
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py
deleted file mode 100644
index e70f00b440bb..000000000000
--- a/deepspeed/utils/distributed.py
+++ /dev/null
@@ -1,129 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import os
-import torch
-
-from .logging import logger
-from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
-
-
-def init_distributed(dist_backend="nccl",
-                     auto_mpi_discovery=True,
-                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
-                     verbose=True):
-    """
-    Initialize torch.distributed backend, potentially performing MPI discovery if needed
-    Arguments:
-        dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo
-        auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI
-        distributed_port (int, optional): torch distributed backend port
-        verbose (bool, optional): verbose logging
-    """
-
-    required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
-    if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
-        if verbose:
-            logger.info(
-                "Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment..."
-            )
-        if in_aml() and not in_dlts():
-            patch_aml_env_for_torch_nccl_backend(verbose=verbose)
-        else:
-            mpi_discovery(distributed_port=distributed_port, verbose=verbose)
-
-    if not torch.distributed.is_initialized():
-        if verbose:
-            logger.info(
-                "Initializing torch distributed with backend: {}".format(dist_backend))
-        torch.distributed.init_process_group(backend=dist_backend)
-
-
-def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
-    """
-    Discovery MPI environment via mpi4py and map to relevant torch.distributed state
-    """
-    from mpi4py import MPI
-    import subprocess
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    world_size = comm.Get_size()
-
-    master_addr = None
-    if rank == 0:
-        hostname_cmd = ["hostname -I"]
-        result = subprocess.check_output(hostname_cmd, shell=True)
-        master_addr = result.decode('utf-8').split()[0]
-    master_addr = comm.bcast(master_addr, root=0)
-
-    # Determine local rank by assuming hostnames are unique
-    proc_name = MPI.Get_processor_name()
-    all_procs = comm.allgather(proc_name)
-    local_rank = sum([i == proc_name for i in all_procs[:rank]])
-
-    os.environ['RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['LOCAL_RANK'] = str(local_rank)
-    os.environ['MASTER_ADDR'] = master_addr
-    os.environ['MASTER_PORT'] = str(distributed_port)
-
-    if verbose:
-        logger.info(
-            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
-
-    if torch.distributed.is_initialized():
-        assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
-        assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
-            world_size, dist.get_world_size())
-
-
-def in_aml():
-    # Are we running inside an Azure Machine Learning (AML) environment?
-    return 'AZUREML_EXPERIMENT_ID' in os.environ
-
-
-def in_dlts():
-    # Are we running on a DLTS cluster?
-    return 'DLTS_JOB_ID' in os.environ
-
-
-def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
-    """Helper routine to get and set environment variables.
-    This is adapted from Azure ML's documentation available from:
-    https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
-    """
-    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
-    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
-        os.environ["WORLD_SIZE"])
-
-    if not single_node:
-        master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
-        os.environ["MASTER_ADDR"] = master_node_params[0]
-        # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
-        if "MASTER_PORT" not in os.environ:
-            os.environ["MASTER_PORT"] = str(master_port)
-    else:
-        os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
-        os.environ["MASTER_PORT"] = "54965"
-
-    if verbose:
-        logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
-            os.environ["NCCL_SOCKET_IFNAME"]))
-
-    os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
-    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
-
-    if verbose:
-        logger.info(
-            "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 2074bb3e3b0f..ec0724e11aa4 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -28,8 +28,7 @@ deepspeed --hostfile=<hostfile> \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
-The script `<client_entry.py>` will execute on the resources specified in
-[`<hostfile>`](/getting-started/#resource-configuration-multi-node).
+The script `<client_entry.py>` will execute on the resources specified in `<hostfile>`.
 
 ## Pipeline Parallelism
 DeepSpeed provides [pipeline parallelism](/tutorials/pipeline/) for memory-
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 21268802d6c8..1f23c64d4085 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -216,28 +216,26 @@ DeepSpeed will then make sure that these environment variables are set when
 launching each process on every node across their training job.
 
 
-### MPI and AzureML Compatibility
+### MPI Compatibility
 As described above, DeepSpeed provides its own parallel launcher to help launch
 multi-node/multi-gpu training jobs. If you prefer to launch your training job
 using MPI (e.g., mpirun), we provide support for this. It should be noted that
 DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
-backend.
-
-To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
-mpirun as a launcher backend) you simply need to install the
-[mpi4py](https://pypi.org/project/mpi4py/) python package.  DeepSpeed will use
-this to discover the MPI environment and pass the necessary state (e.g., world
-size, rank) to the torch distributed backend.
-
-If you are using model parallelism, pipeline parallelism, or otherwise require
-torch.distributed calls before calling `deepspeed.initialize(..)` we provide
-the same MPI support with an additional DeepSpeed API call. Replace your initial
-`torch.distributed.init_process_group(..)` call with:
-
-```python
-deepspeed.init_distributed()
+backend. To launch your training job with mpirun + DeepSpeed you simply pass us
+an additional flag `--deepspeed_mpi`. DeepSpeed will then use
+[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
+rank, world size) and properly initialize torch distributed for training. In this
+case you will explicitly invoke `python` to launch your model script instead of using
+the `deepspeed` launcher, here is an example:
+```bash
+mpirun <mpi-args> python \
+	<client_entry.py> <client args> \
+	--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
 ```
 
+If you want to use this feature of DeepSpeed, please ensure that mpi4py is
+installed via `pip install mpi4py`.
+
 ## Resource Configuration (single-node)
 In the case that we are only running on a single node (with one or more GPUs)
 DeepSpeed *does not* require a hostfile as described above. If a hostfile is
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index eb9a412d8a4a..167f6427d7b4 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -79,4 +79,4 @@
 
 autoclass_content = 'both'
 
-autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
+autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy"]
diff --git a/install.sh b/install.sh
index b9f1501d9cad..b027d319cdd6 100755
--- a/install.sh
+++ b/install.sh
@@ -171,5 +171,5 @@ else
     pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
     pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
     pdsh -w $hosts "ds_report"
-    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rm $tmp_wheel_path/*.txt; rmdir $tmp_wheel_path; fi"
+    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rmdir $tmp_wheel_path; fi"
 fi
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 1f350065b4f6..f44aee79637a 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -221,7 +221,7 @@ def compute_capability_args(self, cross_compile_archs=None):
 
         1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
         2. If neither is set default compute capabilities will be used
-        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+        3. Under `jit_mode` compute capabilities of all visible cards will be used.
 
         Format:
 
@@ -243,7 +243,6 @@ def compute_capability_args(self, cross_compile_archs=None):
                 if cc not in ccs:
                     ccs.append(cc)
             ccs = sorted(ccs)
-            ccs[-1] += '+PTX'
         else:
             # Cross-compile mode, compile for various architectures
             # env override takes priority
@@ -261,10 +260,8 @@ def compute_capability_args(self, cross_compile_archs=None):
 
         args = []
         for cc in ccs:
-            num = cc[0] + cc[2]
-            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
-            if cc.endswith('+PTX'):
-                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+            cc = cc.replace('.', '')
+            args.append(f'-gencode=arch=compute_{cc},code=compute_{cc}')
 
         return args
 
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index 78620c472c9d..c032a8c9fdad 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1 +1,2 @@
 tqdm
+psutil
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 62b7495a025c..73d7957e29f9 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -5,8 +5,6 @@
 import torch.distributed as dist
 from torch.multiprocessing import Process
 
-import deepspeed
-
 import pytest
 
 # Worker timeout *after* the first worker has completed.
@@ -35,12 +33,10 @@ def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
             """Initialize torch.distributed and execute the user function. """
             os.environ['MASTER_ADDR'] = '127.0.0.1'
             os.environ['MASTER_PORT'] = '29503'
-            os.environ['LOCAL_RANK'] = str(local_rank)
-            # NOTE: unit tests don't support multi-node so local_rank == global rank
-            os.environ['RANK'] = str(local_rank)
-            os.environ['WORLD_SIZE'] = str(num_procs)
-
-            deepspeed.init_distributed(dist_backend=backend)
+            dist.init_process_group(backend=backend,
+                                    init_method='env://',
+                                    rank=local_rank,
+                                    world_size=num_procs)
 
             if torch.cuda.is_available():
                 torch.cuda.set_device(local_rank)
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index fd3f9887ad42..317cd7aa33c0 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -150,7 +150,7 @@ def create_models(ds_config):
                              hidden_act="gelu",
                              hidden_dropout_prob=ds_config.hidden_dropout_ratio,
                              attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=512,
+                             max_position_embeddings=ds_config.max_seq_length,
                              type_vocab_size=2,
                              initializer_range=ds_config.initializer_range)
 
@@ -210,18 +210,25 @@ def set_seed(seed):
     torch.manual_seed(seed)
 
 
-def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
+def run_backward(ds_config, atol=1e-2, verbose=False):
     set_seed(123)
     bert_encoder, ds_encoder = create_models(ds_config)
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
     hidden_states = torch.randn(ds_config.batch_size,
-                                seq_len,
+                                ds_config.max_seq_length,
                                 ds_config.hidden_size,
                                 **kwargs)
-    input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
-    Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
+    input_mask = torch.randn(ds_config.batch_size,
+                             1,
+                             1,
+                             ds_config.max_seq_length,
+                             **kwargs)
+    Y = torch.randn(ds_config.batch_size,
+                    ds_config.max_seq_length,
+                    ds_config.hidden_size,
+                    **kwargs)
 
     # run baseline
     base_results = bert_encoder(hidden_states,
@@ -250,12 +257,12 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 #test_backward[3-1024-120-16-24-True-True-0.05]
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
-                             (3,1024,119,16,24,True,False, 0.05),
-                             (3,1024,115,16,24,True,True, 0.05),
-                             (1024,128,10,2,2,False,False, 0.1),
-                             (3,1024,52,16,24,False,True, 0.2),
-                             (3,128,51,2,24,False,False, 0.1),
-                             (3,128,54,2,24,False,True, 0.2),
+                             (3,1024,120,16,24,True,False, 0.05),
+                             (3,1024,120,16,24,True,True, 0.05),
+                             (3,1024,56,16,24,False,False, 0.1),
+                             (3,1024,56,16,24,False,True, 0.2),
+                             (3,128,56,2,24,False,False, 0.1),
+                             (3,128,56,2,24,False,True, 0.2),
                          ]) # yapf: disable
 def test_backward(batch_size,
                   hidden_size,
@@ -275,6 +282,7 @@ def test_backward(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = hidden_size
+    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0
@@ -283,7 +291,7 @@ def test_backward(batch_size,
     ds_config.initializer_range = 0.02
     ds_config.fp16 = use_fp16
 
-    run_backward(ds_config, seq_len, atol=atol)
+    run_backward(ds_config, atol=atol)
 
 
 #@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 88cb90848603..893b66c904bb 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -117,7 +117,7 @@ def create_models(ds_config):
                              hidden_act="gelu",
                              hidden_dropout_prob=ds_config.hidden_dropout_ratio,
                              attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=512,
+                             max_position_embeddings=ds_config.max_seq_length,
                              type_vocab_size=2,
                              initializer_range=ds_config.initializer_range,
                              fp16=ds_config.fp16)
@@ -186,8 +186,13 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
-    input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
+    hidden_states = torch.randn(bsz,
+                                seq_len, #ds_config.max_seq_length,
+                                ds_config.hidden_size,
+                                **kwargs)
+    input_mask = torch.randn(bsz, 1, 1,
+                             seq_len, #ds_config.max_seq_length,
+                             **kwargs)
 
     # run baseline
     base_results = bert_encoder(hidden_states,
@@ -208,25 +213,25 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 # FP16 test cases can only run on the devices support FP16.
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,256,53,4,3,True,False),
-                             (8,256,52,4,3,True,True),
-                             (3,1024,51,16,3,True,False),
-                             (3,1024,54,16,3,True,True),
-                             (8,1024,381,16,3,True,False),
+                             (8,256,128,4,3,True,False),
+                             (8,256,128,4,3,True,True),
+                             (64,1024,128,16,3,True,False),
+                             (64,1024,128,16,3,True,True),
+                             (8,1024,384,16,3,True,False),
                              (8,1024,384,16,3,True,True),
                              (8,1024,384,16,3,True,True),
-                             (8,1024,119,16,3,True,False),
+                             (8,1024,120,16,3,True,False),
                              (8,1024,120,16,3,True,True),
-                             (8,1024,509,16,3,True,False),
+                             (8,1024,512,16,3,True,False),
                              (8,1024,512,16,3,True,True),
                              (64,1024,56,16,3,False,False),
-                             (64,1024,53,16,3,False,True),
+                             (64,1024,56,16,3,False,True),
                              (64,1024,24,16,3,False,False),
-                             (64,1024,21,16,3,False,True),
+                             (64,1024,24,16,3,False,True),
                              (8,1024,384,16,3,False,False),
                              (8,1024,384,16,3,False,True),
                              (8,1024,512,16,3,False,False),
-                             (8,1024,511,16,3,False,True),
+                             (8,1024,512,16,3,False,True),
                              (8,1536,128,24,3,False,False),
                              (8,1536,128,24,3,False,True),
                              (8,2048,128,32,3,False,False),
@@ -254,6 +259,7 @@ def test_forward(batch_size,
     ds_config.layer_id = None
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
+    ds_config.max_seq_length = 128  #seq_len
     ds_config.intermediate_size = 4 * hidden_size
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
@@ -291,6 +297,7 @@ def test_forward_with_small_bsz(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = 4 * hidden_size
+    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0
@@ -325,6 +332,7 @@ def test_forward_stochastic(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = 4 * hidden_size
+    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0

From 9712f10ce19c1a459b4fb30081281f7ffa9b69fd Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 28 Dec 2020 03:53:37 +0000
Subject: [PATCH 03/41] Revert "Revert "Merge branch 'master' into
 staging-1bit-nccl-v2""

This reverts commit 6dbdd9858bafef4d340c089fdc0e3ddde3706f47.
---
 .github/workflows/pre-compile-ops.yml     |  47 ++++++++
 DeepSpeedExamples                         |   2 +-
 csrc/transformer/ds_transformer_cuda.cpp  |  13 ++-
 csrc/transformer/softmax_kernels.cu       |  20 +++-
 deepspeed/__init__.py                     |   1 +
 deepspeed/constants.py                    |   8 ++
 deepspeed/git_version_info.py             |   8 +-
 deepspeed/launcher/constants.py           |   5 -
 deepspeed/launcher/launch.py              |   3 +-
 deepspeed/launcher/runner.py              |   4 +-
 deepspeed/ops/sparse_attention/softmax.py |   4 +-
 deepspeed/ops/transformer/transformer.py  |  28 ++++-
 deepspeed/runtime/constants.py            |   5 -
 deepspeed/runtime/engine.py               | 116 ++-----------------
 deepspeed/runtime/lr_schedules.py         |  21 ++++
 deepspeed/runtime/pipe/engine.py          |   4 +-
 deepspeed/runtime/zero/config.py          |   4 +
 deepspeed/utils/__init__.py               |   3 +-
 deepspeed/utils/distributed.py            | 129 ++++++++++++++++++++++
 docs/_pages/features.md                   |   3 +-
 docs/_tutorials/getting-started.md        |  30 ++---
 docs/code-docs/source/conf.py             |   2 +-
 install.sh                                |   2 +-
 op_builder/builder.py                     |   9 +-
 requirements/requirements-readthedocs.txt |   1 -
 tests/unit/common.py                      |  12 +-
 tests/unit/test_cuda_backward.py          |  32 ++----
 tests/unit/test_cuda_forward.py           |  34 +++---
 28 files changed, 342 insertions(+), 208 deletions(-)
 create mode 100644 .github/workflows/pre-compile-ops.yml
 mode change 100644 => 100755 csrc/transformer/ds_transformer_cuda.cpp
 create mode 100644 deepspeed/constants.py
 create mode 100644 deepspeed/utils/distributed.py

diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
new file mode 100644
index 000000000000..4005d4baf2fc
--- /dev/null
+++ b/.github/workflows/pre-compile-ops.yml
@@ -0,0 +1,47 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Tests-w-precompiled-ops
+
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index fa1d1a71c486..78d69cb2f89a 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit fa1d1a71c48623db8a091d9cf636a5fe3b8f43c7
+Subproject commit 78d69cb2f89a27b1e9b072df8c3e47d00c024fdc
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
old mode 100644
new mode 100755
index 85ec0418971c..ebd534d04ab3
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -14,6 +14,8 @@
 
 static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
 
+const int init_seq_length = 128;
+
 // C++ interface
 
 template <typename T>
@@ -591,7 +593,6 @@ int create_transformer_layer(int layer_id,
                              int hidden_dim,
                              int num_heads,
                              int intermediate_size,
-                             int seq_length,
                              float attn_dropout_ratio,
                              float hidden_dropout_ratio,
                              int seed,
@@ -604,14 +605,14 @@ int create_transformer_layer(int layer_id,
 {
     Context::Instance().SetSeed(seed);
     Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, seq_length, num_heads, hidden_dim / num_heads);
+        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
     auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
                                                            batch_size,
                                                            hidden_dim,
                                                            num_heads,
                                                            intermediate_size,
-                                                           seq_length,
+                                                           init_seq_length,
                                                            attn_dropout_ratio,
                                                            hidden_dropout_ratio,
                                                            pre_or_postLayerNorm,
@@ -873,6 +874,12 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
     std::shared_ptr<BertTransformerLayer<T>> layer =
         std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
 
+    int seq_len = layer->GetSeqLength();
+    if (g_output.size(1) != seq_len) {
+        seq_len = g_output.size(1);
+        layer->SetSeqLength(seq_len, bsz);
+    }
+
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
     auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 582da4829f47..be776b0c074d 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -80,7 +80,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -113,7 +114,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -216,7 +218,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -252,7 +255,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -339,7 +343,9 @@ void launch_attn_softmax<float>(float* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
@@ -408,7 +414,9 @@ void launch_attn_softmax<__half>(__half* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 8ac0aad05562..ba6f9b5bb6bf 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -14,6 +14,7 @@
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .utils import log_dist
+from .utils.distributed import init_distributed
 
 from .pipe import PipelineModule
 
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
new file mode 100644
index 000000000000..467e85aefcb6
--- /dev/null
+++ b/deepspeed/constants.py
@@ -0,0 +1,8 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+#############################################
+# Torch distributed constants
+#############################################
+TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index d17948ae41a7..f04982c74f0d 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -2,8 +2,12 @@
     #  This is populated by setup.py
     from .git_version_info_installed import *
 except ModuleNotFoundError:
-    # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
-    version = open('version.txt', 'r').read().strip()
+    import os
+    if os.path.isfile('version.txt'):
+        # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+        version = open('version.txt', 'r').read().strip()
+    else:
+        version = "0.0.0"
     git_hash = '[none]'
     git_branch = '[none]'
 
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index f384d58b2c52..fd56facc4343 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -1,10 +1,5 @@
 # Copyright 2020 The Microsoft DeepSpeed Team
 
-#############################################
-# Torch distributed constants
-#############################################
-TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
-
 PDSH_LAUNCHER = 'pdsh'
 PDSH_MAX_FAN_OUT = 1024
 
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index 205aee2d6ac4..0958295efe06 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -16,7 +16,7 @@
 from collections import defaultdict
 from argparse import ArgumentParser, REMAINDER
 
-from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..utils import logger
 
 
@@ -113,6 +113,7 @@ def main():
         # each process's rank
         dist_rank = global_rank_mapping[local_node][local_rank]
         current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
 
         # spawn the processes
         cmd = [
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 9479bb63758c..eb03502cc3f2 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -19,8 +19,8 @@
 import torch.cuda
 
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
-from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT, \
-    PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
+from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..utils import logger
 
 DLTS_HOSTFILE = "/job/hostfile"
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
index cd18fbcae71f..a0805ada4bc0 100644
--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -224,8 +224,8 @@ class Softmax:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
-
-    sparse_softmax = _sparse_softmax.apply
+    def sparse_softmax(*args, **kwargs):
+        return _sparse_softmax.apply(*args, **kwargs)
 
     def make_lut(self, device):
         """Generates the sparsity layout used in block-sparse softmax
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index a91e5ce6f08b..ea4b98848d3c 100755
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -18,7 +18,6 @@
 class TransformerConfig():
     def __init__(self,
                  batch_size,
-                 max_seq_length,
                  hidden_size,
                  intermediate_size,
                  heads,
@@ -30,7 +29,6 @@ def __init__(self,
         self.batch_size = batch_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.max_seq_length = max_seq_length
         self.heads = heads
         self.attn_dropout_ratio = attn_dropout_ratio
         self.hidden_dropout_ratio = hidden_dropout_ratio
@@ -92,7 +90,6 @@ class DeepSpeedTransformerConfig(TransformerConfig):
     """
     def __init__(self,
                  batch_size=-1,
-                 max_seq_length=-1,
                  hidden_size=-1,
                  intermediate_size=-1,
                  heads=-1,
@@ -112,7 +109,6 @@ def __init__(self,
         super(DeepSpeedTransformerConfig,
               self).__init__(
                   batch_size,
-                  max_seq_length,
                   hidden_size,
                   (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
                   heads,
@@ -142,7 +138,7 @@ def from_dict(cls, json_object):
 
     @classmethod
     def from_json_file(cls, json_file):
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding='utf-16') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
@@ -177,6 +173,18 @@ def forward(ctx,
         cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
         forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
 
+        inp_size = input.size()
+        if inp_size[1] % 16 != 0:
+            input = torch.cat((input,
+                               torch.randn((inp_size[0],
+                                            (16 - (inp_size[1] % 16)),
+                                            inp_size[2]),
+                                           device=input.device,
+                                           dtype=input.dtype)),
+                              1)
+            input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
+                                            (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
+
         (output,
          inp_norm,
          qkv_tf,
@@ -303,11 +311,17 @@ def forward(ctx,
             ctx.attn_layer_norm_var = attn_layer_norm_var
             ctx.layer_norm_var = layer_norm_var
 
+        if inp_size[1] % 16 != 0:
+            output = torch.narrow(output, 1, 0, inp_size[1])
         return output
 
     @staticmethod
     def backward(ctx, grad_output):
         bsz = grad_output.shape[0]
+        grad_output_shape = grad_output.size()
+        if grad_output_shape[1] % 16 != 0:
+            grad_output = torch.cat((grad_output, torch.zeros((bsz, (16 - (grad_output_shape[1] % 16)), \
+                                        grad_output_shape[2]), device=grad_output.device, dtype=grad_output.dtype)), 1)
 
         if bsz > ctx.config.batch_size:
             raise ValueError('grad_output batch size exceeds the limit.')
@@ -398,6 +412,9 @@ def backward(ctx, grad_output):
              norm_w,
              norm_b)
 
+        if grad_output_shape[1] % 16 != 0:
+            grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
+
         return (grad_input,
                 None,
                 None,
@@ -501,7 +518,6 @@ def __init__(self, layer_id, config, initial_weights=None, initial_biases=None):
                           self.config.hidden_size,
                           self.config.heads,
                           self.config.intermediate_size,
-                          self.config.max_seq_length,
                           self.config.attn_dropout_ratio,
                           self.config.hidden_dropout_ratio,
                           self.config.seed,
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index a731865714fe..c56c3898f60f 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -73,11 +73,6 @@
 ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
 ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
 
-#############################################
-# Torch distributed constants
-#############################################
-TORCH_DISTRIBUTED_DEFAULT_PORT = "29500"
-
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
 STEPS_PER_PRINT_DEFAULT = 10
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 7c9b920d8bb6..8b2901f8452e 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -24,12 +24,12 @@
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    TORCH_DISTRIBUTED_DEFAULT_PORT, PLD_THETA, PLD_GAMMA
+    PLD_THETA, PLD_GAMMA
 from deepspeed.runtime.zero.constants import \
     ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.runtime.csr_tensor import CSRTensor
 import deepspeed.runtime.lr_schedules as lr_schedules
-from deepspeed.utils import logger, log_dist
+from deepspeed.utils import logger, log_dist, init_distributed
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
 
@@ -130,29 +130,14 @@ def __init__(self,
         if dist_init_required is False:
             assert (dist.is_initialized()==True), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
 
-        # DeepSpeed will initialize torch distributed only if the user has not already intialized it.
-        if dist_init_required and not dist.is_initialized():
-            # discover using mpi4py if user specifies the flag
-            if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
-                # if in Azure ML environment and user specified this flag, notify the user to remove the flag.
-                if self._in_aml():
-                    logger.warning(
-                        "Please remove the --deepspeed_mpi flag if running on AzureML.")
-                self._mpi_check(args, dist_init_required)
-            else:
-                # detect if we are in Azure ML environment
-                if self._in_aml():
-                    self._set_environment_variables_for_nccl_backend(args)
-
-            logger.info("Initializing torch distributed with backend: {}".format(
-                self.dist_backend))
-            dist.init_process_group(backend=self.dist_backend)
+        # Initialize torch distributed if needed
+        init_distributed(dist_backend=self.dist_backend)
 
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
 
-        self._init_distributed(dist_init_required)
+        self._set_distributed_vars()
 
         if self.tensorboard_enabled() and self.global_rank == 0:
             self.summary_writer = self.get_summary_writer()
@@ -209,87 +194,6 @@ def __init__(self,
         self.flatten = util_ops.flatten
         self.unflatten = util_ops.unflatten
 
-    def _in_aml(self):
-        # read AzureML environment variable to detect if we are using an Azure ML environment
-        if 'AZUREML_EXPERIMENT_ID' in os.environ:
-            return True
-        else:
-            return False
-
-    def _set_environment_variables_for_nccl_backend(self,
-                                                    args,
-                                                    master_port=6105,
-                                                    verbose=True):
-        """Helper routine to get and set environment variables.
-        This is adapted from Azure ML's documentation available from:
-        https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
-        """
-        os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
-        os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-        single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
-            os.environ["WORLD_SIZE"])
-        if not single_node:
-            master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
-            os.environ["MASTER_ADDR"] = master_node_params[0]
-            # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
-            if "MASTER_PORT" not in os.environ:
-                os.environ["MASTER_PORT"] = str(master_port)
-        else:
-            os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
-            os.environ["MASTER_PORT"] = "54965"
-        print("NCCL_SOCKET_IFNAME original value = {}".format(
-            os.environ["NCCL_SOCKET_IFNAME"]))
-
-        os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
-        args.local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
-
-        if verbose:
-            logger.info(
-                "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-                .format(os.environ['RANK'],
-                        args.local_rank,
-                        os.environ['WORLD_SIZE'],
-                        os.environ['MASTER_ADDR'],
-                        os.environ['MASTER_PORT']))
-
-    def _mpi_check(self, args, dist_init_required):
-        from mpi4py import MPI
-        import subprocess
-        comm = MPI.COMM_WORLD
-        rank = comm.Get_rank()
-        world_size = comm.Get_size()
-
-        master_addr = None
-        if rank == 0:
-            hostname_cmd = ["hostname -I"]
-            result = subprocess.check_output(hostname_cmd, shell=True)
-            master_addr = result.decode('utf-8').split()[0]
-        master_addr = comm.bcast(master_addr, root=0)
-
-        # Determine local rank by assuming hostnames are unique
-        proc_name = MPI.Get_processor_name()
-        all_procs = comm.allgather(proc_name)
-        local_rank = sum([i == proc_name for i in all_procs[:rank]])
-
-        os.environ['RANK'] = str(rank)
-        os.environ['WORLD_SIZE'] = str(world_size)
-        args.local_rank = local_rank
-        os.environ['MASTER_ADDR'] = master_addr
-        os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT
-
-        logger.info(
-            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    args.local_rank,
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
-
-        if not dist_init_required and dist.is_initialized():
-            assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
-            assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
-                world_size, dist.get_world_size())
-
     def pld_enabled(self):
         return self._config.pld_enabled
 
@@ -497,7 +401,7 @@ def _scheduler_from_config(self, optimizer):
         else:
             return None
 
-    def _init_distributed(self, dist_init_required):
+    def _set_distributed_vars(self):
         if self.local_rank >= 0:
             torch.cuda.set_device(self.local_rank)
             self.device = torch.device("cuda", self.local_rank)
@@ -979,7 +883,7 @@ def clip_fp32_gradients(self):
         torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(),
                                        max_norm=self.gradient_clipping())
 
-    def _take_model_step(self):
+    def _take_model_step(self, lr_kwargs):
         if self.gradient_clipping() > 0.0:
             if not self.fp16_enabled() and not self.amp_enabled():
                 self.clip_fp32_gradients()
@@ -1010,14 +914,14 @@ def _take_model_step(self):
             self.skipped_steps += 1
         else:
             if self.lr_scheduler is not None:
-                self.lr_scheduler.step()
+                self.lr_scheduler.step(**(lr_kwargs or {}))
             if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
                 self._report_progress(self.global_steps + 1)
 
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
 
-    def step(self):
+    def step(self, lr_kwargs=None):
         r"""Execute the weight update step after forward and backward propagation
         on effective_train_batch.
         """
@@ -1034,7 +938,7 @@ def step(self):
             if self.progressive_layer_drop:
                 self.progressive_layer_drop.update_state(self.global_steps)
 
-            self._take_model_step()
+            self._take_model_step(lr_kwargs)
 
         self.tput_timer.stop(report_progress)
 
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index 5ec106c28d67..e7e3be1e786b 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -381,6 +381,12 @@ def get_lr(self):
             lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr
         ]
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
     def _update_optimizer(self, group_lrs):
         for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
             param_group['lr'] = lr
@@ -390,6 +396,7 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
         self._update_optimizer(self.get_lr())
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -628,12 +635,19 @@ def get_lr(self):
             return self._get_cycle_lr()
         return self._get_decay_lr(self.last_batch_iteration - self.total_size)
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
     def step(self, batch_iteration=None):
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -690,12 +704,19 @@ def get_lr(self):
                             self.delta_lrs)
         ]
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
     def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 954774e58912..5c5d896dfc0d 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -940,14 +940,14 @@ def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():
             self.timers('pipe_recv_grad').stop()
 
-    def _exec_optimizer_step(self):
+    def _exec_optimizer_step(self, lr_kwargs=None):
         if self.wall_clock_breakdown():
             self.timers('step_microstep').start()
             self.timers('step').start()
         self.mem_status('BEFORE STEP', reset_max=True)
 
         self._force_grad_boundary = True
-        self._take_model_step()
+        self._take_model_step(lr_kwargs)
         self._force_grad_boundary = False
 
         self.mem_status('AFTER STEP')
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 14bfc937705c..b784f3ffdd6c 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -6,6 +6,7 @@
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.utils import logger
 from deepspeed.runtime.zero.constants import *
+import json
 
 
 class DeepSpeedZeroConfig(object):
@@ -54,6 +55,9 @@ def read_zero_config_deprecated(self, param_dict):
     def repr(self):
         return self.__dict__
 
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+
     def _initialize(self, zero_config_dict):
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index 37517764b375..c231edca4919 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,2 +1,3 @@
-from deepspeed.utils.logging import logger, log_dist
+from .logging import logger, log_dist
+from .distributed import init_distributed
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py
new file mode 100644
index 000000000000..e70f00b440bb
--- /dev/null
+++ b/deepspeed/utils/distributed.py
@@ -0,0 +1,129 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import os
+import torch
+
+from .logging import logger
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+
+
+def init_distributed(dist_backend="nccl",
+                     auto_mpi_discovery=True,
+                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
+                     verbose=True):
+    """
+    Initialize torch.distributed backend, potentially performing MPI discovery if needed
+    Arguments:
+        dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo
+        auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI
+        distributed_port (int, optional): torch distributed backend port
+        verbose (bool, optional): verbose logging
+    """
+
+    required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
+        if verbose:
+            logger.info(
+                "Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment..."
+            )
+        if in_aml() and not in_dlts():
+            patch_aml_env_for_torch_nccl_backend(verbose=verbose)
+        else:
+            mpi_discovery(distributed_port=distributed_port, verbose=verbose)
+
+    if not torch.distributed.is_initialized():
+        if verbose:
+            logger.info(
+                "Initializing torch distributed with backend: {}".format(dist_backend))
+        torch.distributed.init_process_group(backend=dist_backend)
+
+
+def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
+    """
+    Discovery MPI environment via mpi4py and map to relevant torch.distributed state
+    """
+    from mpi4py import MPI
+    import subprocess
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        hostname_cmd = ["hostname -I"]
+        result = subprocess.check_output(hostname_cmd, shell=True)
+        master_addr = result.decode('utf-8').split()[0]
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ['MASTER_PORT'] = str(distributed_port)
+
+    if verbose:
+        logger.info(
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
+
+    if torch.distributed.is_initialized():
+        assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
+        assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
+            world_size, dist.get_world_size())
+
+
+def in_aml():
+    # Are we running inside an Azure Machine Learning (AML) environment?
+    return 'AZUREML_EXPERIMENT_ID' in os.environ
+
+
+def in_dlts():
+    # Are we running on a DLTS cluster?
+    return 'DLTS_JOB_ID' in os.environ
+
+
+def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
+    """Helper routine to get and set environment variables.
+    This is adapted from Azure ML's documentation available from:
+    https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
+    """
+    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
+        os.environ["WORLD_SIZE"])
+
+    if not single_node:
+        master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
+        os.environ["MASTER_ADDR"] = master_node_params[0]
+        # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
+        if "MASTER_PORT" not in os.environ:
+            os.environ["MASTER_PORT"] = str(master_port)
+    else:
+        os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
+        os.environ["MASTER_PORT"] = "54965"
+
+    if verbose:
+        logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
+            os.environ["NCCL_SOCKET_IFNAME"]))
+
+    os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
+    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
+
+    if verbose:
+        logger.info(
+            "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index ec0724e11aa4..2074bb3e3b0f 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -28,7 +28,8 @@ deepspeed --hostfile=<hostfile> \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
-The script `<client_entry.py>` will execute on the resources specified in `<hostfile>`.
+The script `<client_entry.py>` will execute on the resources specified in
+[`<hostfile>`](/getting-started/#resource-configuration-multi-node).
 
 ## Pipeline Parallelism
 DeepSpeed provides [pipeline parallelism](/tutorials/pipeline/) for memory-
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 1f23c64d4085..21268802d6c8 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -216,25 +216,27 @@ DeepSpeed will then make sure that these environment variables are set when
 launching each process on every node across their training job.
 
 
-### MPI Compatibility
+### MPI and AzureML Compatibility
 As described above, DeepSpeed provides its own parallel launcher to help launch
 multi-node/multi-gpu training jobs. If you prefer to launch your training job
 using MPI (e.g., mpirun), we provide support for this. It should be noted that
 DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
-backend. To launch your training job with mpirun + DeepSpeed you simply pass us
-an additional flag `--deepspeed_mpi`. DeepSpeed will then use
-[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
-rank, world size) and properly initialize torch distributed for training. In this
-case you will explicitly invoke `python` to launch your model script instead of using
-the `deepspeed` launcher, here is an example:
-```bash
-mpirun <mpi-args> python \
-	<client_entry.py> <client args> \
-	--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
-```
+backend.
+
+To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
+mpirun as a launcher backend) you simply need to install the
+[mpi4py](https://pypi.org/project/mpi4py/) python package.  DeepSpeed will use
+this to discover the MPI environment and pass the necessary state (e.g., world
+size, rank) to the torch distributed backend.
 
-If you want to use this feature of DeepSpeed, please ensure that mpi4py is
-installed via `pip install mpi4py`.
+If you are using model parallelism, pipeline parallelism, or otherwise require
+torch.distributed calls before calling `deepspeed.initialize(..)` we provide
+the same MPI support with an additional DeepSpeed API call. Replace your initial
+`torch.distributed.init_process_group(..)` call with:
+
+```python
+deepspeed.init_distributed()
+```
 
 ## Resource Configuration (single-node)
 In the case that we are only running on a single node (with one or more GPUs)
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index 167f6427d7b4..eb9a412d8a4a 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -79,4 +79,4 @@
 
 autoclass_content = 'both'
 
-autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy"]
+autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
diff --git a/install.sh b/install.sh
index b027d319cdd6..b9f1501d9cad 100755
--- a/install.sh
+++ b/install.sh
@@ -171,5 +171,5 @@ else
     pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
     pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
     pdsh -w $hosts "ds_report"
-    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rmdir $tmp_wheel_path; fi"
+    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rm $tmp_wheel_path/*.txt; rmdir $tmp_wheel_path; fi"
 fi
diff --git a/op_builder/builder.py b/op_builder/builder.py
index f44aee79637a..1f350065b4f6 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -221,7 +221,7 @@ def compute_capability_args(self, cross_compile_archs=None):
 
         1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
         2. If neither is set default compute capabilities will be used
-        3. Under `jit_mode` compute capabilities of all visible cards will be used.
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
 
         Format:
 
@@ -243,6 +243,7 @@ def compute_capability_args(self, cross_compile_archs=None):
                 if cc not in ccs:
                     ccs.append(cc)
             ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
         else:
             # Cross-compile mode, compile for various architectures
             # env override takes priority
@@ -260,8 +261,10 @@ def compute_capability_args(self, cross_compile_archs=None):
 
         args = []
         for cc in ccs:
-            cc = cc.replace('.', '')
-            args.append(f'-gencode=arch=compute_{cc},code=compute_{cc}')
+            num = cc[0] + cc[2]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc.endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
 
         return args
 
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index c032a8c9fdad..78620c472c9d 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,2 +1 @@
 tqdm
-psutil
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 73d7957e29f9..62b7495a025c 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -5,6 +5,8 @@
 import torch.distributed as dist
 from torch.multiprocessing import Process
 
+import deepspeed
+
 import pytest
 
 # Worker timeout *after* the first worker has completed.
@@ -33,10 +35,12 @@ def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
             """Initialize torch.distributed and execute the user function. """
             os.environ['MASTER_ADDR'] = '127.0.0.1'
             os.environ['MASTER_PORT'] = '29503'
-            dist.init_process_group(backend=backend,
-                                    init_method='env://',
-                                    rank=local_rank,
-                                    world_size=num_procs)
+            os.environ['LOCAL_RANK'] = str(local_rank)
+            # NOTE: unit tests don't support multi-node so local_rank == global rank
+            os.environ['RANK'] = str(local_rank)
+            os.environ['WORLD_SIZE'] = str(num_procs)
+
+            deepspeed.init_distributed(dist_backend=backend)
 
             if torch.cuda.is_available():
                 torch.cuda.set_device(local_rank)
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index 317cd7aa33c0..fd3f9887ad42 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -150,7 +150,7 @@ def create_models(ds_config):
                              hidden_act="gelu",
                              hidden_dropout_prob=ds_config.hidden_dropout_ratio,
                              attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=ds_config.max_seq_length,
+                             max_position_embeddings=512,
                              type_vocab_size=2,
                              initializer_range=ds_config.initializer_range)
 
@@ -210,25 +210,18 @@ def set_seed(seed):
     torch.manual_seed(seed)
 
 
-def run_backward(ds_config, atol=1e-2, verbose=False):
+def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
     set_seed(123)
     bert_encoder, ds_encoder = create_models(ds_config)
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
     hidden_states = torch.randn(ds_config.batch_size,
-                                ds_config.max_seq_length,
+                                seq_len,
                                 ds_config.hidden_size,
                                 **kwargs)
-    input_mask = torch.randn(ds_config.batch_size,
-                             1,
-                             1,
-                             ds_config.max_seq_length,
-                             **kwargs)
-    Y = torch.randn(ds_config.batch_size,
-                    ds_config.max_seq_length,
-                    ds_config.hidden_size,
-                    **kwargs)
+    input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
+    Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
 
     # run baseline
     base_results = bert_encoder(hidden_states,
@@ -257,12 +250,12 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
 #test_backward[3-1024-120-16-24-True-True-0.05]
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
-                             (3,1024,120,16,24,True,False, 0.05),
-                             (3,1024,120,16,24,True,True, 0.05),
-                             (3,1024,56,16,24,False,False, 0.1),
-                             (3,1024,56,16,24,False,True, 0.2),
-                             (3,128,56,2,24,False,False, 0.1),
-                             (3,128,56,2,24,False,True, 0.2),
+                             (3,1024,119,16,24,True,False, 0.05),
+                             (3,1024,115,16,24,True,True, 0.05),
+                             (1024,128,10,2,2,False,False, 0.1),
+                             (3,1024,52,16,24,False,True, 0.2),
+                             (3,128,51,2,24,False,False, 0.1),
+                             (3,128,54,2,24,False,True, 0.2),
                          ]) # yapf: disable
 def test_backward(batch_size,
                   hidden_size,
@@ -282,7 +275,6 @@ def test_backward(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = hidden_size
-    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0
@@ -291,7 +283,7 @@ def test_backward(batch_size,
     ds_config.initializer_range = 0.02
     ds_config.fp16 = use_fp16
 
-    run_backward(ds_config, atol=atol)
+    run_backward(ds_config, seq_len, atol=atol)
 
 
 #@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 893b66c904bb..88cb90848603 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -117,7 +117,7 @@ def create_models(ds_config):
                              hidden_act="gelu",
                              hidden_dropout_prob=ds_config.hidden_dropout_ratio,
                              attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=ds_config.max_seq_length,
+                             max_position_embeddings=512,
                              type_vocab_size=2,
                              initializer_range=ds_config.initializer_range,
                              fp16=ds_config.fp16)
@@ -186,13 +186,8 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(bsz,
-                                seq_len, #ds_config.max_seq_length,
-                                ds_config.hidden_size,
-                                **kwargs)
-    input_mask = torch.randn(bsz, 1, 1,
-                             seq_len, #ds_config.max_seq_length,
-                             **kwargs)
+    hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
+    input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
 
     # run baseline
     base_results = bert_encoder(hidden_states,
@@ -213,25 +208,25 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 # FP16 test cases can only run on the devices support FP16.
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,256,128,4,3,True,False),
-                             (8,256,128,4,3,True,True),
-                             (64,1024,128,16,3,True,False),
-                             (64,1024,128,16,3,True,True),
-                             (8,1024,384,16,3,True,False),
+                             (8,256,53,4,3,True,False),
+                             (8,256,52,4,3,True,True),
+                             (3,1024,51,16,3,True,False),
+                             (3,1024,54,16,3,True,True),
+                             (8,1024,381,16,3,True,False),
                              (8,1024,384,16,3,True,True),
                              (8,1024,384,16,3,True,True),
-                             (8,1024,120,16,3,True,False),
+                             (8,1024,119,16,3,True,False),
                              (8,1024,120,16,3,True,True),
-                             (8,1024,512,16,3,True,False),
+                             (8,1024,509,16,3,True,False),
                              (8,1024,512,16,3,True,True),
                              (64,1024,56,16,3,False,False),
-                             (64,1024,56,16,3,False,True),
+                             (64,1024,53,16,3,False,True),
                              (64,1024,24,16,3,False,False),
-                             (64,1024,24,16,3,False,True),
+                             (64,1024,21,16,3,False,True),
                              (8,1024,384,16,3,False,False),
                              (8,1024,384,16,3,False,True),
                              (8,1024,512,16,3,False,False),
-                             (8,1024,512,16,3,False,True),
+                             (8,1024,511,16,3,False,True),
                              (8,1536,128,24,3,False,False),
                              (8,1536,128,24,3,False,True),
                              (8,2048,128,32,3,False,False),
@@ -259,7 +254,6 @@ def test_forward(batch_size,
     ds_config.layer_id = None
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
-    ds_config.max_seq_length = 128  #seq_len
     ds_config.intermediate_size = 4 * hidden_size
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
@@ -297,7 +291,6 @@ def test_forward_with_small_bsz(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = 4 * hidden_size
-    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0
@@ -332,7 +325,6 @@ def test_forward_stochastic(batch_size,
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
     ds_config.intermediate_size = 4 * hidden_size
-    ds_config.max_seq_length = seq_len
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0

From d8a23c98d886ec3690572e46c1e4e8738366a647 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 30 Dec 2020 09:56:15 +0000
Subject: [PATCH 04/41] comm optimization + 1-bit lamb

---
 deepspeed/runtime/comm/mpi.py         | 154 +++++-----
 deepspeed/runtime/comm/nccl.py        | 148 +++++-----
 deepspeed/runtime/config.py           |   2 +
 deepspeed/runtime/engine.py           |   8 +-
 deepspeed/runtime/fp16/onebit/lamb.py | 397 ++++++++++++++++++++++++++
 5 files changed, 542 insertions(+), 167 deletions(-)
 create mode 100644 deepspeed/runtime/fp16/onebit/lamb.py

diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
index 532f03d940cd..862decac60fa 100644
--- a/deepspeed/runtime/comm/mpi.py
+++ b/deepspeed/runtime/comm/mpi.py
@@ -43,20 +43,20 @@ def gather_cuda(self,
         requests = []
         for idx in range(world_size):
             req_sign = self.my_igather(rank,
-                                  world_size,
-                                  comm,
-                                  cupy_sign_list_packed[idx],
-                                  cupy_recvbuf_sign,
-                                  root=idx)
+                                       world_size,
+                                       comm,
+                                       cupy_sign_list_packed[idx],
+                                       cupy_recvbuf_sign,
+                                       root=idx)
             requests += req_sign
 
         for idx in range(world_size):
             req_scale = self.my_igather(rank,
-                                   world_size,
-                                   comm,
-                                   cupy_worker_scale,
-                                   cupy_recvbuf_scale,
-                                   root=idx)
+                                        world_size,
+                                        comm,
+                                        cupy_worker_scale,
+                                        cupy_recvbuf_scale,
+                                        root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
@@ -93,20 +93,20 @@ def gather_host(self,
 
         for idx in range(world_size):
             req_sign = self.my_igather(rank,
-                                  world_size,
-                                  comm,
-                                  numpy_sign_list_packed[idx],
-                                  numpy_recvbuf_sign,
-                                  root=idx)
+                                       world_size,
+                                       comm,
+                                       numpy_sign_list_packed[idx],
+                                       numpy_recvbuf_sign,
+                                       root=idx)
             requests += req_sign
 
         for idx in range(world_size):
             req_scale = self.my_igather(rank,
-                                   world_size,
-                                   comm,
-                                   numpy_worker_scale,
-                                   numpy_recvbuf_scale,
-                                   root=idx)
+                                        world_size,
+                                        comm,
+                                        numpy_worker_scale,
+                                        numpy_recvbuf_scale,
+                                        root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
@@ -175,33 +175,23 @@ def compressed_allreduce(self,
 
         all_start_time = time.time()
         original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
         cupy.cuda.Device(local_rank).use()
 
-        if torch.numel(buffer_m) != torch.numel(worker_error):
-            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size,
                                        device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
         worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        sign_buffer_m = buffer_m.sign().add_(1).bool()
-        sign_buffer_m = sign_buffer_m.float()
-        sign_buffer_m.add_(-0.5).mul_(2.0)
-        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
-        sign_buffer_m = None
-
-        compensated_buffer_m = buffer_m
-        compensated_buffer_m.sign_()
-        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
-        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
-        cupy_compensated_buffer_m = self.compression_backend.torch2cupy(
-            compensated_buffer_m)
-        compensated_buffer_m = None
+        worker_error.set_(buffer_m - worker_scale *
+                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-            cupy_compensated_buffer_m,
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
             self.size)
-        cupy_compensated_buffer_m = None
+        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
 
         cupy_recvbuf_sign = cupy.zeros(
             [self.size,
@@ -213,14 +203,14 @@ def compressed_allreduce(self,
         gather_start = time.time()
         if self.cuda_aware:
             self.gather_cuda(self.rank,
-                        self.size,
-                        self.comm,
-                        cupy_sign_list_packed,
-                        cupy_recvbuf_sign,
-                        cupy_worker_scale,
-                        cupy_recvbuf_scale)
+                             self.size,
+                             self.comm,
+                             cupy_sign_list_packed,
+                             cupy_recvbuf_sign,
+                             cupy_worker_scale,
+                             cupy_recvbuf_scale)
         else:
-            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = self.gather_host(self.rank,
+            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank,
                self.size,
                self.comm,
                cupy_sign_list_packed,
@@ -229,71 +219,67 @@ def compressed_allreduce(self,
                cupy_recvbuf_scale)
         gather_end = time.time()
 
-        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-            self.size,
-            -1)
-        cupy_recvbuf_sign = None
-        unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float()
-        cupy_unpacked_sign = None
-        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
-        worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
-            1 / self.size)
-        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
-        unpacked_sign = None
+        # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
+        cupy_sign_list_packed = None
 
+        compensated_server_m = self.compression_backend.cupy2torch(
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+                self.size,
+                -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
+                        1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
         server_scale = torch.norm(compensated_server_m) / np.sqrt(
             compensated_server_m.numel())
-        sign_server_m = compensated_server_m.sign().add_(1).bool()
-        sign_server_m = sign_server_m.float()
-        sign_server_m.add_(-0.5).mul_(2.0)
-        server_error.set_(compensated_server_m - server_scale * sign_server_m)
-        sign_server_m = None
-
-        compensated_server_m.sign_()
-        compensated_server_m = compensated_server_m.add_(1).bool()
+        server_error.set_(
+            compensated_server_m - server_scale *
+            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
         cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
-        cupy_compensated_server_m = self.compression_backend.torch2cupy(
-            compensated_server_m)
-        compensated_server_m = None
 
         cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-            cupy_compensated_server_m,
+            self.compression_backend.torch2cupy(
+                compensated_server_m.sign_().add_(1).bool()),
             1)
+        compensated_server_m = None
 
         cupy_recvbuf_sign_server = cupy.zeros(
             [self.size,
              cupy_server_sign_packed[0].size],
-            dtype=cupy_sign_list_packed[0].dtype)
+            dtype=cupy_recvbuf_sign.dtype)
         cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                 1],
-                                               dtype=cupy_worker_scale.dtype)
+                                               dtype=cupy_recvbuf_scale.dtype)
+        # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
+        cupy_recvbuf_sign = None
 
         # Communication Phase 2
         if self.cuda_aware:
             self.allgather_cuda(self.comm,
-                           cupy_server_sign_packed[0],
-                           cupy_recvbuf_sign_server,
-                           cupy_server_scale,
-                           cupy_recvbuf_scale_server)
+                                cupy_server_sign_packed[0],
+                                cupy_recvbuf_sign_server,
+                                cupy_server_scale,
+                                cupy_recvbuf_scale_server)
         else:
-            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
+            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
                   cupy_server_sign_packed[0],
                   cupy_recvbuf_sign_server,
                   cupy_server_scale,
                   cupy_recvbuf_scale_server)
 
-        cupy_server_unpacked_sign = (cupy.unpackbits(
-            cupy_recvbuf_sign_server.flatten())).reshape(self.size,
-                                                         -1)
-        cupy_recvbuf_sign_server = None
+        # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
+        cupy_server_sign_packed = None
 
-        server_unpacked_sign = self.compression_backend.cupy2torch(
-            cupy_server_unpacked_sign)
-        cupy_server_unpacked_sign = None
+        buffer_m.data.copy_(
+            self.compression_backend.cupy2torch(
+                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                    self.size,
+                    -1)).float().add_(-0.5).mul_(2.0).mul_(
+                        self.compression_backend.cupy2torch(
+                            cupy_recvbuf_scale_server)).flatten().data)
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
 
-        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
-        server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)
-        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+        # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None
 
         return buffer_m
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 49f57f57df36..1972fda6cd73 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -4,14 +4,14 @@
 
 import torch
 import torch.distributed as dist
-import time 
-import cupy 
-import numpy as np 
+import time
+import cupy
+import numpy as np
 
 from deepspeed.runtime.compression.cupy import CupyBackend
 
-class NcclBackend(object):
 
+class NcclBackend(object):
     def __init__(self):
         self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
         self.rank = dist.get_rank(group=self.world_group)
@@ -48,32 +48,23 @@ def compressed_allreduce(self,
 
         all_start_time = time.time()
         original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
         cupy.cuda.Device(local_rank).use()
 
-        if torch.numel(buffer_m) != torch.numel(worker_error):
-            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size,
                                        device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
         worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        sign_buffer_m = buffer_m.sign().add_(1).bool()
-        sign_buffer_m = sign_buffer_m.float()
-        sign_buffer_m.add_(-0.5).mul_(2.0)
-        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
-        sign_buffer_m = None
-
-        compensated_buffer_m = buffer_m
-        compensated_buffer_m.sign_()
-        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
+        worker_error.set_(buffer_m - worker_scale *
+                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
+        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
+            self.size)
         cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
-        cupy_compensated_buffer_m = self.compression_backend.torch2cupy(compensated_buffer_m)
-        compensated_buffer_m = None
-
-        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(cupy_compensated_buffer_m,
-                                                       self.size)
-        cupy_compensated_buffer_m = None
 
         cupy_recvbuf_sign = cupy.zeros(
             [self.size,
@@ -81,14 +72,13 @@ def compressed_allreduce(self,
             dtype=cupy_sign_list_packed[0].dtype)
         cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
-        sign_list_packed = [None] * self.size
-
-        for idx in range(self.size):
-            sign_list_packed[idx] = self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
+        sign_list_packed = [
+            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
+            for idx in range(self.size)
+        ]
 
+        # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
-
-        worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
 
         # communication phase 1
@@ -107,88 +97,84 @@ def compressed_allreduce(self,
                                         worker_scale,
                                         recvbuf_scale,
                                         root=idx)
-
         for i in range(len(requests)):
             requests[i].wait()
-
         gather_end = time.time()
 
+        # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
+        cupy_sign_list_packed = None
+
         cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
         cupy_recvbuf_scale = self.compression_backend.torch2cupy(recvbuf_scale)
 
-        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-            self.size,
-            -1)
-        cupy_recvbuf_sign = None
-
-        unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float()
-        cupy_unpacked_sign = None
-
-        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
-        worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size)
-
-        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
-        unpacked_sign = None
-
+        compensated_server_m = self.compression_backend.cupy2torch(
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+                self.size,
+                -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
+                        1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
         server_scale = torch.norm(compensated_server_m) / np.sqrt(
             compensated_server_m.numel())
-        sign_server_m = compensated_server_m.sign().add_(1).bool()
-        sign_server_m = sign_server_m.float()
-        sign_server_m.add_(-0.5).mul_(2.0)
-        server_error.set_(compensated_server_m - server_scale * sign_server_m)
-        sign_server_m = None
-
-        compensated_server_m.sign_()
-        compensated_server_m = compensated_server_m.add_(1).bool()
-        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
-        cupy_compensated_server_m = self.compression_backend.torch2cupy(compensated_server_m)
-        compensated_server_m = None
+        server_error.set_(
+            compensated_server_m - server_scale *
+            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
-        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(cupy_compensated_server_m, 1)
+        # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
+
+        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
+            self.compression_backend.torch2cupy(
+                compensated_server_m.sign_().add_(1).bool()),
+            1)
+        compensated_server_m = None
 
         cupy_recvbuf_sign_server = cupy.zeros(
             [self.size,
              cupy_server_sign_packed[0].size],
-            dtype=cupy_sign_list_packed[0].dtype)
-
-        server_sign_packed = [None] * 1
-        recvbuf_sign_server = [None] * self.size
-
-        for idx in range(self.size):
-            recvbuf_sign_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
+            dtype=cupy_recvbuf_sign.dtype)
+        # cupy_recvbuf_sign, recvbuf_sign = None, None
+        cupy_recvbuf_sign = None
 
-        server_sign_packed[0] = self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
+        server_sign_packed = [
+            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
+        ]
+        recvbuf_sign_server = [
+            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
+            for idx in range(self.size)
+        ]
 
-        server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
+        # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
         cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                 1],
-                                               dtype=cupy_worker_scale.dtype)
+                                               dtype=cupy_recvbuf_scale.dtype)
+        # cupy_recvbuf_scale, recvbuf_scale = None, None
 
-        recvbuf_scale_server = [None] * self.size
-        for idx in range(self.size):
-            recvbuf_scale_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
+        recvbuf_scale_server = [
+            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
+            for idx in range(self.size)
+        ]
 
         # Communication Phase 2
         dist.all_gather(recvbuf_sign_server, server_sign_packed[0])
         dist.all_gather(recvbuf_scale_server, server_scale)
 
+        cupy_server_sign_packed = None
+
         # need to convert from a tensor list to a single tensor
         # dist.all_gather only provides a tensor list as the recv/output buffer
         recvbuf_sign_server = torch.stack(recvbuf_sign_server)
 
-        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server)
-
-        cupy_server_unpacked_sign = (cupy.unpackbits(
-            cupy_recvbuf_sign_server.flatten())).reshape(self.size,
-                                                         -1)
-        cupy_recvbuf_sign_server = None
-
-        server_unpacked_sign = self.compression_backend.cupy2torch(cupy_server_unpacked_sign)
-        cupy_server_unpacked_sign = None
-
-        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
-        server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)
-        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
+            recvbuf_sign_server)
+
+        buffer_m.data.copy_(
+            self.compression_backend.cupy2torch(
+                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                    self.size,
+                    -1)).float().add_(-0.5).mul_(2.0).mul_(
+                        self.compression_backend.cupy2torch(
+                            cupy_recvbuf_scale_server)).flatten().data)
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
 
         return buffer_m
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 9d52dfe6d766..95790210991a 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -27,10 +27,12 @@
 ADAM_OPTIMIZER = 'adam'
 LAMB_OPTIMIZER = 'lamb'
 ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
+ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
 DEEPSPEED_OPTIMIZERS = [
     ADAM_OPTIMIZER,
     LAMB_OPTIMIZER,
     ONEBIT_ADAM_OPTIMIZER,
+    ONEBIT_LAMB_OPTIMIZER,
 ]
 
 # extra optimizer parameters for adam
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index d5607a2be60d..4bda682a1db1 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -18,7 +18,7 @@
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
-    ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
+    ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE_PARAM
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
@@ -471,7 +471,8 @@ def _do_sanity_check(self):
             assert self.client_model_parameters, \
                 'DeepSpeed {} optimizer requires parameters in initialize() call'.format(self.optimizer_name())
 
-        if self.optimizer_name() == LAMB_OPTIMIZER:
+        if self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name(
+        ) == ONEBIT_LAMB_OPTIMIZER:
             assert self.dynamic_loss_scale(), \
                 'DeepSpeed {} optimizer requires dynamic loss scaling'.format(self.optimizer_name())
 
@@ -590,6 +591,9 @@ def _configure_basic_optimizer(self, model_parameters):
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
             from deepspeed.runtime.fp16.onebit.adam import Adam
             optimizer = Adam(model_parameters, self, **optimizer_parameters)
+        elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
+            from deepspeed.runtime.fp16.onebit.lamb import Lamb
+            optimizer = Lamb(model_parameters, self, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
new file mode 100644
index 000000000000..a543846fa225
--- /dev/null
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -0,0 +1,397 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import importlib
+import numpy as np
+import time
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from deepspeed.utils.logging import logger
+
+
+class Lamb(torch.optim.Optimizer):
+    """Implements the 1-bit Lamb algorithm. Currently GPU-only.
+    For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
+    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Adam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
+            from cupy. (default: 'deepspeed')
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 max_coeff=10.0,
+                 min_coeff=0.01,
+                 amsgrad=False,
+                 cuda_aware=False,
+                 comm_backend_name='nccl',
+                 coeff_beta=0.99,
+                 compress_mode=0,
+                 ratio_max=2.5,
+                 ratio_min=0.5,
+                 ratio_threshold=0.1,
+                 linear_step=1000,
+                 extra_stats=0):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Lamb does not support the AMSGrad variant.')
+
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm,
+                        max_coeff=max_coeff,
+                        min_coeff=min_coeff)
+
+        super(Lamb, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
+
+        self.comm_time = 0.0
+        self.step_time = 0.0
+        self.ave_step = 1
+        self.bk_time = 0.0
+
+        self.deepspeed = deepspeed
+        self.adam_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+        self.coeff_beta = coeff_beta
+        self.compress_mode = int(compress_mode)
+        self.ratio_max = ratio_max
+        self.ratio_min = ratio_min
+        self.ratio_threshold = ratio_threshold
+        self.linear_step = int(linear_step)
+        self.extra_stats = int(extra_stats)
+
+        self.comm_backend_name = comm_backend_name
+
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
+
+        if self.comm_backend_name == 'nccl':
+            assert torch.__version__.startswith("1.8."), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.comm_backend_handle = NcclBackend()
+
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
+
+        self.size = self.comm_backend_handle.size
+
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
+        self.exp_avg_flat = []
+        self.dummy_exp_avg = {}
+        self.corrected_tensor_sizes = []
+        self.server_chunk_sizes = []
+        self.worker_errors = []
+        self.server_errors = []
+
+        self.lamb_coeffs = []
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced recision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        #remove the previous stats
+        del self.lamb_coeffs[:]
+
+        if self.adam_freeze_key and self.compress_mode == 0:
+            exp_avg_back_list = []
+            for group in self.param_groups:
+                exp_avg_back_list.append([])
+                for p in group['params']:
+                    exp_avg_back_list[-1].append(
+                        self.state[p]['exp_avg'].detach().clone())
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['lamb_coeff_freeze'] = 0.0
+                    state['last_ratio'] = 1.0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    if self.compress_mode == 0:
+                        state['exp_avg_sq_back'] = torch.zeros_like(p.data)
+
+                if not self.initialize:
+                    self.adam_freeze_key = True
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+                max_coeff = group['max_coeff']
+                min_coeff = group['min_coeff']
+                if self.compress_mode == 0:
+                    exp_avg_sq_back = state['exp_avg_sq_back']
+
+                state['step'] += 1
+
+                if self.adam_freeze_key is False:
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    if self.compress_mode == 0 and state['step'] == self.freeze_step:
+                        exp_avg_sq_back.data = exp_avg_sq.detach().clone()
+                    grad = None
+                    if self.initialize:
+                        weight_norm = p.data.pow(2).sum().sqrt()
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+                        if group['weight_decay'] > 0.0:
+                            update += group['weight_decay'] * p.data
+                        update_norm = update.pow(2).sum().sqrt()
+                        lamb_coeff = 1.0
+                        if weight_norm != 0 and update_norm != 0:
+                            lamb_coeff = (weight_norm / update_norm).item()
+                            if lamb_coeff > max_coeff:
+                                lamb_coeff = max_coeff
+                            if lamb_coeff < min_coeff:
+                                lamb_coeff = min_coeff
+                        if lamb_coeff != 1.0:
+                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
+                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
+                        self.lamb_coeffs.append(lamb_coeff)
+                        with torch.no_grad():
+                            p.add_(-group['lr'] * lamb_coeff * update)
+                else:
+                    if self.initialize:
+                        exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    grad = None
+
+        # init flattened momentums, worker/server error sizes
+        if len(self.exp_avg_flat) == 0:
+            for i, param_group in enumerate(self.param_groups):
+                momentum_groups = [
+                    self.state[p]['exp_avg'] for p in param_group['params']
+                ]
+                tensor_size = sum([torch.numel(p.data) for p in momentum_groups])
+                corrected_tensor_size = tensor_size
+                if tensor_size % (self.size * self.divider) != 0:
+                    difference = ((self.size * self.divider) -
+                                  (tensor_size % (self.size * self.divider)))
+                    corrected_tensor_size += difference
+                    self.dummy_exp_avg[i] = torch.zeros(
+                        difference,
+                        device=momentum_groups[0].data.device)
+                    momentum_groups.append(self.dummy_exp_avg[i])
+                self.corrected_tensor_sizes.append(corrected_tensor_size)
+                self.server_chunk_sizes.append(corrected_tensor_size // self.size)
+
+                self.exp_avg_flat.append(
+                    # _flatten_dense_tensors([p.detach().clone()
+                    _flatten_dense_tensors([p.clone().detach()
+                                            for p in momentum_groups]))
+                updated_params = _unflatten_dense_tensors(self.exp_avg_flat[i],
+                                                          momentum_groups)
+                for p, q in zip(momentum_groups, updated_params):
+                    p.data = q.data
+
+        if self.initialize and len(self.worker_errors) == 0:
+            torch.cuda.empty_cache()
+            for i in range(len(self.exp_avg_flat)):
+                self.worker_errors.append(
+                    torch.zeros(self.corrected_tensor_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+                self.server_errors.append(
+                    torch.zeros(self.server_chunk_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+            torch.cuda.empty_cache()
+
+        if self.adam_freeze_key:
+            if self.size > 1 and self.linear_step != 0:
+                for i in range(len(self.exp_avg_flat)):
+                    if not self.initialize:
+                        torch.cuda.empty_cache()
+                        self.worker_errors.append(
+                            torch.zeros(self.corrected_tensor_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        self.server_errors.append(
+                            torch.zeros(self.server_chunk_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        torch.cuda.empty_cache()
+                        if torch.distributed.get_rank() == 0:
+                            print("Cupy Buffers Initialized Successfully.")
+
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[0],
+                            self.server_errors[0],
+                            self.deepspeed.local_rank)
+
+                        if torch.distributed.get_rank() == 0:
+                            print('Pop out errors', flush=True)
+                        del self.worker_errors[:]
+                        del self.server_errors[:]
+                    else:
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[i],
+                            self.server_errors[i],
+                            self.deepspeed.local_rank)
+
+        if self.adam_freeze_key and self.initialize:
+            for i, group in enumerate(self.param_groups):
+                bias_correction = 1 if group['bias_correction'] else 0
+
+                for j, p in enumerate(group['params']):
+                    state = self.state[p]
+                    exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                    beta1, beta2 = group['betas']
+                    max_coeff = group['max_coeff']
+                    min_coeff = group['min_coeff']
+
+                    if self.compress_mode == 0:
+                        exp_avg_back = exp_avg_back_list[i][j]
+                        exp_avg_sq_back = state['exp_avg_sq_back']
+                        grad_recover = ((exp_avg - exp_avg_back * beta1) / (1 - beta1))
+                        exp_avg_sq_back.mul_(beta2).addcmul_(1 - beta2,
+                                                             grad_recover,
+                                                             grad_recover)
+                    denom = exp_avg_sq.sqrt() + group['eps']
+                    update_prelim = exp_avg / denom
+
+                    if group['weight_decay'] > 0.0:
+                        update = update_prelim + group['weight_decay'] * p.data
+                    else:
+                        update = update_prelim
+
+                    lamb_coeff = 1.0
+                    if self.compress_mode == 0:
+                        update_norm = update.pow(2).sum().sqrt()
+                        denom_real = exp_avg_sq_back.sqrt() + group['eps']
+                        ratio = (denom / denom_real).max().item()
+                        if group['weight_decay'] > 0.0:
+                            update_ratio = (update_prelim.pow(2).sum().sqrt() /
+                                            update_norm).item()
+                            update_ratio = min(1.0, update_ratio)
+                            ratio = ratio * update_ratio + (1.0 - update_ratio)
+                        if ratio > self.ratio_max:
+                            ratio = self.ratio_max
+                        if ratio < self.ratio_min:
+                            ratio = self.ratio_min
+                        if ratio > state['last_ratio'] * (1.0 + self.ratio_threshold):
+                            ratio = state['last_ratio'] * (1.0 + self.ratio_threshold)
+                        if ratio < state['last_ratio'] * (1.0 - self.ratio_threshold):
+                            ratio = state['last_ratio'] * (1.0 - self.ratio_threshold)
+                        state['last_ratio'] = ratio
+                        lamb_coeff = state['lamb_coeff_freeze'] * ratio
+                    elif self.compress_mode == 1:
+                        ratio = min(
+                            1.0,
+                            float(state['step'] - self.freeze_step) /
+                            (self.linear_step - self.freeze_step))
+                        factor = 1.0 + self.ratio_max * ratio
+                        lamb_coeff = state['lamb_coeff_freeze'] * factor
+                    else:
+                        lamb_coeff = min_coeff
+                    self.lamb_coeffs.append(lamb_coeff)
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * lamb_coeff * update)
+            if self.compress_mode == 0:
+                del exp_avg_back_list[:]
+                exp_avg_back_list = None
+
+        if not self.initialize:
+            self.adam_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.adam_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                self.adam_freeze_key = True
+                self.deepspeed.enable_backward_allreduce = False
+
+        return loss
+
+    def get_lamb_coeffs(self):
+        return self.lamb_coeffs

From 89e1936258a7539f988f86720279aa27d7c26d07 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Fri, 5 Feb 2021 19:32:24 +0000
Subject: [PATCH 05/41] Saving/debugging commit.

---
 tests/onebitadam/test_mpi_backend.py |  2 +-
 tests/onebitadam/test_mpi_perf.py    |  4 ++--
 tests/onebitadam/test_nccl_perf.py   | 16 +++++++++++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/tests/onebitadam/test_mpi_backend.py b/tests/onebitadam/test_mpi_backend.py
index 7c1b59737532..f0dd4dce5bdc 100644
--- a/tests/onebitadam/test_mpi_backend.py
+++ b/tests/onebitadam/test_mpi_backend.py
@@ -18,7 +18,7 @@
                                      rank=rank)
 
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
-backend = MpiBackend(cuda_aware=False)
+backend = MpiBackend(cuda_aware=True)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
diff --git a/tests/onebitadam/test_mpi_perf.py b/tests/onebitadam/test_mpi_perf.py
index 63e445e89c50..769792b92f76 100644
--- a/tests/onebitadam/test_mpi_perf.py
+++ b/tests/onebitadam/test_mpi_perf.py
@@ -24,7 +24,7 @@
                                      world_size=size,
                                      rank=rank)
 
-backend = MpiBackend(cuda_aware=False)
+backend = MpiBackend(cuda_aware=True)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
@@ -44,7 +44,7 @@
 server_error = torch.zeros(right_server_size, device=device)
 
 warmup = 10
-iters = 100
+iters = 10
 
 local_rank = rank % torch.cuda.device_count()
 
diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py
index e079838288a0..f7b4f123e429 100644
--- a/tests/onebitadam/test_nccl_perf.py
+++ b/tests/onebitadam/test_nccl_perf.py
@@ -44,7 +44,7 @@
 server_error = torch.zeros(right_server_size, device=device)
 
 warmup = 10
-iters = 100
+iters = 10
 
 local_rank = rank % torch.cuda.device_count()
 
@@ -54,9 +54,15 @@
 
 time_list = []
 
+a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+scale = a.norm() / np.sqrt(a.numel())
+a_compressed = scale * a_sign
+print(a_compressed.shape)
+
 for i in range(iters):
     timers('compressed_allreduce').start()
     backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    #torch.distributed.all_reduce(a_compressed)
     timers('compressed_allreduce').stop()
     time_list.append(timers('compressed_allreduce').elapsed())
 
@@ -76,3 +82,11 @@
 maxlat = round(max(time_list) * convert)
 meanlat = round(mean(time_list) * convert, places)
 print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat))
+print("tensor shape", a.shape)
+duration=meanlat/1e3
+tput = ((tensor_size*4)/duration)
+print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput/1e9))
+size = tensor_size * 4
+n = dist.get_world_size()
+busbw = (size / duration) * (2 * (n - 1) / n)
+print("busbw: %f GB/s" % (busbw / 1e9))

From a1bbf78186498a0f3d468fa03ce5b448eb761ee4 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Sat, 13 Feb 2021 22:56:39 -0800
Subject: [PATCH 06/41] finalizing 1-bit lamb

---
 deepspeed/runtime/fp16/onebit/lamb.py | 284 ++++++++++++++------------
 1 file changed, 159 insertions(+), 125 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index a543846fa225..cfb311f12c6c 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -3,19 +3,13 @@
 '''
 import types
 import torch
-import importlib
 import numpy as np
-import time
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from deepspeed.utils.logging import logger
-
 
 class Lamb(torch.optim.Optimizer):
     """Implements the 1-bit Lamb algorithm. Currently GPU-only.
-    For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
-    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
 
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining
@@ -32,7 +26,7 @@ class Lamb(torch.optim.Optimizer):
         min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in 1-bit Adam!
+            (default: False) NOT SUPPORTED in 1-bit Lamb!
         eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
             adds eps to the bias-corrected second moment estimate before
             evaluating square root instead of adding it to the square root of
@@ -40,7 +34,18 @@ class Lamb(torch.optim.Optimizer):
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
         comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-            from cupy. (default: 'deepspeed')
+        coeff_beta (float, optional): coefficients used for computing
+            running averages of lamb coefficient (default: 0.99) not that you may want to
+            increase or decrease this beta depending on the freeze_step you choose:
+            1/(1 - coeff_beta) should be smaller than or equal to freeze_step
+        factor_max (float, optional): maximum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 4.5)
+        factor_min (float, optional): maximum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 0.5)
+        factor_threshold (float, optional): threshold of how much the scaling factor can
+            fluctuate between steps (default: 0.1)
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
@@ -64,12 +69,9 @@ def __init__(self,
                  cuda_aware=False,
                  comm_backend_name='nccl',
                  coeff_beta=0.99,
-                 compress_mode=0,
-                 ratio_max=2.5,
-                 ratio_min=0.5,
-                 ratio_threshold=0.1,
-                 linear_step=1000,
-                 extra_stats=0):
+                 factor_max=4.5,
+                 factor_min=0.5,
+                 factor_threshold=0.1):
 
         if amsgrad:
             raise RuntimeError('1-bit Lamb does not support the AMSGrad variant.')
@@ -87,23 +89,15 @@ def __init__(self,
         self.eps_mode = 0 if eps_inside_sqrt else 1
         assert (dist.is_initialized())
 
-        self.comm_time = 0.0
-        self.step_time = 0.0
-        self.ave_step = 1
-        self.bk_time = 0.0
-
         self.deepspeed = deepspeed
-        self.adam_freeze_key = False
+        self.lamb_freeze_key = False
         self.initialize = False
         self.freeze_step = freeze_step
         self.cuda_aware = cuda_aware
         self.coeff_beta = coeff_beta
-        self.compress_mode = int(compress_mode)
-        self.ratio_max = ratio_max
-        self.ratio_min = ratio_min
-        self.ratio_threshold = ratio_threshold
-        self.linear_step = int(linear_step)
-        self.extra_stats = int(extra_stats)
+        self.factor_max = factor_max
+        self.factor_min = factor_min
+        self.factor_threshold = factor_threshold
 
         self.comm_backend_name = comm_backend_name
 
@@ -130,6 +124,7 @@ def __init__(self,
         self.server_chunk_sizes = []
         self.worker_errors = []
         self.server_errors = []
+        self.scaling_coeffs = []
 
         self.lamb_coeffs = []
 
@@ -141,11 +136,6 @@ def step(self, closure=None, grads=None):
             grads (list of tensors, optional): weight gradient to use for the
                 optimizer update. If gradients have type torch.half, parameters
                 are expected to be in type torch.float. (default: None)
-            output params (list of tensors, optional): A reduced recision copy
-                of the updated weights written out in addition to the regular
-                updated weights. Have to be of same type as gradients. (default: None)
-            scale (float, optional): factor to divide gradient tensor values
-                by before applying to weights. (default: 1)
         """
         loss = None
         if closure is not None:
@@ -165,29 +155,42 @@ def step(self, closure=None, grads=None):
         #remove the previous stats
         del self.lamb_coeffs[:]
 
-        if self.adam_freeze_key and self.compress_mode == 0:
-            exp_avg_back_list = []
+        if self.lamb_freeze_key:
+            exp_avg_last_step = []
             for group in self.param_groups:
-                exp_avg_back_list.append([])
-                for p in group['params']:
-                    exp_avg_back_list[-1].append(
-                        self.state[p]['exp_avg'].detach().clone())
-
-        for group, grads_this_group in zip(self.param_groups, grads_group):
+                exp_avg_last_step.append(
+                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
+            if len(self.scaling_coeffs) == 0:
+                # compute the scaling_coeff for each momentum which is used to
+                # reduce compression error during compressed_allreduce
+                momentum_scales = []
+                for group in self.param_groups:
+                    momentum_scales.append([
+                        (torch.norm(self.state[p]['exp_avg']) /
+                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
+                        for p in group['params']
+                    ])
+                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
+                    [len(x) for x in momentum_scales])
+                for i, group in enumerate(self.param_groups):
+                    self.scaling_coeffs.append([
+                        united_scale / momentum_scales[i][j]
+                        for j in range(len(group['params']))
+                    ])
+
+        for i, (group, grads_this_group) in enumerate(zip(self.param_groups, grads_group)):
             if grads_this_group is None:
                 grads_this_group = [None] * len(group['params'])
 
             bias_correction = 1 if group['bias_correction'] else 0
 
-            for p, grad in zip(group['params'], grads_this_group):
+            for j, (p, grad) in enumerate(zip(group['params'], grads_this_group)):
                 if p.grad is None and grad is None:
                     continue
                 if grad is None:
                     grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                    raise RuntimeError('1-bit Lamb does not support sparse gradients')
 
                 state = self.state[p]
 
@@ -195,30 +198,28 @@ def step(self, closure=None, grads=None):
                 if len(state) == 0:
                     state['step'] = 0
                     state['lamb_coeff_freeze'] = 0.0
-                    state['last_ratio'] = 1.0
+                    state['last_factor'] = 1.0
                     # Exponential moving average of gradient values
                     state['exp_avg'] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
                     state['exp_avg_sq'] = torch.zeros_like(p.data)
-                    if self.compress_mode == 0:
-                        state['exp_avg_sq_back'] = torch.zeros_like(p.data)
+                    state['exp_avg_sq_back'] = torch.zeros_like(p.data)
 
                 if not self.initialize:
-                    self.adam_freeze_key = True
+                    self.lamb_freeze_key = True
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                exp_avg, exp_avg_sq, exp_avg_sq_back = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_back']
                 beta1, beta2 = group['betas']
                 max_coeff = group['max_coeff']
                 min_coeff = group['min_coeff']
-                if self.compress_mode == 0:
-                    exp_avg_sq_back = state['exp_avg_sq_back']
 
                 state['step'] += 1
 
-                if self.adam_freeze_key is False:
+                if self.lamb_freeze_key is False:
+                    # warmup stage, baseline Lamb optimization
                     exp_avg.mul_(beta1).add_(1 - beta1, grad)
                     exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                    if self.compress_mode == 0 and state['step'] == self.freeze_step:
+                    if state['step'] == self.freeze_step:
                         exp_avg_sq_back.data = exp_avg_sq.detach().clone()
                     grad = None
                     if self.initialize:
@@ -241,37 +242,39 @@ def step(self, closure=None, grads=None):
                         with torch.no_grad():
                             p.add_(-group['lr'] * lamb_coeff * update)
                 else:
+                    # compression stage, update each momentum locally, then
+                    # communicate based on the compressed_allreduce below
                     if self.initialize:
                         exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        exp_avg.mul_(self.scaling_coeffs[i][j])
                     grad = None
 
-        # init flattened momentums, worker/server error sizes
+        # init fused momentum
         if len(self.exp_avg_flat) == 0:
-            for i, param_group in enumerate(self.param_groups):
-                momentum_groups = [
-                    self.state[p]['exp_avg'] for p in param_group['params']
-                ]
-                tensor_size = sum([torch.numel(p.data) for p in momentum_groups])
-                corrected_tensor_size = tensor_size
-                if tensor_size % (self.size * self.divider) != 0:
-                    difference = ((self.size * self.divider) -
-                                  (tensor_size % (self.size * self.divider)))
-                    corrected_tensor_size += difference
-                    self.dummy_exp_avg[i] = torch.zeros(
-                        difference,
-                        device=momentum_groups[0].data.device)
-                    momentum_groups.append(self.dummy_exp_avg[i])
-                self.corrected_tensor_sizes.append(corrected_tensor_size)
-                self.server_chunk_sizes.append(corrected_tensor_size // self.size)
-
-                self.exp_avg_flat.append(
-                    # _flatten_dense_tensors([p.detach().clone()
-                    _flatten_dense_tensors([p.clone().detach()
-                                            for p in momentum_groups]))
-                updated_params = _unflatten_dense_tensors(self.exp_avg_flat[i],
-                                                          momentum_groups)
-                for p, q in zip(momentum_groups, updated_params):
-                    p.data = q.data
+            momentum_groups = []
+            tensor_size = 0
+            for group in self.param_groups:
+                for p in group['params']:
+                    momentum_groups.append(self.state[p]['exp_avg'])
+                    tensor_size += torch.numel(p.data)
+            corrected_tensor_size = tensor_size
+            if tensor_size % (self.size * self.divider) != 0:
+                difference = ((self.size * self.divider) - (tensor_size %
+                                                            (self.size * self.divider)))
+                corrected_tensor_size += difference
+                self.dummy_exp_avg[0] = torch.zeros(
+                    difference,
+                    device=momentum_groups[0].data.device)
+                momentum_groups.append(self.dummy_exp_avg[0])
+            self.corrected_tensor_sizes.append(corrected_tensor_size)
+            self.server_chunk_sizes.append(corrected_tensor_size // self.size)
+
+            self.exp_avg_flat.append(
+                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
+            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
+                                                      momentum_groups)
+            for p, q in zip(momentum_groups, updated_params):
+                p.data = q.data
 
         if self.initialize and len(self.worker_errors) == 0:
             torch.cuda.empty_cache()
@@ -284,7 +287,7 @@ def step(self, closure=None, grads=None):
                                 device=self.exp_avg_flat[i].device))
             torch.cuda.empty_cache()
 
-        if self.adam_freeze_key:
+        if self.lamb_freeze_key:
             if self.size > 1 and self.linear_step != 0:
                 for i in range(len(self.exp_avg_flat)):
                     if not self.initialize:
@@ -316,24 +319,23 @@ def step(self, closure=None, grads=None):
                             self.server_errors[i],
                             self.deepspeed.local_rank)
 
-        if self.adam_freeze_key and self.initialize:
+        if self.lamb_freeze_key and self.initialize:
             for i, group in enumerate(self.param_groups):
                 bias_correction = 1 if group['bias_correction'] else 0
 
                 for j, p in enumerate(group['params']):
                     state = self.state[p]
-                    exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                    exp_avg, exp_avg_sq, exp_avg_sq_back = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_back']
                     beta1, beta2 = group['betas']
-                    max_coeff = group['max_coeff']
-                    min_coeff = group['min_coeff']
-
-                    if self.compress_mode == 0:
-                        exp_avg_back = exp_avg_back_list[i][j]
-                        exp_avg_sq_back = state['exp_avg_sq_back']
-                        grad_recover = ((exp_avg - exp_avg_back * beta1) / (1 - beta1))
-                        exp_avg_sq_back.mul_(beta2).addcmul_(1 - beta2,
-                                                             grad_recover,
-                                                             grad_recover)
+                    exp_avg.div_(self.scaling_coeffs[i][j])
+                    if 'exp_avg_mask' in group:
+                        exp_avg.mul_(group['exp_avg_mask'])
+
+                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
+                                        (1 - beta1))
+                    exp_avg_sq_back.mul_(beta2).addcmul_(1 - beta2,
+                                                         grad_reconstruct,
+                                                         grad_reconstruct)
                     denom = exp_avg_sq.sqrt() + group['eps']
                     update_prelim = exp_avg / denom
 
@@ -343,55 +345,87 @@ def step(self, closure=None, grads=None):
                         update = update_prelim
 
                     lamb_coeff = 1.0
-                    if self.compress_mode == 0:
-                        update_norm = update.pow(2).sum().sqrt()
-                        denom_real = exp_avg_sq_back.sqrt() + group['eps']
-                        ratio = (denom / denom_real).max().item()
-                        if group['weight_decay'] > 0.0:
-                            update_ratio = (update_prelim.pow(2).sum().sqrt() /
-                                            update_norm).item()
-                            update_ratio = min(1.0, update_ratio)
-                            ratio = ratio * update_ratio + (1.0 - update_ratio)
-                        if ratio > self.ratio_max:
-                            ratio = self.ratio_max
-                        if ratio < self.ratio_min:
-                            ratio = self.ratio_min
-                        if ratio > state['last_ratio'] * (1.0 + self.ratio_threshold):
-                            ratio = state['last_ratio'] * (1.0 + self.ratio_threshold)
-                        if ratio < state['last_ratio'] * (1.0 - self.ratio_threshold):
-                            ratio = state['last_ratio'] * (1.0 - self.ratio_threshold)
-                        state['last_ratio'] = ratio
-                        lamb_coeff = state['lamb_coeff_freeze'] * ratio
-                    elif self.compress_mode == 1:
-                        ratio = min(
-                            1.0,
-                            float(state['step'] - self.freeze_step) /
-                            (self.linear_step - self.freeze_step))
-                        factor = 1.0 + self.ratio_max * ratio
-                        lamb_coeff = state['lamb_coeff_freeze'] * factor
-                    else:
-                        lamb_coeff = min_coeff
+                    update_norm = update.pow(2).sum().sqrt()
+                    denom_real = exp_avg_sq_back.sqrt() + group['eps']
+                    factor = (denom / denom_real).max().item()
+                    if group['weight_decay'] > 0.0:
+                        update_ratio = min(1.0,
+                                           (update_prelim.pow(2).sum().sqrt() /
+                                            update_norm).item())
+                        factor = factor * update_ratio + (1.0 - update_ratio)
+                    if factor > self.factor_max:
+                        factor = self.factor_max
+                    if factor < self.factor_min:
+                        factor = self.factor_min
+                    if factor > state['last_factor'] * (1.0 + self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 + self.factor_threshold)
+                    if factor < state['last_factor'] * (1.0 - self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 - self.factor_threshold)
+                    state['last_factor'] = factor
+                    lamb_coeff = state['lamb_coeff_freeze'] * factor
                     self.lamb_coeffs.append(lamb_coeff)
                     with torch.no_grad():
                         p.add_(-group['lr'] * lamb_coeff * update)
-            if self.compress_mode == 0:
-                del exp_avg_back_list[:]
-                exp_avg_back_list = None
+            del exp_avg_last_step[:]
+            exp_avg_last_step = None
 
         if not self.initialize:
-            self.adam_freeze_key = False
+            self.lamb_freeze_key = False
             self.initialize = True
             print(
                 f"Finished the initialization step at rank {torch.distributed.get_rank()}"
             )
             return loss
 
-        if self.adam_freeze_key is False:
+        if self.lamb_freeze_key is False:
             if state['step'] >= self.freeze_step:
-                self.adam_freeze_key = True
+                self.lamb_freeze_key = True
                 self.deepspeed.enable_backward_allreduce = False
 
         return loss
 
+    def state_dict(self):
+        """
+        Overrides state_dict() to also save 1-bit Lamb states
+        """
+        original_dict = super().state_dict()
+        original_dict['worker_errors'] = self.worker_errors
+        original_dict['server_errors'] = self.server_errors
+        original_dict['scaling_coeffs'] = self.scaling_coeffs
+        return original_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides state_dict() to reset fused momentum and load/reset 1-bit Lamb states
+        """
+        super().load_state_dict(state_dict)
+        del self.exp_avg_flat[:]
+        self.dummy_exp_avg.clear()
+        del self.corrected_tensor_sizes[:]
+        del self.server_chunk_sizes[:]
+        if self.state[self.param_groups[0]['params'][0]]['step'] >= self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and compression stage continues, load 1-bit Lamb states."
+                )
+            self.worker_errors = state_dict.pop('worker_errors')
+            self.server_errors = state_dict.pop('server_errors')
+            self.scaling_coeffs = state_dict.pop('scaling_coeffs')
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and warmup stage starts/continues, reset 1-bit Lamb states."
+                )
+            if self.lamb_freeze_key is True:
+                self.lamb_freeze_key = False
+                self.deepspeed.enable_backward_allreduce = True
+            del self.worker_errors[:]
+            del self.server_errors[:]
+            del self.scaling_coeffs[:]
+            for group in self.param_groups:
+                for p in group['params']:
+                    self.state[p]['lamb_coeff_freeze'] = 0.0
+                    self.state[p]['last_factor'] = 1.0
+
     def get_lamb_coeffs(self):
         return self.lamb_coeffs

From db0ca7698d82e467a809945fc33b021be908896e Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 15 Feb 2021 16:17:11 -0800
Subject: [PATCH 07/41] finalizing 1-bit lamb

---
 deepspeed/runtime/fp16/onebit/lamb.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index cfb311f12c6c..cc5c0f00031f 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -288,7 +288,7 @@ def step(self, closure=None, grads=None):
             torch.cuda.empty_cache()
 
         if self.lamb_freeze_key:
-            if self.size > 1 and self.linear_step != 0:
+            if self.size > 1:
                 for i in range(len(self.exp_avg_flat)):
                     if not self.initialize:
                         torch.cuda.empty_cache()
@@ -329,6 +329,9 @@ def step(self, closure=None, grads=None):
                     beta1, beta2 = group['betas']
                     exp_avg.div_(self.scaling_coeffs[i][j])
                     if 'exp_avg_mask' in group:
+                        if exp_avg.device != group['exp_avg_mask'].device:
+                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                device=exp_avg.device)
                         exp_avg.mul_(group['exp_avg_mask'])
 
                     grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
@@ -398,7 +401,17 @@ def load_state_dict(self, state_dict):
         """
         Overrides state_dict() to reset fused momentum and load/reset 1-bit Lamb states
         """
+        mask = {}
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                mask[i] = group['exp_avg_mask']
         super().load_state_dict(state_dict)
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # when loading seq 128 checkpoint for seq 512 pretraining),
+        # we don't load the exp_avg_mask from the checkpoint but always
+        # use the one provided in optimizer_grouped_parameters in deepspeed_train.py.
+        for k, v in mask.items():
+            self.param_groups[k]['exp_avg_mask'] = v
         del self.exp_avg_flat[:]
         self.dummy_exp_avg.clear()
         del self.corrected_tensor_sizes[:]
@@ -411,6 +424,13 @@ def load_state_dict(self, state_dict):
             self.worker_errors = state_dict.pop('worker_errors')
             self.server_errors = state_dict.pop('server_errors')
             self.scaling_coeffs = state_dict.pop('scaling_coeffs')
+            for i_error in range(len(self.worker_errors)):
+                self.worker_errors[i_error] = self.worker_errors[i_error].to(
+                    device=self.state[self.param_groups[0]['params']
+                                      [0]]['exp_avg'].device)
+                self.server_errors[i_error] = self.server_errors[i_error].to(
+                    device=self.state[self.param_groups[0]['params']
+                                      [0]]['exp_avg'].device)
         else:
             if torch.distributed.get_rank() == 0:
                 print(

From 07deab84c658e1797bc2642023963696d71ce850 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 15 Feb 2021 16:19:36 -0800
Subject: [PATCH 08/41] add momentum mask and chkpt handling for 1-bit adam

---
 deepspeed/runtime/fp16/onebit/adam.py | 40 +++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 5cb0be7546e5..8171f9e80f35 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -27,8 +27,6 @@ class Adam(torch.optim.Optimizer):
         eps (float, optional): term added to the denominator to improve
             numerical stability. (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
-        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False) NOT SUPPORTED in 1-bit Adam!
@@ -39,7 +37,6 @@ class Adam(torch.optim.Optimizer):
         cuda_aware (boolean, required): Set True if the underlying MPI implementation
             supports CUDA-Aware communication. (default: False)
         comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-            from cupy. (default: 'deepspeed')
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
@@ -153,9 +150,7 @@ def step(self, closure=None, grads=None):
                 if grad is None:
                     grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                    raise RuntimeError('1-bit Adam does not support sparse gradients')
 
                 state = self.state[p]
 
@@ -220,6 +215,11 @@ def step(self, closure=None, grads=None):
                                     state['worker_error'],
                                     state['server_error'],
                                     self.deepspeed.local_rank))
+                        if 'exp_avg_mask' in group:
+                            if exp_avg.device != group['exp_avg_mask'].device:
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                    device=exp_avg.device)
+                            exp_avg.mul_(group['exp_avg_mask'])
 
                     if self.initialize:
                         update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
@@ -249,3 +249,31 @@ def step(self, closure=None, grads=None):
                 self.deepspeed.enable_backward_allreduce = False
 
         return loss
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides state_dict() to reset 1-bit Adam states when needed
+        """
+        mask = {}
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                mask[i] = group['exp_avg_mask']
+        super().load_state_dict(state_dict)
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # when loading seq 128 checkpoint for seq 512 pretraining),
+        # we don't load the exp_avg_mask from the checkpoint but always
+        # use the one provided in optimizer_grouped_parameters in deepspeed_train.py.
+        for k, v in mask.items():
+            self.param_groups[k]['exp_avg_mask'] = v
+        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and warmup stage starts/continues, reset 1-bit Adam states."
+                )
+            if self.adam_freeze_key is True:
+                self.adam_freeze_key = False
+                self.deepspeed.enable_backward_allreduce = True
+            for group in self.param_groups:
+                for p in group['params']:
+                    self.state[p].pop('worker_error')
+                    self.state[p].pop('server_error')

From d55fddb5248d63624e0e176e4b8902082ca422b3 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Fri, 19 Feb 2021 20:19:09 +0000
Subject: [PATCH 09/41] Cleanup and modify nccl test to be runnable with
 deepspeed launcher.

---
 tests/onebitadam/test_nccl_perf.py | 46 ++++++++++++++----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py
index f7b4f123e429..511ebe557157 100644
--- a/tests/onebitadam/test_nccl_perf.py
+++ b/tests/onebitadam/test_nccl_perf.py
@@ -1,33 +1,32 @@
-from mpi4py import MPI
 import time
 import torch
 import torch.distributed as dist
 import numpy as np
+import argparse
 import deepspeed
 
 from deepspeed.runtime.comm.nccl import NcclBackend
-
-# Configure wall clock timer
 from deepspeed.utils.timer import SynchronizedWallClockTimer
-
 from statistics import mean 
 
 timers = SynchronizedWallClockTimer()
 
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
+dist.init_process_group(backend='nccl')
 
-backend = NcclBackend()
+torch.cuda.set_device(args.local_rank)
+device = torch.device("cuda", args.local_rank)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+size = dist.get_world_size() 
+rank = dist.get_rank() 
 
+backend = NcclBackend()
+local_rank = args.local_rank  
+
+# Setting tensor_size (BERT-Large)
 tensor_size = 300 * 2**20
 server_size = int(tensor_size / size)
 if tensor_size % (8 * size) != 0:
@@ -46,8 +45,6 @@
 warmup = 10
 iters = 10
 
-local_rank = rank % torch.cuda.device_count()
-
 # Warmup
 for i in range(warmup):
     backend.compressed_allreduce(a, worker_error, server_error, local_rank)
@@ -57,7 +54,8 @@
 a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
 scale = a.norm() / np.sqrt(a.numel())
 a_compressed = scale * a_sign
-print(a_compressed.shape)
+
+print("Shape of the compressed buffer:", a_compressed.shape) if rank == 0 else None
 
 for i in range(iters):
     timers('compressed_allreduce').start()
@@ -66,8 +64,8 @@
     timers('compressed_allreduce').stop()
     time_list.append(timers('compressed_allreduce').elapsed())
 
-timer_names = ['compressed_allreduce']
-timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+#timer_names = ['compressed_allreduce']
+#timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
 
 places = 2
 convert = 1e3
@@ -81,12 +79,12 @@
 minlat = round(min(time_list) * convert)
 maxlat = round(max(time_list) * convert)
 meanlat = round(mean(time_list) * convert, places)
-print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat))
-print("tensor shape", a.shape)
-duration=meanlat/1e3
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None
+#print("tensor shape", a.shape)
+duration = meanlat/1e3
 tput = ((tensor_size*4)/duration)
-print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput/1e9))
+print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput/1e9)) if rank == 0 else None
 size = tensor_size * 4
 n = dist.get_world_size()
 busbw = (size / duration) * (2 * (n - 1) / n)
-print("busbw: %f GB/s" % (busbw / 1e9))
+print("busbw: %f GB/s" % (busbw / 1e9)) if rank == 0 else None

From 8cbc212ba7aef89b6c27212d607e61318c476517 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 22 Feb 2021 10:19:35 -0800
Subject: [PATCH 10/41] Fix format.

---
 deepspeed/runtime/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index e8e425da0331..b4557a5cd04a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -20,7 +20,8 @@
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
     ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
-    ONEBIT_LAMB_OPTIMIZER, TORCH_ADAM_PARAM
+    ONEBIT_LAMB_OPTIMIZER, \
+    TORCH_ADAM_PARAM
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \

From ff8c871ab905e172f5732e53a04518934e19f0fc Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 22 Feb 2021 10:22:27 -0800
Subject: [PATCH 11/41] fix formatting again.

---
 tests/onebitadam/test_mpi_perf.py  |  2 +-
 tests/onebitadam/test_nccl_perf.py | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/onebitadam/test_mpi_perf.py b/tests/onebitadam/test_mpi_perf.py
index 769792b92f76..4b572c814317 100644
--- a/tests/onebitadam/test_mpi_perf.py
+++ b/tests/onebitadam/test_mpi_perf.py
@@ -10,7 +10,7 @@
 # Configure wall clock timer
 from deepspeed.utils.timer import SynchronizedWallClockTimer
 
-from statistics import mean 
+from statistics import mean
 
 timers = SynchronizedWallClockTimer()
 
diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py
index 511ebe557157..c45ff205621f 100644
--- a/tests/onebitadam/test_nccl_perf.py
+++ b/tests/onebitadam/test_nccl_perf.py
@@ -7,7 +7,7 @@
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 from deepspeed.utils.timer import SynchronizedWallClockTimer
-from statistics import mean 
+from statistics import mean
 
 timers = SynchronizedWallClockTimer()
 
@@ -20,11 +20,11 @@
 torch.cuda.set_device(args.local_rank)
 device = torch.device("cuda", args.local_rank)
 
-size = dist.get_world_size() 
-rank = dist.get_rank() 
+size = dist.get_world_size()
+rank = dist.get_rank()
 
 backend = NcclBackend()
-local_rank = args.local_rank  
+local_rank = args.local_rank
 
 # Setting tensor_size (BERT-Large)
 tensor_size = 300 * 2**20
@@ -79,11 +79,13 @@
 minlat = round(min(time_list) * convert)
 maxlat = round(max(time_list) * convert)
 meanlat = round(mean(time_list) * convert, places)
-print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat,
+                                                        maxlat,
+                                                        meanlat)) if rank == 0 else None
 #print("tensor shape", a.shape)
-duration = meanlat/1e3
-tput = ((tensor_size*4)/duration)
-print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput/1e9)) if rank == 0 else None
+duration = meanlat / 1e3
+tput = ((tensor_size * 4) / duration)
+print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput / 1e9)) if rank == 0 else None
 size = tensor_size * 4
 n = dist.get_world_size()
 busbw = (size / duration) * (2 * (n - 1) / n)

From c17041f3848266b8ab6a4e896e57850707e47c88 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 22 Feb 2021 22:39:31 +0000
Subject: [PATCH 12/41] make test runnable without mpi4py

---
 tests/onebitadam/test_nccl_backend.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tests/onebitadam/test_nccl_backend.py b/tests/onebitadam/test_nccl_backend.py
index be4acc8a31d8..b7313d0155aa 100644
--- a/tests/onebitadam/test_nccl_backend.py
+++ b/tests/onebitadam/test_nccl_backend.py
@@ -1,26 +1,26 @@
-from mpi4py import MPI
 import time
 import torch
 import torch.distributed as dist
 import numpy as np
+import argparse
 import deepspeed
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
+dist.init_process_group(backend='nccl')
 
-backend = NcclBackend()
+torch.cuda.set_device(args.local_rank)
+device = torch.device("cuda", args.local_rank)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+size = dist.get_world_size()
+rank = dist.get_rank()
 
+backend = NcclBackend()
+local_rank = args.local_rank
 
 # A simulated compression function using torch.distributed
 def torch_sim(a):
@@ -44,7 +44,7 @@ def torch_sim(a):
     return a_server_compressed, worker_error, server_error
 
 
-tensor_size = 100 * 2**20
+tensor_size = 300 * 2**20
 server_size = int(tensor_size / size)
 if tensor_size % (8 * size) != 0:
     right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
@@ -61,7 +61,6 @@ def torch_sim(a):
 
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
 torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
 
 a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 

From 5e01a30e3b9b5081204759a0b0d956ccd22bc743 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 22 Feb 2021 23:32:16 +0000
Subject: [PATCH 13/41] Add dist.alltoall and dist.allgather instead of custom
 functions.

---
 deepspeed/runtime/comm/nccl.py | 35 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 1972fda6cd73..e2b72e612383 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -79,39 +79,34 @@ def compressed_allreduce(self,
 
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
-        recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+        recvbuf_scale = [torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for i in range(self.size)]
 
+
+        print(f"recvbuf_scale = {recvbuf_scale}, worker_scale={worker_scale}")
         # communication phase 1
         gather_start = time.time()
-        requests = []
-        for idx in range(self.size):
-            requests += self.my_igather(self.rank,
-                                        self.size,
-                                        self.world_group,
-                                        sign_list_packed[idx],
-                                        recvbuf_sign,
-                                        root=idx)
-            requests += self.my_igather(self.rank,
-                                        self.size,
-                                        self.world_group,
-                                        worker_scale,
-                                        recvbuf_scale,
-                                        root=idx)
-        for i in range(len(requests)):
-            requests[i].wait()
+        # Alltoall for sign
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed))
+        # Allgather for scale
+        dist.all_gather(recvbuf_scale, worker_scale)
+        print(f"recvbuf_scale = {recvbuf_scale}, worker_scale={worker_scale}")
+
+
+
         gather_end = time.time()
 
         # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
         cupy_sign_list_packed = None
 
         cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
-        cupy_recvbuf_scale = self.compression_backend.torch2cupy(recvbuf_scale)
+        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
 
         compensated_server_m = self.compression_backend.cupy2torch(
             (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                 self.size,
                 -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
+                    torch.stack(recvbuf_scale).mul_(
                         1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
         server_scale = torch.norm(compensated_server_m) / np.sqrt(
@@ -146,7 +141,7 @@ def compressed_allreduce(self,
         # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
         cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                 1],
-                                               dtype=cupy_recvbuf_scale.dtype)
+                                               dtype=cupy_worker_scale.dtype)
         # cupy_recvbuf_scale, recvbuf_scale = None, None
 
         recvbuf_scale_server = [

From 97a55577900d04d8d5069dc732e8d30785092a46 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Mon, 22 Feb 2021 23:52:26 +0000
Subject: [PATCH 14/41] remove debug prints.

---
 deepspeed/runtime/comm/nccl.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index e2b72e612383..f9c9a4d3c43d 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -82,17 +82,12 @@ def compressed_allreduce(self,
         #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
         recvbuf_scale = [torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for i in range(self.size)]
 
-
-        print(f"recvbuf_scale = {recvbuf_scale}, worker_scale={worker_scale}")
         # communication phase 1
         gather_start = time.time()
         # Alltoall for sign
         dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed))
         # Allgather for scale
         dist.all_gather(recvbuf_scale, worker_scale)
-        print(f"recvbuf_scale = {recvbuf_scale}, worker_scale={worker_scale}")
-
-
 
         gather_end = time.time()
 

From e3e1e39b223556a8b99ba23ce8209eb543c6bc27 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 1 Mar 2021 11:22:21 -0800
Subject: [PATCH 15/41] formatting and renaming

---
 deepspeed/runtime/comm/nccl.py                  | 17 ++++++++++-------
 .../test_com_reduce_host.py                     |  0
 .../{onebitadam => onebit}/test_mpi_backend.py  |  0
 tests/{onebitadam => onebit}/test_mpi_perf.py   |  0
 .../{onebitadam => onebit}/test_nccl_backend.py |  1 +
 tests/{onebitadam => onebit}/test_nccl_perf.py  |  0
 .../{onebitadam => onebit}/test_server_error.py |  0
 7 files changed, 11 insertions(+), 7 deletions(-)
 rename tests/{onebitadam => onebit}/test_com_reduce_host.py (100%)
 rename tests/{onebitadam => onebit}/test_mpi_backend.py (100%)
 rename tests/{onebitadam => onebit}/test_mpi_perf.py (100%)
 rename tests/{onebitadam => onebit}/test_nccl_backend.py (99%)
 rename tests/{onebitadam => onebit}/test_nccl_perf.py (100%)
 rename tests/{onebitadam => onebit}/test_server_error.py (100%)

diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index f9c9a4d3c43d..3a07b969f23d 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -46,7 +46,7 @@ def compressed_allreduce(self,
                              server_error,
                              local_rank):
 
-        all_start_time = time.time()
+        # all_start_time = time.time()
         original_size = buffer_m.numel()
         worker_error_size = worker_error.numel()
         cupy.cuda.Device(local_rank).use()
@@ -70,7 +70,7 @@ def compressed_allreduce(self,
             [self.size,
              cupy_sign_list_packed[self.rank].size],
             dtype=cupy_sign_list_packed[0].dtype)
-        cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
+        # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
         sign_list_packed = [
             self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
@@ -80,16 +80,20 @@ def compressed_allreduce(self,
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
         #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
-        recvbuf_scale = [torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for i in range(self.size)]
+        recvbuf_scale = [
+            torch.zeros(1,
+                        dtype=worker_scale.dtype,
+                        device=torch.device(local_rank)) for i in range(self.size)
+        ]
 
         # communication phase 1
-        gather_start = time.time()
+        # gather_start = time.time()
         # Alltoall for sign
         dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed))
         # Allgather for scale
         dist.all_gather(recvbuf_scale, worker_scale)
 
-        gather_end = time.time()
+        # gather_end = time.time()
 
         # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
         cupy_sign_list_packed = None
@@ -101,8 +105,7 @@ def compressed_allreduce(self,
             (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                 self.size,
                 -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    torch.stack(recvbuf_scale).mul_(
-                        1 / self.size)).sum(0)
+                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
         server_scale = torch.norm(compensated_server_m) / np.sqrt(
             compensated_server_m.numel())
diff --git a/tests/onebitadam/test_com_reduce_host.py b/tests/onebit/test_com_reduce_host.py
similarity index 100%
rename from tests/onebitadam/test_com_reduce_host.py
rename to tests/onebit/test_com_reduce_host.py
diff --git a/tests/onebitadam/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
similarity index 100%
rename from tests/onebitadam/test_mpi_backend.py
rename to tests/onebit/test_mpi_backend.py
diff --git a/tests/onebitadam/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
similarity index 100%
rename from tests/onebitadam/test_mpi_perf.py
rename to tests/onebit/test_mpi_perf.py
diff --git a/tests/onebitadam/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
similarity index 99%
rename from tests/onebitadam/test_nccl_backend.py
rename to tests/onebit/test_nccl_backend.py
index b7313d0155aa..7f43038d7d6d 100644
--- a/tests/onebitadam/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -22,6 +22,7 @@
 backend = NcclBackend()
 local_rank = args.local_rank
 
+
 # A simulated compression function using torch.distributed
 def torch_sim(a):
     a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
similarity index 100%
rename from tests/onebitadam/test_nccl_perf.py
rename to tests/onebit/test_nccl_perf.py
diff --git a/tests/onebitadam/test_server_error.py b/tests/onebit/test_server_error.py
similarity index 100%
rename from tests/onebitadam/test_server_error.py
rename to tests/onebit/test_server_error.py

From d5b9dcc89b97cb3be40bc857f4cba5d19ae2abda Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 1 Mar 2021 12:52:43 -0800
Subject: [PATCH 16/41] renaming

---
 deepspeed/runtime/engine.py                              | 8 ++++----
 deepspeed/runtime/fp16/onebit/{adam.py => onebitadam.py} | 4 ++--
 deepspeed/runtime/fp16/onebit/{lamb.py => onebitlamb.py} | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename deepspeed/runtime/fp16/onebit/{adam.py => onebitadam.py} (99%)
 rename deepspeed/runtime/fp16/onebit/{lamb.py => onebitlamb.py} (99%)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index b4557a5cd04a..fd0e1463fc60 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -624,11 +624,11 @@ def _configure_basic_optimizer(self, model_parameters):
             from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.adam import Adam
-            optimizer = Adam(model_parameters, self, **optimizer_parameters)
+            from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
+            optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.lamb import Lamb
-            optimizer = Lamb(model_parameters, self, **optimizer_parameters)
+            from deepspeed.runtime.fp16.onebit.onebitlamb import OnebitLamb
+            optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/onebitadam.py
similarity index 99%
rename from deepspeed/runtime/fp16/onebit/adam.py
rename to deepspeed/runtime/fp16/onebit/onebitadam.py
index 8171f9e80f35..2a37e239d9ce 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/onebitadam.py
@@ -11,7 +11,7 @@
 from deepspeed.utils.logging import logger
 
 
-class Adam(torch.optim.Optimizer):
+class OnebitAdam(torch.optim.Optimizer):
     """Implements the 1-bit Adam algorithm. Currently GPU-only.
     For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
     It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
@@ -68,7 +68,7 @@ def __init__(self,
                         weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
 
-        super(Adam, self).__init__(params, defaults)
+        super(OnebitAdam, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
         assert (dist.is_initialized())
 
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/onebitlamb.py
similarity index 99%
rename from deepspeed/runtime/fp16/onebit/lamb.py
rename to deepspeed/runtime/fp16/onebit/onebitlamb.py
index cc5c0f00031f..52c871c07679 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/onebitlamb.py
@@ -8,7 +8,7 @@
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 
-class Lamb(torch.optim.Optimizer):
+class OnebitLamb(torch.optim.Optimizer):
     """Implements the 1-bit Lamb algorithm. Currently GPU-only.
 
     Arguments:
@@ -85,7 +85,7 @@ def __init__(self,
                         max_coeff=max_coeff,
                         min_coeff=min_coeff)
 
-        super(Lamb, self).__init__(params, defaults)
+        super(OnebitLamb, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
         assert (dist.is_initialized())
 

From 3d66a8a218c112d2eaa808344c51b3f79247b487 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 1 Mar 2021 13:02:05 -0800
Subject: [PATCH 17/41] renaming

---
 tests/onebit/test_com_reduce_host.py | 2 +-
 tests/onebit/test_server_error.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/onebit/test_com_reduce_host.py b/tests/onebit/test_com_reduce_host.py
index 1507abc44f24..a02816c6a4c9 100644
--- a/tests/onebit/test_com_reduce_host.py
+++ b/tests/onebit/test_com_reduce_host.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
diff --git a/tests/onebit/test_server_error.py b/tests/onebit/test_server_error.py
index 075145f84915..e4b680a6cffb 100644
--- a/tests/onebit/test_server_error.py
+++ b/tests/onebit/test_server_error.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()

From b042467e9db8c1119910df106d63b28e280f355f Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 1 Mar 2021 22:10:11 -0800
Subject: [PATCH 18/41] add unit test, fix existing tests

---
 tests/onebit/test_com_reduce_host.py |   2 +-
 tests/onebit/test_mpi_backend.py     |   2 +-
 tests/onebit/test_nccl_backend.py    |   2 +-
 tests/unit/test_onebit.py            | 131 +++++++++++++++++++++++++++
 4 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/test_onebit.py

diff --git a/tests/onebit/test_com_reduce_host.py b/tests/onebit/test_com_reduce_host.py
index a02816c6a4c9..3a575828638e 100644
--- a/tests/onebit/test_com_reduce_host.py
+++ b/tests/onebit/test_com_reduce_host.py
@@ -79,7 +79,7 @@ def torch_sim(a):
 if torch.sum(diff_server_mask) == 0:
     print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
 else:
-    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+    check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
     if torch.sum(check_mag_mask) == 0:
         print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
     else:
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index f0dd4dce5bdc..6ef7df42a81d 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -81,7 +81,7 @@ def torch_sim(a):
     if torch.sum(diff_server_mask) == 0:
         print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
     else:
-        check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
         if torch.sum(check_mag_mask) == 0:
             print('Successfully passed the test for MPI Backend at Rank {}'.format(rank))
         else:
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index 7f43038d7d6d..8935977ad5a2 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -80,7 +80,7 @@ def torch_sim(a):
     if torch.sum(diff_server_mask) == 0:
         print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
     else:
-        check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
         if torch.sum(check_mag_mask) == 0:
             print(
                 'Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
new file mode 100644
index 000000000000..1d5bd9becb71
--- /dev/null
+++ b/tests/unit/test_onebit.py
@@ -0,0 +1,131 @@
+import torch
+import torch.distributed as dist
+import deepspeed
+import argparse
+import pytest
+import json
+import os
+import numpy as np
+from common import distributed_test
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
+
+try:
+    from apex import amp
+    _amp_available = True
+except ImportError:
+    _amp_available = False
+amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
+
+
+def test_onebitadam_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitadam_fp16_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_compressed_allreduce_basic(tmpdir):
+    @distributed_test(world_size=[1, 2])
+    def _test_compressed_allreduce_basic():
+        from deepspeed.runtime.comm.nccl import NcclBackend
+        size = dist.get_world_size()
+        rank = dist.get_rank()
+        backend = NcclBackend()
+        local_rank = dist.get_rank()
+        device = torch.device("cuda", dist.get_rank())
+
+        # A simulated compression function using torch.distributed
+        def torch_sim(a):
+            a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+            scale = a.norm() / np.sqrt(a.numel())
+            a_compressed = scale * a_sign
+            a_sign = None
+            worker_error = a - a_compressed
+            dist.all_reduce(a_compressed)
+            a_compressed.mul_(1 / dist.get_world_size())
+            a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(
+                2.0)
+            a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+            server_scale = [
+                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
+            ]
+            a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+            a_server_compressed = torch.cat(
+                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+            rank = dist.get_rank()
+            server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+            torch.cuda.synchronize()
+            torch.distributed.barrier()
+            return a_server_compressed, worker_error, server_error
+
+        tensor_size = 300 * 2**20
+        server_size = int(tensor_size / size)
+        if tensor_size % (8 * size) != 0:
+            right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+        else:
+            right_tensor_size = tensor_size
+        right_server_size = right_tensor_size // size
+
+        # Adding bias to the initialization of the gradient we are communicating
+        # In order to get rid of the case where some elements in the gradient are too small
+        a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+        worker_error = torch.zeros(right_tensor_size, device=device)
+        server_error = torch.zeros(right_server_size, device=device)
+
+        a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+        torch.cuda.empty_cache()
+
+        a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+        threshold = 1e-6
+        magnitude_threshold = 1e-6
+        diff_mask = (a_after - a_torch) > threshold
+        diff_server_mask = torch.chunk(diff_mask, size)[rank]
+        mpi_server = torch.chunk(a_after, size)[rank] + server_error
+        torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+        # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+        # The test would skip those numbers that are too small in compensated_server_m
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) != 0:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
+        assert torch.sum(diff_server_mask) == 0 or torch.sum(check_mag_mask) == 0
+
+    _test_compressed_allreduce_basic()

From 9fa5166cb1e240bd45156413a2eb65ed27550946 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 2 Mar 2021 22:23:46 -0800
Subject: [PATCH 19/41] skip unit test when torch < 1.8

---
 tests/unit/test_onebit.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 1d5bd9becb71..49fab4cf534d 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -9,6 +9,12 @@
 from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
 
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
+    pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
+                allow_module_level=True)
+
 try:
     from apex import amp
     _amp_available = True

From 65d7ec5920b1af0da3be71aeccf3d1579ce0765b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 3 Mar 2021 13:24:02 -0800
Subject: [PATCH 20/41] revert 1-bit lamb

---
 deepspeed/runtime/config.py                 |   2 -
 deepspeed/runtime/engine.py                 |   7 +-
 deepspeed/runtime/fp16/onebit/onebitlamb.py | 451 --------------------
 3 files changed, 1 insertion(+), 459 deletions(-)
 delete mode 100644 deepspeed/runtime/fp16/onebit/onebitlamb.py

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 53a47c24b194..2aeb5135350f 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -30,13 +30,11 @@
 ADAMW_OPTIMIZER = 'adamw'
 LAMB_OPTIMIZER = 'lamb'
 ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
-ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
 DEEPSPEED_OPTIMIZERS = [
     ADAM_OPTIMIZER,
     ADAMW_OPTIMIZER,
     LAMB_OPTIMIZER,
     ONEBIT_ADAM_OPTIMIZER,
-    ONEBIT_LAMB_OPTIMIZER,
 ]
 
 # extra optimizer parameters for adam/adamw
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index dbe906b8889b..8ba40c3983d7 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -20,7 +20,6 @@
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
     ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
-    ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
@@ -508,8 +507,7 @@ def _do_sanity_check(self):
                 assert self._is_supported_optimizer(self.optimizer_name()), \
                     '{} is not a supported DeepSpeed Optimizer'.format(self.optimizer_name())
 
-        if self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name(
-        ) == ONEBIT_LAMB_OPTIMIZER:
+        if self.optimizer_name() == LAMB_OPTIMIZER:
             assert self.dynamic_loss_scale(), \
                 'DeepSpeed {} optimizer requires dynamic loss scaling'.format(self.optimizer_name())
 
@@ -626,9 +624,6 @@ def _configure_basic_optimizer(self, model_parameters):
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
             from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
-        elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.onebitlamb import OnebitLamb
-            optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/fp16/onebit/onebitlamb.py b/deepspeed/runtime/fp16/onebit/onebitlamb.py
deleted file mode 100644
index 52c871c07679..000000000000
--- a/deepspeed/runtime/fp16/onebit/onebitlamb.py
+++ /dev/null
@@ -1,451 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import types
-import torch
-import numpy as np
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-
-class OnebitLamb(torch.optim.Optimizer):
-    """Implements the 1-bit Lamb algorithm. Currently GPU-only.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups.
-        lr (float, optional): learning rate. (default: 1e-3)
-        freeze_step (int, optional): Number of steps for warmup (uncompressed)
-            stage before we start using compressed communication. (default 100000)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square. (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
-        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in 1-bit Lamb!
-        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
-            adds eps to the bias-corrected second moment estimate before
-            evaluating square root instead of adding it to the square root of
-            second moment estimate as in the original paper. (default: False)
-        cuda_aware (boolean, required): Set True if the underlying MPI implementation
-            supports CUDA-Aware communication. (default: False)
-        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-        coeff_beta (float, optional): coefficients used for computing
-            running averages of lamb coefficient (default: 0.99) not that you may want to
-            increase or decrease this beta depending on the freeze_step you choose:
-            1/(1 - coeff_beta) should be smaller than or equal to freeze_step
-        factor_max (float, optional): maximum value of scaling factor to the frozen lamb
-            coefficient during compression stage (default: 4.5)
-        factor_min (float, optional): maximum value of scaling factor to the frozen lamb
-            coefficient during compression stage (default: 0.5)
-        factor_threshold (float, optional): threshold of how much the scaling factor can
-            fluctuate between steps (default: 0.1)
-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
-        https://arxiv.org/abs/1904.00962
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-    def __init__(self,
-                 params,
-                 deepspeed=None,
-                 lr=1e-3,
-                 freeze_step=100000,
-                 bias_correction=True,
-                 betas=(0.9,
-                        0.999),
-                 eps=1e-8,
-                 eps_inside_sqrt=False,
-                 weight_decay=0.,
-                 max_grad_norm=0.,
-                 max_coeff=10.0,
-                 min_coeff=0.01,
-                 amsgrad=False,
-                 cuda_aware=False,
-                 comm_backend_name='nccl',
-                 coeff_beta=0.99,
-                 factor_max=4.5,
-                 factor_min=0.5,
-                 factor_threshold=0.1):
-
-        if amsgrad:
-            raise RuntimeError('1-bit Lamb does not support the AMSGrad variant.')
-
-        defaults = dict(lr=lr,
-                        bias_correction=bias_correction,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm,
-                        max_coeff=max_coeff,
-                        min_coeff=min_coeff)
-
-        super(OnebitLamb, self).__init__(params, defaults)
-        self.eps_mode = 0 if eps_inside_sqrt else 1
-        assert (dist.is_initialized())
-
-        self.deepspeed = deepspeed
-        self.lamb_freeze_key = False
-        self.initialize = False
-        self.freeze_step = freeze_step
-        self.cuda_aware = cuda_aware
-        self.coeff_beta = coeff_beta
-        self.factor_max = factor_max
-        self.factor_min = factor_min
-        self.factor_threshold = factor_threshold
-
-        self.comm_backend_name = comm_backend_name
-
-        # Empty initializer. Set handle based on the comm backend as follows.
-        self.comm_backend_handle = None
-
-        if self.comm_backend_name == 'nccl':
-            assert torch.__version__.startswith("1.8."), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
-            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
-            from deepspeed.runtime.comm.nccl import NcclBackend
-            self.comm_backend_handle = NcclBackend()
-
-        elif self.comm_backend_name == 'mpi':
-            from deepspeed.runtime.comm.mpi import MpiBackend
-            self.comm_backend_handle = MpiBackend(cuda_aware)
-
-        self.size = self.comm_backend_handle.size
-
-        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
-
-        self.exp_avg_flat = []
-        self.dummy_exp_avg = {}
-        self.corrected_tensor_sizes = []
-        self.server_chunk_sizes = []
-        self.worker_errors = []
-        self.server_errors = []
-        self.scaling_coeffs = []
-
-        self.lamb_coeffs = []
-
-    def step(self, closure=None, grads=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            grads (list of tensors, optional): weight gradient to use for the
-                optimizer update. If gradients have type torch.half, parameters
-                are expected to be in type torch.float. (default: None)
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        if grads is None:
-            grads_group = [None] * len(self.param_groups)
-        # backward compatibility
-        # assuming a list/generator of parameter means single group
-        elif isinstance(grads, types.GeneratorType):
-            grads_group = [grads]
-        elif type(grads[0]) != list:
-            grads_group = [grads]
-        else:
-            grads_group = grads
-
-        #remove the previous stats
-        del self.lamb_coeffs[:]
-
-        if self.lamb_freeze_key:
-            exp_avg_last_step = []
-            for group in self.param_groups:
-                exp_avg_last_step.append(
-                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
-            if len(self.scaling_coeffs) == 0:
-                # compute the scaling_coeff for each momentum which is used to
-                # reduce compression error during compressed_allreduce
-                momentum_scales = []
-                for group in self.param_groups:
-                    momentum_scales.append([
-                        (torch.norm(self.state[p]['exp_avg']) /
-                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
-                        for p in group['params']
-                    ])
-                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
-                    [len(x) for x in momentum_scales])
-                for i, group in enumerate(self.param_groups):
-                    self.scaling_coeffs.append([
-                        united_scale / momentum_scales[i][j]
-                        for j in range(len(group['params']))
-                    ])
-
-        for i, (group, grads_this_group) in enumerate(zip(self.param_groups, grads_group)):
-            if grads_this_group is None:
-                grads_this_group = [None] * len(group['params'])
-
-            bias_correction = 1 if group['bias_correction'] else 0
-
-            for j, (p, grad) in enumerate(zip(group['params'], grads_this_group)):
-                if p.grad is None and grad is None:
-                    continue
-                if grad is None:
-                    grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('1-bit Lamb does not support sparse gradients')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['lamb_coeff_freeze'] = 0.0
-                    state['last_factor'] = 1.0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-                    state['exp_avg_sq_back'] = torch.zeros_like(p.data)
-
-                if not self.initialize:
-                    self.lamb_freeze_key = True
-
-                exp_avg, exp_avg_sq, exp_avg_sq_back = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_back']
-                beta1, beta2 = group['betas']
-                max_coeff = group['max_coeff']
-                min_coeff = group['min_coeff']
-
-                state['step'] += 1
-
-                if self.lamb_freeze_key is False:
-                    # warmup stage, baseline Lamb optimization
-                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                    if state['step'] == self.freeze_step:
-                        exp_avg_sq_back.data = exp_avg_sq.detach().clone()
-                    grad = None
-                    if self.initialize:
-                        weight_norm = p.data.pow(2).sum().sqrt()
-                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
-                        if group['weight_decay'] > 0.0:
-                            update += group['weight_decay'] * p.data
-                        update_norm = update.pow(2).sum().sqrt()
-                        lamb_coeff = 1.0
-                        if weight_norm != 0 and update_norm != 0:
-                            lamb_coeff = (weight_norm / update_norm).item()
-                            if lamb_coeff > max_coeff:
-                                lamb_coeff = max_coeff
-                            if lamb_coeff < min_coeff:
-                                lamb_coeff = min_coeff
-                        if lamb_coeff != 1.0:
-                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
-                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
-                        self.lamb_coeffs.append(lamb_coeff)
-                        with torch.no_grad():
-                            p.add_(-group['lr'] * lamb_coeff * update)
-                else:
-                    # compression stage, update each momentum locally, then
-                    # communicate based on the compressed_allreduce below
-                    if self.initialize:
-                        exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                        exp_avg.mul_(self.scaling_coeffs[i][j])
-                    grad = None
-
-        # init fused momentum
-        if len(self.exp_avg_flat) == 0:
-            momentum_groups = []
-            tensor_size = 0
-            for group in self.param_groups:
-                for p in group['params']:
-                    momentum_groups.append(self.state[p]['exp_avg'])
-                    tensor_size += torch.numel(p.data)
-            corrected_tensor_size = tensor_size
-            if tensor_size % (self.size * self.divider) != 0:
-                difference = ((self.size * self.divider) - (tensor_size %
-                                                            (self.size * self.divider)))
-                corrected_tensor_size += difference
-                self.dummy_exp_avg[0] = torch.zeros(
-                    difference,
-                    device=momentum_groups[0].data.device)
-                momentum_groups.append(self.dummy_exp_avg[0])
-            self.corrected_tensor_sizes.append(corrected_tensor_size)
-            self.server_chunk_sizes.append(corrected_tensor_size // self.size)
-
-            self.exp_avg_flat.append(
-                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
-            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
-                                                      momentum_groups)
-            for p, q in zip(momentum_groups, updated_params):
-                p.data = q.data
-
-        if self.initialize and len(self.worker_errors) == 0:
-            torch.cuda.empty_cache()
-            for i in range(len(self.exp_avg_flat)):
-                self.worker_errors.append(
-                    torch.zeros(self.corrected_tensor_sizes[i],
-                                device=self.exp_avg_flat[i].device))
-                self.server_errors.append(
-                    torch.zeros(self.server_chunk_sizes[i],
-                                device=self.exp_avg_flat[i].device))
-            torch.cuda.empty_cache()
-
-        if self.lamb_freeze_key:
-            if self.size > 1:
-                for i in range(len(self.exp_avg_flat)):
-                    if not self.initialize:
-                        torch.cuda.empty_cache()
-                        self.worker_errors.append(
-                            torch.zeros(self.corrected_tensor_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
-                        self.server_errors.append(
-                            torch.zeros(self.server_chunk_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
-                        torch.cuda.empty_cache()
-                        if torch.distributed.get_rank() == 0:
-                            print("Cupy Buffers Initialized Successfully.")
-
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[0],
-                            self.server_errors[0],
-                            self.deepspeed.local_rank)
-
-                        if torch.distributed.get_rank() == 0:
-                            print('Pop out errors', flush=True)
-                        del self.worker_errors[:]
-                        del self.server_errors[:]
-                    else:
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[i],
-                            self.server_errors[i],
-                            self.deepspeed.local_rank)
-
-        if self.lamb_freeze_key and self.initialize:
-            for i, group in enumerate(self.param_groups):
-                bias_correction = 1 if group['bias_correction'] else 0
-
-                for j, p in enumerate(group['params']):
-                    state = self.state[p]
-                    exp_avg, exp_avg_sq, exp_avg_sq_back = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_back']
-                    beta1, beta2 = group['betas']
-                    exp_avg.div_(self.scaling_coeffs[i][j])
-                    if 'exp_avg_mask' in group:
-                        if exp_avg.device != group['exp_avg_mask'].device:
-                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                device=exp_avg.device)
-                        exp_avg.mul_(group['exp_avg_mask'])
-
-                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
-                                        (1 - beta1))
-                    exp_avg_sq_back.mul_(beta2).addcmul_(1 - beta2,
-                                                         grad_reconstruct,
-                                                         grad_reconstruct)
-                    denom = exp_avg_sq.sqrt() + group['eps']
-                    update_prelim = exp_avg / denom
-
-                    if group['weight_decay'] > 0.0:
-                        update = update_prelim + group['weight_decay'] * p.data
-                    else:
-                        update = update_prelim
-
-                    lamb_coeff = 1.0
-                    update_norm = update.pow(2).sum().sqrt()
-                    denom_real = exp_avg_sq_back.sqrt() + group['eps']
-                    factor = (denom / denom_real).max().item()
-                    if group['weight_decay'] > 0.0:
-                        update_ratio = min(1.0,
-                                           (update_prelim.pow(2).sum().sqrt() /
-                                            update_norm).item())
-                        factor = factor * update_ratio + (1.0 - update_ratio)
-                    if factor > self.factor_max:
-                        factor = self.factor_max
-                    if factor < self.factor_min:
-                        factor = self.factor_min
-                    if factor > state['last_factor'] * (1.0 + self.factor_threshold):
-                        factor = state['last_factor'] * (1.0 + self.factor_threshold)
-                    if factor < state['last_factor'] * (1.0 - self.factor_threshold):
-                        factor = state['last_factor'] * (1.0 - self.factor_threshold)
-                    state['last_factor'] = factor
-                    lamb_coeff = state['lamb_coeff_freeze'] * factor
-                    self.lamb_coeffs.append(lamb_coeff)
-                    with torch.no_grad():
-                        p.add_(-group['lr'] * lamb_coeff * update)
-            del exp_avg_last_step[:]
-            exp_avg_last_step = None
-
-        if not self.initialize:
-            self.lamb_freeze_key = False
-            self.initialize = True
-            print(
-                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
-            )
-            return loss
-
-        if self.lamb_freeze_key is False:
-            if state['step'] >= self.freeze_step:
-                self.lamb_freeze_key = True
-                self.deepspeed.enable_backward_allreduce = False
-
-        return loss
-
-    def state_dict(self):
-        """
-        Overrides state_dict() to also save 1-bit Lamb states
-        """
-        original_dict = super().state_dict()
-        original_dict['worker_errors'] = self.worker_errors
-        original_dict['server_errors'] = self.server_errors
-        original_dict['scaling_coeffs'] = self.scaling_coeffs
-        return original_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Overrides state_dict() to reset fused momentum and load/reset 1-bit Lamb states
-        """
-        mask = {}
-        for i, group in enumerate(self.param_groups):
-            if 'exp_avg_mask' in group:
-                mask[i] = group['exp_avg_mask']
-        super().load_state_dict(state_dict)
-        # Because at different stage exp_avg_mask may change (e.g.,
-        # when loading seq 128 checkpoint for seq 512 pretraining),
-        # we don't load the exp_avg_mask from the checkpoint but always
-        # use the one provided in optimizer_grouped_parameters in deepspeed_train.py.
-        for k, v in mask.items():
-            self.param_groups[k]['exp_avg_mask'] = v
-        del self.exp_avg_flat[:]
-        self.dummy_exp_avg.clear()
-        del self.corrected_tensor_sizes[:]
-        del self.server_chunk_sizes[:]
-        if self.state[self.param_groups[0]['params'][0]]['step'] >= self.freeze_step:
-            if torch.distributed.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and compression stage continues, load 1-bit Lamb states."
-                )
-            self.worker_errors = state_dict.pop('worker_errors')
-            self.server_errors = state_dict.pop('server_errors')
-            self.scaling_coeffs = state_dict.pop('scaling_coeffs')
-            for i_error in range(len(self.worker_errors)):
-                self.worker_errors[i_error] = self.worker_errors[i_error].to(
-                    device=self.state[self.param_groups[0]['params']
-                                      [0]]['exp_avg'].device)
-                self.server_errors[i_error] = self.server_errors[i_error].to(
-                    device=self.state[self.param_groups[0]['params']
-                                      [0]]['exp_avg'].device)
-        else:
-            if torch.distributed.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and warmup stage starts/continues, reset 1-bit Lamb states."
-                )
-            if self.lamb_freeze_key is True:
-                self.lamb_freeze_key = False
-                self.deepspeed.enable_backward_allreduce = True
-            del self.worker_errors[:]
-            del self.server_errors[:]
-            del self.scaling_coeffs[:]
-            for group in self.param_groups:
-                for p in group['params']:
-                    self.state[p]['lamb_coeff_freeze'] = 0.0
-                    self.state[p]['last_factor'] = 1.0
-
-    def get_lamb_coeffs(self):
-        return self.lamb_coeffs

From 8376a404204b951080a23f885d0f9062ab178704 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 3 Mar 2021 15:23:32 -0800
Subject: [PATCH 21/41] flatten momentum when dimension is more than 1

---
 deepspeed/runtime/comm/mpi.py  | 5 +++++
 deepspeed/runtime/comm/nccl.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
index 862decac60fa..9e112bccc71d 100644
--- a/deepspeed/runtime/comm/mpi.py
+++ b/deepspeed/runtime/comm/mpi.py
@@ -174,6 +174,9 @@ def compressed_allreduce(self,
                              local_rank):
 
         all_start_time = time.time()
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
         original_size = buffer_m.numel()
         worker_error_size = worker_error.numel()
         cupy.cuda.Device(local_rank).use()
@@ -279,6 +282,8 @@ def compressed_allreduce(self,
                             cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
 
         # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None
 
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 3a07b969f23d..0ac2646bd0d7 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -47,6 +47,9 @@ def compressed_allreduce(self,
                              local_rank):
 
         # all_start_time = time.time()
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
         original_size = buffer_m.numel()
         worker_error_size = worker_error.numel()
         cupy.cuda.Device(local_rank).use()
@@ -169,5 +172,7 @@ def compressed_allreduce(self,
                             cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
 
         return buffer_m

From 6a19f2965ffde1ebf9711f4e5a9020586e00b224 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 3 Mar 2021 15:24:15 -0800
Subject: [PATCH 22/41] add warning message for 1-bit adam under fp32

---
 deepspeed/runtime/engine.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8ba40c3983d7..ab139ae892c5 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -624,6 +624,10 @@ def _configure_basic_optimizer(self, model_parameters):
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
             from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
+            if not self.fp16_enabled():
+                logger.warning(
+                    f'Currently the convergence of 1-bit Adam is only verified under FP16'
+                )
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)

From 819043da65c8aeed098f8560b4a96afb79c9404b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 3 Mar 2021 16:34:59 -0800
Subject: [PATCH 23/41] improve version check

---
 deepspeed/runtime/fp16/onebit/onebitadam.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/fp16/onebit/onebitadam.py b/deepspeed/runtime/fp16/onebit/onebitadam.py
index 2a37e239d9ce..0202500717df 100644
--- a/deepspeed/runtime/fp16/onebit/onebitadam.py
+++ b/deepspeed/runtime/fp16/onebit/onebitadam.py
@@ -89,7 +89,9 @@ def __init__(self,
         self.comm_backend_handle = None
 
         if self.comm_backend_name == 'nccl':
-            assert torch.__version__.startswith("1.8."), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            TORCH_MAJOR = int(torch.__version__.split('.')[0])
+            TORCH_MINOR = int(torch.__version__.split('.')[1])
+            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
             self.comm_backend_handle = NcclBackend()

From a6943be9668cfb25b3787f42940f2f29fbfb046d Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 3 Mar 2021 16:35:58 -0800
Subject: [PATCH 24/41] add fp32 test

---
 tests/unit/test_onebit.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 49fab4cf534d..1d505b8d682f 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -66,6 +66,45 @@ def _test_onebitadam_fp16_basic(args, model, hidden_dim):
     _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+def test_onebitadam_fp32_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitadam_fp32_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
 def test_compressed_allreduce_basic(tmpdir):
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():

From 66a8c93028124083670d2fa9799a02890b83f554 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Thu, 4 Mar 2021 01:57:55 -0800
Subject: [PATCH 25/41] 1-bit adam doc

---
 docs/_tutorials/onebit-adam.md | 96 ++++++++++++++++++++++------------
 docs/index.md                  |  1 +
 2 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index c8eee07586aa..1af80cf833a8 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -1,7 +1,15 @@
 ---
-title: "1-bit Adam: Up to 5x less communication volume and up to 2x faster training"
+title: "1-bit Adam: Up to 5x less communication volume and up to 3.4x faster training"
 ---
 
+**Note:**
+This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes include: 1) NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation. 2) Add support to momentum masks for those parameters with constant zero gradients during training. 3) Bug fixes. See details below.
+{: .notice--info}
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training.
+{: .notice--warning}
+
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
 
 To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
@@ -13,7 +21,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert
 
 ## 1. Overview
 
-### Pre-requisites for installing DeepSpeed
+### 1.1 Pre-requisites for installing DeepSpeed
 
 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
@@ -25,9 +33,19 @@ git submodule update --init --recursive
 cd DeepSpeedExamples/
 ```
 
-### Pre-requisites for 1-bit Adam
+### 1.2 Pre-requisites for 1-bit Adam
+
+#### 1.2.1 (New in v2) NCCL-based implementation
+
+In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. Currently (2021/03/04) you need to install PyTorch 1.8 as a nightly version. Currently (2021/03/04) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
 
-1-bit Adam uses advanced communication schemes that are not yet supported by PyTorch distributed and NCCL. We rely on Message Passing Interface (MPI) for these advanced communication primitives.
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
 
 We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
 
@@ -43,31 +61,32 @@ An example launch command for 1-bit Adam using the `deepspeed` launcher is as fo
 deepspeed --launcher=[mvapich|openmpi] script.py
 ```
 
-Please note that because 1-bit Adam uses MPI backend to communicate during the compression stage, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+Please note that for MPI-based implementation of 1-bit Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
 
 Alternatively, the standard mpirun launcher can also be used as follows:
 
 ```shell
-mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash [training_script.sh]
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
-### 1-bit Algorithm
+### 1.3 1-bit Algorithm
 
-The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
-### Configuration of 1-bit Adam
+### 1.4 Configuration of 1-bit Adam
 The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
 
 ```json
 {
   "train_batch_size": 4096,
-  "train_micro_batch_size_per_gpu": 64,
+  "train_micro_batch_size_per_gpu": 16,
   "optimizer": {
     "type": "OneBitAdam",
     "params": {
-      "lr": 2e-4,
-      "freeze_step": 400,
-      "cuda_aware": true
+      "lr": 4e-4,
+      "freeze_step": 23000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
     }
   },
   "fp16": {
@@ -75,12 +94,16 @@ The 1-bit Adam feature can be used by setting the optimizer configuration option
   }
 }
 ```
-Please note two new parameters `freeze_step` and `cuda_aware` that have been added to support the 1-bit Adam feature.
+Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_name` that have been added to support the 1-bit Adam feature.
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to Adam's variance/second moment term. See detailed analysis in our [paper](https://arxiv.org/abs/2102.02888)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
 
-`cuda_aware` is used to indicate that the underlying MPI library support CUDA-Aware communication.
-This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model. If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+#### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask.
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
 
@@ -93,9 +116,13 @@ This feature is only supported on systems with InfiniBand interconnect and a CUD
 
 You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.
 
+**Note:** For details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
+
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
 
-The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
+We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
 helps to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -132,7 +159,7 @@ For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the su
 
 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
-```
+``` -->
 
 ### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
 
@@ -148,18 +175,17 @@ Table 1 shows the fine-tuning configuration we used in our experiments.
 | ------------------------------ | ---------------------|
 | Total batch size               | 96    		|
 | Train micro batch size per GPU | 3     		|
-| Optimizer                      | **OnebitAdam**  	|
+| Optimizer                      | **"OnebitAdam"**  	|
 | Learning rate                  | 3e-5  		|
 | Sequence-length                | 384   		|
 | Weight-decay                   | 0.0   		|
 | Epoch count                    | 2     		|
 | **freeze_step**                | 400     	   	|
-| **cuda_aware**                 | True     		|
+| **cuda_aware**                 | false     		|
+| **comm_backend_name**          | "nccl"     		|
 
 Table 1. Fine-tuning configuration
 
-**Note:** For more details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
-
 ### 2.3 Performance Results for BingBertSQuAD Fine-tuning
 
 ***Accuracy:***
@@ -174,19 +200,24 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor
 
 ***Training Speed and Scalability:***
 
-1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
+<!-- 1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
 
 ![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
 
-Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.
+Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
+
+Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+
 
 
 ## 3. BERT Pre-training with 1-bit Adam
-For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) post.
+For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) tutorial.
 
 ### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
 
-The main part of training is done in `deepspeed_train.py`, which has
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `deepspeed_train.py`, which has
 already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
 are the shell scripts that help to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -218,11 +249,11 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
 For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
-```
+``` -->
 
 ### 3.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit Adam enabled
 
-The `deepspeed_bsz4k_onebit_config_seq128.json` file gives the user the ability to specify DeepSpeed
+The `deepspeed_bsz4k_onebit_config_seq128_*.json` file gives the user the ability to specify DeepSpeed
 options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
 
 Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
@@ -240,7 +271,8 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
       "weight_decay": 0.01,
       "bias_correction": false,
       "freeze_step": 23000,
-      "cuda_aware": true
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
     }
   },
   "gradient_clipping": 1.0,
@@ -251,8 +283,8 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
   }
 }
 ```
-The above file is for BERT-large but for BERT-base training (sequence length 128), the suggested `freeze_step` will need to be changed to 16000. For the rest of the pre-training using sequence 512, we suggest to use a `freeze_step` of 1500. And make sure to set the `cuda_aware` correctly as described above.
+The above file is for BERT-large. For BERT-base training (sequence length 128), the suggested `freeze_step` is 16000. For sequence 512 pre-training, we suggest to use a `freeze_step` of 1500 for both BERT-base and BERT-large. And make sure to set the `comm_backend_name` and `cuda_aware` correctly as described above.
 
 ### 3.3 Performance Results for BERT Pre-training
 
-Performance results of BERT Pre-training can be seen from our detailed [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
+Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
diff --git a/docs/index.md b/docs/index.md
index 7bbe79c836a5..03e5a09d8df3 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -28,6 +28,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/03/04] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})

From fb329a9c2a2e696809f3d16b238c1542eed8bc9b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Thu, 4 Mar 2021 15:25:36 -0800
Subject: [PATCH 26/41] fix file name

---
 deepspeed/runtime/engine.py                              | 2 +-
 deepspeed/runtime/fp16/onebit/{onebitadam.py => adam.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename deepspeed/runtime/fp16/onebit/{onebitadam.py => adam.py} (100%)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index ab139ae892c5..0093e7ce5839 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -622,7 +622,7 @@ def _configure_basic_optimizer(self, model_parameters):
             from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
+            from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
                 logger.warning(
diff --git a/deepspeed/runtime/fp16/onebit/onebitadam.py b/deepspeed/runtime/fp16/onebit/adam.py
similarity index 100%
rename from deepspeed/runtime/fp16/onebit/onebitadam.py
rename to deepspeed/runtime/fp16/onebit/adam.py

From 0b3c1d76cdd4945fd3f0bdb02b22044ecb0d0559 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Thu, 4 Mar 2021 15:37:47 -0800
Subject: [PATCH 27/41] doc fix

---
 docs/_tutorials/onebit-adam.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 1af80cf833a8..3190f4cae327 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -181,7 +181,6 @@ Table 1 shows the fine-tuning configuration we used in our experiments.
 | Weight-decay                   | 0.0   		|
 | Epoch count                    | 2     		|
 | **freeze_step**                | 400     	   	|
-| **cuda_aware**                 | false     		|
 | **comm_backend_name**          | "nccl"     		|
 
 Table 1. Fine-tuning configuration
@@ -271,7 +270,6 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
       "weight_decay": 0.01,
       "bias_correction": false,
       "freeze_step": 23000,
-      "cuda_aware": false,
       "comm_backend_name": "nccl"
     }
   },

From 0bffa9b1219afdc32ff7508d655dff9570582daa Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Thu, 4 Mar 2021 20:01:36 -0800
Subject: [PATCH 28/41] torch 1.8 is released

---
 docs/_tutorials/onebit-adam.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 3190f4cae327..09fc42af20d8 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -40,7 +40,7 @@ cd DeepSpeedExamples/
 In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
 
 **Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. Currently (2021/03/04) you need to install PyTorch 1.8 as a nightly version. Currently (2021/03/04) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+This NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. Currently (2021/03/04) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
 {: .notice--warning}
 
 #### 1.2.2 MPI-based implementation

From 294c2d6fdf12e7d7412a659aad6216e4d72452ba Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Fri, 5 Mar 2021 10:58:24 -0800
Subject: [PATCH 29/41] doc fix

---
 docs/_tutorials/onebit-adam.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 09fc42af20d8..bba0e863a08a 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -7,7 +7,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 {: .notice--info}
 
 **Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training.
+1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism.
 {: .notice--warning}
 
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.

From 003981a4eb2670d9edd6bcc92b8b14f847aad47f Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Sun, 7 Mar 2021 17:18:06 -0800
Subject: [PATCH 30/41] fix tests

---
 tests/onebit/test_com_reduce_host.py | 2 +-
 tests/onebit/test_server_error.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/onebit/test_com_reduce_host.py b/tests/onebit/test_com_reduce_host.py
index 3a575828638e..eaa7b368895a 100644
--- a/tests/onebit/test_com_reduce_host.py
+++ b/tests/onebit/test_com_reduce_host.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
+from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
diff --git a/tests/onebit/test_server_error.py b/tests/onebit/test_server_error.py
index e4b680a6cffb..20ad1f128d25 100644
--- a/tests/onebit/test_server_error.py
+++ b/tests/onebit/test_server_error.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 import numpy as np
 import deepspeed
-from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
+from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()

From 877f8d71fc5bc92bccc984046088484c923962ba Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 8 Mar 2021 14:56:19 -0800
Subject: [PATCH 31/41] update news

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 768cfc50c4dd..39ea2b420f04 100755
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 # News
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
+* [2021/03/04] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)

From 2ed029e3db23e7a03a8bbeedbc6f4e8b749d2cce Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 9 Mar 2021 11:37:44 -0800
Subject: [PATCH 32/41] add doc for momentum mask

---
 deepspeed/runtime/fp16/onebit/adam.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 0202500717df..32ed56c72be6 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -217,6 +217,12 @@ def step(self, closure=None, grads=None):
                                     state['worker_error'],
                                     state['server_error'],
                                     self.deepspeed.local_rank))
+                        # Because 1-bit compression cannot represent exact zero, it is required to
+                        # provide a momentum mask for those params that have constant exact zeros in their
+                        # momentums, otherwise the compression error would keep accumulating.
+                        # For example, for bert pre-training seq 128, bert.embeddings.position_embeddings.weight
+                        # always have exact zeros in its momentum for row 129 to 512, because it only
+                        # learns up to seq length 128 while the model supports up to 512 seq length.
                         if 'exp_avg_mask' in group:
                             if exp_avg.device != group['exp_avg_mask'].device:
                                 group['exp_avg_mask'] = group['exp_avg_mask'].to(

From 3b53c90a6902853f84d28d7f8867bfd30b02a580 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Thu, 11 Mar 2021 23:51:33 -0800
Subject: [PATCH 33/41] fix checkpoing handling, add unit test

---
 deepspeed/runtime/fp16/onebit/adam.py |  52 +++++--
 tests/unit/test_onebit.py             | 197 ++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 15 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 32ed56c72be6..5f6551c00063 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -220,9 +220,10 @@ def step(self, closure=None, grads=None):
                         # Because 1-bit compression cannot represent exact zero, it is required to
                         # provide a momentum mask for those params that have constant exact zeros in their
                         # momentums, otherwise the compression error would keep accumulating.
-                        # For example, for bert pre-training seq 128, bert.embeddings.position_embeddings.weight
+                        # For example, for BERT pre-training seq 128, bert.embeddings.position_embeddings.weight
                         # always have exact zeros in its momentum for row 129 to 512, because it only
                         # learns up to seq length 128 while the model supports up to 512 seq length.
+                        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
                         if 'exp_avg_mask' in group:
                             if exp_avg.device != group['exp_avg_mask'].device:
                                 group['exp_avg_mask'] = group['exp_avg_mask'].to(
@@ -260,28 +261,49 @@ def step(self, closure=None, grads=None):
 
     def load_state_dict(self, state_dict):
         """
-        Overrides state_dict() to reset 1-bit Adam states when needed
+        Overrides load_state_dict() to add special handling when loading checkpoints
         """
-        mask = {}
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # BERT pre-training seqlen 128 and 512 ), we don't save the exp_avg_mask
+        # in checkpoints but always use the one user provided in training script.
+        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+        # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
-                mask[i] = group['exp_avg_mask']
+                state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
+                    'param_groups'][i]:
+                state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
-        # Because at different stage exp_avg_mask may change (e.g.,
-        # when loading seq 128 checkpoint for seq 512 pretraining),
-        # we don't load the exp_avg_mask from the checkpoint but always
-        # use the one provided in optimizer_grouped_parameters in deepspeed_train.py.
-        for k, v in mask.items():
-            self.param_groups[k]['exp_avg_mask'] = v
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
             if torch.distributed.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and warmup stage starts/continues, reset 1-bit Adam states."
-                )
+                print("Checkpoint loaded and 1-bit Adam warmup stage starts/continues.")
             if self.adam_freeze_key is True:
                 self.adam_freeze_key = False
                 self.deepspeed.enable_backward_allreduce = True
-            for group in self.param_groups:
-                for p in group['params']:
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and 1-bit Adam compression stage starts/continues."
+                )
+            if self.adam_freeze_key is False:
+                self.adam_freeze_key = True
+                self.deepspeed.enable_backward_allreduce = False
+        # We reset the compression errors when loading checkpoints for 3 reasons:
+        # 1) The worker and server error at each GPU are distinct, so in current implementation
+        # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
+        # If we want to save them correctly we need O(num_gpu*model_size) memory in order to
+        # gather all the error, which is a very large memory requirement. It's possible to save
+        # them in a distributed way, but it will make the checkpoint saving/loading much more complicated.
+        # 2) Even if we are able to save the compression errors correctly, you need to have the
+        # exact same number of GPUs in order to load them correctly.
+        # 3) We verified on BERT pre-training that occasionally resetting the compression error
+        # at checkpoint loading does not affect the convergence.
+        # However, please avoid frequent checkpoint loading which could break the error
+        # compensation mechanism thus affect the convergence.
+        for group in self.param_groups:
+            for p in group['params']:
+                if 'worker_error' in self.state[p]:
                     self.state[p].pop('worker_error')
+                if 'server_error' in self.state[p]:
                     self.state[p].pop('server_error')
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 1d505b8d682f..ca9411a2b372 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -105,6 +105,203 @@ def _test_onebitadam_fp32_basic(args, model, hidden_dim):
     _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+def test_onebitadam_exp_avg_mask(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+    mask1 = torch.flatten(mask1)
+    optimizer_grouped_parameters = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                    {
+                                        'params': [param_optimizer[1][1]],
+                                        'weight_decay': 0.01
+                                    }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
+        model, optimizer, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      model_parameters=optimizer_grouped_parameters)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v['exp_avg'].size() == mask1.size():
+                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
+
+    _test_onebitadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitadam_checkpointing(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    mask2 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+        mask2[1][col] += 1
+    mask1 = torch.flatten(mask1)
+    mask2 = torch.flatten(mask2)
+
+    optimizer_grouped_parameters_1 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_2 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask2
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_3 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
+        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_1)
+        data_loader = random_dataloader(model=model_1,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model_1.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.adam_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        # optimizer_1.optimizer.gather_compression_errors()
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
+
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_2)
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
+        model_2.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        for v in optimizer_2.state.values():
+            assert 'worker_error' not in v, f"Incorrect worker error"
+            assert 'server_error' not in v, f"Incorrect server error"
+        assert optimizer_2.optimizer.adam_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_3)
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(model=model_3,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model_3.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.adam_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
+        model_3.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        for v in optimizer_3.state.values():
+            assert 'worker_error' not in v, f"Incorrect worker error"
+            assert 'server_error' not in v, f"Incorrect server error"
+        assert optimizer_3.optimizer.adam_freeze_key is False
+
+    _test_onebitadam_checkpointing(mask1,
+                                   mask2,
+                                   args=args,
+                                   model=model,
+                                   hidden_dim=hidden_dim)
+
+
 def test_compressed_allreduce_basic(tmpdir):
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():

From 424072945540c0da37ee68081ec61e2534a3c16c Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Fri, 12 Mar 2021 11:46:47 -0800
Subject: [PATCH 34/41] checkpoint handling doc

---
 docs/_tutorials/onebit-adam.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index bba0e863a08a..5bac51688964 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -7,7 +7,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 {: .notice--info}
 
 **Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism.
+1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
@@ -103,7 +103,11 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 (New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask.
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note that we don't load this momentum mask from checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit Adam replies on an error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
 

From 968a53fd00954020dfde684e457ba70b50e6d7d4 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Fri, 12 Mar 2021 14:49:32 -0800
Subject: [PATCH 35/41] doc final cleanup

---
 deepspeed/runtime/fp16/onebit/adam.py | 2 +-
 docs/_tutorials/onebit-adam.md        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 5f6551c00063..f260fda50c82 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -264,7 +264,7 @@ def load_state_dict(self, state_dict):
         Overrides load_state_dict() to add special handling when loading checkpoints
         """
         # Because at different stage exp_avg_mask may change (e.g.,
-        # BERT pre-training seqlen 128 and 512 ), we don't save the exp_avg_mask
+        # BERT pre-training seqlen 128 and 512 ), we don't use the exp_avg_mask
         # in checkpoints but always use the one user provided in training script.
         # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
         # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 5bac51688964..6673aaf2d4a1 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -103,10 +103,10 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 (New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note that we don't load this momentum mask from checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
-1-bit Adam replies on an error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
 {: .notice--warning}
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam

From 1221aec8560db87d8558a9cafdef08364e588016 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 10:40:41 -0700
Subject: [PATCH 36/41] bump dates

---
 README.md     | 2 +-
 docs/index.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 39ea2b420f04..da8bccc383d4 100755
--- a/README.md
+++ b/README.md
@@ -31,8 +31,8 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
-* [2021/03/04] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
diff --git a/docs/index.md b/docs/index.md
index 7b1b62746a85..a30848246e07 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -28,8 +28,8 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
-* [2021/03/04] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})

From 8cfd2b7805a45e1da5891e17e6af7230f4bfe02b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 13:05:45 -0700
Subject: [PATCH 37/41] update tests

---
 tests/onebit/test_com_reduce_host.py | 86 ---------------------------
 tests/onebit/test_mpi_backend.py     |  8 +--
 tests/onebit/test_mpi_perf.py        | 10 +---
 tests/onebit/test_nccl_backend.py    |  4 +-
 tests/onebit/test_nccl_perf.py       |  4 +-
 tests/onebit/test_server_error.py    | 87 ----------------------------
 6 files changed, 11 insertions(+), 188 deletions(-)
 delete mode 100644 tests/onebit/test_com_reduce_host.py
 delete mode 100644 tests/onebit/test_server_error.py

diff --git a/tests/onebit/test_com_reduce_host.py b/tests/onebit/test_com_reduce_host.py
deleted file mode 100644
index eaa7b368895a..000000000000
--- a/tests/onebit/test_com_reduce_host.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from mpi4py import MPI
-import time
-import torch
-import torch.distributed as dist
-import numpy as np
-import deepspeed
-from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
-
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
-
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-1:2245',
-                                     world_size=size,
-                                     rank=rank)
-
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
-
-# Set cuda_aware to False to use host buffers for communication
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
-
-device = torch.device('cuda', rank % torch.cuda.device_count())
-
-
-def torch_sim(a):
-    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    scale = a.norm() / np.sqrt(a.numel())
-    a_compressed = scale * a_sign
-    a_sign = None
-    worker_error = a - a_compressed
-    dist.all_reduce(a_compressed)
-    a_compressed.mul_(1 / dist.get_world_size())
-    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
-    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
-    rank = dist.get_rank()
-    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
-    torch.distributed.barrier()
-    return a_server_compressed, worker_error, server_error
-
-
-tensor_size = 100 * 2**20
-server_size = int(tensor_size / size)
-if tensor_size % (8 * size) != 0:
-    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
-else:
-    right_tensor_size = tensor_size
-right_server_size = right_tensor_size // size
-# Adding bias to the initialization of the gradient we are communicating
-# In order to get rid of the case where some elements in the gradient are too small
-a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
-worker_error = torch.zeros(right_tensor_size, device=device)
-server_error = torch.zeros(right_server_size, device=device)
-a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
-threshold = 1e-6
-magnitude_threshold = 1e-6
-diff_mask = (a_after - a_torch) > threshold
-diff_server_mask = torch.chunk(diff_mask, size)[rank]
-mpi_server = torch.chunk(a_after, size)[rank] + server_error
-torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
-
-# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
-# The test would skip those numbers that are too small in compensated_server_m
-if torch.sum(diff_server_mask) == 0:
-    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
-else:
-    check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
-    if torch.sum(check_mag_mask) == 0:
-        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
-    else:
-        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index 6ef7df42a81d..785021cf0935 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -11,14 +11,10 @@
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
+deepspeed.init_distributed(dist_backend='nccl')
 
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
-backend = MpiBackend(cuda_aware=True)
+backend = MpiBackend(cuda_aware=False)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
index 4b572c814317..6017ec873c21 100644
--- a/tests/onebit/test_mpi_perf.py
+++ b/tests/onebit/test_mpi_perf.py
@@ -18,13 +18,9 @@
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-#TODO: Detect the hostname we are running on automatically
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
-
-backend = MpiBackend(cuda_aware=True)
+deepspeed.init_distributed(dist_backend='nccl')
+# Change cuda_aware to True to test out CUDA-Aware MPI communication
+backend = MpiBackend(cuda_aware=False)
 
 device = torch.device('cuda', rank % torch.cuda.device_count())
 
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index 8935977ad5a2..16de37174c10 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -4,6 +4,7 @@
 import numpy as np
 import argparse
 import deepspeed
+import os
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 
@@ -11,7 +12,8 @@
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-dist.init_process_group(backend='nccl')
+deepspeed.init_distributed(dist_backend='nccl')
+args.local_rank = int(os.environ['LOCAL_RANK'])
 
 torch.cuda.set_device(args.local_rank)
 device = torch.device("cuda", args.local_rank)
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
index c45ff205621f..1374cda4ddce 100644
--- a/tests/onebit/test_nccl_perf.py
+++ b/tests/onebit/test_nccl_perf.py
@@ -4,6 +4,7 @@
 import numpy as np
 import argparse
 import deepspeed
+import os
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 from deepspeed.utils.timer import SynchronizedWallClockTimer
@@ -15,7 +16,8 @@
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-dist.init_process_group(backend='nccl')
+deepspeed.init_distributed(dist_backend='nccl')
+args.local_rank = int(os.environ['LOCAL_RANK'])
 
 torch.cuda.set_device(args.local_rank)
 device = torch.device("cuda", args.local_rank)
diff --git a/tests/onebit/test_server_error.py b/tests/onebit/test_server_error.py
deleted file mode 100644
index 20ad1f128d25..000000000000
--- a/tests/onebit/test_server_error.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from mpi4py import MPI
-import time
-import torch
-import torch.distributed as dist
-import numpy as np
-import deepspeed
-from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
-
-comm = MPI.COMM_WORLD
-size = comm.Get_size()
-rank = comm.Get_rank()
-
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='tcp://worker-0:2245',
-                                     world_size=size,
-                                     rank=rank)
-
-dummy_model = [torch.nn.Parameter(torch.ones(10))]
-dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
-
-device = torch.device('cuda', rank % torch.cuda.device_count())
-
-
-def torch_sim(a):
-    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    scale = a.norm() / np.sqrt(a.numel())
-    a_compressed = scale * a_sign
-    a_sign = None
-    worker_error = a - a_compressed
-    dist.all_reduce(a_compressed)
-    a_compressed.mul_(1 / dist.get_world_size())
-    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
-    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
-    rank = dist.get_rank()
-    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
-    torch.distributed.barrier()
-    return a_server_compressed, worker_error, server_error
-
-
-# Input Tensor size
-tensor_size = 100 * 2**20
-
-server_size = int(tensor_size / size)
-if tensor_size % (8 * size) != 0:
-    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
-else:
-    right_tensor_size = tensor_size
-
-right_server_size = right_tensor_size // size
-
-# The -0.5 is required for avoiding sign flips/errors
-a = torch.rand(tensor_size, device=device) - 0.5
-
-worker_error = torch.zeros(right_tensor_size, device=device)
-server_error = torch.zeros(right_server_size, device=device)
-a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
-
-# Test the 1-bit Adam optimizer
-a_after = dummy_optim.Compressed_Allreduce(a,
-                                           worker_error,
-                                           server_error,
-                                           rank,
-                                           size,
-                                           comm,
-                                           local_rank)
-
-# If the error is below the threshold, it is acceptable for training
-threshold = 1e-6
-
-diff_pos = ((a_after - a_torch) > threshold)
-
-if rank == 0:
-    before_diff = torch.chunk(a_after - a_torch,
-                              size)[rank] + server_error - server_error_torch
-    if torch.norm(before_diff) / torch.norm(torch.chunk(a_after,
-                                                        size)[rank]) < threshold:
-        print('Successfully passed the test')
-    else:
-        print('The difference for the tensor before allgather is {}'.format(
-            torch.norm(before_diff)))

From 38ff08a93fe24083b264767f085b64da9ca0eaf7 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 13:12:38 -0700
Subject: [PATCH 38/41] url change

---
 deepspeed/runtime/fp16/onebit/adam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index f260fda50c82..e3417fea9d6f 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -13,8 +13,8 @@
 
 class OnebitAdam(torch.optim.Optimizer):
     """Implements the 1-bit Adam algorithm. Currently GPU-only.
-    For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
-    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
+    For usage example please see https://www.deepspeed.ai/tutorials/onebit-adam/
+    For technical details please read https://arxiv.org/abs/2102.02888
 
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining

From de036564289167f2f6f608f9e06e5fdd4a82f2f2 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 13:16:44 -0700
Subject: [PATCH 39/41] doc fix

---
 docs/_tutorials/onebit-adam.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 6673aaf2d4a1..1a15000135c9 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -7,7 +7,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 {: .notice--info}
 
 **Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
@@ -40,7 +40,7 @@ cd DeepSpeedExamples/
 In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
 
 **Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8 and NCCL >= 2.8.3. Currently (2021/03/04) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
 {: .notice--warning}
 
 #### 1.2.2 MPI-based implementation

From 5957bce956445f6cf45447910e4fa2c261b9144b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 13:57:08 -0700
Subject: [PATCH 40/41] fix test

---
 tests/unit/test_onebit.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index ca9411a2b372..8e0056be0cff 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -6,6 +6,7 @@
 import json
 import os
 import numpy as np
+import time
 from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
 
@@ -15,13 +16,6 @@
     pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
                 allow_module_level=True)
 
-try:
-    from apex import amp
-    _amp_available = True
-except ImportError:
-    _amp_available = False
-amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
-
 
 def test_onebitadam_fp16_basic(tmpdir):
     config_dict = {
@@ -249,6 +243,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
         # optimizer_1.optimizer.gather_compression_errors()
         model_1.save_checkpoint(save_folder, tag=None)
+        time.sleep(5)
         assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
 

From ef51ac699637f689477c2180758b9c5492bb7a14 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 16 Mar 2021 15:32:16 -0700
Subject: [PATCH 41/41] doc update

---
 docs/_pages/config-json.md           | 13 +++++++++++--
 docs/code-docs/source/optimizers.rst |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 40f31310d57e..9a9554cbd75f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -60,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
 | torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
 | adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |
 
-  Another example of ***optimizer*** with 1-bit Adam specific parameters is as follows.
+  Another example of ***optimizer*** with 1-bit Adam
 
 ```json
 "optimizer": {
@@ -74,11 +74,20 @@ The Adam optimizer also supports the following two params keys/values in additio
       "eps": 1e-8,
       "weight_decay": 3e-7,
       "freeze_step": 400,
-      "cuda_aware": true
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
     }
   }
 ```
 
+The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
+
+| "params" key  | Description                                                                 | Default |
+| ------------- | --------------------------------------------------------------------------- | ------- |
+| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication | 100000   |
+| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication         | false    |
+| comm\_backend\_name | To indicate which backend implementation to use                               | "nccl"   |
+
 ### Scheduler Parameters
 
 ***scheduler***: [dictionary]
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 89fc47ac547b..d7b338561b96 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -17,4 +17,4 @@ FusedLamb (GPU)
 
 OneBitAdam (GPU)
 ----------------------------
-.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam