From f4f28b94c0b59d82ddf0c00a567f5c024606ce3d Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 20:24:19 +0000
Subject: [PATCH 01/17] Adding tf32 and fp32 support for ZeRO Stage 3

---
 deepspeed/runtime/config.py                   |  3 +-
 deepspeed/runtime/engine.py                   |  2 +
 .../runtime/zero/partition_parameters.py      | 52 +++++++++++++++----
 deepspeed/runtime/zero/stage3.py              | 24 +++++----
 4 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 9e33876994f9..f5f98a0fa534 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -759,7 +759,8 @@ def _do_error_check(self):
             GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
+            if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION:
+                assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             #if self.zero_config.cpu_offload is True:
             #    assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f71a7324585a..dd08fbe920dd 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -573,6 +573,8 @@ def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
             self.module.half()
+        else:
+            assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16"
 
         if not self.dont_change_device:
             self.module.to(self.device)
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index c8bde6390b3c..22e7c089e5ba 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -153,7 +153,7 @@ class ZeroParamStatus(Enum):
 _orig_torch_empty = torch.empty
 
 
-def empty_cuda_tensor(*size, **kwargs):
+def empty_cuda_tensor_half(*size, **kwargs):
     if not 'device' in kwargs.keys():
         kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
     tensor = _orig_torch_empty(*size, **kwargs)
@@ -163,7 +163,7 @@ def empty_cuda_tensor(*size, **kwargs):
         return tensor
 
 
-def new_cuda_tensor(cls, *args):
+def new_cuda_tensor_half(cls, *args):
     device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
     tensor = torch.ones((1, 1), device=device).new_empty(*args).half()
     if tensor.is_floating_point():
@@ -172,6 +172,19 @@ def new_cuda_tensor(cls, *args):
         return tensor
 
 
+def empty_cuda_tensor(*size, **kwargs):
+    if not 'device' in kwargs.keys():
+        kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+    tensor = _orig_torch_empty(*size, **kwargs)
+    return tensor
+
+
+def new_cuda_tensor(cls, *args):
+    device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+    tensor = torch.ones((1, 1), device=device).new_empty(*args)
+    return tensor
+
+
 reuse_buffers = False
 temp_contiguous_tensor = None
 empty_buffers = {}
@@ -180,9 +193,15 @@ def new_cuda_tensor(cls, *args):
 # Inserts _post_init_method at the end of init method
 # for all sub classes of torch.nn.Module
 class InsertPostInitMethodToModuleSubClasses(object):
-    def __init__(self, enabled=True, mem_efficient_linear=True):
+    def __init__(self,
+                 enabled=True,
+                 mem_efficient_linear=True,
+                 deepspeed_config=None,
+                 dtype=None):
         self.mem_efficient_linear = mem_efficient_linear
         self.enabled = enabled
+        self._set_dtype(deepspeed_config, dtype)
+        assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]"
 
     def __enter__(self):
         if not self.enabled:
@@ -218,8 +237,12 @@ def _init_subclass(cls, **kwargs):
 
         # Replace .__init__() for future subclasses of torch.nn.Module
         torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)
-        torch.Tensor.__new__ = new_cuda_tensor
-        torch.empty = empty_cuda_tensor
+        if self.dtype == torch.half:
+            torch.Tensor.__new__ = new_cuda_tensor_half
+            torch.empty = empty_cuda_tensor_half
+        else:
+            torch.Tensor.__new__ = new_cuda_tensor
+            torch.empty = empty_cuda_tensor
 
         if self.mem_efficient_linear:
             print_rank_0(
@@ -259,6 +282,13 @@ def _disable_class(cls):
     def _post_init_method(self, module):
         pass
 
+    def _set_dtype(self, ds_config, dtype):
+        if ds_config is not None and dtype is None:
+            _ds_config = DeepSpeedConfig(ds_config)
+            self.dtype = torch.half if _ds_config.fp16_enabled else torch.float
+        elif dtype is None:
+            self.dtype = torch.half
+
 
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):
@@ -271,7 +301,8 @@ def __init__(self,
                  remote_device=None,
                  pin_memory=False,
                  deepspeed_config=None,
-                 enabled=True):
+                 enabled=True,
+                 dtype=None):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
         system and converted to half precision.
@@ -366,7 +397,10 @@ def get_model():
                 model = deepspeed.zero.Init(module=model)
         """
 
-        super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear)
+        super().__init__(enabled=enabled,
+                         mem_efficient_linear=mem_efficient_linear,
+                         deepspeed_config=deepspeed_config,
+                         dtype=dtype)
         if not torch.distributed.is_initialized():
             init_distributed()
             assert torch.distributed.is_initialized(), "Parameters cannot be scattered without initializing torch.distributed"
@@ -631,7 +665,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                     f'Before partitioning param {param.ds_id} {param.shape}',
                     force=False)
                 #param.data does not store anything meaningful in partitioned state
-                param.data = torch.ones(1).half().to(param.device)
+                param.data = torch.ones(1, dtype=self.dtype).to(param.device)
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                                  force=False)
 
@@ -712,7 +746,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
             see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
                              force=False)
-            param.data = torch.ones(1).half().to(param.device)
+            param.data = torch.ones(1, dtype=self.dtype).to(param.device)
             see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                              force=False)
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index f8b526952de8..5ba1376b3355 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -844,6 +844,7 @@ def __init__(self,
 
         # simplified param id
         self.param_id = {}
+        self.dtype = self.fp16_groups[0][0].dtype
 
         count = 0
         for i, params_group in enumerate(self.fp16_groups):
@@ -874,10 +875,11 @@ def __init__(self,
             self.local_overflow = False
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_partitioned_param_numel,
-                device=torch.cuda.current_device()).half()
-            self.temp_grad_gpu_buffer = torch.zeros(
-                largest_partitioned_param_numel,
-                device=torch.cuda.current_device()).half()
+                device=torch.cuda.current_device(),
+                dtype=self.dtype)
+            self.temp_grad_gpu_buffer = torch.zeros(largest_partitioned_param_numel,
+                                                    device=torch.cuda.current_device(),
+                                                    dtype=self.dtype)
         see_memory_usage(f"After CPU Offload initialization", force=False)
 
         # stores if a partition has been reduced in this step
@@ -1059,7 +1061,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self):
                              force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
                     torch.empty(int(flat_buffer_size),
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 pin_memory=True))
             else:
                 print_rank_0(
@@ -1068,7 +1070,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self):
 
                 self.param_groups_fp16_flat_cpu_memory.append(
                     torch.empty(1,
-                                dtype=torch.half))
+                                dtype=self.dtype))
 
     def _create_fp16_partitions_with_defragmentation(self):
         dist.barrier()
@@ -1170,7 +1172,7 @@ def _create_fp16_partitions_with_defragmentation(self):
                         -1] is None and self.param_group_fp16_flat_reuse_buffer is None:
                     self.param_group_fp16_flat_reuse_buffer = torch.empty(
                         max(self.fp16_partitioned_groups_flat_numel),
-                        dtype=torch.half,
+                        dtype=self.dtype,
                         device='cpu',
                         pin_memory=True)
 
@@ -2077,12 +2079,12 @@ def partition_previous_reduced_grads(self):
                 if self.offload_param_pin_memory:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=self.device).pin_memory())
                 else:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=self.device))
                 see_memory_usage(
                     f"group {i} after creating {total_size} reduced gradients into partition",
@@ -2930,14 +2932,14 @@ def backward(self, loss, retain_graph=False):
         if self.contiguous_gradients:
             self.ipg_buffer = []
             buf_0 = torch.empty(self.reduce_bucket_size,
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 device=torch.cuda.current_device())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
                 buf_1 = torch.empty(self.reduce_bucket_size,
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0

From 08fc9e6794b5a34213222b04c26eff78255270d1 Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 21:04:27 +0000
Subject: [PATCH 02/17]  Changing to location of self.dtype assignment

---
 deepspeed/runtime/zero/stage3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 5ba1376b3355..fd1ba9cb1a1f 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -791,7 +791,7 @@ def __init__(self,
         self.sub_group_size = sub_group_size
 
         self.sub_group_to_group_id = {}
-
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
@@ -844,7 +844,6 @@ def __init__(self,
 
         # simplified param id
         self.param_id = {}
-        self.dtype = self.fp16_groups[0][0].dtype
 
         count = 0
         for i, params_group in enumerate(self.fp16_groups):

From ed679c96cc109ca0c5905162545e10cf752a414b Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 21:07:47 +0000
Subject: [PATCH 03/17] Exhaustive setting of self.dtype

---
 deepspeed/runtime/zero/partition_parameters.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 22e7c089e5ba..835d38f75222 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -288,6 +288,8 @@ def _set_dtype(self, ds_config, dtype):
             self.dtype = torch.half if _ds_config.fp16_enabled else torch.float
         elif dtype is None:
             self.dtype = torch.half
+        else:
+            self.dtype = dtype
 
 
 # Replaces all parameters in module with Scattered Parameters

From 2eda1616512fb3f26b0fa77476864d32fcef49f2 Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 22:50:51 +0000
Subject: [PATCH 04/17] Adding fp32 and tf32 support for ZeRO Stage 2

---
 deepspeed/runtime/config.py      |  2 +-
 deepspeed/runtime/zero/stage2.py | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index f5f98a0fa534..f4bd6899070a 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -759,7 +759,7 @@ def _do_error_check(self):
             GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION:
+            if self.zero_optimization_stage < ZERO_OPTIMIZATION_GRADIENTS:
                 assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             #if self.zero_config.cpu_offload is True:
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 39d780e55574..c59e87517add 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -187,6 +187,7 @@ def __init__(self,
         partition_id = dist.get_rank(group=self.dp_process_group)
 
         self.all_reduce_print = False
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
 
         # padding on each partition for alignment purposes
         self.groups_padding = []
@@ -306,10 +307,12 @@ def __init__(self,
             self.grad_position = {}
             self.temp_grad_buffer_for_cpu_offload = torch.zeros(
                 largest_param_numel,
-                device=self.device).half().pin_memory()
+                device=self.device,
+                dtype=self.dtype).pin_memory()
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_param_numel,
-                device=torch.cuda.current_device()).half()
+                device=torch.cuda.current_device(),
+                dtype=self.dtype)
 
             for i, params_group in enumerate(self.fp16_groups):
                 self.get_grad_position(i,
@@ -464,14 +467,14 @@ def independent_gradient_partition_epilogue(self):
                         self.params_in_partition[i],
                         self.first_offset[i],
                         self.partition_size[i],
-                        dtype=torch.half,
+                        dtype=self.dtype,
                         device=torch.cuda.current_device(),
                         return_tensor_list=True)
                 else:
                     avg_new = self.get_flat_partition(self.params_in_partition[i],
                                                       self.first_offset[i],
                                                       self.partition_size[i],
-                                                      dtype=torch.half,
+                                                      dtype=self.dtype,
                                                       device=torch.cuda.current_device(),
                                                       return_tensor_list=True)
 
@@ -931,7 +934,7 @@ def copy_grads_in_partition(self, param):
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
             self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=torch.half,
+                                                  dtype=self.dtype,
                                                   device=torch.cuda.current_device())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
@@ -1617,14 +1620,14 @@ def backward(self, loss, retain_graph=False):
         if self.contiguous_gradients:
             self.ipg_buffer = []
             buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 device=torch.cuda.current_device())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
                 buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0

From 5ce4d3bdcc337d68b4854e3d6f3c4a127d132bdb Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Fri, 23 Apr 2021 22:37:48 +0000
Subject: [PATCH 05/17] fix loss scale value for static loss scale

---
 deepspeed/runtime/zero/stage2.py | 13 +++++++------
 deepspeed/runtime/zero/stage3.py | 17 +++++++++--------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index c59e87517add..cf6d4876d401 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -357,7 +357,13 @@ def __init__(self,
         self.create_reduce_and_remove_grad_hooks()
 
         # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
+        if self.dtype == torch.float or not dynamic_loss_scale:
+            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
+
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=loss_scale_value)
+            cur_iter = 0
+        else:
             if dynamic_loss_args is None:
                 self.loss_scaler = DynamicLossScaler()
             else:
@@ -365,11 +371,6 @@ def __init__(self,
 
             self.dynamic_loss_scale = True
 
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
         see_memory_usage("Before initializing optimizer states")
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states")
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index fd1ba9cb1a1f..51e758203532 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -640,12 +640,13 @@ def __init__(self,
         util_ops = UtilsBuilder().load()
         self.flatten = util_ops.flatten
         self.unflatten = util_ops.unflatten
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
 
         if not all(is_zero_param(p) for p in module.parameters()):
             group = None
             if mpu:
                 group = mpu.get_data_parallel_group()
-            Init(module=module, data_parallel_group=group)
+            Init(module=module, data_parallel_group=group, dtype=self.dtype)
 
         for m in module.modules():
             _init_external_params(m)
@@ -791,7 +792,6 @@ def __init__(self,
         self.sub_group_size = sub_group_size
 
         self.sub_group_to_group_id = {}
-        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
@@ -896,7 +896,13 @@ def __init__(self,
         #exit(0)
 
         # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
+        if self.dtype == torch.float or not dynamic_loss_scale:
+            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
+
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=loss_scale_value)
+            cur_iter = 0
+        else:
             if dynamic_loss_args is None:
                 self.loss_scaler = DynamicLossScaler()
             else:
@@ -904,11 +910,6 @@ def __init__(self,
 
             self.dynamic_loss_scale = True
 
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
         self.debug_fp16_grads = [{} for _ in self.fp16_groups]
 
         if dist.get_rank(group=self.dp_process_group) == 0:

From 639d4875fa44be90e6aba691adcf6cf9e8f916bf Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 20:24:19 +0000
Subject: [PATCH 06/17] Adding tf32 and fp32 support for ZeRO Stage 3

---
 deepspeed/runtime/config.py                   |  3 +-
 deepspeed/runtime/engine.py                   |  2 +
 .../runtime/zero/partition_parameters.py      | 52 +++++++++++++++----
 deepspeed/runtime/zero/stage3.py              | 24 +++++----
 4 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 3fa0b32a6032..23dc37c0b345 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -761,7 +761,8 @@ def _do_error_check(self):
             GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
+            if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION:
+                assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             #if self.zero_config.cpu_offload is True:
             #    assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 646e492cc3dd..293c6c72bdd9 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -574,6 +574,8 @@ def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
             self.module.half()
+        else:
+            assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16"
 
         if not self.dont_change_device:
             self.module.to(self.device)
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e831911efd62..a18ca47ca586 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -154,7 +154,7 @@ class ZeroParamStatus(Enum):
 _orig_torch_empty = torch.empty
 
 
-def empty_cuda_tensor(*size, **kwargs):
+def empty_cuda_tensor_half(*size, **kwargs):
     if not 'device' in kwargs.keys():
         kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
     tensor = _orig_torch_empty(*size, **kwargs)
@@ -164,7 +164,7 @@ def empty_cuda_tensor(*size, **kwargs):
         return tensor
 
 
-def new_cuda_tensor(cls, *args):
+def new_cuda_tensor_half(cls, *args):
     device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
     tensor = torch.ones((1, 1), device=device).new_empty(*args).half()
     if tensor.is_floating_point():
@@ -173,6 +173,19 @@ def new_cuda_tensor(cls, *args):
         return tensor
 
 
+def empty_cuda_tensor(*size, **kwargs):
+    if not 'device' in kwargs.keys():
+        kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+    tensor = _orig_torch_empty(*size, **kwargs)
+    return tensor
+
+
+def new_cuda_tensor(cls, *args):
+    device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
+    tensor = torch.ones((1, 1), device=device).new_empty(*args)
+    return tensor
+
+
 reuse_buffers = False
 temp_contiguous_tensor = None
 empty_buffers = {}
@@ -181,9 +194,15 @@ def new_cuda_tensor(cls, *args):
 # Inserts _post_init_method at the end of init method
 # for all sub classes of torch.nn.Module
 class InsertPostInitMethodToModuleSubClasses(object):
-    def __init__(self, enabled=True, mem_efficient_linear=True):
+    def __init__(self,
+                 enabled=True,
+                 mem_efficient_linear=True,
+                 deepspeed_config=None,
+                 dtype=None):
         self.mem_efficient_linear = mem_efficient_linear
         self.enabled = enabled
+        self._set_dtype(deepspeed_config, dtype)
+        assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]"
 
     def __enter__(self):
         if not self.enabled:
@@ -219,8 +238,12 @@ def _init_subclass(cls, **kwargs):
 
         # Replace .__init__() for future subclasses of torch.nn.Module
         torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)
-        torch.Tensor.__new__ = new_cuda_tensor
-        torch.empty = empty_cuda_tensor
+        if self.dtype == torch.half:
+            torch.Tensor.__new__ = new_cuda_tensor_half
+            torch.empty = empty_cuda_tensor_half
+        else:
+            torch.Tensor.__new__ = new_cuda_tensor
+            torch.empty = empty_cuda_tensor
 
         if self.mem_efficient_linear:
             print_rank_0(
@@ -260,6 +283,12 @@ def _disable_class(cls):
     def _post_init_method(self, module):
         pass
 
+    def _set_dtype(self, ds_config, dtype):
+        if ds_config is not None and dtype is None:
+            _ds_config = DeepSpeedConfig(ds_config)
+            self.dtype = torch.half if _ds_config.fp16_enabled else torch.float
+        elif dtype is None:
+            self.dtype = torch.half
 
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):
@@ -273,7 +302,8 @@ def __init__(self,
                  pin_memory=False,
                  deepspeed_config=None,
                  param_dict=None,
-                 enabled=True):
+                 enabled=True,
+                 dtype=None):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
         system and converted to half precision.
@@ -370,7 +400,10 @@ def get_model():
                 model = deepspeed.zero.Init(module=model)
         """
 
-        super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear)
+        super().__init__(enabled=enabled,
+                         mem_efficient_linear=mem_efficient_linear,
+                         deepspeed_config=deepspeed_config,
+                         dtype=dtype)
         if not torch.distributed.is_initialized():
             init_distributed()
             assert torch.distributed.is_initialized(), "Parameters cannot be scattered without initializing torch.distributed"
@@ -635,8 +668,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                     f'Before partitioning param {param.ds_id} {param.shape}',
                     force=False)
                 #param.data does not store anything meaningful in partitioned state
-                param.data = torch.ones(partitioned_param_data_shape).half().to(
-                    param.device)
+                param.data = torch.ones(1, dtype=self.dtype).to(param.device)
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                                  force=False)
 
@@ -717,7 +749,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
             see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
                              force=False)
-            param.data = torch.ones(partitioned_param_data_shape).half().to(param.device)
+            param.data = torch.ones(1, dtype=self.dtype).to(param.device)
             see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                              force=False)
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 2b16887ff60d..ff26d1d39ddd 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -844,6 +844,7 @@ def __init__(self,
 
         # simplified param id
         self.param_id = {}
+        self.dtype = self.fp16_groups[0][0].dtype
 
         count = 0
         for i, params_group in enumerate(self.fp16_groups):
@@ -874,10 +875,11 @@ def __init__(self,
             self.local_overflow = False
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_partitioned_param_numel,
-                device=torch.cuda.current_device()).half()
-            self.temp_grad_gpu_buffer = torch.zeros(
-                largest_partitioned_param_numel,
-                device=torch.cuda.current_device()).half()
+                device=torch.cuda.current_device(),
+                dtype=self.dtype)
+            self.temp_grad_gpu_buffer = torch.zeros(largest_partitioned_param_numel,
+                                                    device=torch.cuda.current_device(),
+                                                    dtype=self.dtype)
         see_memory_usage(f"After CPU Offload initialization", force=False)
 
         # stores if a partition has been reduced in this step
@@ -1059,7 +1061,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self):
                              force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
                     torch.empty(int(flat_buffer_size),
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 pin_memory=True))
             else:
                 print_rank_0(
@@ -1068,7 +1070,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self):
 
                 self.param_groups_fp16_flat_cpu_memory.append(
                     torch.empty(1,
-                                dtype=torch.half))
+                                dtype=self.dtype))
 
     def _create_fp16_partitions_with_defragmentation(self):
         dist.barrier()
@@ -1170,7 +1172,7 @@ def _create_fp16_partitions_with_defragmentation(self):
                         -1] is None and self.param_group_fp16_flat_reuse_buffer is None:
                     self.param_group_fp16_flat_reuse_buffer = torch.empty(
                         max(self.fp16_partitioned_groups_flat_numel),
-                        dtype=torch.half,
+                        dtype=self.dtype,
                         device='cpu',
                         pin_memory=True)
 
@@ -2076,12 +2078,12 @@ def partition_previous_reduced_grads(self):
                 if self.offload_param_pin_memory:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=self.device).pin_memory())
                 else:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=self.device))
                 see_memory_usage(
                     f"group {i} after creating {total_size} reduced gradients into partition",
@@ -2929,14 +2931,14 @@ def backward(self, loss, retain_graph=False):
         if self.contiguous_gradients:
             self.ipg_buffer = []
             buf_0 = torch.empty(self.reduce_bucket_size,
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 device=torch.cuda.current_device())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
                 buf_1 = torch.empty(self.reduce_bucket_size,
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0

From e3b35342f2c9990a076dc35b1a3ffe1201b66c24 Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 21:04:27 +0000
Subject: [PATCH 07/17]  Changing to location of self.dtype assignment

---
 deepspeed/runtime/zero/stage3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index ff26d1d39ddd..931282d5362b 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -791,7 +791,7 @@ def __init__(self,
         self.sub_group_size = sub_group_size
 
         self.sub_group_to_group_id = {}
-
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
@@ -844,7 +844,6 @@ def __init__(self,
 
         # simplified param id
         self.param_id = {}
-        self.dtype = self.fp16_groups[0][0].dtype
 
         count = 0
         for i, params_group in enumerate(self.fp16_groups):

From bed15a015535400cc3b9dcffa9f31a486c6a5ede Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 21:07:47 +0000
Subject: [PATCH 08/17] Exhaustive setting of self.dtype

---
 deepspeed/runtime/zero/partition_parameters.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index a18ca47ca586..b910417f3d50 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -289,6 +289,8 @@ def _set_dtype(self, ds_config, dtype):
             self.dtype = torch.half if _ds_config.fp16_enabled else torch.float
         elif dtype is None:
             self.dtype = torch.half
+        else:
+            self.dtype = dtype
 
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):

From bfe1e844a9ea0db43cea45fdff83902d46b3a873 Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Tue, 20 Apr 2021 22:50:51 +0000
Subject: [PATCH 09/17] Adding fp32 and tf32 support for ZeRO Stage 2

---
 deepspeed/runtime/config.py      |  2 +-
 deepspeed/runtime/zero/stage2.py | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 23dc37c0b345..661c9d03e943 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -761,7 +761,7 @@ def _do_error_check(self):
             GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION:
+            if self.zero_optimization_stage < ZERO_OPTIMIZATION_GRADIENTS:
                 assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             #if self.zero_config.cpu_offload is True:
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 39d780e55574..c59e87517add 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -187,6 +187,7 @@ def __init__(self,
         partition_id = dist.get_rank(group=self.dp_process_group)
 
         self.all_reduce_print = False
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
 
         # padding on each partition for alignment purposes
         self.groups_padding = []
@@ -306,10 +307,12 @@ def __init__(self,
             self.grad_position = {}
             self.temp_grad_buffer_for_cpu_offload = torch.zeros(
                 largest_param_numel,
-                device=self.device).half().pin_memory()
+                device=self.device,
+                dtype=self.dtype).pin_memory()
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_param_numel,
-                device=torch.cuda.current_device()).half()
+                device=torch.cuda.current_device(),
+                dtype=self.dtype)
 
             for i, params_group in enumerate(self.fp16_groups):
                 self.get_grad_position(i,
@@ -464,14 +467,14 @@ def independent_gradient_partition_epilogue(self):
                         self.params_in_partition[i],
                         self.first_offset[i],
                         self.partition_size[i],
-                        dtype=torch.half,
+                        dtype=self.dtype,
                         device=torch.cuda.current_device(),
                         return_tensor_list=True)
                 else:
                     avg_new = self.get_flat_partition(self.params_in_partition[i],
                                                       self.first_offset[i],
                                                       self.partition_size[i],
-                                                      dtype=torch.half,
+                                                      dtype=self.dtype,
                                                       device=torch.cuda.current_device(),
                                                       return_tensor_list=True)
 
@@ -931,7 +934,7 @@ def copy_grads_in_partition(self, param):
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
             self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=torch.half,
+                                                  dtype=self.dtype,
                                                   device=torch.cuda.current_device())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
@@ -1617,14 +1620,14 @@ def backward(self, loss, retain_graph=False):
         if self.contiguous_gradients:
             self.ipg_buffer = []
             buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                dtype=torch.half,
+                                dtype=self.dtype,
                                 device=torch.cuda.current_device())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
                 buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                    dtype=torch.half,
+                                    dtype=self.dtype,
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0

From 7d762799cd43352e81153181bf3f54fc265ad5e7 Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Fri, 23 Apr 2021 22:37:48 +0000
Subject: [PATCH 10/17] fix loss scale value for static loss scale

---
 deepspeed/runtime/zero/stage2.py | 13 +++++++------
 deepspeed/runtime/zero/stage3.py | 17 +++++++++--------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index c59e87517add..cf6d4876d401 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -357,7 +357,13 @@ def __init__(self,
         self.create_reduce_and_remove_grad_hooks()
 
         # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
+        if self.dtype == torch.float or not dynamic_loss_scale:
+            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
+
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=loss_scale_value)
+            cur_iter = 0
+        else:
             if dynamic_loss_args is None:
                 self.loss_scaler = DynamicLossScaler()
             else:
@@ -365,11 +371,6 @@ def __init__(self,
 
             self.dynamic_loss_scale = True
 
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
         see_memory_usage("Before initializing optimizer states")
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states")
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 931282d5362b..8b7aee16c4ee 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -640,12 +640,13 @@ def __init__(self,
         util_ops = UtilsBuilder().load()
         self.flatten = util_ops.flatten
         self.unflatten = util_ops.unflatten
+        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
 
         if not all(is_zero_param(p) for p in module.parameters()):
             group = None
             if mpu:
                 group = mpu.get_data_parallel_group()
-            Init(module=module, data_parallel_group=group)
+            Init(module=module, data_parallel_group=group, dtype=self.dtype)
 
         for m in module.modules():
             _init_external_params(m)
@@ -791,7 +792,6 @@ def __init__(self,
         self.sub_group_size = sub_group_size
 
         self.sub_group_to_group_id = {}
-        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
@@ -896,7 +896,13 @@ def __init__(self,
         #exit(0)
 
         # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
+        if self.dtype == torch.float or not dynamic_loss_scale:
+            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
+
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=loss_scale_value)
+            cur_iter = 0
+        else:
             if dynamic_loss_args is None:
                 self.loss_scaler = DynamicLossScaler()
             else:
@@ -904,11 +910,6 @@ def __init__(self,
 
             self.dynamic_loss_scale = True
 
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
         self.debug_fp16_grads = [{} for _ in self.fp16_groups]
 
         if dist.get_rank(group=self.dp_process_group) == 0:

From 0bf7efd93f71b9e8a9dc7dcaf0d0c483215b6ced Mon Sep 17 00:00:00 2001
From: Samyam <samyamr@microsoft.com>
Date: Fri, 23 Apr 2021 22:50:48 +0000
Subject: [PATCH 11/17] adding documentation for dtype in zero init

---
 deepspeed/runtime/zero/partition_parameters.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index b910417f3d50..8926ba1ca8ca 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -292,6 +292,7 @@ def _set_dtype(self, ds_config, dtype):
         else:
             self.dtype = dtype
 
+
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):
     param_id = 0
@@ -331,6 +332,8 @@ def __init__(self,
                 as a dictionary instead for swapping fp16 params to NVMe.
             enabled (bool, optional): If ``False``, this context has no
                 effect. Defaults to ``True``.
+            dtype (``torch.dtype``, optional): Can be used to change the data type of the parameters.
+                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``
 
         This context accelerates model initialization and enables models that
         are too large to allocate in their entirety in CPU memory. It has the

From 7260cb2d26ecc9289a6b1e6ac05ecefbf7f992e5 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 29 Apr 2021 19:36:51 +0000
Subject: [PATCH 12/17] Disable cpu-adam update_copy api for fp32

---
 deepspeed/runtime/config.py                    |  2 +-
 deepspeed/runtime/zero/partition_parameters.py | 13 +++++--------
 deepspeed/runtime/zero/stage2.py               |  2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index b09dbebf99d8..727c0810290e 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -534,7 +534,7 @@ def __init__(self, config: Union[str, dict], mpu=None):
                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
         else:
             raise ValueError(
-                f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}"
+                f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}"
             )
         try:
             self.global_rank = torch.distributed.get_rank()
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 8a9c204981f8..3cacc524bd69 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -194,14 +194,10 @@ def new_cuda_tensor(cls, *args):
 # Inserts _post_init_method at the end of init method
 # for all sub classes of torch.nn.Module
 class InsertPostInitMethodToModuleSubClasses(object):
-    def __init__(self,
-                 enabled=True,
-                 mem_efficient_linear=True,
-                 deepspeed_config=None,
-                 dtype=None):
+    def __init__(self, enabled=True, mem_efficient_linear=True, config=None, dtype=None):
         self.mem_efficient_linear = mem_efficient_linear
         self.enabled = enabled
-        self._set_dtype(deepspeed_config, dtype)
+        self._set_dtype(config, dtype)
         assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]"
 
     def __enter__(self):
@@ -304,7 +300,8 @@ def __init__(self,
                  remote_device=None,
                  pin_memory=False,
                  config=None,
-                 enabled=True):
+                 enabled=True,
+                 dtype=torch.half):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
         system and converted to half precision.
@@ -403,7 +400,7 @@ def get_model():
 
         super().__init__(enabled=enabled,
                          mem_efficient_linear=mem_efficient_linear,
-                         deepspeed_config=deepspeed_config,
+                         config=config,
                          dtype=dtype)
         if not torch.distributed.is_initialized():
             init_distributed()
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index cf3e81f3034b..9bf06a585bf1 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1459,7 +1459,7 @@ def step(self, closure=None):
         self.start_timers([OPTIMIZER_STEP])
         if self.deepspeed_adam_offload:
             from deepspeed.ops.adam import DeepSpeedCPUAdam
-            if type(self.optimizer) == DeepSpeedCPUAdam:
+            if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
                 fp16_param_groups = [
                     fp16_partitions[partition_id]
                     for fp16_partitions in self.parallel_partitioned_fp16_groups

From b31088b800e8668cd52a686e7c866e35e9ab70bf Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 29 Apr 2021 19:46:40 +0000
Subject: [PATCH 13/17] Disable gradient clipping in engine for ZeRO

---
 deepspeed/runtime/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 6fae47baa1bd..1d7723de3168 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1095,7 +1095,8 @@ def clip_fp32_gradients(self):
 
     def _take_model_step(self, lr_kwargs):
         if self.gradient_clipping() > 0.0:
-            if not self.fp16_enabled() and not self.amp_enabled():
+            if not (self.fp16_enabled() or self.amp_enabled()
+                    or self.zero_optimization()):
                 self.clip_fp32_gradients()
             elif self.amp_enabled():
                 # AMP's recommended way of doing clipping

From cec23774aa3b7be48f02d255af29e9d0e68e38d9 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 29 Apr 2021 20:03:43 +0000
Subject: [PATCH 14/17] fp16 mode init required for ZeRO-3

---
 deepspeed/runtime/engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 1d7723de3168..83413fe6f81a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -582,6 +582,8 @@ def is_replicated(p):
     def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
+            if self.zero_optimization_partition_weights():
+                assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3."
             self.module.half()
         else:
             assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16"

From de76124fc2389467ad7ba52c302a77ebd4828ba1 Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Thu, 29 Apr 2021 13:24:51 -0700
Subject: [PATCH 15/17] Update engine.py

Assert to check if param.dtype is torch.half for ZeRO3 should only happen if the model was initialized in ZeRO3 context.
---
 deepspeed/runtime/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 83413fe6f81a..1cfe245c65a4 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -582,7 +582,7 @@ def is_replicated(p):
     def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
-            if self.zero_optimization_partition_weights():
+            if self.zero_optimization_partition_weights() and any([hasattr(param,'ds_id') for param in self.module.parameters()]):
                 assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3."
             self.module.half()
         else:

From bb46f5880ce92ed9286af18f5f47bacfda98be86 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 29 Apr 2021 20:34:17 +0000
Subject: [PATCH 16/17] Formatting fix

---
 deepspeed/runtime/engine.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 1cfe245c65a4..46f969ab44ec 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -582,7 +582,9 @@ def is_replicated(p):
     def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
-            if self.zero_optimization_partition_weights() and any([hasattr(param,'ds_id') for param in self.module.parameters()]):
+            if self.zero_optimization_partition_weights() and any(
+                [hasattr(param,
+                         'ds_id') for param in self.module.parameters()]):
                 assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3."
             self.module.half()
         else:

From ab35410a71361485dd4f234b9354bb41d5152913 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 29 Apr 2021 21:27:25 +0000
Subject: [PATCH 17/17] bump DSE

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index bdf8e59aede8..127372571189 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit bdf8e59aede8c8e0577e8d4d557298ca8515268f
+Subproject commit 127372571189ac905c8c92f4fe55a3d85c80324e