From f4f28b94c0b59d82ddf0c00a567f5c024606ce3d Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 20:24:19 +0000 Subject: [PATCH 01/17] Adding tf32 and fp32 support for ZeRO Stage 3 --- deepspeed/runtime/config.py | 3 +- deepspeed/runtime/engine.py | 2 + .../runtime/zero/partition_parameters.py | 52 +++++++++++++++---- deepspeed/runtime/zero/stage3.py | 24 +++++---- 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 9e33876994f9..f5f98a0fa534 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -759,7 +759,8 @@ def _do_error_check(self): GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" + if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION: + assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION) #if self.zero_config.cpu_offload is True: # assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index f71a7324585a..dd08fbe920dd 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -573,6 +573,8 @@ def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): self.module.half() + else: + assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16" if not self.dont_change_device: self.module.to(self.device) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index c8bde6390b3c..22e7c089e5ba 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -153,7 +153,7 @@ class ZeroParamStatus(Enum): _orig_torch_empty = torch.empty -def empty_cuda_tensor(*size, **kwargs): +def empty_cuda_tensor_half(*size, **kwargs): if not 'device' in kwargs.keys(): kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) tensor = _orig_torch_empty(*size, **kwargs) @@ -163,7 +163,7 @@ def empty_cuda_tensor(*size, **kwargs): return tensor -def new_cuda_tensor(cls, *args): +def new_cuda_tensor_half(cls, *args): device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) tensor = torch.ones((1, 1), device=device).new_empty(*args).half() if tensor.is_floating_point(): @@ -172,6 +172,19 @@ def new_cuda_tensor(cls, *args): return tensor +def empty_cuda_tensor(*size, **kwargs): + if not 'device' in kwargs.keys(): + kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) + tensor = _orig_torch_empty(*size, **kwargs) + return tensor + + +def new_cuda_tensor(cls, *args): + device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) + tensor = torch.ones((1, 1), device=device).new_empty(*args) + return tensor + + reuse_buffers = False temp_contiguous_tensor = None empty_buffers = {} @@ -180,9 +193,15 @@ def new_cuda_tensor(cls, *args): # Inserts _post_init_method at the end of init method # for all sub classes of torch.nn.Module class InsertPostInitMethodToModuleSubClasses(object): - def __init__(self, enabled=True, mem_efficient_linear=True): + def __init__(self, + enabled=True, + mem_efficient_linear=True, + deepspeed_config=None, + dtype=None): self.mem_efficient_linear = mem_efficient_linear self.enabled = enabled + self._set_dtype(deepspeed_config, dtype) + assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]" def __enter__(self): if not self.enabled: @@ -218,8 +237,12 @@ def _init_subclass(cls, **kwargs): # Replace .__init__() for future subclasses of torch.nn.Module torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass) - torch.Tensor.__new__ = new_cuda_tensor - torch.empty = empty_cuda_tensor + if self.dtype == torch.half: + torch.Tensor.__new__ = new_cuda_tensor_half + torch.empty = empty_cuda_tensor_half + else: + torch.Tensor.__new__ = new_cuda_tensor + torch.empty = empty_cuda_tensor if self.mem_efficient_linear: print_rank_0( @@ -259,6 +282,13 @@ def _disable_class(cls): def _post_init_method(self, module): pass + def _set_dtype(self, ds_config, dtype): + if ds_config is not None and dtype is None: + _ds_config = DeepSpeedConfig(ds_config) + self.dtype = torch.half if _ds_config.fp16_enabled else torch.float + elif dtype is None: + self.dtype = torch.half + # Replaces all parameters in module with Scattered Parameters class Init(InsertPostInitMethodToModuleSubClasses): @@ -271,7 +301,8 @@ def __init__(self, remote_device=None, pin_memory=False, deepspeed_config=None, - enabled=True): + enabled=True, + dtype=None): """A context to enable massive model construction for training with ZeRO-3. Models are automatically partitioned (or, sharded) across the system and converted to half precision. @@ -366,7 +397,10 @@ def get_model(): model = deepspeed.zero.Init(module=model) """ - super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear) + super().__init__(enabled=enabled, + mem_efficient_linear=mem_efficient_linear, + deepspeed_config=deepspeed_config, + dtype=dtype) if not torch.distributed.is_initialized(): init_distributed() assert torch.distributed.is_initialized(), "Parameters cannot be scattered without initializing torch.distributed" @@ -631,7 +665,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): f'Before partitioning param {param.ds_id} {param.shape}', force=False) #param.data does not store anything meaningful in partitioned state - param.data = torch.ones(1).half().to(param.device) + param.data = torch.ones(1, dtype=self.dtype).to(param.device) see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) @@ -712,7 +746,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False) - param.data = torch.ones(1).half().to(param.device) + param.data = torch.ones(1, dtype=self.dtype).to(param.device) see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index f8b526952de8..5ba1376b3355 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -844,6 +844,7 @@ def __init__(self, # simplified param id self.param_id = {} + self.dtype = self.fp16_groups[0][0].dtype count = 0 for i, params_group in enumerate(self.fp16_groups): @@ -874,10 +875,11 @@ def __init__(self, self.local_overflow = False self.temp_grad_buffer_for_gpu_offload = torch.zeros( largest_partitioned_param_numel, - device=torch.cuda.current_device()).half() - self.temp_grad_gpu_buffer = torch.zeros( - largest_partitioned_param_numel, - device=torch.cuda.current_device()).half() + device=torch.cuda.current_device(), + dtype=self.dtype) + self.temp_grad_gpu_buffer = torch.zeros(largest_partitioned_param_numel, + device=torch.cuda.current_device(), + dtype=self.dtype) see_memory_usage(f"After CPU Offload initialization", force=False) # stores if a partition has been reduced in this step @@ -1059,7 +1061,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self): force=False) self.param_groups_fp16_flat_cpu_memory.append( torch.empty(int(flat_buffer_size), - dtype=torch.half, + dtype=self.dtype, pin_memory=True)) else: print_rank_0( @@ -1068,7 +1070,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self): self.param_groups_fp16_flat_cpu_memory.append( torch.empty(1, - dtype=torch.half)) + dtype=self.dtype)) def _create_fp16_partitions_with_defragmentation(self): dist.barrier() @@ -1170,7 +1172,7 @@ def _create_fp16_partitions_with_defragmentation(self): -1] is None and self.param_group_fp16_flat_reuse_buffer is None: self.param_group_fp16_flat_reuse_buffer = torch.empty( max(self.fp16_partitioned_groups_flat_numel), - dtype=torch.half, + dtype=self.dtype, device='cpu', pin_memory=True) @@ -2077,12 +2079,12 @@ def partition_previous_reduced_grads(self): if self.offload_param_pin_memory: self.grads_in_partition.append( torch.zeros(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=self.device).pin_memory()) else: self.grads_in_partition.append( torch.zeros(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=self.device)) see_memory_usage( f"group {i} after creating {total_size} reduced gradients into partition", @@ -2930,14 +2932,14 @@ def backward(self, loss, retain_graph=False): if self.contiguous_gradients: self.ipg_buffer = [] buf_0 = torch.empty(self.reduce_bucket_size, - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_0) # Use double buffers to avoid data access conflict when overlap_comm is enabled. if self.overlap_comm: buf_1 = torch.empty(self.reduce_bucket_size, - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_1) self.ipg_index = 0 From 08fc9e6794b5a34213222b04c26eff78255270d1 Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 21:04:27 +0000 Subject: [PATCH 02/17] Changing to location of self.dtype assignment --- deepspeed/runtime/zero/stage3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 5ba1376b3355..fd1ba9cb1a1f 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -791,7 +791,7 @@ def __init__(self, self.sub_group_size = sub_group_size self.sub_group_to_group_id = {} - + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype see_memory_usage("Before creating fp16 partitions", force=True) self._create_fp16_partitions_with_defragmentation() num_fp16_subgroups = len(self.fp16_partitioned_groups_flat) @@ -844,7 +844,6 @@ def __init__(self, # simplified param id self.param_id = {} - self.dtype = self.fp16_groups[0][0].dtype count = 0 for i, params_group in enumerate(self.fp16_groups): From ed679c96cc109ca0c5905162545e10cf752a414b Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 21:07:47 +0000 Subject: [PATCH 03/17] Exhaustive setting of self.dtype --- deepspeed/runtime/zero/partition_parameters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 22e7c089e5ba..835d38f75222 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -288,6 +288,8 @@ def _set_dtype(self, ds_config, dtype): self.dtype = torch.half if _ds_config.fp16_enabled else torch.float elif dtype is None: self.dtype = torch.half + else: + self.dtype = dtype # Replaces all parameters in module with Scattered Parameters From 2eda1616512fb3f26b0fa77476864d32fcef49f2 Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 22:50:51 +0000 Subject: [PATCH 04/17] Adding fp32 and tf32 support for ZeRO Stage 2 --- deepspeed/runtime/config.py | 2 +- deepspeed/runtime/zero/stage2.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index f5f98a0fa534..f4bd6899070a 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -759,7 +759,7 @@ def _do_error_check(self): GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION: + if self.zero_optimization_stage < ZERO_OPTIMIZATION_GRADIENTS: assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION) #if self.zero_config.cpu_offload is True: diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 39d780e55574..c59e87517add 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -187,6 +187,7 @@ def __init__(self, partition_id = dist.get_rank(group=self.dp_process_group) self.all_reduce_print = False + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype # padding on each partition for alignment purposes self.groups_padding = [] @@ -306,10 +307,12 @@ def __init__(self, self.grad_position = {} self.temp_grad_buffer_for_cpu_offload = torch.zeros( largest_param_numel, - device=self.device).half().pin_memory() + device=self.device, + dtype=self.dtype).pin_memory() self.temp_grad_buffer_for_gpu_offload = torch.zeros( largest_param_numel, - device=torch.cuda.current_device()).half() + device=torch.cuda.current_device(), + dtype=self.dtype) for i, params_group in enumerate(self.fp16_groups): self.get_grad_position(i, @@ -464,14 +467,14 @@ def independent_gradient_partition_epilogue(self): self.params_in_partition[i], self.first_offset[i], self.partition_size[i], - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device(), return_tensor_list=True) else: avg_new = self.get_flat_partition(self.params_in_partition[i], self.first_offset[i], self.partition_size[i], - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device(), return_tensor_list=True) @@ -931,7 +934,7 @@ def copy_grads_in_partition(self, param): see_memory_usage(f"before copying {total_size} gradients into partition") self.grads_in_partition = torch.empty(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) see_memory_usage(f"after copying {total_size} gradients into partition") @@ -1617,14 +1620,14 @@ def backward(self, loss, retain_graph=False): if self.contiguous_gradients: self.ipg_buffer = [] buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_0) # Use double buffers to avoid data access conflict when overlap_comm is enabled. if self.overlap_comm: buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_1) self.ipg_index = 0 From 5ce4d3bdcc337d68b4854e3d6f3c4a127d132bdb Mon Sep 17 00:00:00 2001 From: Samyam Date: Fri, 23 Apr 2021 22:37:48 +0000 Subject: [PATCH 05/17] fix loss scale value for static loss scale --- deepspeed/runtime/zero/stage2.py | 13 +++++++------ deepspeed/runtime/zero/stage3.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index c59e87517add..cf6d4876d401 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -357,7 +357,13 @@ def __init__(self, self.create_reduce_and_remove_grad_hooks() # we may have a way of fusing dynamic scale. Do not support for now - if dynamic_loss_scale: + if self.dtype == torch.float or not dynamic_loss_scale: + loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale + + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(scale=loss_scale_value) + cur_iter = 0 + else: if dynamic_loss_args is None: self.loss_scaler = DynamicLossScaler() else: @@ -365,11 +371,6 @@ def __init__(self, self.dynamic_loss_scale = True - else: - self.dynamic_loss_scale = False - self.loss_scaler = LossScaler(scale=static_loss_scale) - self.cur_iter = 0 - see_memory_usage("Before initializing optimizer states") self.initialize_optimizer_states() see_memory_usage("After initializing optimizer states") diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index fd1ba9cb1a1f..51e758203532 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -640,12 +640,13 @@ def __init__(self, util_ops = UtilsBuilder().load() self.flatten = util_ops.flatten self.unflatten = util_ops.unflatten + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype if not all(is_zero_param(p) for p in module.parameters()): group = None if mpu: group = mpu.get_data_parallel_group() - Init(module=module, data_parallel_group=group) + Init(module=module, data_parallel_group=group, dtype=self.dtype) for m in module.modules(): _init_external_params(m) @@ -791,7 +792,6 @@ def __init__(self, self.sub_group_size = sub_group_size self.sub_group_to_group_id = {} - self.dtype = self.optimizer.param_groups[0]['params'][0].dtype see_memory_usage("Before creating fp16 partitions", force=True) self._create_fp16_partitions_with_defragmentation() num_fp16_subgroups = len(self.fp16_partitioned_groups_flat) @@ -896,7 +896,13 @@ def __init__(self, #exit(0) # we may have a way of fusing dynamic scale. Do not support for now - if dynamic_loss_scale: + if self.dtype == torch.float or not dynamic_loss_scale: + loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale + + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(scale=loss_scale_value) + cur_iter = 0 + else: if dynamic_loss_args is None: self.loss_scaler = DynamicLossScaler() else: @@ -904,11 +910,6 @@ def __init__(self, self.dynamic_loss_scale = True - else: - self.dynamic_loss_scale = False - self.loss_scaler = LossScaler(scale=static_loss_scale) - self.cur_iter = 0 - self.debug_fp16_grads = [{} for _ in self.fp16_groups] if dist.get_rank(group=self.dp_process_group) == 0: From 639d4875fa44be90e6aba691adcf6cf9e8f916bf Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 20:24:19 +0000 Subject: [PATCH 06/17] Adding tf32 and fp32 support for ZeRO Stage 3 --- deepspeed/runtime/config.py | 3 +- deepspeed/runtime/engine.py | 2 + .../runtime/zero/partition_parameters.py | 52 +++++++++++++++---- deepspeed/runtime/zero/stage3.py | 24 +++++---- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 3fa0b32a6032..23dc37c0b345 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -761,7 +761,8 @@ def _do_error_check(self): GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" + if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION: + assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION) #if self.zero_config.cpu_offload is True: # assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 646e492cc3dd..293c6c72bdd9 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -574,6 +574,8 @@ def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): self.module.half() + else: + assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16" if not self.dont_change_device: self.module.to(self.device) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index e831911efd62..a18ca47ca586 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -154,7 +154,7 @@ class ZeroParamStatus(Enum): _orig_torch_empty = torch.empty -def empty_cuda_tensor(*size, **kwargs): +def empty_cuda_tensor_half(*size, **kwargs): if not 'device' in kwargs.keys(): kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) tensor = _orig_torch_empty(*size, **kwargs) @@ -164,7 +164,7 @@ def empty_cuda_tensor(*size, **kwargs): return tensor -def new_cuda_tensor(cls, *args): +def new_cuda_tensor_half(cls, *args): device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) tensor = torch.ones((1, 1), device=device).new_empty(*args).half() if tensor.is_floating_point(): @@ -173,6 +173,19 @@ def new_cuda_tensor(cls, *args): return tensor +def empty_cuda_tensor(*size, **kwargs): + if not 'device' in kwargs.keys(): + kwargs['device'] = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) + tensor = _orig_torch_empty(*size, **kwargs) + return tensor + + +def new_cuda_tensor(cls, *args): + device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"])) + tensor = torch.ones((1, 1), device=device).new_empty(*args) + return tensor + + reuse_buffers = False temp_contiguous_tensor = None empty_buffers = {} @@ -181,9 +194,15 @@ def new_cuda_tensor(cls, *args): # Inserts _post_init_method at the end of init method # for all sub classes of torch.nn.Module class InsertPostInitMethodToModuleSubClasses(object): - def __init__(self, enabled=True, mem_efficient_linear=True): + def __init__(self, + enabled=True, + mem_efficient_linear=True, + deepspeed_config=None, + dtype=None): self.mem_efficient_linear = mem_efficient_linear self.enabled = enabled + self._set_dtype(deepspeed_config, dtype) + assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]" def __enter__(self): if not self.enabled: @@ -219,8 +238,12 @@ def _init_subclass(cls, **kwargs): # Replace .__init__() for future subclasses of torch.nn.Module torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass) - torch.Tensor.__new__ = new_cuda_tensor - torch.empty = empty_cuda_tensor + if self.dtype == torch.half: + torch.Tensor.__new__ = new_cuda_tensor_half + torch.empty = empty_cuda_tensor_half + else: + torch.Tensor.__new__ = new_cuda_tensor + torch.empty = empty_cuda_tensor if self.mem_efficient_linear: print_rank_0( @@ -260,6 +283,12 @@ def _disable_class(cls): def _post_init_method(self, module): pass + def _set_dtype(self, ds_config, dtype): + if ds_config is not None and dtype is None: + _ds_config = DeepSpeedConfig(ds_config) + self.dtype = torch.half if _ds_config.fp16_enabled else torch.float + elif dtype is None: + self.dtype = torch.half # Replaces all parameters in module with Scattered Parameters class Init(InsertPostInitMethodToModuleSubClasses): @@ -273,7 +302,8 @@ def __init__(self, pin_memory=False, deepspeed_config=None, param_dict=None, - enabled=True): + enabled=True, + dtype=None): """A context to enable massive model construction for training with ZeRO-3. Models are automatically partitioned (or, sharded) across the system and converted to half precision. @@ -370,7 +400,10 @@ def get_model(): model = deepspeed.zero.Init(module=model) """ - super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear) + super().__init__(enabled=enabled, + mem_efficient_linear=mem_efficient_linear, + deepspeed_config=deepspeed_config, + dtype=dtype) if not torch.distributed.is_initialized(): init_distributed() assert torch.distributed.is_initialized(), "Parameters cannot be scattered without initializing torch.distributed" @@ -635,8 +668,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): f'Before partitioning param {param.ds_id} {param.shape}', force=False) #param.data does not store anything meaningful in partitioned state - param.data = torch.ones(partitioned_param_data_shape).half().to( - param.device) + param.data = torch.ones(1, dtype=self.dtype).to(param.device) see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) @@ -717,7 +749,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False) - param.data = torch.ones(partitioned_param_data_shape).half().to(param.device) + param.data = torch.ones(1, dtype=self.dtype).to(param.device) see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 2b16887ff60d..ff26d1d39ddd 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -844,6 +844,7 @@ def __init__(self, # simplified param id self.param_id = {} + self.dtype = self.fp16_groups[0][0].dtype count = 0 for i, params_group in enumerate(self.fp16_groups): @@ -874,10 +875,11 @@ def __init__(self, self.local_overflow = False self.temp_grad_buffer_for_gpu_offload = torch.zeros( largest_partitioned_param_numel, - device=torch.cuda.current_device()).half() - self.temp_grad_gpu_buffer = torch.zeros( - largest_partitioned_param_numel, - device=torch.cuda.current_device()).half() + device=torch.cuda.current_device(), + dtype=self.dtype) + self.temp_grad_gpu_buffer = torch.zeros(largest_partitioned_param_numel, + device=torch.cuda.current_device(), + dtype=self.dtype) see_memory_usage(f"After CPU Offload initialization", force=False) # stores if a partition has been reduced in this step @@ -1059,7 +1061,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self): force=False) self.param_groups_fp16_flat_cpu_memory.append( torch.empty(int(flat_buffer_size), - dtype=torch.half, + dtype=self.dtype, pin_memory=True)) else: print_rank_0( @@ -1068,7 +1070,7 @@ def _create_param_groups_fp16_flat_cpu_memory(self): self.param_groups_fp16_flat_cpu_memory.append( torch.empty(1, - dtype=torch.half)) + dtype=self.dtype)) def _create_fp16_partitions_with_defragmentation(self): dist.barrier() @@ -1170,7 +1172,7 @@ def _create_fp16_partitions_with_defragmentation(self): -1] is None and self.param_group_fp16_flat_reuse_buffer is None: self.param_group_fp16_flat_reuse_buffer = torch.empty( max(self.fp16_partitioned_groups_flat_numel), - dtype=torch.half, + dtype=self.dtype, device='cpu', pin_memory=True) @@ -2076,12 +2078,12 @@ def partition_previous_reduced_grads(self): if self.offload_param_pin_memory: self.grads_in_partition.append( torch.zeros(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=self.device).pin_memory()) else: self.grads_in_partition.append( torch.zeros(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=self.device)) see_memory_usage( f"group {i} after creating {total_size} reduced gradients into partition", @@ -2929,14 +2931,14 @@ def backward(self, loss, retain_graph=False): if self.contiguous_gradients: self.ipg_buffer = [] buf_0 = torch.empty(self.reduce_bucket_size, - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_0) # Use double buffers to avoid data access conflict when overlap_comm is enabled. if self.overlap_comm: buf_1 = torch.empty(self.reduce_bucket_size, - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_1) self.ipg_index = 0 From e3b35342f2c9990a076dc35b1a3ffe1201b66c24 Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 21:04:27 +0000 Subject: [PATCH 07/17] Changing to location of self.dtype assignment --- deepspeed/runtime/zero/stage3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index ff26d1d39ddd..931282d5362b 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -791,7 +791,7 @@ def __init__(self, self.sub_group_size = sub_group_size self.sub_group_to_group_id = {} - + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype see_memory_usage("Before creating fp16 partitions", force=True) self._create_fp16_partitions_with_defragmentation() num_fp16_subgroups = len(self.fp16_partitioned_groups_flat) @@ -844,7 +844,6 @@ def __init__(self, # simplified param id self.param_id = {} - self.dtype = self.fp16_groups[0][0].dtype count = 0 for i, params_group in enumerate(self.fp16_groups): From bed15a015535400cc3b9dcffa9f31a486c6a5ede Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 21:07:47 +0000 Subject: [PATCH 08/17] Exhaustive setting of self.dtype --- deepspeed/runtime/zero/partition_parameters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index a18ca47ca586..b910417f3d50 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -289,6 +289,8 @@ def _set_dtype(self, ds_config, dtype): self.dtype = torch.half if _ds_config.fp16_enabled else torch.float elif dtype is None: self.dtype = torch.half + else: + self.dtype = dtype # Replaces all parameters in module with Scattered Parameters class Init(InsertPostInitMethodToModuleSubClasses): From bfe1e844a9ea0db43cea45fdff83902d46b3a873 Mon Sep 17 00:00:00 2001 From: Samyam Date: Tue, 20 Apr 2021 22:50:51 +0000 Subject: [PATCH 09/17] Adding fp32 and tf32 support for ZeRO Stage 2 --- deepspeed/runtime/config.py | 2 +- deepspeed/runtime/zero/stage2.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 23dc37c0b345..661c9d03e943 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -761,7 +761,7 @@ def _do_error_check(self): GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - if self.zero_optimization_stage != MAX_STAGE_ZERO_OPTIMIZATION: + if self.zero_optimization_stage < ZERO_OPTIMIZATION_GRADIENTS: assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled" assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION) #if self.zero_config.cpu_offload is True: diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 39d780e55574..c59e87517add 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -187,6 +187,7 @@ def __init__(self, partition_id = dist.get_rank(group=self.dp_process_group) self.all_reduce_print = False + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype # padding on each partition for alignment purposes self.groups_padding = [] @@ -306,10 +307,12 @@ def __init__(self, self.grad_position = {} self.temp_grad_buffer_for_cpu_offload = torch.zeros( largest_param_numel, - device=self.device).half().pin_memory() + device=self.device, + dtype=self.dtype).pin_memory() self.temp_grad_buffer_for_gpu_offload = torch.zeros( largest_param_numel, - device=torch.cuda.current_device()).half() + device=torch.cuda.current_device(), + dtype=self.dtype) for i, params_group in enumerate(self.fp16_groups): self.get_grad_position(i, @@ -464,14 +467,14 @@ def independent_gradient_partition_epilogue(self): self.params_in_partition[i], self.first_offset[i], self.partition_size[i], - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device(), return_tensor_list=True) else: avg_new = self.get_flat_partition(self.params_in_partition[i], self.first_offset[i], self.partition_size[i], - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device(), return_tensor_list=True) @@ -931,7 +934,7 @@ def copy_grads_in_partition(self, param): see_memory_usage(f"before copying {total_size} gradients into partition") self.grads_in_partition = torch.empty(int(total_size), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) see_memory_usage(f"after copying {total_size} gradients into partition") @@ -1617,14 +1620,14 @@ def backward(self, loss, retain_graph=False): if self.contiguous_gradients: self.ipg_buffer = [] buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_0) # Use double buffers to avoid data access conflict when overlap_comm is enabled. if self.overlap_comm: buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5), - dtype=torch.half, + dtype=self.dtype, device=torch.cuda.current_device()) self.ipg_buffer.append(buf_1) self.ipg_index = 0 From 7d762799cd43352e81153181bf3f54fc265ad5e7 Mon Sep 17 00:00:00 2001 From: Samyam Date: Fri, 23 Apr 2021 22:37:48 +0000 Subject: [PATCH 10/17] fix loss scale value for static loss scale --- deepspeed/runtime/zero/stage2.py | 13 +++++++------ deepspeed/runtime/zero/stage3.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index c59e87517add..cf6d4876d401 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -357,7 +357,13 @@ def __init__(self, self.create_reduce_and_remove_grad_hooks() # we may have a way of fusing dynamic scale. Do not support for now - if dynamic_loss_scale: + if self.dtype == torch.float or not dynamic_loss_scale: + loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale + + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(scale=loss_scale_value) + cur_iter = 0 + else: if dynamic_loss_args is None: self.loss_scaler = DynamicLossScaler() else: @@ -365,11 +371,6 @@ def __init__(self, self.dynamic_loss_scale = True - else: - self.dynamic_loss_scale = False - self.loss_scaler = LossScaler(scale=static_loss_scale) - self.cur_iter = 0 - see_memory_usage("Before initializing optimizer states") self.initialize_optimizer_states() see_memory_usage("After initializing optimizer states") diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 931282d5362b..8b7aee16c4ee 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -640,12 +640,13 @@ def __init__(self, util_ops = UtilsBuilder().load() self.flatten = util_ops.flatten self.unflatten = util_ops.unflatten + self.dtype = self.optimizer.param_groups[0]['params'][0].dtype if not all(is_zero_param(p) for p in module.parameters()): group = None if mpu: group = mpu.get_data_parallel_group() - Init(module=module, data_parallel_group=group) + Init(module=module, data_parallel_group=group, dtype=self.dtype) for m in module.modules(): _init_external_params(m) @@ -791,7 +792,6 @@ def __init__(self, self.sub_group_size = sub_group_size self.sub_group_to_group_id = {} - self.dtype = self.optimizer.param_groups[0]['params'][0].dtype see_memory_usage("Before creating fp16 partitions", force=True) self._create_fp16_partitions_with_defragmentation() num_fp16_subgroups = len(self.fp16_partitioned_groups_flat) @@ -896,7 +896,13 @@ def __init__(self, #exit(0) # we may have a way of fusing dynamic scale. Do not support for now - if dynamic_loss_scale: + if self.dtype == torch.float or not dynamic_loss_scale: + loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale + + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(scale=loss_scale_value) + cur_iter = 0 + else: if dynamic_loss_args is None: self.loss_scaler = DynamicLossScaler() else: @@ -904,11 +910,6 @@ def __init__(self, self.dynamic_loss_scale = True - else: - self.dynamic_loss_scale = False - self.loss_scaler = LossScaler(scale=static_loss_scale) - self.cur_iter = 0 - self.debug_fp16_grads = [{} for _ in self.fp16_groups] if dist.get_rank(group=self.dp_process_group) == 0: From 0bf7efd93f71b9e8a9dc7dcaf0d0c483215b6ced Mon Sep 17 00:00:00 2001 From: Samyam Date: Fri, 23 Apr 2021 22:50:48 +0000 Subject: [PATCH 11/17] adding documentation for dtype in zero init --- deepspeed/runtime/zero/partition_parameters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index b910417f3d50..8926ba1ca8ca 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -292,6 +292,7 @@ def _set_dtype(self, ds_config, dtype): else: self.dtype = dtype + # Replaces all parameters in module with Scattered Parameters class Init(InsertPostInitMethodToModuleSubClasses): param_id = 0 @@ -331,6 +332,8 @@ def __init__(self, as a dictionary instead for swapping fp16 params to NVMe. enabled (bool, optional): If ``False``, this context has no effect. Defaults to ``True``. + dtype (``torch.dtype``, optional): Can be used to change the data type of the parameters. + Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half`` This context accelerates model initialization and enables models that are too large to allocate in their entirety in CPU memory. It has the From 7260cb2d26ecc9289a6b1e6ac05ecefbf7f992e5 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 29 Apr 2021 19:36:51 +0000 Subject: [PATCH 12/17] Disable cpu-adam update_copy api for fp32 --- deepspeed/runtime/config.py | 2 +- deepspeed/runtime/zero/partition_parameters.py | 13 +++++-------- deepspeed/runtime/zero/stage2.py | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index b09dbebf99d8..727c0810290e 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -534,7 +534,7 @@ def __init__(self, config: Union[str, dict], mpu=None): object_pairs_hook=dict_raise_error_on_duplicate_keys) else: raise ValueError( - f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}" + f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}" ) try: self.global_rank = torch.distributed.get_rank() diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 8a9c204981f8..3cacc524bd69 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -194,14 +194,10 @@ def new_cuda_tensor(cls, *args): # Inserts _post_init_method at the end of init method # for all sub classes of torch.nn.Module class InsertPostInitMethodToModuleSubClasses(object): - def __init__(self, - enabled=True, - mem_efficient_linear=True, - deepspeed_config=None, - dtype=None): + def __init__(self, enabled=True, mem_efficient_linear=True, config=None, dtype=None): self.mem_efficient_linear = mem_efficient_linear self.enabled = enabled - self._set_dtype(deepspeed_config, dtype) + self._set_dtype(config, dtype) assert self.dtype in [torch.half, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.float]" def __enter__(self): @@ -304,7 +300,8 @@ def __init__(self, remote_device=None, pin_memory=False, config=None, - enabled=True): + enabled=True, + dtype=torch.half): """A context to enable massive model construction for training with ZeRO-3. Models are automatically partitioned (or, sharded) across the system and converted to half precision. @@ -403,7 +400,7 @@ def get_model(): super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, - deepspeed_config=deepspeed_config, + config=config, dtype=dtype) if not torch.distributed.is_initialized(): init_distributed() diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index cf3e81f3034b..9bf06a585bf1 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1459,7 +1459,7 @@ def step(self, closure=None): self.start_timers([OPTIMIZER_STEP]) if self.deepspeed_adam_offload: from deepspeed.ops.adam import DeepSpeedCPUAdam - if type(self.optimizer) == DeepSpeedCPUAdam: + if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half: fp16_param_groups = [ fp16_partitions[partition_id] for fp16_partitions in self.parallel_partitioned_fp16_groups From b31088b800e8668cd52a686e7c866e35e9ab70bf Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 29 Apr 2021 19:46:40 +0000 Subject: [PATCH 13/17] Disable gradient clipping in engine for ZeRO --- deepspeed/runtime/engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 6fae47baa1bd..1d7723de3168 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -1095,7 +1095,8 @@ def clip_fp32_gradients(self): def _take_model_step(self, lr_kwargs): if self.gradient_clipping() > 0.0: - if not self.fp16_enabled() and not self.amp_enabled(): + if not (self.fp16_enabled() or self.amp_enabled() + or self.zero_optimization()): self.clip_fp32_gradients() elif self.amp_enabled(): # AMP's recommended way of doing clipping From cec23774aa3b7be48f02d255af29e9d0e68e38d9 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 29 Apr 2021 20:03:43 +0000 Subject: [PATCH 14/17] fp16 mode init required for ZeRO-3 --- deepspeed/runtime/engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 1d7723de3168..83413fe6f81a 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -582,6 +582,8 @@ def is_replicated(p): def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): + if self.zero_optimization_partition_weights(): + assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3." self.module.half() else: assert all([param.dtype == torch.float for param in self.module.parameters()]), f"The fp16 is not enabled but dtype on parameters not fp16" From de76124fc2389467ad7ba52c302a77ebd4828ba1 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Thu, 29 Apr 2021 13:24:51 -0700 Subject: [PATCH 15/17] Update engine.py Assert to check if param.dtype is torch.half for ZeRO3 should only happen if the model was initialized in ZeRO3 context. --- deepspeed/runtime/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 83413fe6f81a..1cfe245c65a4 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -582,7 +582,7 @@ def is_replicated(p): def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): - if self.zero_optimization_partition_weights(): + if self.zero_optimization_partition_weights() and any([hasattr(param,'ds_id') for param in self.module.parameters()]): assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3." self.module.half() else: From bb46f5880ce92ed9286af18f5f47bacfda98be86 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 29 Apr 2021 20:34:17 +0000 Subject: [PATCH 16/17] Formatting fix --- deepspeed/runtime/engine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 1cfe245c65a4..46f969ab44ec 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -582,7 +582,9 @@ def is_replicated(p): def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): - if self.zero_optimization_partition_weights() and any([hasattr(param,'ds_id') for param in self.module.parameters()]): + if self.zero_optimization_partition_weights() and any( + [hasattr(param, + 'ds_id') for param in self.module.parameters()]): assert all([param.dtype == torch.half for param in self.module.parameters()]), f"Model must initialized in fp16 mode for ZeRO Stage 3." self.module.half() else: From ab35410a71361485dd4f234b9354bb41d5152913 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 29 Apr 2021 21:27:25 +0000 Subject: [PATCH 17/17] bump DSE --- DeepSpeedExamples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DeepSpeedExamples b/DeepSpeedExamples index bdf8e59aede8..127372571189 160000 --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit bdf8e59aede8c8e0577e8d4d557298ca8515268f +Subproject commit 127372571189ac905c8c92f4fe55a3d85c80324e