From 2ffaecbbf32773c907a27b718141bee4257a1ac7 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Wed, 10 Mar 2021 14:53:14 -0800 Subject: [PATCH 1/5] Fix mis-aligned-grad When a parameter is not divisible by world size, the partitioned gradients are mis-aligned due to incorrect padding handling. This PR should fix for that. --- deepspeed/runtime/zero/partition_parameters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 05825fc90688..4216e5606a76 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -807,8 +807,9 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False): if start < param.ds_numel: elements = min(param.ds_numel - start, partition_size) - dest_tensor = partition_buffer.view(-1).narrow(0, 0, elements) + dest_tensor_full_buffer = partition_buffer.view(-1).narrow(0, 0, partition_size) + dest_tensor = dest_tensor_full_buffer.narrow(0, 0, elements) src_tensor = param.grad.view(-1).narrow(0, start, elements) # just copy the grad partition to the buffer @@ -841,7 +842,7 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False): # elements)) #print("after partition gradients") - param.grad.data = dest_tensor.data + param.grad.data = dest_tensor_full_buffer.data see_memory_usage("After partitioning gradients", force=False) From ac9266eef286235eb593ed0110fb7231dbd2c936 Mon Sep 17 00:00:00 2001 From: Samyam Date: Wed, 10 Mar 2021 23:11:49 +0000 Subject: [PATCH 2/5] Formatting fix --- deepspeed/runtime/zero/partition_parameters.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 4216e5606a76..e6cb9199899a 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -807,7 +807,10 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False): if start < param.ds_numel: elements = min(param.ds_numel - start, partition_size) - dest_tensor_full_buffer = partition_buffer.view(-1).narrow(0, 0, partition_size) + dest_tensor_full_buffer = partition_buffer.view(-1).narrow( + 0, + 0, + partition_size) dest_tensor = dest_tensor_full_buffer.narrow(0, 0, elements) src_tensor = param.grad.view(-1).narrow(0, start, elements) From bf53561cad23f47ffed58887bb17d0aed8acc0b4 Mon Sep 17 00:00:00 2001 From: Samyam Date: Wed, 10 Mar 2021 23:26:46 +0000 Subject: [PATCH 3/5] Adding static_scale test back for Z3, and also changing hidden size to be not divisile by world_size --- tests/unit/test_fp16.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index 5012614f97b0..038ccacc471f 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -347,9 +347,6 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip("skip for now") - config_dict = { "train_batch_size": 4, "steps_per_print": 1, @@ -372,7 +369,8 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): @distributed_test(world_size=2) def _test_zero_static_scale(args, zero_stage): - hidden_dim = 10 + #making hidden size not divisible by DP for covering this scenario + hidden_dim = 9 model = SimpleModel(hidden_dim) model, optim, _, _ = deepspeed.initialize(args=args, From 9cd813d252e963bd44915b0e5e98fa5a83973dda Mon Sep 17 00:00:00 2001 From: Samyam Date: Thu, 11 Mar 2021 02:08:58 +0000 Subject: [PATCH 4/5] also removing alignment from flat fp16 buffers --- deepspeed/runtime/zero/stage3.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index d2c197fa93c8..10208c7b5b49 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -961,10 +961,9 @@ def _create_fp16_partitions_with_defragmentation(self): #create flat buffer in CPU and move to GPU self.fp16_partitioned_groups_flat.append( - flatten_dense_tensors_aligned( - self.fp16_partitioned_groups[i], - dist.get_world_size(group=self.dp_process_group)).cuda( - torch.cuda.current_device())) + flatten_dense_tensors_aligned(self.fp16_partitioned_groups[i], + 1).cuda( + torch.cuda.current_device())) see_memory_usage( f"After flattening and moving param group {i} to GPU", force=False) From 5692c62b60e81e7212c5f7d848e7a0fd9d744eea Mon Sep 17 00:00:00 2001 From: Samyam Date: Thu, 11 Mar 2021 02:21:52 +0000 Subject: [PATCH 5/5] Testing for hidden dim alignment --- tests/unit/test_fp16.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index 038ccacc471f..dbd40c322be9 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -368,9 +368,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=2) - def _test_zero_static_scale(args, zero_stage): + def _test_zero_static_scale(args, zero_stage, hidden_dim): #making hidden size not divisible by DP for covering this scenario - hidden_dim = 9 + hidden_dim = hidden_dim model = SimpleModel(hidden_dim) model, optim, _, _ = deepspeed.initialize(args=args, @@ -391,7 +391,10 @@ def _test_zero_static_scale(args, zero_stage): model.backward(loss) model.step() - _test_zero_static_scale(args=args, zero_stage=zero_stage) + #test when hidden_dim is not aligned with world size + _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9) + #test when hidden_dim is aligned with world size + _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10) def test_zero_static_scale_deprecated_format(tmpdir):