From 4ff2cfe8e7a5978be459ad17b3e9c3e0322736f1 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 17 Apr 2021 15:27:53 +0000 Subject: [PATCH 1/2] Make reduce scatter optional for ZeRO-1 as workaround --- deepspeed/runtime/engine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 5081adb0b021..06ecdb072766 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -741,7 +741,6 @@ def _configure_zero_optimizer(self, optimizer): timers = self.timers if self.wall_clock_breakdown() else None if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: - assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode' optimizer = FP16_DeepSpeedZeroOptimizer_Stage1( optimizer, static_loss_scale=self.loss_scale(), @@ -941,8 +940,8 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): #Communicate only at gradient accumulation boundaries elif self.is_gradient_accumulation_boundary(): - if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES: - assert self.zero_reduce_scatter() + if self.zero_optimization_stage( + ) == ZERO_OPTIMIZATION_OPTIMIZER_STATES and self.zero_reduce_scatter(): self.optimizer.reduce_scatter_gradients( postscale_gradients=self.postscale_gradients(), gradient_predivide_factor=self.gradient_predivide_factor(), From ff581f35b7a1c3b2a6e69d5c26e12546644ba9e1 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 17 Apr 2021 18:25:05 +0000 Subject: [PATCH 2/2] Make allreduce default for ZeRO 1 --- deepspeed/runtime/zero/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py index e5812980a337..21642fe4c68e 100755 --- a/deepspeed/runtime/zero/constants.py +++ b/deepspeed/runtime/zero/constants.py @@ -48,7 +48,7 @@ ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter' -ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True +ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = False ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm' ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False