From e423d104f952998d936b63182ba1cf78621e4863 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 8 Mar 2021 09:09:52 -0800 Subject: [PATCH 1/3] less scary overflow notice --- deepspeed/runtime/fp16/fused_optimizer.py | 2 +- deepspeed/runtime/fp16/loss_scaler.py | 2 +- deepspeed/runtime/fp16/unfused_optimizer.py | 4 ++-- deepspeed/runtime/zero/stage1.py | 2 +- deepspeed/runtime/zero/stage2.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 8c1d2003cb1b..1fc24ef60e53 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -153,7 +153,7 @@ def step_fused_adam(self, closure=None): if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " + logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index ad6f8f6227f9..bbc718a47fe6 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -213,7 +213,7 @@ def update_scale(self, overflow): optimizer.step() # Otherwise, don't do anything -- ie, skip iteration else: - print('OVERFLOW!') + print('Overflow!') # Update loss scale for next iteration loss_scaler.update_scale(has_overflow) diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 37edf9d5002d..72846092df6c 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -139,7 +139,7 @@ def step_fused_lamb(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " + logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) @@ -165,7 +165,7 @@ def step(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " + logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py index d5c7616ff87e..bcf45298673e 100755 --- a/deepspeed/runtime/zero/stage1.py +++ b/deepspeed/runtime/zero/stage1.py @@ -630,7 +630,7 @@ def step(self, closure=None): if self.overflow: self.zero_grad() if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " + logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.loss_scale)) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 4cf39cb5ccd6..d09e624fe9b1 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1355,7 +1355,7 @@ def step(self, closure=None): see_memory_usage('After overflow after clearing gradients') logger.info( - "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, " + "[deepscale] Overflow! Rank {} Skipping step. Attempted loss scale: {}, " "reducing to {}".format(dist.get_rank(), prev_scale, self.loss_scale)) From f82716d3fe95b759e4da33e1e1d5d887c97aaaaf Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 10 Mar 2021 19:06:14 -0800 Subject: [PATCH 2/3] make the message more user-friendly --- deepspeed/runtime/fp16/fused_optimizer.py | 2 +- deepspeed/runtime/fp16/loss_scaler.py | 2 +- deepspeed/runtime/fp16/unfused_optimizer.py | 4 ++-- deepspeed/runtime/zero/stage1.py | 2 +- deepspeed/runtime/zero/stage2.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 1fc24ef60e53..4d8fcb6392fe 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -153,7 +153,7 @@ def step_fused_adam(self, closure=None): if self.overflow: if self.verbose: - logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index bbc718a47fe6..954d0ea61585 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -213,7 +213,7 @@ def update_scale(self, overflow): optimizer.step() # Otherwise, don't do anything -- ie, skip iteration else: - print('Overflow!') + print('fp16 dynamic loss scale overflow!') # Update loss scale for next iteration loss_scaler.update_scale(has_overflow) diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 72846092df6c..6b22afff8ab4 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -139,7 +139,7 @@ def step_fused_lamb(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) @@ -165,7 +165,7 @@ def step(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py index bcf45298673e..3bc6ff9c5f9c 100755 --- a/deepspeed/runtime/zero/stage1.py +++ b/deepspeed/runtime/zero/stage1.py @@ -630,7 +630,7 @@ def step(self, closure=None): if self.overflow: self.zero_grad() if self.verbose: - logger.info("[deepspeed] Overflow! Skipping step. Attempted loss " + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.loss_scale)) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index d09e624fe9b1..6f3fb1cd6509 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1355,7 +1355,7 @@ def step(self, closure=None): see_memory_usage('After overflow after clearing gradients') logger.info( - "[deepscale] Overflow! Rank {} Skipping step. Attempted loss scale: {}, " + "[deepspeed] fp16 dynamic loss scale overflow! Rank {} Skipping step. Attempted loss scale: {}, " "reducing to {}".format(dist.get_rank(), prev_scale, self.loss_scale)) From 8d50c8525e1896f62bf0b92f003cea5988189403 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 10 Mar 2021 18:16:22 -0800 Subject: [PATCH 3/3] formatting --- deepspeed/runtime/fp16/fused_optimizer.py | 8 ++++---- deepspeed/runtime/fp16/unfused_optimizer.py | 16 ++++++++-------- deepspeed/runtime/zero/stage1.py | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 4d8fcb6392fe..5f35c1884a41 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -153,10 +153,10 @@ def step_fused_adam(self, closure=None): if self.overflow: if self.verbose: - logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(grads_groups_flat, norm_groups, diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 6b22afff8ab4..c0cef6a56ba7 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -139,10 +139,10 @@ def step_fused_lamb(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False) @@ -165,10 +165,10 @@ def step(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow norm_groups = [] diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py index 3bc6ff9c5f9c..7cd37f904faa 100755 --- a/deepspeed/runtime/zero/stage1.py +++ b/deepspeed/runtime/zero/stage1.py @@ -630,10 +630,10 @@ def step(self, closure=None): if self.overflow: self.zero_grad() if self.verbose: - logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.loss_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.loss_scale)) return self.overflow norm_groups = []