diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
index 451451c51a32..dc82c0adf24f 100755
--- a/deepspeed/runtime/fp16/loss_scaler.py
+++ b/deepspeed/runtime/fp16/loss_scaler.py
@@ -23,6 +23,7 @@
 
 import torch
 from deepspeed import comm as dist
+from deepspeed.runtime.constants import FP16_INITIAL_SCALE_POWER_DEFAULT
 from deepspeed.utils import logger
 
 INITIAL_LOSS_SCALE = 'init_scale'
@@ -109,14 +110,14 @@ class DynamicLossScaler(LossScalerBase):
     always using the highest loss scale possible without incurring overflow.
 
     Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        init_scale (float, optional, default=2**16):  Initial loss scale attempted by :class:`DynamicLossScaler.`
         scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
         consecutive_hysteresis (bool, optional, default=False): Whether to refill hysteresis if we reach an iteration that doesn't overflow
     """
 
     def __init__(self,
-                 init_scale=2**32,
+                 init_scale=2**FP16_INITIAL_SCALE_POWER_DEFAULT,
                  scale_factor=2.,
                  scale_window=1000,
                  min_scale=1,