diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e6cb9199899a..4465adfd7c16 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -279,6 +279,9 @@ def __init__(self,
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
 
+        Important: If the fp16 weights of the model can't fit onto a single GPU memory
+        this feature must be used.
+
         .. note::
             Initializes ``torch.distributed`` if it has not already been done so.
             See :meth:`deepseed.init_distributed` for more information.