diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index e6cb9199899a..4465adfd7c16 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -279,6 +279,9 @@ def __init__(self, For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion parameter model with 4 nodes and 32 GPUs. + Important: If the fp16 weights of the model can't fit onto a single GPU memory + this feature must be used. + .. note:: Initializes ``torch.distributed`` if it has not already been done so. See :meth:`deepseed.init_distributed` for more information.