From bd9c78126e8ad323a9353fc15f6b8c5ed10d62f1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Mar 2021 11:36:06 -0700 Subject: [PATCH 1/4] zero.Init() clarification clarify that if `model.half()` can't fit into gpu memory `zero.Init()` is a must. this proposal is via @samyam's clarification shared elsewhere. Thank you. --- deepspeed/runtime/zero/partition_parameters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index e6cb9199899a..2cfb5871d68e 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -278,6 +278,9 @@ def __init__(self, allows for a linear increase in model size with the aggregate system memory. For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion parameter model with 4 nodes and 32 GPUs. + + Important: If the fp16 weights of the model can't fit onto the gpu memory this + feature must be used. .. note:: Initializes ``torch.distributed`` if it has not already been done so. From c02227567f0114a3dc071c255ccb3d8f30578f03 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Mar 2021 11:39:14 -0700 Subject: [PATCH 2/4] style --- deepspeed/runtime/zero/partition_parameters.py | 4 ++-- docs/_tutorials/getting-started.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 2cfb5871d68e..e5aca2c75842 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -278,8 +278,8 @@ def __init__(self, allows for a linear increase in model size with the aggregate system memory. For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion parameter model with 4 nodes and 32 GPUs. - - Important: If the fp16 weights of the model can't fit onto the gpu memory this + + Important: If the fp16 weights of the model can't fit onto the gpu memory this feature must be used. .. note:: diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md index e12388aaf973..e9b9aa0e627e 100644 --- a/docs/_tutorials/getting-started.md +++ b/docs/_tutorials/getting-started.md @@ -265,8 +265,8 @@ local machine to discover the number of slots available. The `--include` and `--exclude` arguments work as normal, but the user should specify 'localhost' as the hostname. -Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control -which devices should be used. For example, to use only gpu1 of the current +Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control +which devices should be used. For example, to use only gpu1 of the current node, do: ```bash deepspeed --include localhost:1 ... From 40060909be8f65b61730f05219b00851851116ce Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Mar 2021 11:42:40 -0700 Subject: [PATCH 3/4] add clarity --- deepspeed/runtime/zero/partition_parameters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index e5aca2c75842..e80c87e91d77 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -279,8 +279,8 @@ def __init__(self, For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion parameter model with 4 nodes and 32 GPUs. - Important: If the fp16 weights of the model can't fit onto the gpu memory this - feature must be used. + Important: If the fp16 weights of the model can't fit onto a single GPU memory + this feature must be used. .. note:: Initializes ``torch.distributed`` if it has not already been done so. From cdc439c36fe19d9e1776dda7ef67911b3987f22d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Mar 2021 11:44:05 -0700 Subject: [PATCH 4/4] style --- deepspeed/runtime/zero/partition_parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index e80c87e91d77..4465adfd7c16 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -279,7 +279,7 @@ def __init__(self, For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion parameter model with 4 nodes and 32 GPUs. - Important: If the fp16 weights of the model can't fit onto a single GPU memory + Important: If the fp16 weights of the model can't fit onto a single GPU memory this feature must be used. .. note::