From bd9c78126e8ad323a9353fc15f6b8c5ed10d62f1 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 19 Mar 2021 11:36:06 -0700
Subject: [PATCH 1/4] zero.Init() clarification

clarify that if `model.half()` can't fit into gpu memory `zero.Init()` is a must.

this proposal is via @samyam's clarification shared elsewhere.

Thank you.
---
 deepspeed/runtime/zero/partition_parameters.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e6cb9199899a..2cfb5871d68e 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -278,6 +278,9 @@ def __init__(self,
         allows for a linear increase in model size with the aggregate system memory.
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
+        
+        Important: If the fp16 weights of the model can't fit onto the gpu memory this 
+        feature must be used.
 
         .. note::
             Initializes ``torch.distributed`` if it has not already been done so.

From c02227567f0114a3dc071c255ccb3d8f30578f03 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 19 Mar 2021 11:39:14 -0700
Subject: [PATCH 2/4] style

---
 deepspeed/runtime/zero/partition_parameters.py | 4 ++--
 docs/_tutorials/getting-started.md             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 2cfb5871d68e..e5aca2c75842 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -278,8 +278,8 @@ def __init__(self,
         allows for a linear increase in model size with the aggregate system memory.
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
-        
-        Important: If the fp16 weights of the model can't fit onto the gpu memory this 
+
+        Important: If the fp16 weights of the model can't fit onto the gpu memory this
         feature must be used.
 
         .. note::
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index e12388aaf973..e9b9aa0e627e 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -265,8 +265,8 @@ local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
 
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control 
-which devices should be used. For example, to use only gpu1 of the current 
+Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
+which devices should be used. For example, to use only gpu1 of the current
 node, do:
 ```bash
 deepspeed --include localhost:1 ...

From 40060909be8f65b61730f05219b00851851116ce Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 19 Mar 2021 11:42:40 -0700
Subject: [PATCH 3/4] add clarity

---
 deepspeed/runtime/zero/partition_parameters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e5aca2c75842..e80c87e91d77 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -279,8 +279,8 @@ def __init__(self,
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
 
-        Important: If the fp16 weights of the model can't fit onto the gpu memory this
-        feature must be used.
+        Important: If the fp16 weights of the model can't fit onto a single GPU memory 
+        this feature must be used.
 
         .. note::
             Initializes ``torch.distributed`` if it has not already been done so.

From cdc439c36fe19d9e1776dda7ef67911b3987f22d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 19 Mar 2021 11:44:05 -0700
Subject: [PATCH 4/4] style

---
 deepspeed/runtime/zero/partition_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index e80c87e91d77..4465adfd7c16 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -279,7 +279,7 @@ def __init__(self,
         For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
         parameter model with 4 nodes and 32 GPUs.
 
-        Important: If the fp16 weights of the model can't fit onto a single GPU memory 
+        Important: If the fp16 weights of the model can't fit onto a single GPU memory
         this feature must be used.
 
         .. note::