From 8d2e33d42630ca265844de3ade25931be56fe3f6 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 11:30:36 +0800
Subject: [PATCH 1/6] [example] add google doc for benchmark results of GPT

---
 examples/language/gpt/README.md | 53 +--------------------------------
 1 file changed, 1 insertion(+), 52 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 07905b0cb3d0..b3d2bb53eba8 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -62,58 +62,7 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e.
 ColossalAI version 0.1.13.
 
-How dose Batch Size affect the efficency.
-
-| model | #GPU | policy | TP | batch per DP | Tflops |
-| ---------- | --------- |--------- |--------- |--------- |--------- |
-| gpt2_10b |  2  | cpu | 1 | 32 | 122.046 |
-| gpt2_10b |  2  | cpu | 1 | 16 | 82.649 |
-| gpt2_10b |  2  | cpu | 1 | 8 | 61.354 |
-
-
-How dose the Placement Policy affect the efficency.
-
-| model | #GPU | policy | TP | batch per DP | Tflops |
-| ---------- | --------- |--------- |--------- |--------- |--------- |
-| gpt2_10b |  4  | auto | 1 | 8 | 88.657 |
-| gpt2_10b |  4  | cuda | 1 | 8 | OOM |
-| gpt2_10b |  4  | cpu | 1 | 8 | 61.354 |
-| gpt2_10b |  4  | const | 1 | 8 | 82.137 |
-
-How dose the Tensor Parallel Degree affect the efficency.
-
-| model | #GPU | policy | TP | batch per DP | Tflops |
-| ---------- | --------- |--------- |--------- |--------- |--------- |
-| gpt2_10b |  4  | auto | 1 | 8 | 88.657 |
-| gpt2_10b |  4  | auto | 2 | 8 | 56.687 |
-| gpt2_10b |  4  | auto | 4 | 8 | 29.019 |
-| gpt2_10b |  4  | auto | 4 | 64 | 50.411 |
-| gpt2_20b |  1  | cpu | 1 | 8 | 43.102 |
-| gpt2_20b |  4  | cpu | 4 | 8 | 28.491 |
-
-
-Touch the bar of model scale and batch size.
-
-1. `cpu` is the most stable policy for large model and large batch size. One 8 GPU with TP=2, largest batch size of `auto`, `const`
- `cpu` is 64, 32 and 16, respectively.
-
-2. Tensor parallel is necessary for 20B model to reduce model data memory requirement on each GPU.
-
-| model | #GPU | policy | TP | batch per DP | Tflops |
-| ---------- | --------- |--------- |--------- |--------- |--------- |
-| gpt2_20b |  4  | cpu | 1 | 64 | CUDA OOM |
-| gpt2_20b |  4  | auto | 1/2 | 64 | CUDA OOM |
-| gpt2_20b |  4  | cpu | 2 | 8 | 43.102 |
-| gpt2_20b |  4  | cpu | 2 | 64 | 121.394 |
-| gpt2_20b |  8  | auto | 2 | 16 | 99.871 |
-| gpt2_20b |  8  | cpu | 2 | 64 | 125.170 |
-| gpt2_20b |  8  | const | 2 | 32 | 105.415 |
-
-
-| model | #GPU | policy | TP | batch per DP | Tflops |
-| ---------- | --------- |--------- |--------- |--------- |--------- |
-| gpt2_20b |  8  | cpu | 2 | 8 | 46.895 |
-
+[benchmark results on google doc](https://docs.google.com/spreadsheets/d/15A2j3RwyHh-UobAPv_hJgT4W_d7CnlPm5Fp4yEzH5K4/edit#gid=0)
 
 ### Experimental Features
 

From b3199df9f56d84698cdbc925d1ad53a295560aa4 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 11:36:14 +0800
Subject: [PATCH 2/6] add tencet doc

---
 examples/language/gpt/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index b3d2bb53eba8..8fdf6be3b6d9 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -64,6 +64,8 @@ ColossalAI version 0.1.13.
 
 [benchmark results on google doc](https://docs.google.com/spreadsheets/d/15A2j3RwyHh-UobAPv_hJgT4W_d7CnlPm5Fp4yEzH5K4/edit#gid=0)
 
+[benchmark results on Tencent doc (for china)](https://docs.qq.com/sheet/DUVpqeVdxS3RKRldk?tab=BB08J2)
+
 ### Experimental Features
 
 #### [Pipeline Parallel](./experiments/pipeline_parallel/)

From 41ad66da65a04efda232453d85fd1d08ebc1412f Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 14:53:23 +0800
Subject: [PATCH 3/6] [example] gpt, shard init on all processes

---
 colossalai/tensor/colo_tensor.py              |  8 +++----
 examples/language/gpt/gemini/run_gemini.sh    |  2 +-
 .../language/gpt/gemini/train_gpt_demo.py     | 22 ++++++++++++-------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index 93ab982ccb8f..136afebac193 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -117,7 +117,7 @@ def get_process_group(self) -> 'ProcessGroup':
     def set_process_group(self, pg: ProcessGroup):
         """set_process_group
         change the pg of the ColoTensor. Note that the valid use cases is limited.
-        Only existing pg is DP and dist spec is REPLICaTE is valid.
+        Only existing pg is DP and dist spec is Replica is valid.
 
         Args:
             pg (ProcessGroup): target pg
@@ -127,10 +127,10 @@ def set_process_group(self, pg: ProcessGroup):
         # if the new pg is the same as the old pg, just returns
         if self.process_group == pg:
             return
-        assert self.process_group.tp_world_size() == 1, \
-            "Can not set_process_group on a ColoTensor whose process_group has tp world group"
+        assert self.process_group.tp_world_size() == 1 or self.process_group.dp_world_size() == 1, \
+            "Can not set_process_group on a ColoTensor whose process_group is both tp > 1 and world group > 1"
         assert self.dist_spec.placement.value == 'r', \
-            "Can not set_process_group on a ColoTensor whose dist spec is not REPLICATE"
+            "Can not set_process_group on a ColoTensor whose dist spec is not Replica"
 
         self.process_group = pg
 
diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index ad577c350d39..b0770e28e931 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -6,7 +6,7 @@ export DISTPAN=${DISTPAN:-"colossalai"}
 export GPUNUM=${GPUNUM:-1}
 export TPDEGREE=${TPDEGREE:-1}
 export PLACEMENT=${PLACEMENT:-"cpu"}
-export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
+export USE_SHARD_INIT=${USE_SHARD_INIT:-True}
 export BATCH_SIZE=${BATCH_SIZE:-16}
 export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
 
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 14200bff7b7e..29f8c8ef1215 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -148,10 +148,16 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
     """
     for mn, module in model.named_modules():
         for pn, param in module.named_parameters(recurse=False):
-            # NOTE() a param maybe shared by tow modules
+            # NOTE() a param maybe shared by two modules
             if hasattr(param, 'visited'):
                 continue
+
+            # if shard init, then convert param to replica and use the dp-only ProcessGroup
+            param: ColoParameter = param
             param.set_dist_spec(ReplicaSpec())
+            param.set_process_group(pg)
+
+            # shard it w.r.t tp pattern
             if 'mlp.c_fc' in mn:
                 if 'weight' in pn or 'bias' in pn:
                     split_param_col_tp1d(param, pg)    # colmn slice
@@ -170,7 +176,6 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 split_param_col_tp1d(param, pg)    # colmn slice
             else:
                 param.set_dist_spec(ReplicaSpec())
-
             param.visited = True
 
 
@@ -248,27 +253,28 @@ def main():
     torch.manual_seed(123)
     if args.distplan == "colossalai":
         # all param must use the same process group.
-        default_pg = ProcessGroup(tp_degree=args.tp_degree)
-        default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+        world_size = torch.distributed.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size)
+        default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
 
         # build GPT model
         if version.parse(CAI_VERSION) > version.parse("0.1.10"):
             with ColoInitContext(device=get_current_device(),
                                  dtype=torch.half,
                                  default_dist_spec=default_dist_spec,
-                                 default_pg=default_pg):
+                                 default_pg=shard_pg):
                 model = model_builder(args.model_type)(checkpoint=True)
         else:
             with ColoInitContext(device=get_current_device()):
                 model = model_builder(args.model_type)(checkpoint=True)
 
-        pg = default_pg
+        tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
-        tensor_parallelize(model, pg)
+        tensor_parallelize(model, tp_pg)
 
         # build a Gemini model and a highly optimized cpu optimizer
         # Gemini + ZeRO DP, Note it must be used after TP
-        model, optimizer = build_gemini(model, pg, args.placement)
+        model, optimizer = build_gemini(model, tp_pg, args.placement)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:

From 9d945aff401b467cdf6d3f39ea1a6340f2f9af1c Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 14:58:18 +0800
Subject: [PATCH 4/6] polish comments

---
 colossalai/tensor/colo_tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index 136afebac193..3712d6a0acea 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -117,7 +117,7 @@ def get_process_group(self) -> 'ProcessGroup':
     def set_process_group(self, pg: ProcessGroup):
         """set_process_group
         change the pg of the ColoTensor. Note that the valid use cases is limited.
-        Only existing pg is DP and dist spec is Replica is valid.
+        It works for the target pg is DP and TP only and current dist spec of the Tensor is Replica.
 
         Args:
             pg (ProcessGroup): target pg

From 29c25d9ca9c1977e85ea14982593f754d3e6a8d5 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 15:27:30 +0800
Subject: [PATCH 5/6] polish code

---
 examples/language/gpt/gemini/run_gemini.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index b0770e28e931..ad577c350d39 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -6,7 +6,7 @@ export DISTPAN=${DISTPAN:-"colossalai"}
 export GPUNUM=${GPUNUM:-1}
 export TPDEGREE=${TPDEGREE:-1}
 export PLACEMENT=${PLACEMENT:-"cpu"}
-export USE_SHARD_INIT=${USE_SHARD_INIT:-True}
+export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
 export BATCH_SIZE=${BATCH_SIZE:-16}
 export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
 

From c811b516edd731d4c07ad8856e5a74ebce84c67e Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Fri, 6 Jan 2023 16:29:04 +0800
Subject: [PATCH 6/6] [builder] update readme

---
 README.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 6ffbc85ba9eb..1b0ca7e973e0 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,10 @@
 
    Colossal-AI: A Unified Deep Learning System for Big Model Era
 
-   <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> | 
-   <a href="https://www.colossalai.org/"> Documentation </a> | 
-   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> Examples </a> |   
-   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> | 
+   <h3> <a href="https://arxiv.org/abs/2110.14883"> Paper </a> |
+   <a href="https://www.colossalai.org/"> Documentation </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> Examples </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
    <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
 
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml)
@@ -17,7 +17,7 @@
    [![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech)
    [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w)
    [![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&amp)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png)
-   
+
 
    | [English](README.md) | [中文](README-zh-Hans.md) |
 
@@ -35,7 +35,7 @@
  <li><a href="#Why-Colossal-AI">Why Colossal-AI</a> </li>
  <li><a href="#Features">Features</a> </li>
  <li>
-   <a href="#Parallel-Training-Demo">Parallel Training Demo</a> 
+   <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
    <ul>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
@@ -47,14 +47,14 @@
    </ul>
  </li>
  <li>
-   <a href="#Single-GPU-Training-Demo">Single GPU Training Demo</a> 
+   <a href="#Single-GPU-Training-Demo">Single GPU Training Demo</a>
    <ul>
      <li><a href="#GPT-2-Single">GPT-2</a></li>
      <li><a href="#PaLM-Single">PaLM</a></li>
    </ul>
  </li>
  <li>
-   <a href="#Inference-Energon-AI-Demo">Inference (Energon-AI) Demo</a> 
+   <a href="#Inference-Energon-AI-Demo">Inference (Energon-AI) Demo</a>
    <ul>
      <li><a href="#GPT-3-Inference">GPT-3</a></li>
      <li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
@@ -62,7 +62,7 @@
    </ul>
  </li>
    <li>
-   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a> 
+   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
    <ul>
      <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
      <li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
@@ -106,7 +106,7 @@ distributed training and inference in a few lines.
   - [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054)
   - [Auto-Parallelism](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
 
-- Heterogeneous Memory Management 
+- Heterogeneous Memory Management
   - [PatrickStar](https://arxiv.org/abs/2108.05818)
 
 - Friendly Usage
@@ -115,7 +115,7 @@ distributed training and inference in a few lines.
 - Inference
   - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
 
-- Colossal-AI in the Real World 
+- Colossal-AI in the Real World
   - Biomedicine: [FastFold](https://github.com/hpcaitech/FastFold) accelerates training and inference of AlphaFold protein structure
 <p align="right">(<a href="#top">back to top</a>)</p>
 
@@ -149,7 +149,7 @@ distributed training and inference in a few lines.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model released by Meta, which stimulates AI programmers to perform various downstream tasks and application deployments because public pretrained model weights.
-- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt) 
+- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt)
 
 Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.
 
@@ -277,10 +277,11 @@ pip install -r requirements/requirements.txt
 pip install .
 ```
 
-If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
+By default, we do not compile CUDA/C++ kernels. ColossalAI will build them during runtime.
+If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
 
 ```shell
-NO_CUDA_EXT=1 pip install .
+CUDA_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">back to top</a>)</p>