From bfbf6505885fce90af66d0c82a6fdb12604becc6 Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Thu, 4 May 2023 15:31:09 +0800
Subject: [PATCH 01/20] fix spelling error

fix spelling error with evaluate.py
---
 applications/Chat/evaluate/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/evaluate/evaluate.py b/applications/Chat/evaluate/evaluate.py
index 9f17704426e2..2f9c9ce8e10d 100644
--- a/applications/Chat/evaluate/evaluate.py
+++ b/applications/Chat/evaluate/evaluate.py
@@ -130,7 +130,7 @@ def evaluate(args):
             assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
             answer_id = answer1_jsons[i]['id']
 
-            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instuction'] + \
+            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
                 " " + answer1_jsons[i]['input']
             cat = answer1_jsons[i]['category']
             ans1 = answer1_jsons[i]['output']

From 8ba7858753bab60bfc368e6ade561196598b48b7 Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Thu, 4 May 2023 15:34:16 +0800
Subject: [PATCH 02/20] Update generate_gpt35_answers.py

fix spelling error with generate_gpt35_answers.py
---
 applications/Chat/evaluate/generate_gpt35_answers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/evaluate/generate_gpt35_answers.py b/applications/Chat/evaluate/generate_gpt35_answers.py
index 852a7cb19dfa..db95cd2febf4 100644
--- a/applications/Chat/evaluate/generate_gpt35_answers.py
+++ b/applications/Chat/evaluate/generate_gpt35_answers.py
@@ -35,7 +35,7 @@
 
 def get_answer(question: str, max_tokens: int):
     answer = question
-    prompt = question['instruction'] if question['input'] == "" else question['instuction'] + \
+    prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
             " " + question['input']
     for _ in range(MAX_API_RETRY):
         try:

From 7bd0bee8ea0ca7d713c34792dc99c1970a2c6701 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 4 May 2023 16:03:33 +0800
Subject: [PATCH 03/20] [chat] add opt attn kernel (#3655)

* [chat] add opt attn kernel

* [chat] disable xformer during fwd
---
 .../benchmarks/benchmark_opt_lora_dummy.py    |  6 ++
 applications/Chat/coati/kernels/__init__.py   |  6 ++
 applications/Chat/coati/kernels/opt_attn.py   | 87 +++++++++++++++++++
 applications/Chat/coati/kernels/wrapper.py    | 18 ++++
 4 files changed, 117 insertions(+)
 create mode 100644 applications/Chat/coati/kernels/__init__.py
 create mode 100644 applications/Chat/coati/kernels/opt_attn.py
 create mode 100644 applications/Chat/coati/kernels/wrapper.py

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index a991e8558aee..7a47624f74d8 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -101,6 +101,11 @@ def main(args):
         initial_model = deepcopy(actor).cuda().half()
         reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()
 
+    if args.use_kernels:
+        from coati.kernels import convert_to_xformer_model
+        actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
+                                                         (actor, critic, initial_model, reward_model))
+
     actor_numel = get_model_numel(actor, strategy)
     critic_numel = get_model_numel(critic, strategy)
     initial_model_numel = get_model_numel(initial_model, strategy)
@@ -184,5 +189,6 @@ def main(args):
     parser.add_argument('--lora_rank', type=int, default=0)
     parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
     parser.add_argument('--offload_inference_models', action='store_true', default=False)
+    parser.add_argument('--use_kernels', action='store_true', default=False)
     args = parser.parse_args()
     main(args)
diff --git a/applications/Chat/coati/kernels/__init__.py b/applications/Chat/coati/kernels/__init__.py
new file mode 100644
index 000000000000..230eedf7ecba
--- /dev/null
+++ b/applications/Chat/coati/kernels/__init__.py
@@ -0,0 +1,6 @@
+from .wrapper import convert_to_xformer_model, recover_from_xformer_model
+
+__all__ = [
+    'convert_to_xformer_model',
+    'recover_from_xformer_model',
+]
diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
new file mode 100644
index 000000000000..c10f341e94a3
--- /dev/null
+++ b/applications/Chat/coati/kernels/opt_attn.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple
+
+import torch
+import xformers.ops as xops
+from torch import Tensor
+from transformers.models.opt.modeling_opt import OPTAttention
+
+
+# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+class XOPTAttention(OPTAttention):
+    # def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
+    #     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        key_value_states: Optional[Tensor] = None,
+        past_key_value: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        layer_head_mask: Optional[Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
+        if not self.training:
+            return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask,
+                                   output_attentions)
+        """Input shape: Batch x Time x Channel"""
+        assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask'
+        assert not output_attentions, 'Xformers attention does not support output_attentions'
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = xops.memory_efficient_attention(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      attn_bias=xops.LowerTriangularMask(),
+                                                      p=self.dropout if self.training else 0.0,
+                                                      scale=self.scaling)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        attn_weights_reshaped = None
+
+        return attn_output, attn_weights_reshaped, past_key_value
diff --git a/applications/Chat/coati/kernels/wrapper.py b/applications/Chat/coati/kernels/wrapper.py
new file mode 100644
index 000000000000..c55bda600230
--- /dev/null
+++ b/applications/Chat/coati/kernels/wrapper.py
@@ -0,0 +1,18 @@
+import torch.nn as nn
+from transformers.models.opt.modeling_opt import OPTAttention
+
+from .opt_attn import XOPTAttention
+
+
+def convert_to_xformer_model(model: nn.Module) -> nn.Module:
+    for module in model.modules():
+        if isinstance(module, OPTAttention):
+            module.__class__ = XOPTAttention
+    return model
+
+
+def recover_from_xformer_model(model: nn.Module) -> nn.Module:
+    for module in model.modules():
+        if isinstance(module, XOPTAttention):
+            module.__class__ = OPTAttention
+    return model

From 6650daeb0ae43ab93e2341cce75975f3e553c02a Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Fri, 5 May 2023 11:37:35 +0800
Subject: [PATCH 04/20] [doc] fix chat spelling error (#3671)

* Update README.md

change "huggingaface" to "huggingface"

* Update README.md

change "Colossa-AI" to "Colossal-AI"
---
 applications/Chat/README.md          | 2 +-
 applications/Chat/examples/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 2a9c916d45c9..9ba831973b6c 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -59,7 +59,7 @@ The Coati package provides a unified large language model framework that has imp
    Image source: https://openai.com/blog/chatgpt
 </div>
 
-**As Colossa-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
+**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
 
 
 More details can be found in the latest news.
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index 3e85bfe2d170..a466d415d15e 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -233,7 +233,7 @@ If you want to support your own model in Coati, please refer the pull request fo
 You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model
 
 here are some example code for a NewModel named `Coati`.
-if it is supported in huggingaface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
+if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
 r you can build your own model by yourself.
 
 ### Actor model

From 0f785cb1f332bfa1643b4dedd2ce38950b4a3179 Mon Sep 17 00:00:00 2001
From: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Date: Fri, 5 May 2023 13:36:56 +0800
Subject: [PATCH 05/20] [chat] PPO stage3 doc enhancement (#3679)

* Add RoBERTa for RLHF Stage 2 & 3 (test)

RoBERTa for RLHF Stage 2 & 3 (still in testing)

Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)"

This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368.

Add RoBERTa for RLHF stage 2 & 3

1. add roberta folder under model folder
2. add  roberta option in train_reward_model.py
3. add some test in testci

Update test_ci.sh

Revert "Update test_ci.sh"

This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a.

Add RoBERTa for RLHF Stage 2 & 3 (test)

RoBERTa for RLHF Stage 2 & 3 (still in testing)

Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)"

This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368.

Add RoBERTa for RLHF stage 2 & 3

1. add roberta folder under model folder
2. add  roberta option in train_reward_model.py
3. add some test in testci

Update test_ci.sh

Revert "Update test_ci.sh"

This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a.

update roberta with coati

chat ci update

Revert "chat ci update"

This reverts commit 17ae7ae01fa752bd3289fc39069868fde99cf846.

* Update README.md

Update README.md

* update readme

* Update test_ci.sh

* update readme and add a script

update readme and add a script

modify readme

Update README.md
---
 applications/Chat/examples/README.md                |  2 +-
 applications/Chat/examples/example_data_reformat.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 applications/Chat/examples/example_data_reformat.py

diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index a466d415d15e..561ace2205ed 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -154,7 +154,7 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
          --rm_path /your/rm/model/path
 ```
 
-Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
+Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.  
 Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
 
 ### Arg List
diff --git a/applications/Chat/examples/example_data_reformat.py b/applications/Chat/examples/example_data_reformat.py
new file mode 100644
index 000000000000..dc83b29b525b
--- /dev/null
+++ b/applications/Chat/examples/example_data_reformat.py
@@ -0,0 +1,12 @@
+jsonl_file = 'seed_prompts_xx.jsonl'  # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild
+reformat_file = 'prompts_xx.jsonl'  # reformat jsonl file used as Prompt dataset in Stage3
+
+data = ''
+with open(jsonl_file, 'r', encoding="utf-8") as f1:
+    for jsonstr in f1.readlines():
+        jsonstr = '\t' + jsonstr.strip('\n') + ',\n'
+        data = data + jsonstr
+    data = '[\n' + data + ']'
+
+with open(reformat_file, 'w') as f2:
+    f2.write(data)
\ No newline at end of file

From 307894f74dd63d71f4b95272fe149ca607e2aafa Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Fri, 5 May 2023 14:37:21 +0800
Subject: [PATCH 06/20] [booster] gemini plugin support shard checkpoint
 (#3610)

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin add shard checkpoint save/load

* gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

* [API Refactoring]gemini plugin support shard checkpoint

---------

Co-authored-by: luchen <luchen@luchendeMBP.lan>
Co-authored-by: luchen <luchen@luchendeMacBook-Pro.local>
---
 colossalai/booster/plugin/gemini_plugin.py    | 44 +++++++++
 .../checkpoint_io/checkpoint_io_base.py       |  2 +-
 .../checkpoint_io/general_checkpoint_io.py    | 63 ++++++------
 colossalai/checkpoint_io/index_file.py        | 16 ++-
 colossalai/checkpoint_io/utils.py             | 86 ++++++++--------
 colossalai/zero/gemini/gemini_ddp.py          | 51 +++++++---
 pytest.ini                                    |  2 +-
 .../test_general_checkpoint_io.py             | 99 ++++++++++++++++++-
 .../test_zeroddp_state_dict_shard.py          |  3 +-
 9 files changed, 269 insertions(+), 97 deletions(-)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index deda00d8a7b3..dfdd7be26eaa 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -1,6 +1,9 @@
 import random
 import warnings
 from typing import Callable, List, Optional, Tuple, Union
+from pathlib import Path
+import os
+import logging
 
 import numpy as np
 import torch
@@ -20,6 +23,13 @@
 from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
 from colossalai.zero.gemini.memory_tracer import MemStats
 
+from colossalai.checkpoint_io.utils import (
+    get_base_filenames,
+    get_shard_filename
+    )
+
+from colossalai.checkpoint_io import CheckpointIndexFile
+
 from .plugin_base import Plugin
 
 __all__ = ['GeminiPlugin']
@@ -62,6 +72,40 @@ def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         if self.coordinator.is_master():
             super().save_lr_scheduler(lr_scheduler, checkpoint)
 
+    def save_sharded_model(self, model: GeminiDDP, checkpoint_path: str, gather_dtensor: bool = False, variant: Optional[str] = None, max_shard_size: int = 1024, use_safetensors: bool = False):
+        """
+        Save sharded model
+        """
+        state_dict_shard = model.state_dict_shard(max_shard_size=max_shard_size, only_rank_0=True, dtype=torch.float32)
+        weights_name, save_index_file = get_base_filenames(variant, use_safetensors)
+        total_size = 0
+        index_file = CheckpointIndexFile(checkpoint_path)
+        for idx, shard_pair in enumerate(state_dict_shard):
+            if not self.coordinator.is_master():
+                continue
+            shard = shard_pair[0]
+            shard_file = get_shard_filename(weights_name, idx)
+            total_size = total_size + shard_pair[1]
+            for key in shard.keys():
+                index_file.append_weight_map(key, shard_file)
+            
+            checkpoint_file_path = os.path.join(checkpoint_path, shard_file)
+            save_state_dict(shard, checkpoint_file_path, use_safetensors)
+        
+        index_file.append_meta_data("total_size", total_size)
+        index_file.write_index_file(save_index_file)
+        logging.info(
+            f"The model is going to be split to checkpoint shards. "
+            f"You can find where each parameters has been saved in the "
+            f"index located at {save_index_file}."
+        )
+
+
+    def load_sharded_model(self, model: GeminiDDP, checkpoint_index_file: Path, strict: bool = False, use_safetensors: bool = False):
+        """
+        load shard model, load model from multiple files
+        """
+        return super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module=False)
 
 class GeminiModel(ModelWrapper):
 
diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py
index cb853559c48c..9cf344ecc41b 100644
--- a/colossalai/checkpoint_io/checkpoint_io_base.py
+++ b/colossalai/checkpoint_io/checkpoint_io_base.py
@@ -86,7 +86,7 @@ def load_model(self,
         # the existence of index file means it is a sharded checkpoint
         ckpt_path = Path(checkpoint)
         index_file_exists, index_file_path = has_index_file(checkpoint)
-
+        
         # return the origin model instead of the unwrapped model
         origin_model = model
 
diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py
index bf584f45d045..96a883fdb42a 100644
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -1,12 +1,12 @@
 from pathlib import Path
+from functools import reduce
 
 import torch.nn as nn
 from torch.optim import Optimizer
 import logging
 import os
-import json
 import gc
-from typing import Optional
+from typing import Optional, Iterator, OrderedDict, Tuple
 
 from .checkpoint_io_base import CheckpointIO
 from .index_file import CheckpointIndexFile
@@ -18,10 +18,9 @@
     shard_checkpoint,
     load_shard_state_dict,
     load_state_dict_into_model,
-    add_variant
+    get_shard_filename,
+    get_base_filenames
     )
-from .utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME
-
 
 __all__ = ['GeneralCheckpointIO']
 
@@ -85,30 +84,32 @@ def save_sharded_model(self, model: nn.Module, checkpoint_path: str, gather_dten
         
         # shard checkpoint
         state_dict = model.state_dict()
-        weights_name = SAFE_WEIGHTS_NAME if use_safetensors else WEIGHTS_NAME
-        weights_name = add_variant(weights_name, variant)
-        shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=weights_name)
-
-        # Save the model
-        for shard_file, shard in shards.items():
+        state_dict_shard = shard_checkpoint(state_dict, max_shard_size=max_shard_size)
+
+        weights_name, save_index_file = get_base_filenames(variant, use_safetensors)
+        total_size = 0
+        index_file = CheckpointIndexFile(checkpoint_path)
+        for idx, shard_pair in enumerate(state_dict_shard):
+            shard = shard_pair[0]
+            shard_file = get_shard_filename(weights_name, idx)
+            total_size = total_size + shard_pair[1]
+            for key in shard.keys():
+                index_file.append_weight_map(key, shard_file)
+            
             checkpoint_file_path = os.path.join(checkpoint_path, shard_file)
             save_state_dict(shard, checkpoint_file_path, use_safetensors)
-
-        # save index file
-        save_index_file = SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME
-
-        save_index_file = os.path.join(checkpoint_path, add_variant(save_index_file, variant))
-        with open(save_index_file, "w", encoding="utf-8") as f:
-            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-            f.write(content)
+        
+        index_file.append_meta_data("total_size", total_size)
+        index_file.write_index_file(save_index_file)
         logging.info(
-            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+            f"The model is going to be split to checkpoint shards. "
+            f"You can find where each parameters has been saved in the "
             f"index located at {save_index_file}."
         )
 
 
-    def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, strict: bool = False, use_safetensors: bool = False):
+    def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, strict: bool = False, 
+                           use_safetensors: bool = False, load_sub_module: bool = True):
         """
         load shard model, load model from multiple files
         """
@@ -122,17 +123,21 @@ def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, stri
         # read checkpoint index file
         ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_fileanames()
-        missing_keys = ckpt_index_file.get_all_param_names()
+        missing_keys = []
 
         for shard_file in checkpoint_files:
             state_dict = load_shard_state_dict(Path(shard_file), use_safetensors)
-            load_state_dict_into_model(model, state_dict, missing_keys, strict)
+            load_state_dict_into_model(model, state_dict, missing_keys, strict, load_sub_module)
             del state_dict
             gc.collect()
 
-        if strict and len(missing_keys) > 0:
-            error_msgs = 'Missing key(s) in state_dict: {}. '.format(
-                        ', '.join('"{}"'.format(k) for k in missing_keys))
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               self.__class__.__name__, "\n\t".join(error_msgs)))
+        if strict:
+            remain_keys = reduce(lambda a, b: a & b, map(set, missing_keys))
+            if len(remain_keys) > 0:
+                error_msgs = 'Missing key(s) in state_dict: {}. '.format(
+                            ', '.join('"{}"'.format(k) for k in missing_keys))
+                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                                self.__class__.__name__, "\n\t".join(error_msgs)))
+
+
 
diff --git a/colossalai/checkpoint_io/index_file.py b/colossalai/checkpoint_io/index_file.py
index 89224787a91b..15a6d09f3b5e 100644
--- a/colossalai/checkpoint_io/index_file.py
+++ b/colossalai/checkpoint_io/index_file.py
@@ -1,6 +1,8 @@
 import json
 from pathlib import Path
 from typing import Any, List, Union
+import os
+import json
 
 from .utils import is_dtensor_checkpoint
 
@@ -18,8 +20,8 @@ class CheckpointIndexFile:
         >>> index.export('new_index.json')
     """
 
-    def __init__(self) -> None:
-        self.root_path = None
+    def __init__(self, root_path=None) -> None:
+        self.root_path = root_path
         self.metadata: dict = dict()
         self.weight_map: dict = dict()
 
@@ -154,3 +156,13 @@ def get_all_param_names(self):
         Get all the weight keys.
         """
         return list(self.weight_map.keys())
+    
+    def write_index_file(self, save_index_file):
+        """
+        Wriete index file.
+        """
+        save_index_file = os.path.join(self.root_path, save_index_file)
+        index = {"metadata": self.metadata, "weight_map": self.weight_map}
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 37d22d08df40..16e41631f0d5 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 import torch
 import torch.nn as nn
-from typing import List, Dict, Mapping, OrderedDict, Optional, Tuple
+from typing import List, Mapping, OrderedDict, Optional, Tuple, Iterator
 from colossalai.tensor.d_tensor.d_tensor import DTensor
 import re
 
@@ -77,55 +77,35 @@ def is_safetensor_checkpoint(checkpoint_file_path: str) -> bool:
 # ======================================
 # Helper functions for saving shard file
 # ======================================
-def shard_checkpoint(state_dict: torch.Tensor, max_shard_size: int = 1024, weights_name: str = WEIGHTS_NAME):
+def shard_checkpoint(state_dict: torch.Tensor, max_shard_size: int = 1024) -> Iterator[Tuple[OrderedDict, int]]:
  
     """
     Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
     given size.
     """
-    sharded_state_dicts = []
     current_block = {}
     current_block_size = 0
-    total_size = 0
 
     for key, weight in state_dict.items():
+        ret_block = None
+        ret_block_size = 0
         if type(weight) != DTensor:
             weight_size = calculate_tensor_size(weight)
 
             # If this weight is going to tip up over the maximal size, we split.
             if current_block_size + weight_size > max_shard_size:
-                sharded_state_dicts.append(current_block)
+                ret_block = current_block
+                ret_block_size = current_block_size
                 current_block = {}
                 current_block_size = 0
-
             current_block[key] = weight
             current_block_size += weight_size
-            total_size += weight_size
+            
+        if ret_block != None:
+            yield ret_block, ret_block_size
 
-    # Add the last block
-    sharded_state_dicts.append(current_block)
+    yield current_block, current_block_size
 
-    # If we only have one shard, we return it
-    if len(sharded_state_dicts) == 1:
-        return {weights_name: sharded_state_dicts[0]}, None
-    
-    # Otherwise, let's build the index
-    weight_map = {}
-    shards = {}
-
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
-        shard_file = shard_file.replace(
-            ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
-        )
-        shards[shard_file] = shard
-        for key in shard.keys():
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-    return shards, index
 
 def load_shard_state_dict(checkpoint_file: Path, use_safetensors: bool =False):
     """
@@ -146,7 +126,7 @@ def load_shard_state_dict(checkpoint_file: Path, use_safetensors: bool =False):
     else:
         return torch.load(checkpoint_file)
     
-def load_state_dict_into_model(model: nn.Module, state_dict: torch.Tensor, missing_keys: List, strict: bool = False):
+def load_state_dict_into_model(model: nn.Module, state_dict: torch.Tensor, missing_keys: List, strict: bool = False, load_sub_module: bool = True):
     r"""Copies parameters and buffers from :attr:`state_dict` into
     this module and its descendants. 
 
@@ -167,29 +147,22 @@ def load_state_dict_into_model(model: nn.Module, state_dict: torch.Tensor, missi
     if metadata is not None:
         state_dict._metadata = metadata
 
-    def load(module: nn.Module, state_dict, prefix=""):
+    def load(module: nn.Module, state_dict, prefix="", load_sub_module: bool = True):
         local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        args = (state_dict, prefix, local_metadata, True, sub_missing_keys, [], error_msgs)
         # Parameters of module and children will start with prefix. We can exit early if there are none in this
         # state_dict
         if len([key for key in state_dict if key.startswith(prefix)]) > 0:
             module._load_from_state_dict(*args)
+        if load_sub_module:
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, state_dict, prefix + name + ".")
 
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, state_dict, prefix + name + ".")
-
-    load(model, state_dict, "")
+    load(model, state_dict, "", load_sub_module)
     del load
 
-    # deal with missing key
-    if len(missing_keys) > 0:
-        deleted_keys = []
-        for key in missing_keys:
-            if key not in sub_missing_keys:
-                deleted_keys.append(key)
-        for key in deleted_keys:
-            missing_keys.remove(key)
+    missing_keys = missing_keys.append(sub_missing_keys)
 
     if strict:
         if len(unexpected_keys) > 0:
@@ -417,3 +390,24 @@ def add_variant(weights_name: str, variant: Optional[str] = None) -> str:
         weights_name = ".".join(splits)
 
     return weights_name
+
+
+def get_base_filenames(variant: str=None, use_safetensors: bool=False):
+        """
+        generate base weight filenames
+        """
+        weights_name = SAFE_WEIGHTS_NAME if use_safetensors else WEIGHTS_NAME
+        weights_name = add_variant(weights_name, variant)
+
+        save_index_file = SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME
+        save_index_file = add_variant(save_index_file, variant)
+
+        return weights_name, save_index_file
+
+def get_shard_filename(weights_name: str, idx: int):
+    """
+    get shard file name
+    """
+    shard_file = weights_name.replace(".bin", f"-{idx+1:05d}.bin")
+    shard_file = shard_file.replace(".safetensors", f"-{idx + 1:05d}.safetensors")
+    return shard_file
\ No newline at end of file
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 8a001b114e9a..878c25be7094 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -2,7 +2,7 @@
 from collections import OrderedDict
 from contextlib import nullcontext
 from functools import partial
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Dict, Iterator, List, Optional, Union, Tuple, Set
 
 import torch
 import torch.distributed as dist
@@ -96,8 +96,35 @@ def __init__(self,
                 param_name = m_name + '.' + p_name if m_name else p_name
                 self.name2param[param_name] = p_var
         super().__init__(module, process_group=ColoProcessGroup())
+        self._non_persistent_buffers_set=self._get_non_persistent_buffers_set(module)
         self._cast_buffers()
 
+    def _get_non_persistent_buffers_set(self, module, memo: Optional[Set[nn.Module]] = None, prefix: str = '', remove_duplicate: bool = True):
+
+            r"""
+            Args:
+                memo: a memo to store the set of modules already added to the result
+                prefix: a prefix that will be added to the name of the module
+                remove_duplicate: whether to remove the duplicated module instances in the result
+                    or not
+            """
+
+            if memo is None:
+                memo = set()
+            self_non_persistent_set = set()
+            if module not in memo:
+                if remove_duplicate:
+                    memo.add(module)
+                self_non_persistent_set = set(map(lambda key: prefix + ('.' if prefix else '') + key, module._non_persistent_buffers_set))
+                for name, sub_module in module._modules.items():
+                    if sub_module is None:
+                        continue
+                    submodule_prefix = prefix + ('.' if prefix else '') + name
+                    child_non_persistent_set = self._get_non_persistent_buffers_set(sub_module, memo, submodule_prefix, remove_duplicate)
+                    self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
+            return self_non_persistent_set
+    
+
     def _post_forward(self):
         """This function is only triggered for inference.
         """
@@ -604,7 +631,7 @@ def state_dict_shard(self,
                          keep_vars: bool = False,
                          max_shard_size: int = 1024,
                          only_rank_0: bool = True,
-                         dtype: torch.dtype = torch.float16) -> Iterator[OrderedDict]:
+                         dtype: torch.dtype = torch.float16) -> Iterator[Tuple[OrderedDict, int]]:
         """Returns dictionaries containing a whole state of the module one by one. The max size of dictionary shard is specified by ``max_shard_size``.
 
         Both parameters and persistent buffers (e.g. running averages) are included.
@@ -644,9 +671,9 @@ def state_dict_shard(self,
                         gathered_param_buffer.update(self._get_chunk_to_save_data(chunk, only_rank_0, dtype))
                     gathered_param = gathered_param_buffer.pop(fp32_param)
 
-                block = sharder.append(prefix + name, gathered_param)
+                block, block_size = sharder.append(prefix + name, gathered_param)
                 if block is not None:
-                    yield block
+                    yield block, block_size
 
         del fp16_to_fp32
         del gathered_param_buffer
@@ -655,19 +682,19 @@ def state_dict_shard(self,
         for name, buf in self.named_buffers():
             if buf is not None and name not in self._non_persistent_buffers_set:
                 buffer = buf if keep_vars else buf.detach()
-                block = sharder.append(prefix + name, buffer)
+                block, block_size = sharder.append(prefix + name, buffer)
                 if block is not None:
-                    yield block
+                    yield block, block_size
         # save extra states
         extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
         if getattr(self.__class__, "get_extra_state",
                    torch.nn.Module.get_extra_state) is not torch.nn.Module.get_extra_state:
             extra_state = self.get_extra_state()
-            block = sharder.append(extra_state_key, extra_state)
+            block, block_size = sharder.append(extra_state_key, extra_state)
             if block is not None:
-                yield block
+                yield block, block_size
 
-        yield sharder.current_block
+        yield sharder.current_block, sharder.current_block_size
 
 
 class _StateDictSharder:
@@ -677,16 +704,18 @@ def __init__(self, max_shard_size: int) -> None:
         self.current_block = OrderedDict()
         self.current_block_size = 0
 
-    def append(self, name: str, tensor: torch.Tensor) -> Optional[OrderedDict]:
+    def append(self, name: str, tensor: torch.Tensor) -> Tuple[Optional[OrderedDict], int]:
         tensor_size = calculate_tensor_size(tensor)
         ret_block = None
+        ret_block_size = 0
         if self.current_block_size + tensor_size > self.max_shard_size:
             ret_block = self.current_block
+            ret_block_size = self.current_block_size
             self.current_block = OrderedDict()
             self.current_block_size = 0
         self.current_block[name] = tensor
         self.current_block_size += tensor_size
-        return ret_block
+        return ret_block, ret_block_size
 
 
 class GeminiDDP(ZeroDDP):
diff --git a/pytest.ini b/pytest.ini
index ac31ace4bfae..01e5cd217c5d 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,4 +3,4 @@ markers =
     cpu: tests which can run on CPU
     gpu: tests which requires a single GPU
     dist: tests which are run in a multi-GPU or multi-machine environment
-    experiment: tests for experimental features
\ No newline at end of file
+    experiment: tests for experimental features
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
index ca5ce10054f7..752ca706bfd4 100644
--- a/tests/test_checkpoint_io/test_general_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -1,16 +1,21 @@
 import tempfile
 import pytest
 import torch
-import logging
 from torch.optim import Adam
 from torchvision.models import resnet18
-from pathlib import Path
-import os
-import subprocess
 
 from colossalai.checkpoint_io import GeneralCheckpointIO
+from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
 from colossalai.testing import clear_cache_before_run, parameterize
 
+import colossalai
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils.cuda import get_current_device
+from colossalai.zero import ColoInitContext, ZeroDDP
+from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
+from colossalai.zero.gemini.gemini_mgr import GeminiManager
+from tests.components_to_test.registry import non_distributed_component_funcs
+
 # ========
 # Note:
 # 1. due to checkpoint IO can be quite slow if tested with all models, we will only test on resnet for now
@@ -83,7 +88,6 @@ def test_sharded_checkpoint(use_safetensors: bool):
         suffix = ".bin"
         WEIGHTS_INDEX_NAME = "model.bin.index.json"
     
-    # model_ckpt_dir = tempfile.TemporaryDirectory(suffix=suffix)
     model_ckpt_dir = tempfile.TemporaryDirectory()
     optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
 
@@ -104,6 +108,87 @@ def test_sharded_checkpoint(use_safetensors: bool):
     recursive_check(model.state_dict(), new_model.state_dict())
     recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
 
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['bert'])
+@parameterize('use_safetensors', [True, False])
+def hf_load_colossalai_checkpoint(placement_policy, model_name, use_safetensors: bool):
+    from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertForSequenceClassification
+
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+
+    with ColoInitContext(device=get_current_device()):
+        bert_model = model_builder()
+    bert_model.config.save_pretrained(save_directory=model_ckpt_dir.name)
+    config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    bert_model = ZeroDDP(bert_model, gemini_manager)
+    bert_model.train()
+
+    ckpt_io = GeminiCheckpointIO()
+    if ckpt_io.coordinator.is_master():
+        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
+        ckpt_io.save_model(bert_model, model_ckpt_dir.name, True, True, "", (model_size / 3), use_safetensors=use_safetensors)
+        new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
+        recursive_check(bert_model.state_dict(only_rank_0=True, dtype=torch.float32), new_bert_model.state_dict())
+    
+    model_ckpt_dir.cleanup()
+        
+
+
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['gpt2', 'bert'])
+@parameterize('use_safetensors', [True, False])
+def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+
+    with ColoInitContext(device=get_current_device()):
+        model = model_builder()
+        new_model = model_builder()
+
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager)
+    model.train()
+
+    new_config_dict, *_ = search_chunk_configuration(new_model, search_range_mb=1, search_interval_byte=100)
+    new_chunk_manager = ChunkManager(new_config_dict)
+    new_gemini_manager = GeminiManager(placement_policy, new_chunk_manager)
+    new_model = ZeroDDP(new_model, new_gemini_manager)
+
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+
+    ckpt_io = GeminiCheckpointIO()
+    model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
+    ckpt_io.save_model(model, model_ckpt_dir.name, True, True, "epoch", (model_size / 3), use_safetensors=use_safetensors)
+
+    # load model
+    if ckpt_io.coordinator.is_master():
+        ckpt_io.load_model(new_model, model_ckpt_dir.name, strict=True)
+        model_dict = model.state_dict(only_rank_0=True)
+        new_model_dict = new_model.state_dict(only_rank_0=True)
+        recursive_check(model_dict, new_model_dict)
+
+    model_ckpt_dir.cleanup()
+
+
+def run_dist(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_state_dict()
+    hf_load_colossalai_checkpoint()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [4, 4])
+@rerun_if_address_is_in_use()
+def test_gemini_ckpIO(world_size):
+    spawn(run_dist, world_size)
+
 
 # do recursive check for the optimizer state dict
 # if the value is a dict, compare its values
@@ -117,10 +202,14 @@ def recursive_check(d1, d2):
         elif isinstance(v, list):
             for i in range(len(v)):
                 if isinstance(v[i], torch.Tensor):
+                    v[i] = v[i].to("cpu")
+                    d2[k][i] = d2[k][i].to("cpu")
                     assert torch.equal(v[i], d2[k][i])
                 else:
                     assert v[i] == d2[k][i]
         elif isinstance(v, torch.Tensor):
+            v = v.to("cpu")
+            d2[k] = d2[k].to("cpu")
             assert torch.equal(v, d2[k])
         else:
             assert v == d2[k]
diff --git a/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py b/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
index 96c26a1de4df..ad7d3a5a4859 100644
--- a/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
+++ b/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
@@ -31,14 +31,13 @@ def exam_state_dict(placement_policy, model_name: str):
     zero_dict = model.state_dict(only_rank_0=False)
     accumulated_keys = set()
     # ensure number of shards > 1
-    for shard in model.state_dict_shard(max_shard_size=(model_size / 3), only_rank_0=False):
+    for shard, _ in model.state_dict_shard(max_shard_size=(model_size / 3), only_rank_0=False):
         for key, value in shard.items():
             assert key not in accumulated_keys, f"key `{key}` is duplicated."
             accumulated_keys.add(key)
             assert key in zero_dict, f"{key} not in ZeRO dictionary."
             assert torch.equal(value, zero_dict[key]), f"{key} not equal."
 
-
 def run_dist(rank, world_size, port):
     config = {}
     colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')

From b49020c1b1ba6e660e1675178b7edca081efbf4d Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Fri, 5 May 2023 18:57:27 +0800
Subject: [PATCH 07/20] [CI] Update test_sharded_optim_with_sync_bn.py (#3688)

fix spelling error in line23
change "cudnn_determinstic"=True to "cudnn_deterministic=True"
---
 tests/test_zero/test_legacy/test_sharded_optim_with_sync_bn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_zero/test_legacy/test_sharded_optim_with_sync_bn.py b/tests/test_zero/test_legacy/test_sharded_optim_with_sync_bn.py
index 61d850d06080..0223f18c29d6 100644
--- a/tests/test_zero/test_legacy/test_sharded_optim_with_sync_bn.py
+++ b/tests/test_zero/test_legacy/test_sharded_optim_with_sync_bn.py
@@ -20,7 +20,7 @@ def run_dist(rank, world_size, port):
     # need to configure cudnn deterministic so that
     # randomness of convolution layers will be disabled
     zero_config = dict(model_config=dict(shard_strategy=TensorShardStrategy()))
-    colossalai.launch(config=dict(zero=zero_config, cudnn_determinstic=True, cudnn_benchmark=False),
+    colossalai.launch(config=dict(zero=zero_config, cudnn_deterministic=True, cudnn_benchmark=False),
                       rank=rank,
                       world_size=world_size,
                       host='localhost',

From d0915f54f4a1fa8f5995e27144a4b94c2b43263d Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 5 May 2023 19:36:10 +0800
Subject: [PATCH 08/20] [booster] refactor all dp fashion plugins (#3684)

* [booster] add dp plugin base

* [booster] inherit dp plugin base

* [booster] refactor unit tests
---
 colossalai/booster/plugin/dp_plugin_base.py   |  72 +++++++++++
 colossalai/booster/plugin/gemini_plugin.py    | 112 +++++-------------
 .../booster/plugin/low_level_zero_plugin.py   |  66 +----------
 colossalai/booster/plugin/torch_ddp_plugin.py |  66 +----------
 .../test_plugin/test_dp_plugin_base.py        |  85 +++++++++++++
 .../test_plugin/test_gemini_plugin.py         |  25 ----
 .../test_plugin/test_low_level_zero_plugin.py |  24 ----
 .../test_plugin/test_torch_ddp_plugin.py      |  48 --------
 8 files changed, 190 insertions(+), 308 deletions(-)
 create mode 100644 colossalai/booster/plugin/dp_plugin_base.py
 create mode 100644 tests/test_booster/test_plugin/test_dp_plugin_base.py

diff --git a/colossalai/booster/plugin/dp_plugin_base.py b/colossalai/booster/plugin/dp_plugin_base.py
new file mode 100644
index 000000000000..4021b31754b4
--- /dev/null
+++ b/colossalai/booster/plugin/dp_plugin_base.py
@@ -0,0 +1,72 @@
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from .plugin_base import Plugin
+
+
+class DPPluginBase(Plugin):
+    """This is a base class for all DP plugins. It sets up world size and rank, and provides data loader creation.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index dfdd7be26eaa..fde8912a648f 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -1,36 +1,25 @@
-import random
+import logging
+import os
 import warnings
-from typing import Callable, List, Optional, Tuple, Union
 from pathlib import Path
-import os
-import logging
+from typing import Callable, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 
-from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
-from colossalai.checkpoint_io.utils import save_state_dict
+from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO
+from colossalai.checkpoint_io.utils import get_base_filenames, get_shard_filename, save_state_dict
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.utils import get_current_device
 from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
 from colossalai.zero.gemini.memory_tracer import MemStats
 
-from colossalai.checkpoint_io.utils import (
-    get_base_filenames,
-    get_shard_filename
-    )
-
-from colossalai.checkpoint_io import CheckpointIndexFile
-
-from .plugin_base import Plugin
+from .dp_plugin_base import DPPluginBase
 
 __all__ = ['GeminiPlugin']
 
@@ -72,7 +61,13 @@ def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         if self.coordinator.is_master():
             super().save_lr_scheduler(lr_scheduler, checkpoint)
 
-    def save_sharded_model(self, model: GeminiDDP, checkpoint_path: str, gather_dtensor: bool = False, variant: Optional[str] = None, max_shard_size: int = 1024, use_safetensors: bool = False):
+    def save_sharded_model(self,
+                           model: GeminiDDP,
+                           checkpoint_path: str,
+                           gather_dtensor: bool = False,
+                           variant: Optional[str] = None,
+                           max_shard_size: int = 1024,
+                           use_safetensors: bool = False):
         """
         Save sharded model
         """
@@ -88,25 +83,27 @@ def save_sharded_model(self, model: GeminiDDP, checkpoint_path: str, gather_dten
             total_size = total_size + shard_pair[1]
             for key in shard.keys():
                 index_file.append_weight_map(key, shard_file)
-            
+
             checkpoint_file_path = os.path.join(checkpoint_path, shard_file)
             save_state_dict(shard, checkpoint_file_path, use_safetensors)
-        
+
         index_file.append_meta_data("total_size", total_size)
         index_file.write_index_file(save_index_file)
-        logging.info(
-            f"The model is going to be split to checkpoint shards. "
-            f"You can find where each parameters has been saved in the "
-            f"index located at {save_index_file}."
-        )
-
-
-    def load_sharded_model(self, model: GeminiDDP, checkpoint_index_file: Path, strict: bool = False, use_safetensors: bool = False):
+        logging.info(f"The model is going to be split to checkpoint shards. "
+                     f"You can find where each parameters has been saved in the "
+                     f"index located at {save_index_file}.")
+
+    def load_sharded_model(self,
+                           model: GeminiDDP,
+                           checkpoint_index_file: Path,
+                           strict: bool = False,
+                           use_safetensors: bool = False):
         """
         load shard model, load model from multiple files
         """
         return super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module=False)
 
+
 class GeminiModel(ModelWrapper):
 
     def __init__(self, module: nn.Module, gemini_config: dict, verbose: bool = False) -> None:
@@ -148,7 +145,7 @@ def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
         raise NotImplementedError('Gemini does not support clip_grad_by_value')
 
 
-class GeminiPlugin(Plugin):
+class GeminiPlugin(DPPluginBase):
     """
     Plugin for Gemini.
 
@@ -217,11 +214,7 @@ def __init__(
         norm_type: float = 2.0,
         verbose: bool = False,
     ) -> None:
-
-        assert dist.is_initialized(
-        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
+        super().__init__()
         self.gemini_config = dict(
             device=(device or get_current_device()),
             placement_policy=placement_policy,
@@ -260,57 +253,6 @@ def control_device(self) -> bool:
     def supported_devices(self) -> List[str]:
         return ['cuda']
 
-    def prepare_train_dataloader(self,
-                                 dataset,
-                                 batch_size,
-                                 shuffle=False,
-                                 seed=1024,
-                                 drop_last=False,
-                                 pin_memory=False,
-                                 num_workers=0,
-                                 **kwargs):
-        r"""
-        Prepare a dataloader for distributed training. The dataloader will be wrapped by
-        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
-
-        Note:
-            1. Evaluation datasets should not be passed to this function.
-
-        Args:
-            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-            seed (int, optional): Random worker seed for sampling, defaults to 1024.
-            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-                is not divisible by the batch size. If False and the size of dataset is not divisible by
-                the batch size, then the last batch will be smaller, defaults to False.
-            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-
-        Returns:
-            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-        """
-        _kwargs = kwargs.copy()
-        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
-
-        # Deterministic dataloader
-        def seed_worker(worker_id):
-            worker_seed = seed
-            np.random.seed(worker_seed)
-            torch.manual_seed(worker_seed)
-            random.seed(worker_seed)
-
-        return DataLoader(dataset,
-                          batch_size=batch_size,
-                          sampler=sampler,
-                          worker_init_fn=seed_worker,
-                          drop_last=drop_last,
-                          pin_memory=pin_memory,
-                          num_workers=num_workers,
-                          **_kwargs)
-
     def configure(
         self,
         model: nn.Module,
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 969c430bd317..828d8b27422f 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -1,24 +1,20 @@
-import random
 import warnings
 from typing import Callable, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 
 from colossalai.checkpoint_io import CheckpointIO
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.utils import get_current_device
 from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 
-from .plugin_base import Plugin
+from .dp_plugin_base import DPPluginBase
 from .torch_ddp_plugin import TorchDDPCheckpointIO
 
 __all__ = ['LowLevelZeroPlugin']
@@ -88,7 +84,7 @@ def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
         raise NotImplementedError('LowLevelZero does not support clip_grad_by_value')
 
 
-class LowLevelZeroPlugin(Plugin):
+class LowLevelZeroPlugin(DPPluginBase):
     """
     Plugin for low level zero.
 
@@ -142,15 +138,10 @@ def __init__(
         cpu_offload: bool = False,
         verbose: bool = False,
     ) -> None:
-
-        assert dist.is_initialized(
-        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        super().__init__()
         assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
         assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
 
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
-
         self.stage = stage
         self.precision = precision
         self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size_in_m * 1024 * 1024,
@@ -183,57 +174,6 @@ def control_device(self) -> bool:
     def supported_devices(self) -> List[str]:
         return ['cuda']
 
-    def prepare_train_dataloader(self,
-                                 dataset,
-                                 batch_size,
-                                 shuffle=False,
-                                 seed=1024,
-                                 drop_last=False,
-                                 pin_memory=False,
-                                 num_workers=0,
-                                 **kwargs):
-        r"""
-        Prepare a dataloader for distributed training. The dataloader will be wrapped by
-        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
-
-        Note:
-            1. Evaluation datasets should not be passed to this function.
-
-        Args:
-            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-            seed (int, optional): Random worker seed for sampling, defaults to 1024.
-            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-                is not divisible by the batch size. If False and the size of dataset is not divisible by
-                the batch size, then the last batch will be smaller, defaults to False.
-            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-
-        Returns:
-            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-        """
-        _kwargs = kwargs.copy()
-        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
-
-        # Deterministic dataloader
-        def seed_worker(worker_id):
-            worker_seed = seed
-            np.random.seed(worker_seed)
-            torch.manual_seed(worker_seed)
-            random.seed(worker_seed)
-
-        return DataLoader(dataset,
-                          batch_size=batch_size,
-                          sampler=sampler,
-                          worker_init_fn=seed_worker,
-                          drop_last=drop_last,
-                          pin_memory=pin_memory,
-                          num_workers=num_workers,
-                          **_kwargs)
-
     def configure(
         self,
         model: nn.Module,
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index c5e310c7e769..d30d266c0048 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -1,21 +1,16 @@
-import random
 from typing import Callable, List, Tuple, Union
 
-import numpy as np
-import torch
-import torch.distributed as dist
 import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 
 from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 
-from .plugin_base import Plugin
+from .dp_plugin_base import DPPluginBase
 
 __all__ = ['TorchDDPPlugin']
 
@@ -66,7 +61,7 @@ def unwrap(self):
         return self.module.module
 
 
-class TorchDDPPlugin(Plugin):
+class TorchDDPPlugin(DPPluginBase):
     """
     Plugin for PyTorch DDP.
 
@@ -97,11 +92,7 @@ def __init__(self,
                  check_reduction: bool = False,
                  gradient_as_bucket_view: bool = False,
                  static_graph: bool = False) -> None:
-
-        assert dist.is_initialized(
-        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
+        super().__init__()
         self.ddp_kwargs = dict(broadcast_buffers=broadcast_buffers,
                                bucket_cap_mb=bucket_cap_mb,
                                find_unused_parameters=find_unused_parameters,
@@ -124,57 +115,6 @@ def control_device(self) -> bool:
     def supported_devices(self) -> List[str]:
         return ['cuda']
 
-    def prepare_train_dataloader(self,
-                                 dataset,
-                                 batch_size,
-                                 shuffle=False,
-                                 seed=1024,
-                                 drop_last=False,
-                                 pin_memory=False,
-                                 num_workers=0,
-                                 **kwargs):
-        r"""
-        Prepare a dataloader for distributed training. The dataloader will be wrapped by
-        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
-
-        Note:
-            1. Evaluation datasets should not be passed to this function.
-
-        Args:
-            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-            seed (int, optional): Random worker seed for sampling, defaults to 1024.
-            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-                is not divisible by the batch size. If False and the size of dataset is not divisible by
-                the batch size, then the last batch will be smaller, defaults to False.
-            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-
-        Returns:
-            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-        """
-        _kwargs = kwargs.copy()
-        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
-
-        # Deterministic dataloader
-        def seed_worker(worker_id):
-            worker_seed = seed
-            np.random.seed(worker_seed)
-            torch.manual_seed(worker_seed)
-            random.seed(worker_seed)
-
-        return DataLoader(dataset,
-                          batch_size=batch_size,
-                          sampler=sampler,
-                          worker_init_fn=seed_worker,
-                          drop_last=drop_last,
-                          pin_memory=pin_memory,
-                          num_workers=num_workers,
-                          **_kwargs)
-
     def configure(
         self,
         model: nn.Module,
diff --git a/tests/test_booster/test_plugin/test_dp_plugin_base.py b/tests/test_booster/test_plugin/test_dp_plugin_base.py
new file mode 100644
index 000000000000..a2b94ba6ca81
--- /dev/null
+++ b/tests/test_booster/test_plugin/test_dp_plugin_base.py
@@ -0,0 +1,85 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader, TensorDataset
+
+import colossalai
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+from colossalai.checkpoint_io import CheckpointIO
+from colossalai.interface import OptimizerWrapper
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+class DPPluginWrapper(DPPluginBase):
+    """This is a wrapper class for testing DP plugin initialization and dataloader creation.
+    """
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+        pass
+
+    def control_checkpoint_io(self) -> bool:
+        pass
+
+    def control_device(self) -> bool:
+        pass
+
+    def control_precision(self) -> bool:
+        pass
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        pass
+
+    def support_no_sync(self) -> bool:
+        pass
+
+    def supported_devices(self) -> List[str]:
+        pass
+
+    def supported_precisions(self) -> List[str]:
+        pass
+
+
+def check_dataloader_sharding():
+    plugin = DPPluginWrapper()
+
+    # create a custom dasetset with 0 to 10
+    dataset = TensorDataset(torch.arange(0, 10))
+    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
+
+    # get the first batch of data
+    batch = next(iter(train_dataloader))[0].cuda()
+    is_rank_0 = dist.get_rank() == 0
+
+    if is_rank_0:
+        batch_to_compare = batch.clone()
+    else:
+        batch_to_compare = batch
+    # pass to the rank 1 value to rank 0
+    dist.broadcast(batch_to_compare, src=1)
+
+    # compare on rank 0
+    if is_rank_0:
+        assert not torch.equal(batch,
+                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
+
+
+def run_dist(rank, world_size, port):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_dataloader_sharding()
+
+
+@rerun_if_address_is_in_use()
+def test_dp_plugin_dataloader():
+    spawn(run_dist, 2)
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 985d7989fc9d..c7b3676fb478 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -117,34 +117,9 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
     assert len(failed_info) == 0, '\n'.join([f'{k}: {v}' for k, v in failed_info.items()])
 
 
-def check_dataloader_sharding():
-    plugin = GeminiPlugin()
-
-    # create a custom dasetset with 0 to 10
-    dataset = torch.utils.data.TensorDataset(torch.arange(0, 10))
-    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
-
-    # get the first batch of data
-    batch = next(iter(train_dataloader))[0].cuda()
-    is_rank_0 = dist.get_rank() == 0
-
-    if is_rank_0:
-        batch_to_compare = batch.clone()
-    else:
-        batch_to_compare = batch
-    # pass to the rank 1 value to rank 0
-    dist.broadcast(batch_to_compare, src=1)
-
-    # compare on rank 0
-    if is_rank_0:
-        assert not torch.equal(batch,
-                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
-
-
 def run_dist(rank, world_size, port, early_stop: bool = True):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
-    check_dataloader_sharding()
     check_gemini_plugin(early_stop=early_stop)
 
 
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index e24196a14917..d84b96f77a75 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -83,30 +83,6 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
     assert len(failed_info) == 0, '\n'.join([f'{k}: {v}' for k, v in failed_info.items()])
 
 
-def check_dataloader_sharding():
-    plugin = LowLevelZeroPlugin()
-
-    # create a custom dasetset with 0 to 10
-    dataset = torch.utils.data.TensorDataset(torch.arange(0, 10))
-    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
-
-    # get the first batch of data
-    batch = next(iter(train_dataloader))[0].cuda()
-    is_rank_0 = dist.get_rank() == 0
-
-    if is_rank_0:
-        batch_to_compare = batch.clone()
-    else:
-        batch_to_compare = batch
-    # pass to the rank 1 value to rank 0
-    dist.broadcast(batch_to_compare, src=1)
-
-    # compare on rank 0
-    if is_rank_0:
-        assert not torch.equal(batch,
-                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
-
-
 def run_dist(rank, world_size, port, early_stop: bool = True):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 5354eae01d40..30c4db12309f 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -44,57 +44,9 @@ def check_torch_ddp_plugin():
         torch.cuda.empty_cache()
 
 
-def check_dataloader_sharding():
-    plugin = TorchDDPPlugin()
-
-    # create a custom dasetset with 0 to 10
-    dataset = torch.utils.data.TensorDataset(torch.arange(0, 10))
-    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
-
-    # get the first batch of data
-    batch = next(iter(train_dataloader))[0].cuda()
-    is_rank_0 = dist.get_rank() == 0
-
-    if is_rank_0:
-        batch_to_compare = batch.clone()
-    else:
-        batch_to_compare = batch
-    # pass to the rank 1 value to rank 0
-    dist.broadcast(batch_to_compare, src=1)
-
-    # compare on rank 0
-    if is_rank_0:
-        assert not torch.equal(batch,
-                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
-
-
-def check_checkpoint_save_and_load():
-    model_fn, data_gen_fn, output_transform_fn, _ = model_zoo['timm_resnet']
-
-    plugin = TorchDDPPlugin()
-    booster = Booster(plugin=plugin)
-
-    model = model_fn()
-    optimizer = SGD(model.parameters(), lr=1e-3)
-    criterion = lambda x: x.mean()
-    data = data_gen_fn()
-
-    data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
-
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    output = model(**data)
-    output = output_transform_fn(output)
-    output_key = list(output.keys())[0]
-    loss = criterion(output[output_key])
-
-    booster.backward(loss, optimizer)
-
-
 def run_dist(rank, world_size, port):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
-    check_dataloader_sharding()
     check_torch_ddp_plugin()
 
 

From 65bdc3159f2971c71780bc1b9ca5ff9d2e8efc83 Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Sat, 6 May 2023 11:27:23 +0800
Subject: [PATCH 09/20] fix some spelling error with
 applications/Chat/examples/  (#3692)

* fix spelling error with examples/comminity/

* fix spelling error with example/
---
 applications/Chat/examples/README.md                   |  3 +--
 .../Chat/examples/community/peft/easy_dataset.py       | 10 +++++-----
 .../Chat/examples/community/peft/train_peft_prompts.py |  2 +-
 .../Chat/examples/community/peft/train_peft_sft.py     |  2 +-
 applications/Chat/examples/train_sft.py                |  2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index 561ace2205ed..2a2128e25a62 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -24,7 +24,6 @@
     - [LLaMA](#llama)
   - [Add your own models](#add-your-own-models)
     - [Actor model](#actor-model)
-    - [LM model](#lm-model)
     - [Reward model](#reward-model)
     - [Critic model](#critic-model)
 
@@ -150,7 +149,7 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
          --strategy colossalai_zero2 \
          --prompt_dataset /path/to/your/prompt_dataset \
          --pretrain_dataset /path/to/your/pretrain_dataset \
-         --rm_pretrain /your/pretrain/rm/defination \
+         --rm_pretrain /your/pretrain/rm/definition \
          --rm_path /your/rm/model/path
 ```
 
diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/Chat/examples/community/peft/easy_dataset.py
index 24ea4f0a8618..2fe293957079 100644
--- a/applications/Chat/examples/community/peft/easy_dataset.py
+++ b/applications/Chat/examples/community/peft/easy_dataset.py
@@ -188,7 +188,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                 else:
                     raw_input_ids.append(encoded_ids)
 
-        grouped_inpup_ids = []
+        grouped_input_ids = []
         current_input_ids = []
         attention_mask = []
         if tokenizer.pad_token_id is None:
@@ -199,7 +199,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                     #pad the current_input_ids to max_length with tokenizer.pad_token_id
                     padded_length = max_length - len(current_input_ids)
                     current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                    grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
+                    grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                     attention_mask.append(
                         torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
                     current_input_ids = []
@@ -208,7 +208,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
             if len(current_input_ids) > 0:
                 padded_length = max_length - len(current_input_ids)
                 current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
+                grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                 attention_mask.append(
                     torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
         else:
@@ -218,8 +218,8 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                 input_ids.extend([tokenizer.pad_token_id] * padded_length)
                 attention_mask.append(
                     torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
-                grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
-        self.input_ids = grouped_inpup_ids
+                grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
+        self.input_ids = grouped_input_ids
         self.labels = copy.deepcopy(self.input_ids)
         self.file_name = data_file
         self.attention_mask = attention_mask
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 0e277021e917..ba8470f38fad 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -41,7 +41,7 @@ def main(args):
     # configure model
     if args.model == 'bloom':
         # initial_model = BLOOMActor(pretrained=args.pretrain)
-        print('Using peft lora to load Bloom model as inital_model')
+        print('Using peft lora to load Bloom model as initial_model')
         initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
         print('Using peft lora to load Bloom model as initial_model (Done)')
     else:
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 9bd0ebc12a83..d2b08b72ca95 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -86,7 +86,7 @@ def train(args):
 
         if args.strategy == 'colossalai_gemini':
             # this is a hack to deal with the resized embedding
-            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
+            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
             for name, param in model.named_parameters():
                 if not isinstance(param, ColoParameter):
                     sub_module_name = '.'.join(name.split('.')[:-1])
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index da499f068b17..7fcd026fb538 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -84,7 +84,7 @@ def train(args):
 
         if args.strategy == 'colossalai_gemini':
             # this is a hack to deal with the resized embedding
-            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
+            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
             for name, param in model.named_parameters():
                 if not isinstance(param, ColoParameter):
                     sub_module_name = '.'.join(name.split('.')[:-1])

From d5566488852b4a08f901820a608c5b56f221010e Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Sat, 6 May 2023 11:53:13 +0800
Subject: [PATCH 10/20] [example] add finetune bert with booster example
 (#3693)

---
 examples/tutorial/new_api/glue_bert/README.md |  33 +++
 examples/tutorial/new_api/glue_bert/data.py   | 127 +++++++++++
 .../tutorial/new_api/glue_bert/finetune.py    | 198 ++++++++++++++++++
 .../tutorial/new_api/glue_bert/test_ci.sh     |   6 +
 4 files changed, 364 insertions(+)
 create mode 100644 examples/tutorial/new_api/glue_bert/README.md
 create mode 100644 examples/tutorial/new_api/glue_bert/data.py
 create mode 100644 examples/tutorial/new_api/glue_bert/finetune.py
 create mode 100755 examples/tutorial/new_api/glue_bert/test_ci.sh

diff --git a/examples/tutorial/new_api/glue_bert/README.md b/examples/tutorial/new_api/glue_bert/README.md
new file mode 100644
index 000000000000..d65c7705b5da
--- /dev/null
+++ b/examples/tutorial/new_api/glue_bert/README.md
@@ -0,0 +1,33 @@
+# Finetune BERT on GLUE
+
+## 🚀 Quick Start
+
+This example provides a training script, which provides an example of finetuning BERT on GLUE dataset.
+
+- Training Arguments
+  - `-t`, `--task`: GLUE task to run. Defaults to `mrpc`.
+  - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `gemini`, `low_level_zero`. Defaults to `torch_ddp`.
+  - `--target_f1`: Target f1 score. Raise exception if not reached. Defaults to `None`.
+
+
+### Train
+
+```bash
+# train with torch DDP with fp32
+colossalai run --nproc_per_node 4 finetune.py
+
+# train with torch DDP with mixed precision training
+colossalai run --nproc_per_node 4 finetune.py -p torch_ddp_fp16
+
+# train with gemini
+colossalai run --nproc_per_node 4 finetune.py -p gemini
+
+# train with low level zero
+colossalai run --nproc_per_node 4 finetune.py -p low_level_zero
+```
+
+Expected F1-score will be:
+
+| Model             | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Gemini | Booster Low Level Zero |
+| ----------------- | ------------------------ | --------------------- | --------------------- |--------------- | ---------------------- |
+| bert-base-uncased | 0.86                     | 0.88                  | 0.87                  | 0.88           | 0.89                   |
diff --git a/examples/tutorial/new_api/glue_bert/data.py b/examples/tutorial/new_api/glue_bert/data.py
new file mode 100644
index 000000000000..e43312aebc7c
--- /dev/null
+++ b/examples/tutorial/new_api/glue_bert/data.py
@@ -0,0 +1,127 @@
+import datasets
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+
+
+class GLUEDataBuilder:
+
+    task_text_field_map = {
+        "cola": ["sentence"],
+        "sst2": ["sentence"],
+        "mrpc": ["sentence1", "sentence2"],
+        "qqp": ["question1", "question2"],
+        "stsb": ["sentence1", "sentence2"],
+        "mnli": ["premise", "hypothesis"],
+        "qnli": ["question", "sentence"],
+        "rte": ["sentence1", "sentence2"],
+        "wnli": ["sentence1", "sentence2"],
+        "ax": ["premise", "hypothesis"],
+    }
+
+    glue_task_num_labels = {
+        "cola": 2,
+        "sst2": 2,
+        "mrpc": 2,
+        "qqp": 2,
+        "stsb": 1,
+        "mnli": 3,
+        "qnli": 2,
+        "rte": 2,
+        "wnli": 2,
+        "ax": 3,
+    }
+
+    loader_columns = [
+        "datasets_idx",
+        "input_ids",
+        "token_type_ids",
+        "attention_mask",
+        "start_positions",
+        "end_positions",
+        "labels",
+    ]
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        plugin: DPPluginBase,
+        task_name: str = "mrpc",
+        max_seq_length: int = 128,
+        train_batch_size: int = 32,
+        eval_batch_size: int = 32,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_name_or_path = model_name_or_path
+        self.task_name = task_name
+        self.max_seq_length = max_seq_length
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.plugin = plugin
+
+        self.text_fields = self.task_text_field_map[task_name]
+        self.num_labels = self.glue_task_num_labels[task_name]
+        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+        self.setup()
+
+    def setup(self):
+        self.dataset = datasets.load_dataset("glue", self.task_name)
+
+        for split in self.dataset.keys():
+            self.dataset[split] = self.dataset[split].map(
+                self.convert_to_features,
+                batched=True,
+                remove_columns=["label"],
+            )
+            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
+            self.dataset[split].set_format(type="torch", columns=self.columns)
+
+        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]
+
+    def prepare_data(self):
+        datasets.load_dataset("glue", self.task_name)
+        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+
+    def train_dataloader(self):
+        return self.plugin.prepare_train_dataloader(self.dataset["train"],
+                                                    batch_size=self.train_batch_size,
+                                                    shuffle=True,
+                                                    drop_last=True)
+
+    def val_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_train_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_train_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def test_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_train_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_train_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def convert_to_features(self, example_batch):
+
+        # Either encode single sentence or sentence pairs
+        if len(self.text_fields) > 1:
+            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
+        else:
+            texts_or_text_pairs = example_batch[self.text_fields[0]]
+
+        # Tokenize the text/text pairs
+        features = self.tokenizer.batch_encode_plus(texts_or_text_pairs,
+                                                    max_length=self.max_seq_length,
+                                                    padding='max_length',
+                                                    truncation=True)
+
+        # Rename label to labels to make it easier to pass to model forward
+        features["labels"] = example_batch["label"]
+
+        return features
diff --git a/examples/tutorial/new_api/glue_bert/finetune.py b/examples/tutorial/new_api/glue_bert/finetune.py
new file mode 100644
index 000000000000..63bdfc5d02cf
--- /dev/null
+++ b/examples/tutorial/new_api/glue_bert/finetune.py
@@ -0,0 +1,198 @@
+import argparse
+from typing import List, Union
+
+import datasets
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from data import GLUEDataBuilder
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoConfig, BertForSequenceClassification, get_linear_schedule_with_warmup
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+
+
+def move_to_cuda(batch):
+    return {k: v.cuda() for k, v in batch.items()}
+
+
+@torch.no_grad()
+def evaluate(model: nn.Module, test_dataloader: Union[DataLoader, List[DataLoader]], num_labels: int, task_name: str,
+             eval_splits: List[str], coordinator: DistCoordinator):
+    metric = datasets.load_metric("glue", task_name, process_id=coordinator.rank, num_process=coordinator.world_size)
+    model.eval()
+
+    def evaluate_subset(dataloader: DataLoader):
+        accum_loss = torch.zeros(1, device=get_current_device())
+        for batch in dataloader:
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            val_loss, logits = outputs[:2]
+            accum_loss.add_(val_loss)
+
+            if num_labels > 1:
+                preds = torch.argmax(logits, axis=1)
+            elif num_labels == 1:
+                preds = logits.squeeze()
+
+            labels = batch["labels"]
+
+            metric.add_batch(predictions=preds, references=labels)
+
+        results = metric.compute()
+        dist.all_reduce(accum_loss.div_(len(dataloader)))
+        if coordinator.is_master():
+            results['loss'] = accum_loss.item() / coordinator.world_size
+        return results
+
+    if isinstance(test_dataloader, DataLoader):
+        return evaluate_subset(test_dataloader)
+    else:
+        assert len(test_dataloader) == len(eval_splits)
+        final_results = {}
+        for split, sub_loader in zip(eval_splits, test_dataloader):
+            results = evaluate_subset(sub_loader)
+            final_results.update({f'{k}_{split}': v for k, v in results.items()})
+        return final_results
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, lr_scheduler, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for batch in pbar:
+            # Forward pass
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            loss = outputs[0]
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            lr_scheduler.step()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument('--target_f1', type=float, default=None, help="target f1 score. Raise exception if not reached")
+    args = parser.parse_args()
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+    model_name = 'bert-base-uncased'
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    data_builder = GLUEDataBuilder(model_name,
+                                   plugin,
+                                   args.task,
+                                   train_batch_size=BATCH_SIZE,
+                                   eval_batch_size=BATCH_SIZE)
+    train_dataloader = data_builder.train_dataloader()
+    test_dataloader = data_builder.test_dataloader()
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+    config = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels)
+    model = BertForSequenceClassification.from_pretrained(model_name, config=config)
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Train model
+    # ==============================
+    for epoch in range(NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, lr_scheduler, train_dataloader, booster, coordinator)
+
+    results = evaluate(model, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits,
+                       coordinator)
+
+    if coordinator.is_master():
+        print(results)
+        if args.target_f1 is not None and 'f1' in results:
+            assert results['f1'] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorial/new_api/glue_bert/test_ci.sh b/examples/tutorial/new_api/glue_bert/test_ci.sh
new file mode 100755
index 000000000000..f8c2dfbe9eb9
--- /dev/null
+++ b/examples/tutorial/new_api/glue_bert/test_ci.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -xe
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+    torchrun --standalone --nproc_per_node 4  finetune.py --target_f1 0.86 --plugin $plugin
+done

From 2da5d81dec7a94576ee04e69b4149368749573f4 Mon Sep 17 00:00:00 2001
From: zhang-yi-chi <673865549@qq.com>
Date: Sat, 6 May 2023 16:46:38 +0800
Subject: [PATCH 11/20] [chat] fix train_prompts.py gemini strategy bug (#3666)

* fix gemini strategy bug

* add comment

* add comment

* better solution
---
 applications/Chat/examples/train_prompts.py | 70 ++++++++++-----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index a584991cd34e..134f21f80ef1 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -36,45 +36,45 @@ def main(args):
     if args.rm_path is not None:
         state_dict = torch.load(args.rm_path, map_location='cpu')
 
-    # configure model
-    if args.model == 'gpt2':
-        initial_model = GPTActor(pretrained=args.pretrain)
-    elif args.model == 'bloom':
-        initial_model = BLOOMActor(pretrained=args.pretrain)
-    elif args.model == 'opt':
-        initial_model = OPTActor(pretrained=args.pretrain)
-    elif args.model == 'llama':
-        initial_model = LlamaActor(pretrained=args.pretrain)
-    elif args.model == 'roberta':
-        initial_model = RoBERTaActor(pretrained=args.pretrain)
-    else:
-        raise ValueError(f'Unsupported actor model "{args.model}"')
+    with strategy.model_init_context():
+        # configure model
+        if args.model == 'gpt2':
+            initial_model = GPTActor(pretrained=args.pretrain)
+        elif args.model == 'bloom':
+            initial_model = BLOOMActor(pretrained=args.pretrain)
+        elif args.model == 'opt':
+            initial_model = OPTActor(pretrained=args.pretrain)
+        elif args.model == 'llama':
+            initial_model = LlamaActor(pretrained=args.pretrain)
+        elif args.model == 'roberta':
+            initial_model = RoBERTaActor(pretrained=args.pretrain)
+        else:
+            raise ValueError(f'Unsupported actor model "{args.model}"')
 
-    if args.rm_model == None:
-        rm_model_name = args.model
-    else:
-        rm_model_name = args.rm_model
-
-    if rm_model_name == 'gpt2':
-        reward_model = GPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'bloom':
-        reward_model = BLOOMRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'opt':
-        reward_model = OPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'llama':
-        reward_model = LlamaRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'roberta':
-        reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
-    else:
-        raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+        if args.rm_model == None:
+            rm_model_name = args.model
+        else:
+            rm_model_name = args.rm_model
 
-    if args.rm_path is not None:
-        reward_model.load_state_dict(state_dict)
+        if rm_model_name == 'gpt2':
+            reward_model = GPTRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'bloom':
+            reward_model = BLOOMRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'opt':
+            reward_model = OPTRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'llama':
+            reward_model = LlamaRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'roberta':
+            reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
+        else:
+            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
 
-    initial_model.to(torch.float16).to(torch.cuda.current_device())
-    reward_model.to(torch.float16).to(torch.cuda.current_device())
+        if args.rm_path is not None:
+            reward_model.load_state_dict(state_dict)
+
+        initial_model.to(torch.float16).to(torch.cuda.current_device())
+        reward_model.to(torch.float16).to(torch.cuda.current_device())
 
-    with strategy.model_init_context():
         if args.model == 'gpt2':
             actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == 'bloom':

From 2629f9717dbb1f0dc0ecad3f3bda18a4b44e3785 Mon Sep 17 00:00:00 2001
From: YH <100389977+yhna940@users.noreply.github.com>
Date: Sat, 6 May 2023 18:55:37 +0900
Subject: [PATCH 12/20] [tensor] Refactor handle_trans_spec in DistSpecManager

---
 colossalai/tensor/dist_spec_mgr.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py
index 8657989235db..c968050de49d 100644
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/tensor/dist_spec_mgr.py
@@ -4,10 +4,8 @@
 import torch.distributed as dist
 # from colossalai.nn.layer.utils import divide
 from numpy import prod
-from packaging import version
 
-from colossalai.logging import get_dist_logger
-from colossalai.tensor.distspec import _DistSpec
+from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
 from colossalai.tensor.process_group import ProcessGroup
 
 
@@ -171,11 +169,21 @@ def handle_trans_spec(tensor: torch.Tensor, old_dist_spec: _DistSpec, dist_spec:
                           pg: ProcessGroup) -> torch.Tensor:
         assert isinstance(old_dist_spec, _DistSpec), f"{type(old_dist_spec)} should be _DistSpec"
         assert isinstance(dist_spec, _DistSpec), f"{type(dist_spec)} should be _DistSpec"
-        forward_trans_handle = getattr(DistSpecManager, f'_{old_dist_spec.placement.value}2{dist_spec.placement.value}')
+
+        trans_func_key = (old_dist_spec.placement, dist_spec.placement)
+        trans_funcs = {
+            (DistPlacementPattern.REPLICATE, DistPlacementPattern.REPLICATE): DistSpecManager._r2r,
+            (DistPlacementPattern.REPLICATE, DistPlacementPattern.SHARD): DistSpecManager._r2s,
+            (DistPlacementPattern.SHARD, DistPlacementPattern.REPLICATE): DistSpecManager._s2r,
+            (DistPlacementPattern.SHARD, DistPlacementPattern.SHARD): DistSpecManager._s2s
+        }
+
+        forward_trans_handle = trans_funcs[trans_func_key]
         if not DistSpecManager._use_autograd_function:
             return forward_trans_handle(tensor, old_dist_spec, dist_spec, pg)
-        backward_trans_handle = getattr(DistSpecManager,
-                                        f'_{dist_spec.placement.value}2{old_dist_spec.placement.value}')
+
+        backward_trans_handle = trans_funcs[(dist_spec.placement, old_dist_spec.placement)]
+
         return TransformDistSpec.apply(tensor, old_dist_spec, dist_spec, pg, forward_trans_handle,
                                        backward_trans_handle)
 

From f83ea813f5bb6ecc5597c7f1bf97870d46de1c49 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 8 May 2023 10:42:30 +0800
Subject: [PATCH 13/20] [example] add train resnet/vit with booster example
 (#3694)

* [example] add train vit with booster example

* [example] update readme

* [example] add train resnet with booster example

* [example] enable ci

* [example] enable ci

* [example] add requirements

* [hotfix] fix analyzer init

* [example] update requirements
---
 colossalai/_analyzer/__init__.py              |   0
 .../{torch_ddp => cifar_resnet}/.gitignore    |   0
 .../tutorial/new_api/cifar_resnet/README.md   |  56 +++++
 .../{torch_ddp => cifar_resnet}/eval.py       |   0
 .../new_api/cifar_resnet/requirements.txt     |   4 +
 .../tutorial/new_api/cifar_resnet/test_ci.sh  |  10 +
 .../tutorial/new_api/cifar_resnet/train.py    | 210 ++++++++++++++++
 examples/tutorial/new_api/cifar_vit/README.md |  37 +++
 .../new_api/cifar_vit/requirements.txt        |   5 +
 .../tutorial/new_api/cifar_vit/test_ci.sh     |  10 +
 examples/tutorial/new_api/cifar_vit/train.py  | 225 ++++++++++++++++++
 examples/tutorial/new_api/glue_bert/README.md |   6 +
 .../new_api/glue_bert/requirements.txt        |   7 +
 .../tutorial/new_api/glue_bert/test_ci.sh     |   2 +
 examples/tutorial/new_api/test_ci.sh          |   8 +-
 examples/tutorial/new_api/torch_ddp/README.md |  44 ----
 examples/tutorial/new_api/torch_ddp/train.py  | 128 ----------
 17 files changed, 578 insertions(+), 174 deletions(-)
 create mode 100644 colossalai/_analyzer/__init__.py
 rename examples/tutorial/new_api/{torch_ddp => cifar_resnet}/.gitignore (100%)
 create mode 100644 examples/tutorial/new_api/cifar_resnet/README.md
 rename examples/tutorial/new_api/{torch_ddp => cifar_resnet}/eval.py (100%)
 create mode 100644 examples/tutorial/new_api/cifar_resnet/requirements.txt
 create mode 100755 examples/tutorial/new_api/cifar_resnet/test_ci.sh
 create mode 100644 examples/tutorial/new_api/cifar_resnet/train.py
 create mode 100644 examples/tutorial/new_api/cifar_vit/README.md
 create mode 100644 examples/tutorial/new_api/cifar_vit/requirements.txt
 create mode 100755 examples/tutorial/new_api/cifar_vit/test_ci.sh
 create mode 100644 examples/tutorial/new_api/cifar_vit/train.py
 create mode 100644 examples/tutorial/new_api/glue_bert/requirements.txt
 delete mode 100644 examples/tutorial/new_api/torch_ddp/README.md
 delete mode 100644 examples/tutorial/new_api/torch_ddp/train.py

diff --git a/colossalai/_analyzer/__init__.py b/colossalai/_analyzer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/tutorial/new_api/torch_ddp/.gitignore b/examples/tutorial/new_api/cifar_resnet/.gitignore
similarity index 100%
rename from examples/tutorial/new_api/torch_ddp/.gitignore
rename to examples/tutorial/new_api/cifar_resnet/.gitignore
diff --git a/examples/tutorial/new_api/cifar_resnet/README.md b/examples/tutorial/new_api/cifar_resnet/README.md
new file mode 100644
index 000000000000..4ed86aa7a0ad
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_resnet/README.md
@@ -0,0 +1,56 @@
+# Train ResNet on CIFAR-10 from scratch
+
+## 🚀 Quick Start
+
+This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch.
+
+- Training Arguments
+  - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`.
+  - `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming.
+  - `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`.
+  - `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved.
+  - `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`.
+
+- Eval Arguments
+  - `-e`, `--epoch`: select the epoch to evaluate
+  - `-c`, `--checkpoint`: the folder where checkpoints are found
+
+### Install requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+### Train
+
+```bash
+# train with torch DDP with fp32
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32
+
+# train with torch DDP with mixed precision training
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 -p torch_ddp_fp16
+
+# train with low level zero
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-low_level_zero -p low_level_zero
+```
+
+### Eval
+
+```bash
+# evaluate fp32 training
+python eval.py -c ./ckpt-fp32 -e 80
+
+# evaluate fp16 mixed precision training
+python eval.py -c ./ckpt-fp16 -e 80
+
+# evaluate low level zero training
+python eval.py -c ./ckpt-low_level_zero -e 80
+```
+
+Expected accuracy performance will be:
+
+| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
+| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
+| ResNet-18 | 85.85%                   | 84.91%                | 85.46%                | 84.50%                 |
+
+**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`**
diff --git a/examples/tutorial/new_api/torch_ddp/eval.py b/examples/tutorial/new_api/cifar_resnet/eval.py
similarity index 100%
rename from examples/tutorial/new_api/torch_ddp/eval.py
rename to examples/tutorial/new_api/cifar_resnet/eval.py
diff --git a/examples/tutorial/new_api/cifar_resnet/requirements.txt b/examples/tutorial/new_api/cifar_resnet/requirements.txt
new file mode 100644
index 000000000000..85522f4129c4
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_resnet/requirements.txt
@@ -0,0 +1,4 @@
+colossalai
+torch
+torchvision
+tqdm
diff --git a/examples/tutorial/new_api/cifar_resnet/test_ci.sh b/examples/tutorial/new_api/cifar_resnet/test_ci.sh
new file mode 100755
index 000000000000..3954b84ff1ba
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_resnet/test_ci.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -xe
+
+export DATA=/data/scratch/cifar-10
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do
+    colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.84 --plugin $plugin
+done
diff --git a/examples/tutorial/new_api/cifar_resnet/train.py b/examples/tutorial/new_api/cifar_resnet/train.py
new file mode 100644
index 000000000000..e64e95fc2baf
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_resnet/train.py
@@ -0,0 +1,210 @@
+import argparse
+import os
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import MultiStepLR
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 80
+LEARNING_RATE = 1e-3
+
+
+def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
+    # trainsform
+    transform_train = transforms.Compose(
+        [transforms.Pad(4),
+         transforms.RandomHorizontalFlip(),
+         transforms.RandomCrop(32),
+         transforms.ToTensor()])
+    transform_test = transforms.ToTensor()
+
+    # CIFAR-10 dataset
+    data_path = os.environ.get('DATA', './data')
+    with coordinator.priority_execution():
+        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                     train=True,
+                                                     transform=transform_train,
+                                                     download=True)
+        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                    train=False,
+                                                    transform=transform_test,
+                                                    download=True)
+
+    # Data loader
+    train_dataloader = plugin.prepare_train_dataloader(train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=True,
+                                                       drop_last=True)
+    test_dataloader = plugin.prepare_train_dataloader(test_dataset,
+                                                      batch_size=batch_size,
+                                                      shuffle=False,
+                                                      drop_last=False)
+    return train_dataloader, test_dataloader
+
+
+@torch.no_grad()
+def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
+    model.eval()
+    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    for images, labels in test_dataloader:
+        images = images.cuda()
+        labels = labels.cuda()
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    dist.all_reduce(correct)
+    dist.all_reduce(total)
+    accuracy = correct.item() / total.item()
+    if coordinator.is_master():
+        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
+    return accuracy
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for images, labels in pbar:
+            images = images.cuda()
+            labels = labels.cuda()
+            # Forward pass
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    # FIXME(ver217): gemini is not supported resnet now
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
+    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
+    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
+    parser.add_argument('--target_acc',
+                        type=float,
+                        default=None,
+                        help="target accuracy. Raise exception if not reached")
+    args = parser.parse_args()
+
+    # ==============================
+    # Prepare Checkpoint Directory
+    # ==============================
+    if args.interval > 0:
+        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={})
+    coordinator = DistCoordinator()
+
+    # update the learning rate with linear scaling
+    # old_gpu_num / old_lr = new_gpu_num / new_lr
+    global LEARNING_RATE
+    LEARNING_RATE *= coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)
+
+    # ====================================
+    # Prepare model, optimizer, criterion
+    # ====================================
+    # resent50
+    model = torchvision.models.resnet18(num_classes=10)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
+
+    # lr scheduler
+    lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
+                                                                 optimizer,
+                                                                 criterion=criterion,
+                                                                 lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Resume from checkpoint
+    # ==============================
+    if args.resume >= 0:
+        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
+        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
+        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
+
+    # ==============================
+    # Train model
+    # ==============================
+    start_epoch = args.resume if args.resume >= 0 else 0
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
+        lr_scheduler.step()
+
+        # save checkpoint
+        if args.interval > 0 and (epoch + 1) % args.interval == 0:
+            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
+            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
+            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
+
+    accuracy = evaluate(model, test_dataloader, coordinator)
+    if args.target_acc is not None:
+        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorial/new_api/cifar_vit/README.md b/examples/tutorial/new_api/cifar_vit/README.md
new file mode 100644
index 000000000000..fa76447c508f
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_vit/README.md
@@ -0,0 +1,37 @@
+# Train ViT on CIFAR-10 from scratch
+
+## 🚀 Quick Start
+
+This example provides a training script, which provides an example of training ViT on CIFAR10 dataset from scratch.
+
+- Training Arguments
+  - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`.
+  - `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming.
+  - `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`.
+  - `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved.
+  - `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`.
+
+### Install requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+### Train
+
+```bash
+# train with torch DDP with fp32
+colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp32
+
+# train with torch DDP with mixed precision training
+colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp16 -p torch_ddp_fp16
+
+# train with low level zero
+colossalai run --nproc_per_node 4 train.py -c ./ckpt-low_level_zero -p low_level_zero
+```
+
+Expected accuracy performance will be:
+
+| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
+| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
+| ViT       | 83.00%                   | 84.03%                | 84.00%                | 84.43%                 |
diff --git a/examples/tutorial/new_api/cifar_vit/requirements.txt b/examples/tutorial/new_api/cifar_vit/requirements.txt
new file mode 100644
index 000000000000..6d53ce7b5a7d
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_vit/requirements.txt
@@ -0,0 +1,5 @@
+colossalai
+timm
+torch
+torchvision
+tqdm
diff --git a/examples/tutorial/new_api/cifar_vit/test_ci.sh b/examples/tutorial/new_api/cifar_vit/test_ci.sh
new file mode 100755
index 000000000000..43239d400586
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_vit/test_ci.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -xe
+
+export DATA=/data/scratch/cifar-10
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do
+    colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.83 --plugin $plugin
+done
diff --git a/examples/tutorial/new_api/cifar_vit/train.py b/examples/tutorial/new_api/cifar_vit/train.py
new file mode 100644
index 000000000000..fee53df07086
--- /dev/null
+++ b/examples/tutorial/new_api/cifar_vit/train.py
@@ -0,0 +1,225 @@
+import argparse
+import os
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+from timm.models.vision_transformer import _cfg, _create_vision_transformer
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 60
+WARMUP_EPOCSH = 5
+LEARNING_RATE = 1e-3
+
+
+def vit_cifar(**kwargs):
+    pretrained_cfg = _cfg(num_classes=10, input_size=(3, 32, 32), crop_pct=1.0)
+    model_kwargs = dict(patch_size=4, embed_dim=512, depth=6, num_heads=8, drop_rate=0.1, mlp_ratio=1.0, **kwargs)
+    model = _create_vision_transformer('vit_cifar', pretrained_cfg=pretrained_cfg, **model_kwargs)
+    return model
+
+
+def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
+    # trainsform
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)),
+    ])
+    transform_test = transforms.Compose([
+        transforms.Resize(32),
+        transforms.ToTensor(),
+        transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)),
+    ])
+
+    # CIFAR-10 dataset
+    data_path = os.environ.get('DATA', './data')
+    with coordinator.priority_execution():
+        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                     train=True,
+                                                     transform=transform_train,
+                                                     download=True)
+        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                    train=False,
+                                                    transform=transform_test,
+                                                    download=True)
+
+    # Data loader
+    train_dataloader = plugin.prepare_train_dataloader(train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=True,
+                                                       drop_last=True)
+    test_dataloader = plugin.prepare_train_dataloader(test_dataset,
+                                                      batch_size=batch_size,
+                                                      shuffle=False,
+                                                      drop_last=False)
+    return train_dataloader, test_dataloader
+
+
+@torch.no_grad()
+def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
+    model.eval()
+    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    for images, labels in test_dataloader:
+        images = images.cuda()
+        labels = labels.cuda()
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    dist.all_reduce(correct)
+    dist.all_reduce(total)
+    accuracy = correct.item() / total.item()
+    if coordinator.is_master():
+        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
+    return accuracy
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for images, labels in pbar:
+            images = images.cuda()
+            labels = labels.cuda()
+            # Forward pass
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    # FIXME(ver217): gemini is not supported resnet now
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
+    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
+    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
+    parser.add_argument('--target_acc',
+                        type=float,
+                        default=None,
+                        help="target accuracy. Raise exception if not reached")
+    args = parser.parse_args()
+
+    # ==============================
+    # Prepare Checkpoint Directory
+    # ==============================
+    if args.interval > 0:
+        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={})
+    coordinator = DistCoordinator()
+
+    # update the learning rate with linear scaling
+    # old_gpu_num / old_lr = new_gpu_num / new_lr
+    global LEARNING_RATE
+    LEARNING_RATE *= coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    train_dataloader, test_dataloader = build_dataloader(512, coordinator, plugin)
+
+    # ====================================
+    # Prepare model, optimizer, criterion
+    # ====================================
+    # resent50
+    model = torchvision.models.resnet18(num_classes=10)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
+
+    # lr scheduler
+    lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH)
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model,
+                                                                                optimizer,
+                                                                                criterion=criterion,
+                                                                                dataloader=train_dataloader,
+                                                                                lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Resume from checkpoint
+    # ==============================
+    if args.resume >= 0:
+        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
+        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
+        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
+
+    # ==============================
+    # Train model
+    # ==============================
+    start_epoch = args.resume if args.resume >= 0 else 0
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
+        lr_scheduler.step()
+
+        # save checkpoint
+        if args.interval > 0 and (epoch + 1) % args.interval == 0:
+            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
+            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
+            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
+
+    accuracy = evaluate(model, test_dataloader, coordinator)
+    if args.target_acc is not None:
+        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorial/new_api/glue_bert/README.md b/examples/tutorial/new_api/glue_bert/README.md
index d65c7705b5da..0030eead9f5b 100644
--- a/examples/tutorial/new_api/glue_bert/README.md
+++ b/examples/tutorial/new_api/glue_bert/README.md
@@ -10,6 +10,12 @@ This example provides a training script, which provides an example of finetuning
   - `--target_f1`: Target f1 score. Raise exception if not reached. Defaults to `None`.
 
 
+### Install requirements
+
+```bash
+pip install -r requirements.txt
+```
+
 ### Train
 
 ```bash
diff --git a/examples/tutorial/new_api/glue_bert/requirements.txt b/examples/tutorial/new_api/glue_bert/requirements.txt
new file mode 100644
index 000000000000..950c2d378f08
--- /dev/null
+++ b/examples/tutorial/new_api/glue_bert/requirements.txt
@@ -0,0 +1,7 @@
+colossalai
+datasets
+torch
+tqdm
+transformers
+scipy
+scikit-learn
diff --git a/examples/tutorial/new_api/glue_bert/test_ci.sh b/examples/tutorial/new_api/glue_bert/test_ci.sh
index f8c2dfbe9eb9..c2c097f8d026 100755
--- a/examples/tutorial/new_api/glue_bert/test_ci.sh
+++ b/examples/tutorial/new_api/glue_bert/test_ci.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -xe
 
+pip install -r requirements.txt
+
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
     torchrun --standalone --nproc_per_node 4  finetune.py --target_f1 0.86 --plugin $plugin
 done
diff --git a/examples/tutorial/new_api/test_ci.sh b/examples/tutorial/new_api/test_ci.sh
index 8b4475e9f147..a08844dbe5fa 100644
--- a/examples/tutorial/new_api/test_ci.sh
+++ b/examples/tutorial/new_api/test_ci.sh
@@ -1,2 +1,6 @@
-#!/usr/bin/env
-echo "The CI integration will be completed when the API is stable"
+#!/bin/bash
+set -xe
+
+# FIXME(ver217): only run bert finetune to save time
+
+cd glue_bert && bash ./test_ci.sh && cd ..
diff --git a/examples/tutorial/new_api/torch_ddp/README.md b/examples/tutorial/new_api/torch_ddp/README.md
deleted file mode 100644
index e120bacb0c84..000000000000
--- a/examples/tutorial/new_api/torch_ddp/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Distributed Data Parallel
-
-## 🚀 Quick Start
-
-This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch.
-
-- Training Arguments
-  - `-r`, `--resume`: resume from checkpoint file path
-  - `-c`, `--checkpoint`: the folder to save checkpoints
-  - `-i`, `--interval`: epoch interval to save checkpoints
-  - `-f`, `--fp16`: use fp16
-
-- Eval Arguments
-  - `-e`, `--epoch`: select the epoch to evaluate
-  - `-c`, `--checkpoint`: the folder where checkpoints are found
-
-
-### Train
-
-```bash
-# train with torch DDP with fp32
-colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32
-
-# train with torch DDP with mixed precision training
-colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 --fp16
-```
-
-### Eval
-
-```bash
-# evaluate fp32 training
-python eval.py -c ./ckpt-fp32 -e 80
-
-# evaluate fp16 mixed precision training
-python eval.py -c ./ckpt-fp16 -e 80
-```
-
-Expected accuracy performance will be:
-
-| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 |
-| --------- | ------------------------ | --------------------- | --------------------- |
-| ResNet-18 | 85.85%                   | 85.03%                | 85.12%                |
-
-**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`**
diff --git a/examples/tutorial/new_api/torch_ddp/train.py b/examples/tutorial/new_api/torch_ddp/train.py
deleted file mode 100644
index 4741c3151cbb..000000000000
--- a/examples/tutorial/new_api/torch_ddp/train.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-from torch.optim.lr_scheduler import MultiStepLR
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import TorchDDPPlugin
-from colossalai.cluster import DistCoordinator
-
-# ==============================
-# Parse Arguments
-# ==============================
-parser = argparse.ArgumentParser()
-parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
-parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
-parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
-parser.add_argument('-f', '--fp16', action='store_true', help="use fp16")
-args = parser.parse_args()
-
-# ==============================
-# Prepare Checkpoint Directory
-# ==============================
-Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
-
-# ==============================
-# Prepare Hyperparameters
-# ==============================
-NUM_EPOCHS = 80
-LEARNING_RATE = 1e-3
-START_EPOCH = args.resume if args.resume >= 0 else 0
-
-# ==============================
-# Launch Distributed Environment
-# ==============================
-colossalai.launch_from_torch(config={})
-coordinator = DistCoordinator()
-
-# update the learning rate with linear scaling
-# old_gpu_num / old_lr = new_gpu_num / new_lr
-LEARNING_RATE *= coordinator.world_size
-
-# ==============================
-# Prepare Booster
-# ==============================
-plugin = TorchDDPPlugin()
-if args.fp16:
-    booster = Booster(mixed_precision='fp16', plugin=plugin)
-else:
-    booster = Booster(plugin=plugin)
-
-# ==============================
-# Prepare Train Dataset
-# ==============================
-transform = transforms.Compose(
-    [transforms.Pad(4),
-     transforms.RandomHorizontalFlip(),
-     transforms.RandomCrop(32),
-     transforms.ToTensor()])
-
-# CIFAR-10 dataset
-with coordinator.priority_execution():
-    train_dataset = torchvision.datasets.CIFAR10(root='./data/', train=True, transform=transform, download=True)
-
-# ====================================
-# Prepare model, optimizer, criterion
-# ====================================
-# resent50
-model = torchvision.models.resnet18(num_classes=10).cuda()
-
-# Loss and optimizer
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
-
-# lr scheduler
-lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)
-
-# prepare dataloader with torch ddp plugin
-train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=100, shuffle=True)
-
-# ==============================
-# Resume from checkpoint
-# ==============================
-if args.resume >= 0:
-    booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
-    booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
-    booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
-
-# ==============================
-# Boost with ColossalAI
-# ==============================
-model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model, optimizer, criterion,
-                                                                            train_dataloader, lr_scheduler)
-
-# ==============================
-# Train model
-# ==============================
-total_step = len(train_dataloader)
-
-for epoch in range(START_EPOCH, NUM_EPOCHS):
-    for i, (images, labels) in enumerate(train_dataloader):
-        images = images.cuda()
-        labels = labels.cuda()
-
-        # Forward pass
-        outputs = model(images)
-        loss = criterion(outputs, labels)
-
-        # Backward and optimize
-        optimizer.zero_grad()
-        booster.backward(loss, optimizer)
-        optimizer.step()
-
-        if (i + 1) % 100 == 0:
-            print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(epoch + 1, NUM_EPOCHS, i + 1, total_step,
-                                                                    loss.item()))
-
-    lr_scheduler.step()
-
-    # save checkpoint every 5 epoch
-    if (epoch + 1) % args.interval == 0:
-        booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
-        booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
-        booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')

From 3bf09efe74589faae26918104bd4f790742a3c3b Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 8 May 2023 15:44:03 +0800
Subject: [PATCH 14/20] [booster] update prepare dataloader method for plugin
 (#3706)

* [booster] add prepare dataloader method for plug

* [booster] update examples and docstr
---
 colossalai/booster/plugin/dp_plugin_base.py   | 20 +++++++++----------
 colossalai/booster/plugin/gemini_plugin.py    |  2 +-
 .../booster/plugin/low_level_zero_plugin.py   |  2 +-
 colossalai/booster/plugin/plugin_base.py      | 17 +++++++++++++++-
 colossalai/booster/plugin/torch_ddp_plugin.py |  2 +-
 .../tutorial/new_api/cifar_resnet/train.py    | 10 ++--------
 examples/tutorial/new_api/cifar_vit/train.py  | 10 ++--------
 examples/tutorial/new_api/glue_bert/data.py   | 16 +++++++--------
 .../test_plugin/test_dp_plugin_base.py        |  2 +-
 9 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/colossalai/booster/plugin/dp_plugin_base.py b/colossalai/booster/plugin/dp_plugin_base.py
index 4021b31754b4..d5da5938bfd9 100644
--- a/colossalai/booster/plugin/dp_plugin_base.py
+++ b/colossalai/booster/plugin/dp_plugin_base.py
@@ -20,21 +20,19 @@ def __init__(self) -> None:
         self.rank = dist.get_rank()
         self.world_size = dist.get_world_size()
 
-    def prepare_train_dataloader(self,
-                                 dataset,
-                                 batch_size,
-                                 shuffle=False,
-                                 seed=1024,
-                                 drop_last=False,
-                                 pin_memory=False,
-                                 num_workers=0,
-                                 **kwargs):
+    def prepare_dataloader(self,
+                           dataset,
+                           batch_size,
+                           shuffle=False,
+                           seed=1024,
+                           drop_last=False,
+                           pin_memory=False,
+                           num_workers=0,
+                           **kwargs):
         r"""
         Prepare a dataloader for distributed training. The dataloader will be wrapped by
         `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
 
-        Note:
-            1. Evaluation datasets should not be passed to this function.
 
         Args:
             dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index fde8912a648f..4850b52defaf 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -156,7 +156,7 @@ class GeminiPlugin(DPPluginBase):
         >>> model, train_dataset, optimizer, criterion = ...
         >>> plugin = GeminiPlugin()
 
-        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
         >>> booster = Booster(plugin=plugin)
         >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
 
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 828d8b27422f..f0f5768560a7 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -95,7 +95,7 @@ class LowLevelZeroPlugin(DPPluginBase):
         >>> model, train_dataset, optimizer, criterion = ...
         >>> plugin = LowLevelZeroPlugin()
 
-        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
         >>> booster = Booster(plugin=plugin)
         >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
 
diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py
index 7a222022c1b2..eb5478595542 100644
--- a/colossalai/booster/plugin/plugin_base.py
+++ b/colossalai/booster/plugin/plugin_base.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Dataset
 
 from colossalai.checkpoint_io import CheckpointIO
 from colossalai.interface import OptimizerWrapper
@@ -59,3 +59,18 @@ def get_checkpoint_io(self) -> CheckpointIO:
         Get checkpoint io object for this plugin, only invoked when control_checkpoint_io is True.
         """
         pass
+
+    @abstractmethod
+    def prepare_dataloader(self,
+                           dataset: Dataset,
+                           batch_size: int,
+                           shuffle: bool = False,
+                           seed: int = 1024,
+                           drop_last: bool = False,
+                           pin_memory: bool = False,
+                           num_workers: int = 0,
+                           **kwargs):
+        """Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader`
+        """
+        pass
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index d30d266c0048..76906d844ef1 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -72,7 +72,7 @@ class TorchDDPPlugin(DPPluginBase):
         >>> model, train_dataset, optimizer, criterion = ...
         >>> plugin = TorchDDPPlugin()
 
-        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
         >>> booster = Booster(plugin=plugin)
         >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
 
diff --git a/examples/tutorial/new_api/cifar_resnet/train.py b/examples/tutorial/new_api/cifar_resnet/train.py
index e64e95fc2baf..a96a4b640a22 100644
--- a/examples/tutorial/new_api/cifar_resnet/train.py
+++ b/examples/tutorial/new_api/cifar_resnet/train.py
@@ -49,14 +49,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
                                                     download=True)
 
     # Data loader
-    train_dataloader = plugin.prepare_train_dataloader(train_dataset,
-                                                       batch_size=batch_size,
-                                                       shuffle=True,
-                                                       drop_last=True)
-    test_dataloader = plugin.prepare_train_dataloader(test_dataset,
-                                                      batch_size=batch_size,
-                                                      shuffle=False,
-                                                      drop_last=False)
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
     return train_dataloader, test_dataloader
 
 
diff --git a/examples/tutorial/new_api/cifar_vit/train.py b/examples/tutorial/new_api/cifar_vit/train.py
index fee53df07086..2405fdfc60d5 100644
--- a/examples/tutorial/new_api/cifar_vit/train.py
+++ b/examples/tutorial/new_api/cifar_vit/train.py
@@ -63,14 +63,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
                                                     download=True)
 
     # Data loader
-    train_dataloader = plugin.prepare_train_dataloader(train_dataset,
-                                                       batch_size=batch_size,
-                                                       shuffle=True,
-                                                       drop_last=True)
-    test_dataloader = plugin.prepare_train_dataloader(test_dataset,
-                                                      batch_size=batch_size,
-                                                      shuffle=False,
-                                                      drop_last=False)
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
     return train_dataloader, test_dataloader
 
 
diff --git a/examples/tutorial/new_api/glue_bert/data.py b/examples/tutorial/new_api/glue_bert/data.py
index e43312aebc7c..981cedcca8c2 100644
--- a/examples/tutorial/new_api/glue_bert/data.py
+++ b/examples/tutorial/new_api/glue_bert/data.py
@@ -84,26 +84,26 @@ def prepare_data(self):
         AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
 
     def train_dataloader(self):
-        return self.plugin.prepare_train_dataloader(self.dataset["train"],
-                                                    batch_size=self.train_batch_size,
-                                                    shuffle=True,
-                                                    drop_last=True)
+        return self.plugin.prepare_dataloader(self.dataset["train"],
+                                              batch_size=self.train_batch_size,
+                                              shuffle=True,
+                                              drop_last=True)
 
     def val_dataloader(self):
         if len(self.eval_splits) == 1:
-            return self.plugin.prepare_train_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
+            return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
         elif len(self.eval_splits) > 1:
             return [
-                self.plugin.prepare_train_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
                 for x in self.eval_splits
             ]
 
     def test_dataloader(self):
         if len(self.eval_splits) == 1:
-            return self.plugin.prepare_train_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
+            return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
         elif len(self.eval_splits) > 1:
             return [
-                self.plugin.prepare_train_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
                 for x in self.eval_splits
             ]
 
diff --git a/tests/test_booster/test_plugin/test_dp_plugin_base.py b/tests/test_booster/test_plugin/test_dp_plugin_base.py
index a2b94ba6ca81..eab949828db9 100644
--- a/tests/test_booster/test_plugin/test_dp_plugin_base.py
+++ b/tests/test_booster/test_plugin/test_dp_plugin_base.py
@@ -55,7 +55,7 @@ def check_dataloader_sharding():
 
     # create a custom dasetset with 0 to 10
     dataset = TensorDataset(torch.arange(0, 10))
-    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
+    train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
 
     # get the first batch of data
     batch = next(iter(train_dataloader))[0].cuda()

From 6552cbf8e1bf0fb60e189bdcc2467e07f4c1f08e Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 9 May 2023 11:10:02 +0800
Subject: [PATCH 15/20] [booster] fix no_sync method (#3709)

* [booster] fix no_sync method

* [booster] add test for ddp no_sync

* [booster] fix merge

* [booster] update unit test

* [booster] update unit test

* [booster] update unit test
---
 colossalai/booster/plugin/gemini_plugin.py    |  5 +-
 .../booster/plugin/low_level_zero_plugin.py   |  5 +-
 colossalai/booster/plugin/plugin_base.py      |  9 ++-
 colossalai/booster/plugin/torch_ddp_plugin.py |  6 +-
 .../test_plugin/test_dp_plugin_base.py        |  5 +-
 .../test_plugin/test_torch_ddp_plugin.py      | 60 +++++++++++++++++++
 6 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 4850b52defaf..a3789a39d94b 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -2,7 +2,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -286,3 +286,6 @@ def control_checkpoint_io(self) -> bool:
 
     def get_checkpoint_io(self) -> CheckpointIO:
         return GeminiCheckpointIO()
+
+    def no_sync(self, model: nn.Module) -> Iterator[None]:
+        raise NotImplementedError
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index f0f5768560a7..edc0b7679686 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -197,3 +197,6 @@ def control_checkpoint_io(self) -> bool:
 
     def get_checkpoint_io(self) -> CheckpointIO:
         return LowLevelZeroCheckpointIO()
+
+    def no_sync(self, model: nn.Module) -> Iterator[None]:
+        raise NotImplementedError
diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py
index eb5478595542..561f58bc5570 100644
--- a/colossalai/booster/plugin/plugin_base.py
+++ b/colossalai/booster/plugin/plugin_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, List, Tuple, Union
+from typing import Callable, Iterator, List, Tuple, Union
 
 import torch.nn as nn
 from torch.optim import Optimizer
@@ -60,6 +60,13 @@ def get_checkpoint_io(self) -> CheckpointIO:
         """
         pass
 
+    @abstractmethod
+    def no_sync(self, model: nn.Module) -> Iterator[None]:
+        """
+        Context manager to disable gradient synchronization.
+        """
+        pass
+
     @abstractmethod
     def prepare_dataloader(self,
                            dataset: Dataset,
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index 76906d844ef1..99cd2f7791d3 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, Iterator, List, Tuple, Union
 
 import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -142,3 +142,7 @@ def control_checkpoint_io(self) -> bool:
 
     def get_checkpoint_io(self) -> CheckpointIO:
         return TorchDDPCheckpointIO()
+
+    def no_sync(self, model: nn.Module) -> Iterator[None]:
+        assert isinstance(model, TorchDDPModel), 'Model is not boosted by TorchDDPPlugin.'
+        return model.module.no_sync()
diff --git a/tests/test_booster/test_plugin/test_dp_plugin_base.py b/tests/test_booster/test_plugin/test_dp_plugin_base.py
index eab949828db9..61aeded12203 100644
--- a/tests/test_booster/test_plugin/test_dp_plugin_base.py
+++ b/tests/test_booster/test_plugin/test_dp_plugin_base.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, Iterator, List, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -49,6 +49,9 @@ def supported_devices(self) -> List[str]:
     def supported_precisions(self) -> List[str]:
         pass
 
+    def no_sync(self, model: nn.Module) -> Iterator[None]:
+        pass
+
 
 def check_dataloader_sharding():
     plugin = DPPluginWrapper()
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 30c4db12309f..fbe44e5ce6fb 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -1,5 +1,8 @@
+from contextlib import nullcontext
+
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
 
@@ -44,10 +47,67 @@ def check_torch_ddp_plugin():
         torch.cuda.empty_cache()
 
 
+class DummyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(1))
+
+    def forward(self, x):
+        return self.weight * x
+
+
+def check_torch_ddp_no_sync():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+
+    model = DummyModel()
+    criterion = lambda x: x.mean()
+    optimizer = SGD(model.parameters(), lr=1e-3)
+    # create a custom dasetset with 0 to 10
+    dataset = torch.arange(0, 10)
+    train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(model,
+                                                                     optimizer,
+                                                                     criterion,
+                                                                     dataloader=train_dataloader)
+
+    def fwd_bwd():
+        output = model(batch.cuda())
+        loss = criterion(output)
+        booster.backward(loss, optimizer)
+
+    def get_grad_set_over_all_ranks():
+        for p in model.parameters():
+            # grad shape is (1, )
+            assert p.grad.shape == (1,)
+            grad_list = [torch.empty_like(p.grad) for _ in range(dist.get_world_size())]
+            dist.all_gather(grad_list, p.grad)
+            # get grad set of all ranks
+            grad_set = set([grad.item() for grad in grad_list])
+            # as the model only has one parameter, we can return here
+            return grad_set
+
+    for i, batch in enumerate(train_dataloader):
+        if i > 1:
+            # only check the first two batches
+            break
+        # no_sync for the first batch, sync for the second batch
+        ctx = booster.no_sync(model) if i == 0 else nullcontext()
+        with ctx:
+            fwd_bwd()
+        grad_set = get_grad_set_over_all_ranks()
+        # for the first batch, all ranks should have different grads
+        # for the second batch, as grad is synchronized,all ranks should have the same grads
+        target_num_different_grad = dist.get_world_size() if i == 0 else 1
+        assert len(grad_set) == target_num_different_grad
+
+
 def run_dist(rank, world_size, port):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
     check_torch_ddp_plugin()
+    check_torch_ddp_no_sync()
 
 
 @rerun_if_address_is_in_use()

From 20068ba188b916982e2e67cc9ebe120ccd4eb6ce Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Wed, 10 May 2023 12:17:02 +0800
Subject: [PATCH 16/20] [booster] add tests for ddp and low level zero's
 checkpointio (#3715)

* [booster] update tests for booster

* [booster] update tests for booster

* [booster] update tests for booster

* [booster] update tests for booster

* [booster] update tests for booster

* [booster] update booster tutorials#3717, fix recursive check
---
 colossalai/testing/__init__.py                |  11 +-
 colossalai/testing/comparison.py              |  24 ++++
 .../test_gemini_checkpoint_io.py              |  98 +++++++++++++
 .../test_general_checkpoint_io.py             | 133 ++----------------
 .../test_low_level_zero_checkpoint_io.py      |  57 ++++++++
 .../test_torch_ddp_checkpoint_io.py           |  63 +++++++++
 6 files changed, 261 insertions(+), 125 deletions(-)
 create mode 100644 tests/test_checkpoint_io/test_gemini_checkpoint_io.py
 create mode 100644 tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
 create mode 100644 tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py

diff --git a/colossalai/testing/__init__.py b/colossalai/testing/__init__.py
index c53e0f44c7e0..9d0475ed064c 100644
--- a/colossalai/testing/__init__.py
+++ b/colossalai/testing/__init__.py
@@ -1,4 +1,11 @@
-from .comparison import assert_close, assert_close_loose, assert_equal, assert_equal_in_group, assert_not_equal
+from .comparison import (
+    assert_close,
+    assert_close_loose,
+    assert_equal,
+    assert_equal_in_group,
+    assert_not_equal,
+    check_state_dict_equal,
+)
 from .pytest_wrapper import run_on_environment_flag
 from .utils import (
     clear_cache_before_run,
@@ -13,5 +20,5 @@
 __all__ = [
     'assert_equal', 'assert_not_equal', 'assert_close', 'assert_close_loose', 'assert_equal_in_group', 'parameterize',
     'rerun_on_exception', 'rerun_if_address_is_in_use', 'skip_if_not_enough_gpus', 'free_port', 'spawn',
-    'clear_cache_before_run', 'run_on_environment_flag'
+    'clear_cache_before_run', 'run_on_environment_flag', 'check_state_dict_equal'
 ]
diff --git a/colossalai/testing/comparison.py b/colossalai/testing/comparison.py
index e00d0da168c7..faf61638d8bb 100644
--- a/colossalai/testing/comparison.py
+++ b/colossalai/testing/comparison.py
@@ -1,3 +1,5 @@
+from typing import OrderedDict
+
 import torch
 import torch.distributed as dist
 from torch import Tensor
@@ -28,3 +30,25 @@ def assert_equal_in_group(tensor: Tensor, process_group: ProcessGroup = None):
         a = tensor_list[i]
         b = tensor_list[i + 1]
         assert torch.all(a == b), f'expected tensors on rank {i} and {i + 1} to be equal but they are not, {a} vs {b}'
+
+
+def check_state_dict_equal(d1: OrderedDict, d2: OrderedDict, ignore_device: bool = True):
+    for k, v in d1.items():
+        if isinstance(v, dict):
+            check_state_dict_equal(v, d2[k])
+        elif isinstance(v, list):
+            for i in range(len(v)):
+                if isinstance(v[i], torch.Tensor):
+                    if not ignore_device:
+                        v[i] = v[i].to("cpu")
+                        d2[k][i] = d2[k][i].to("cpu")
+                    assert torch.equal(v[i], d2[k][i])
+                else:
+                    assert v[i] == d2[k][i]
+        elif isinstance(v, torch.Tensor):
+            if not ignore_device:
+                v = v.to("cpu")
+                d2[k] = d2[k].to("cpu")
+            assert torch.equal(v, d2[k])
+        else:
+            assert v == d2[k]
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
new file mode 100644
index 000000000000..1e5a2e1c4b44
--- /dev/null
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -0,0 +1,98 @@
+import tempfile
+
+import pytest
+import torch
+
+import colossalai
+from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
+from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils.cuda import get_current_device
+from colossalai.zero import ColoInitContext, ZeroDDP
+from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
+from colossalai.zero.gemini.gemini_mgr import GeminiManager
+from tests.components_to_test.registry import non_distributed_component_funcs
+
+
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['bert'])
+@parameterize('use_safetensors', [True, False])
+def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: bool):
+    from transformers import BertForSequenceClassification
+
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+    with ColoInitContext(device=(get_current_device())):
+        bert_model = model_builder()
+    bert_model.config.save_pretrained(save_directory=(model_ckpt_dir.name))
+
+    config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    bert_model = ZeroDDP(bert_model, gemini_manager)
+    bert_model.train()
+
+    ckpt_io = GeminiCheckpointIO()
+    if ckpt_io.coordinator.is_master():
+        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
+        ckpt_io.save_model(bert_model, (model_ckpt_dir.name),
+                           True,
+                           True,
+                           '', (model_size / 3),
+                           use_safetensors=use_safetensors)
+        new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
+        check_state_dict_equal(bert_model.state_dict(only_rank_0=True, dtype=(torch.float32)),
+                               new_bert_model.state_dict(), False)
+    model_ckpt_dir.cleanup()
+
+
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['gpt2', 'bert'])
+@parameterize('use_safetensors', [True, False])
+def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+    with ColoInitContext(device=(get_current_device())):
+        model = model_builder()
+        new_model = model_builder()
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager)
+
+    model.train()
+    #new model
+    new_config_dict, *_ = search_chunk_configuration(new_model, search_range_mb=1, search_interval_byte=100)
+    new_chunk_manager = ChunkManager(new_config_dict)
+    new_gemini_manager = GeminiManager(placement_policy, new_chunk_manager)
+    new_model = ZeroDDP(new_model, new_gemini_manager)
+
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+    ckpt_io = GeminiCheckpointIO()
+    model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
+    ckpt_io.save_model(model, (model_ckpt_dir.name),
+                       True,
+                       True,
+                       'epoch', (model_size / 3),
+                       use_safetensors=use_safetensors)
+
+    if ckpt_io.coordinator.is_master():
+        ckpt_io.load_model(new_model, (model_ckpt_dir.name), strict=True)
+        model_dict = model.state_dict(only_rank_0=True)
+        new_model_dict = new_model.state_dict(only_rank_0=True)
+        check_state_dict_equal(model_dict, new_model_dict, False)
+    model_ckpt_dir.cleanup()
+
+
+def run_dist(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_state_dict()
+    exam_state_dict_with_origin()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [4, 4])
+@rerun_if_address_is_in_use()
+def test_gemini_ckpIO(world_size):
+    spawn(run_dist, world_size)
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
index 752ca706bfd4..9e973bb23e0b 100644
--- a/tests/test_checkpoint_io/test_general_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -1,20 +1,13 @@
 import tempfile
+
 import pytest
 import torch
 from torch.optim import Adam
 from torchvision.models import resnet18
 
-from colossalai.checkpoint_io import GeneralCheckpointIO
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
-from colossalai.testing import clear_cache_before_run, parameterize
-
-import colossalai
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext, ZeroDDP
-from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
-from colossalai.zero.gemini.gemini_mgr import GeminiManager
-from tests.components_to_test.registry import non_distributed_component_funcs
+from colossalai.checkpoint_io import GeneralCheckpointIO
+from colossalai.testing import check_state_dict_equal, clear_cache_before_run, parameterize
 
 # ========
 # Note:
@@ -61,10 +54,10 @@ def test_unsharded_checkpoint(use_safetensors: bool):
     ckpt_io.load_model(new_model, model_ckpt_tempfile.name)
     ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
 
-
     # check for model and optimizer state dict recursively
-    recursive_check(model.state_dict(), new_model.state_dict())
-    recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+    check_state_dict_equal(model.state_dict(), new_model.state_dict())
+    check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+
 
 @pytest.mark.parametrize('use_safetensors', [True, False])
 def test_sharded_checkpoint(use_safetensors: bool):
@@ -87,7 +80,7 @@ def test_sharded_checkpoint(use_safetensors: bool):
     else:
         suffix = ".bin"
         WEIGHTS_INDEX_NAME = "model.bin.index.json"
-    
+
     model_ckpt_dir = tempfile.TemporaryDirectory()
     optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
 
@@ -96,7 +89,7 @@ def test_sharded_checkpoint(use_safetensors: bool):
 
     ckpt_io.save_model(model, model_ckpt_dir.name, True, True, "", 10, use_safetensors=use_safetensors)
     ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name, shard=False)
-    
+
     # create new model
     new_model = resnet18()
     new_optimizer = Adam(new_model.parameters(), lr=0.001)
@@ -105,111 +98,5 @@ def test_sharded_checkpoint(use_safetensors: bool):
     ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
 
     # check for model and optimizer state dict recursively
-    recursive_check(model.state_dict(), new_model.state_dict())
-    recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
-
-@parameterize('placement_policy', ['cuda', 'cpu'])
-@parameterize('model_name', ['bert'])
-@parameterize('use_safetensors', [True, False])
-def hf_load_colossalai_checkpoint(placement_policy, model_name, use_safetensors: bool):
-    from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertForSequenceClassification
-
-    model_ckpt_dir = tempfile.TemporaryDirectory()
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, *_ = get_components_func()
-
-    with ColoInitContext(device=get_current_device()):
-        bert_model = model_builder()
-    bert_model.config.save_pretrained(save_directory=model_ckpt_dir.name)
-    config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
-    chunk_manager = ChunkManager(config_dict)
-    gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    bert_model = ZeroDDP(bert_model, gemini_manager)
-    bert_model.train()
-
-    ckpt_io = GeminiCheckpointIO()
-    if ckpt_io.coordinator.is_master():
-        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
-        ckpt_io.save_model(bert_model, model_ckpt_dir.name, True, True, "", (model_size / 3), use_safetensors=use_safetensors)
-        new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
-        recursive_check(bert_model.state_dict(only_rank_0=True, dtype=torch.float32), new_bert_model.state_dict())
-    
-    model_ckpt_dir.cleanup()
-        
-
-
-@parameterize('placement_policy', ['cuda', 'cpu'])
-@parameterize('model_name', ['gpt2', 'bert'])
-@parameterize('use_safetensors', [True, False])
-def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, *_ = get_components_func()
-
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-        new_model = model_builder()
-
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
-    chunk_manager = ChunkManager(config_dict)
-    gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    model = ZeroDDP(model, gemini_manager)
-    model.train()
-
-    new_config_dict, *_ = search_chunk_configuration(new_model, search_range_mb=1, search_interval_byte=100)
-    new_chunk_manager = ChunkManager(new_config_dict)
-    new_gemini_manager = GeminiManager(placement_policy, new_chunk_manager)
-    new_model = ZeroDDP(new_model, new_gemini_manager)
-
-    model_ckpt_dir = tempfile.TemporaryDirectory()
-
-    ckpt_io = GeminiCheckpointIO()
-    model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
-    ckpt_io.save_model(model, model_ckpt_dir.name, True, True, "epoch", (model_size / 3), use_safetensors=use_safetensors)
-
-    # load model
-    if ckpt_io.coordinator.is_master():
-        ckpt_io.load_model(new_model, model_ckpt_dir.name, strict=True)
-        model_dict = model.state_dict(only_rank_0=True)
-        new_model_dict = new_model.state_dict(only_rank_0=True)
-        recursive_check(model_dict, new_model_dict)
-
-    model_ckpt_dir.cleanup()
-
-
-def run_dist(rank, world_size, port):
-    config = {}
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    exam_state_dict()
-    hf_load_colossalai_checkpoint()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [4, 4])
-@rerun_if_address_is_in_use()
-def test_gemini_ckpIO(world_size):
-    spawn(run_dist, world_size)
-
-
-# do recursive check for the optimizer state dict
-# if the value is a dict, compare its values
-# if the value is a list, comapre all elements one-by-one
-# if the value is a torch.Tensor, use torch.equal
-# otherwise use assertEqual
-def recursive_check(d1, d2):
-    for k, v in d1.items():
-        if isinstance(v, dict):
-            recursive_check(v, d2[k])
-        elif isinstance(v, list):
-            for i in range(len(v)):
-                if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to("cpu")
-                    d2[k][i] = d2[k][i].to("cpu")
-                    assert torch.equal(v[i], d2[k][i])
-                else:
-                    assert v[i] == d2[k][i]
-        elif isinstance(v, torch.Tensor):
-            v = v.to("cpu")
-            d2[k] = d2[k].to("cpu")
-            assert torch.equal(v, d2[k])
-        else:
-            assert v == d2[k]
+    check_state_dict_equal(model.state_dict(), new_model.state_dict())
+    check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
new file mode 100644
index 000000000000..217a950d8155
--- /dev/null
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -0,0 +1,57 @@
+import tempfile
+
+import pytest
+import torch
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroCheckpointIO
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import (
+    check_state_dict_equal,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+
+
+@clear_cache_before_run()
+@parameterize('stage', [2])
+def check_low_level_zero_checkpointIO(stage: int):
+    plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32)
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = HybridAdam((model.parameters()), lr=0.001)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to('cuda')
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.step()
+
+    optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    ckpt_io = LowLevelZeroCheckpointIO()
+    ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
+
+    if ckpt_io.coordinator.is_master():
+        new_model = resnet18()
+        new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
+        _, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
+        ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_low_level_zero_checkpointIO()
+
+
+@rerun_if_address_is_in_use()
+def test_low_level_zero_checkpointIO():
+    spawn(run_dist, 2)
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
new file mode 100644
index 000000000000..9128f8c0fe9e
--- /dev/null
+++ b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
@@ -0,0 +1,63 @@
+import tempfile
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import SGD
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPCheckpointIO
+from colossalai.interface import OptimizerWrapper
+from colossalai.testing import check_state_dict_equal, rerun_if_address_is_in_use, spawn
+
+
+def check_torch_ddp_checkpointIO():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = SGD((model.parameters()), lr=0.001)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
+
+    assert isinstance(model.module, DDP)
+    assert isinstance(optimizer, OptimizerWrapper)
+
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to('cuda')
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
+    scheduler.step()
+
+    optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    lr_scheduler_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    ckpt_io = TorchDDPCheckpointIO()
+    ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
+    ckpt_io.save_lr_scheduler(scheduler, lr_scheduler_ckpt_tempfile.name)
+
+    if ckpt_io.coordinator.is_master():
+        new_model = resnet18()
+        new_optimizer = SGD((new_model.parameters()), lr=0.001)
+        new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
+        _, new_optimizer, _, _, new_scheduler = booster.boost(new_model, new_optimizer, lr_scheduler=new_scheduler)
+
+        ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
+
+        ckpt_io.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_tempfile.name)
+        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_torch_ddp_checkpointIO()
+
+
+@rerun_if_address_is_in_use()
+def test_torch_ddp_checkpointIO():
+    spawn(run_dist, 2)

From f7361ee1bd31e57004d28418133e3714b08a53b2 Mon Sep 17 00:00:00 2001
From: MisterLin1995 <16671583+MisterLin1995@users.noreply.github.com>
Date: Wed, 10 May 2023 13:36:09 +0800
Subject: [PATCH 17/20] [chat] fix community example ray (#3719)

Co-authored-by: jiangwen <zxl265370@antgroup.com>
---
 applications/Chat/examples/community/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/examples/community/README.md b/applications/Chat/examples/community/README.md
index c9c645032288..cd7b9d99bf06 100644
--- a/applications/Chat/examples/community/README.md
+++ b/applications/Chat/examples/community/README.md
@@ -17,7 +17,7 @@ Community examples consist of both inference and training examples that have bee
 | Example                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                      | Colab                                                                                                                                                                                                              |                                                     Author |
 |:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
 | Peft           | Adding Peft support for SFT and Prompts model training                                                                                                                                                                                                                                                                                                                                                                                                                                   | [Huggingface Peft](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/peft)     | - |             [YY Lin](https://github.com/yynil) |
-| Train prompts on Ray           | A Ray based implementation of Train prompts example                                                                                                                                                                                                                                                                                                                                                                                                                                   | [Huggingface Peft](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/ray)     | - |             [MisterLin1995](https://github.com/MisterLin1995) |
+| Train prompts on Ray           | A Ray based implementation of Train prompts example                                                                                                                                                                                                                                                                                                                                                                                                                                   | [Training On Ray](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/ray)     | - |             [MisterLin1995](https://github.com/MisterLin1995) |
 |...|...|...|...|...|
 
 ### How to get involved

From b7141c36dd84d025b4aef09da74c7b7ac29010b5 Mon Sep 17 00:00:00 2001
From: digger-yu <digger-yu@outlook.com>
Date: Wed, 10 May 2023 17:12:03 +0800
Subject: [PATCH 18/20] [CI] fix some spelling errors (#3707)

* fix spelling error with examples/comminity/

* fix spelling error with tests/

* fix some spelling error with tests/ colossalai/ etc.
---
 applications/Chat/coati/kernels/opt_attn.py   |  2 +-
 colossalai/communication/p2p.py               |  2 +-
 colossalai/communication/p2p_v2.py            |  2 +-
 .../initializer_sequence.py                   |  4 ++--
 .../tutorial/new_api/cifar_resnet/train.py    |  2 +-
 examples/tutorial/new_api/cifar_vit/train.py  |  6 ++---
 op_builder/utils.py                           |  2 +-
 tests/components_to_test/albert.py            | 10 ++++-----
 tests/components_to_test/beit.py              |  4 ++--
 tests/components_to_test/bert.py              | 16 +++++++-------
 tests/components_to_test/registry.py          |  8 +++----
 .../test_activation_checkpointing.py          |  2 +-
 .../test_checkpoint_io/test_load.py           | 22 +++++++++----------
 .../test_checkpoint_io/test_merge.py          |  4 ++--
 .../test_checkpoint_io/test_redist.py         |  4 ++--
 .../test_checkpoint_io/test_save.py           |  8 +++----
 tests/test_utils/test_lazy_init/utils.py      |  4 ++--
 17 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
index c10f341e94a3..e99f9c2247d1 100644
--- a/applications/Chat/coati/kernels/opt_attn.py
+++ b/applications/Chat/coati/kernels/opt_attn.py
@@ -77,7 +77,7 @@ def forward(
                                                       scale=self.scaling)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py
index 0200cd3c6553..1f20fca4f74d 100644
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@@ -217,7 +217,7 @@ def recv_backward(output_grad_shape,
         next_rank (int, optional): The rank of the source of the tensor.
 
     Returns:
-        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list.
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradient tensor list.
     """
     if gpc.is_pipeline_last_stage():
         output_tensor_grad = None
diff --git a/colossalai/communication/p2p_v2.py b/colossalai/communication/p2p_v2.py
index 0dacd8c3c9b5..090311cb35f2 100644
--- a/colossalai/communication/p2p_v2.py
+++ b/colossalai/communication/p2p_v2.py
@@ -19,7 +19,7 @@
 
 
 def init_process_group():
-    """intialise process group by dist.new_group in the adjacent stages
+    """initialise process group by dist.new_group in the adjacent stages
 
     Args:
         None
diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py
index eaacb14d2282..251a2940778a 100644
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@@ -91,11 +91,11 @@ def init_dist_group(self):
 
         parallel_setting = []
 
-        local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode = \
+        local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode = \
             self._sequence_initializer.init_dist_group()
         # change mode to sequence
         mode = ParallelMode.SEQUENCE
 
-        parallel_setting.append((local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode))
+        parallel_setting.append((local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode))
         parallel_setting.append(self._sequence_dp_initializer.init_dist_group())
         return parallel_setting
diff --git a/examples/tutorial/new_api/cifar_resnet/train.py b/examples/tutorial/new_api/cifar_resnet/train.py
index a96a4b640a22..fe0dabf08377 100644
--- a/examples/tutorial/new_api/cifar_resnet/train.py
+++ b/examples/tutorial/new_api/cifar_resnet/train.py
@@ -28,7 +28,7 @@
 
 
 def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
-    # trainsform
+    # transform
     transform_train = transforms.Compose(
         [transforms.Pad(4),
          transforms.RandomHorizontalFlip(),
diff --git a/examples/tutorial/new_api/cifar_vit/train.py b/examples/tutorial/new_api/cifar_vit/train.py
index 2405fdfc60d5..82a8f2ed97e4 100644
--- a/examples/tutorial/new_api/cifar_vit/train.py
+++ b/examples/tutorial/new_api/cifar_vit/train.py
@@ -25,7 +25,7 @@
 # Prepare Hyperparameters
 # ==============================
 NUM_EPOCHS = 60
-WARMUP_EPOCSH = 5
+WARMUP_EPOCHS = 5
 LEARNING_RATE = 1e-3
 
 
@@ -37,7 +37,7 @@ def vit_cifar(**kwargs):
 
 
 def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
-    # trainsform
+    # transform
     transform_train = transforms.Compose([
         transforms.RandomCrop(32, padding=4),
         transforms.RandomHorizontalFlip(),
@@ -177,7 +177,7 @@ def main():
     optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
 
     # lr scheduler
-    lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH)
+    lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCHS)
 
     # ==============================
     # Boost with ColossalAI
diff --git a/op_builder/utils.py b/op_builder/utils.py
index 1b1bd5f49970..2dbd976fbcbb 100644
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
@@ -36,7 +36,7 @@ def get_cuda_version_in_pytorch() -> List[int]:
         torch_cuda_minor = torch.version.cuda.split(".")[1]
     except:
         raise ValueError(
-            "[extension] Cannot retrive the CUDA version in the PyTorch binary given by torch.version.cuda")
+            "[extension] Cannot retrieve the CUDA version in the PyTorch binary given by torch.version.cuda")
     return torch_cuda_major, torch_cuda_minor
 
 
diff --git a/tests/components_to_test/albert.py b/tests/components_to_test/albert.py
index d5b6bc89a83e..52b2275ec4f8 100644
--- a/tests/components_to_test/albert.py
+++ b/tests/components_to_test/albert.py
@@ -28,7 +28,7 @@ def bert_model_builder(checkpoint: bool = False):
         print('building AlbertForSequenceClassification model')
 
         # adapting huggingface BertForSequenceClassification for single unitest calling interface
-        class ModelAaptor(AlbertForSequenceClassification):
+        class ModelAdaptor(AlbertForSequenceClassification):
 
             def forward(self, input_ids, labels):
                 """
@@ -37,23 +37,23 @@ def forward(self, input_ids, labels):
                 """
                 return super().forward(input_ids=input_ids, labels=labels)[0]
 
-        model = ModelAaptor(config)
+        model = ModelAdaptor(config)
         # if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
         #     model.gradient_checkpointing_enable()
 
         return model
 
-    is_distrbuted = torch.distributed.is_initialized()
+    is_distributed = torch.distributed.is_initialized()
     trainloader = get_bert_data_loader(n_class=vocab_size,
                                        batch_size=2,
                                        total_samples=10000,
                                        sequence_length=sequence_length,
-                                       is_distrbuted=is_distrbuted)
+                                       is_distributed=is_distributed)
     testloader = get_bert_data_loader(n_class=vocab_size,
                                       batch_size=2,
                                       total_samples=10000,
                                       sequence_length=sequence_length,
-                                      is_distrbuted=is_distrbuted)
+                                      is_distributed=is_distributed)
 
     criterion = None
     return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion
diff --git a/tests/components_to_test/beit.py b/tests/components_to_test/beit.py
index 1252071f4075..2021ae6f6e35 100644
--- a/tests/components_to_test/beit.py
+++ b/tests/components_to_test/beit.py
@@ -27,7 +27,7 @@ def generate(self):
 @non_distributed_component_funcs.register(name='beit')
 def get_training_components():
 
-    def model_buider(checkpoint=False):
+    def model_builder(checkpoint=False):
         model = Beit(img_size=DummyDataLoader.img_size,
                      num_classes=DummyDataLoader.num_class,
                      embed_dim=32,
@@ -39,4 +39,4 @@ def model_buider(checkpoint=False):
     testloader = DummyDataLoader()
 
     criterion = torch.nn.CrossEntropyLoss()
-    return model_buider, trainloader, testloader, torch.optim.Adam, criterion
+    return model_builder, trainloader, testloader, torch.optim.Adam, criterion
diff --git a/tests/components_to_test/bert.py b/tests/components_to_test/bert.py
index c1faa6f9d892..e7d1d50806b8 100644
--- a/tests/components_to_test/bert.py
+++ b/tests/components_to_test/bert.py
@@ -13,7 +13,7 @@ def get_bert_data_loader(
         total_samples,
         sequence_length,
         device=torch.device('cpu:0'),
-        is_distrbuted=False,
+        is_distributed=False,
 ):
     train_data = torch.randint(
         low=0,
@@ -24,7 +24,7 @@ def get_bert_data_loader(
     )
     train_label = torch.randint(low=0, high=2, size=(total_samples,), device=device, dtype=torch.long)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
-    if is_distrbuted:
+    if is_distributed:
         sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
     else:
         sampler = SequentialSampler(train_dataset)
@@ -52,8 +52,8 @@ def bert_model_builder(checkpoint: bool = False):
                             attention_probs_dropout_prob=0.)
         print('building BertForSequenceClassification model')
 
-        # adapting huggingface BertForSequenceClassification for single unitest calling interface
-        class ModelAaptor(BertForSequenceClassification):
+        # adapting huggingface BertForSequenceClassification for single unittest calling interface
+        class ModelAdaptor(BertForSequenceClassification):
 
             def forward(self, input_ids, labels):
                 """
@@ -62,23 +62,23 @@ def forward(self, input_ids, labels):
                 """
                 return super().forward(input_ids=input_ids, labels=labels)[0]
 
-        model = ModelAaptor(config)
+        model = ModelAdaptor(config)
         if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
             model.gradient_checkpointing_enable()
 
         return model
 
-    is_distrbuted = torch.distributed.is_initialized()
+    is_distributed = torch.distributed.is_initialized()
     trainloader = get_bert_data_loader(n_class=vocab_size,
                                        batch_size=2,
                                        total_samples=10000,
                                        sequence_length=sequence_length,
-                                       is_distrbuted=is_distrbuted)
+                                       is_distributed=is_distributed)
     testloader = get_bert_data_loader(n_class=vocab_size,
                                       batch_size=2,
                                       total_samples=10000,
                                       sequence_length=sequence_length,
-                                      is_distrbuted=is_distrbuted)
+                                      is_distributed=is_distributed)
 
     criterion = None
     return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion
diff --git a/tests/components_to_test/registry.py b/tests/components_to_test/registry.py
index 728ed9eba6ea..edfcaaa7275b 100644
--- a/tests/components_to_test/registry.py
+++ b/tests/components_to_test/registry.py
@@ -9,10 +9,10 @@ def __init__(self):
     def register(self, name):
         assert name not in self._registry
 
-        def _regsiter(callable_):
+        def _register(callable_):
             self._registry[name] = callable_
 
-        return _regsiter
+        return _register
 
     def get_callable(self, name: str):
         return self._registry[name]
@@ -34,6 +34,6 @@ def __next__(self):
 
 
 non_distributed_component_funcs = Registry()
-model_paralle_component_funcs = Registry()
+model_parallel_component_funcs = Registry()
 
-__all__ = ['non_distributed_component_funcs', 'model_paralle_component_funcs']
+__all__ = ['non_distributed_component_funcs', 'model_parallel_component_funcs']
diff --git a/tests/test_utils/test_activation_checkpointing.py b/tests/test_utils/test_activation_checkpointing.py
index 59a8acd4b210..2930552cc4e7 100644
--- a/tests/test_utils/test_activation_checkpointing.py
+++ b/tests/test_utils/test_activation_checkpointing.py
@@ -51,7 +51,7 @@ def test_activation_checkpointing(cpu_offload, use_reentrant):
     # other tests might affect this test
     reset_seeds()
 
-    # We put initilization here to avoid change cuda rng state below
+    # We put initialization here to avoid change cuda rng state below
     inputs = torch.rand(2, 2, requires_grad=True, device='cuda')
     weight = torch.rand(2, 4, requires_grad=True, device='cuda')
 
diff --git a/tests/test_utils/test_checkpoint_io/test_load.py b/tests/test_utils/test_checkpoint_io/test_load.py
index b1a741515728..2949c9f0752d 100644
--- a/tests/test_utils/test_checkpoint_io/test_load.py
+++ b/tests/test_utils/test_checkpoint_io/test_load.py
@@ -23,7 +23,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None:
         assert torch.equal(v, b[k])
 
 
-def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None:
+def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None:
     assert set(a['state'].keys()) == set(b['state'].keys())
     for k, state in a['state'].items():
         b_state = b['state'][k]
@@ -32,7 +32,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False)
                 assert torch.equal(v1, v2)
             else:
                 assert v1 == v2
-    if not ignore_param_gruops:
+    if not ignore_param_groups:
         assert a['param_groups'] == b['param_groups']
 
 
@@ -129,23 +129,23 @@ def launch_dist(fn, world_size: int):
 
 
 def save_dist(dir_name: str, zero: bool):
-    model, optmizer = prepare_model_optim(shard=True, zero=zero)
-    reset_model_optim(model, optmizer)
+    model, optimizer = prepare_model_optim(shard=True, zero=zero)
+    reset_model_optim(model, optimizer)
     world_size = dist.get_world_size()
     rank = dist.get_rank()
-    save(dir_name, model, optmizer, dist_meta=get_dist_metas(world_size, zero)[rank])
+    save(dir_name, model, optimizer, dist_meta=get_dist_metas(world_size, zero)[rank])
 
 
 def load_and_check_dist(dir_name: str):
     world_size = dist.get_world_size()
-    model, optmizer = prepare_model_optim(shard=True)
-    reset_model_optim(model, optmizer)
+    model, optimizer = prepare_model_optim(shard=True)
+    reset_model_optim(model, optimizer)
     model_state_dict = deepcopy(model.state_dict())
-    optimizer_state_dict = deepcopy(optmizer.state_dict())
-    reset_model_optim(model, optmizer, 1)
-    load(dir_name, model, optmizer, get_redist_meta(world_size), get_dist_metas(world_size))
+    optimizer_state_dict = deepcopy(optimizer.state_dict())
+    reset_model_optim(model, optimizer, 1)
+    load(dir_name, model, optimizer, get_redist_meta(world_size), get_dist_metas(world_size))
     check_model_state_dict(model_state_dict, model.state_dict())
-    check_optim_state_dict(optimizer_state_dict, optmizer.state_dict())
+    check_optim_state_dict(optimizer_state_dict, optimizer.state_dict())
 
 
 @pytest.mark.dist
diff --git a/tests/test_utils/test_checkpoint_io/test_merge.py b/tests/test_utils/test_checkpoint_io/test_merge.py
index 255c74adf0a2..07d4597f8391 100644
--- a/tests/test_utils/test_checkpoint_io/test_merge.py
+++ b/tests/test_utils/test_checkpoint_io/test_merge.py
@@ -68,7 +68,7 @@ def run_dist(rank, world_size, port, test_fn):
 
 
 def run_save_dist(dir_name: str, zero: bool):
-    model, optmizer = prepare_model_optim(shard=True, zero=zero)
+    model, optimizer = prepare_model_optim(shard=True, zero=zero)
     rank = dist.get_rank()
     dp_world_size = dist.get_world_size() // 2
     if not zero:
@@ -90,7 +90,7 @@ def run_save_dist(dir_name: str, zero: bool):
             'fc.bias':
                 ParamDistMeta(rank // 2, dp_world_size, 0, 1, zero_numel=1, zero_orig_shape=[1])
         }
-    save(dir_name, model, optmizer, dist_meta=dist_metas)
+    save(dir_name, model, optimizer, dist_meta=dist_metas)
 
 
 @pytest.mark.dist
diff --git a/tests/test_utils/test_checkpoint_io/test_redist.py b/tests/test_utils/test_checkpoint_io/test_redist.py
index 144715bdfcca..fdc849a5ecc0 100644
--- a/tests/test_utils/test_checkpoint_io/test_redist.py
+++ b/tests/test_utils/test_checkpoint_io/test_redist.py
@@ -125,9 +125,9 @@ def run_dist(rank, world_size, port, test_fn):
 
 
 def run_save_dist(dir_name: str, zero: bool):
-    model, optmizer = prepare_model_optim(shard=True, zero=zero)
+    model, optimizer = prepare_model_optim(shard=True, zero=zero)
     rank = dist.get_rank()
-    save(dir_name, model, optmizer, dist_meta=get_dist_metas(4, zero)[rank])
+    save(dir_name, model, optimizer, dist_meta=get_dist_metas(4, zero)[rank])
 
 
 @pytest.mark.dist
diff --git a/tests/test_utils/test_checkpoint_io/test_save.py b/tests/test_utils/test_checkpoint_io/test_save.py
index e35e566f6ff8..2abdd95a6481 100644
--- a/tests/test_utils/test_checkpoint_io/test_save.py
+++ b/tests/test_utils/test_checkpoint_io/test_save.py
@@ -28,7 +28,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None:
         assert torch.equal(v, b[k])
 
 
-def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None:
+def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None:
     assert set(a['state'].keys()) == set(b['state'].keys())
     for k, state in a['state'].items():
         b_state = b['state'][k]
@@ -37,7 +37,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False)
                 assert torch.equal(v1, v2)
             else:
                 assert v1 == v2
-    if not ignore_param_gruops:
+    if not ignore_param_groups:
         assert a['param_groups'] == b['param_groups']
 
 
@@ -113,12 +113,12 @@ def run_dist(rank, world_size, port, test_fn):
 
 
 def run_save_dist(dir_name):
-    model, optmizer = prepare_model_optim()
+    model, optimizer = prepare_model_optim()
     dist_metas = {
         'fc.weight': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1),
         'fc.bias': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1)
     }
-    save(dir_name, model, optmizer, dist_meta=dist_metas)
+    save(dir_name, model, optimizer, dist_meta=dist_metas)
 
 
 @pytest.mark.dist
diff --git a/tests/test_utils/test_lazy_init/utils.py b/tests/test_utils/test_lazy_init/utils.py
index a8aeb4c8930c..0b5f15ca5445 100644
--- a/tests/test_utils/test_lazy_init/utils.py
+++ b/tests/test_utils/test_lazy_init/utils.py
@@ -18,7 +18,7 @@ def set_seed(seed: int) -> None:
     torch.manual_seed(seed)
 
 
-def assert_model_eqaual(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
+def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
     s1 = m1.state_dict()
     s2 = m2.state_dict()
 
@@ -63,7 +63,7 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
     with ctx:
         deferred_model = model_fn()
     deferred_model = ctx.materialize(deferred_model, verbose=verbose)
-    assert_model_eqaual(model, deferred_model)
+    assert_model_equal(model, deferred_model)
     if check_forward:
         assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
     if verbose:

From 899aa86368d9c0d9b3eda8b4186c78018cd56761 Mon Sep 17 00:00:00 2001
From: digger-yu <digger-yu@outlook.com>
Date: Thu, 11 May 2023 11:10:28 +0800
Subject: [PATCH 19/20] [CI] fix typo with tests components (#3695)

* fix spelling error with examples/comminity/

* fix spelling error with tests/

From 1f73609adb358aa1a004f6e5e4c0928a13eb93be Mon Sep 17 00:00:00 2001
From: digger-yu <digger-yu@outlook.com>
Date: Thu, 11 May 2023 16:30:58 +0800
Subject: [PATCH 20/20] [CI] fix typo with tests/ etc. (#3727)

* fix spelling error with examples/comminity/

* fix spelling error with tests/

* fix some spelling error with tests/ colossalai/ etc.

* fix spelling error with tests/ etc. date:2023.5.10
---
 tests/components_to_test/albert.py                          | 2 +-
 tests/test_booster/test_accelerator.py                      | 6 +++---
 tests/test_booster/test_plugin/test_dp_plugin_base.py       | 2 +-
 .../test_cifar_with_data_pipeline_tensor.py                 | 4 ++--
 .../test_cifar_with_data_pipeline_tensor_v2.py              | 4 ++--
 .../test_codegen/test_activation_checkpoint_codegen.py      | 4 ++--
 .../test_nested_activation_checkpoint_codegen.py            | 4 ++--
 tests/test_fx/test_codegen/test_offload_codegen.py          | 6 +++---
 tests/test_layers/test_sequence/test_sequence.py            | 2 +-
 tests/test_moe/test_kernel.py                               | 4 ++--
 tests/test_tensor/model/test_model.py                       | 2 +-
 tests/test_trainer/test_pipeline/test_p2p.py                | 2 +-
 tests/test_zero/test_gemini/test_chunkv2.py                 | 6 +++---
 13 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tests/components_to_test/albert.py b/tests/components_to_test/albert.py
index 52b2275ec4f8..8924eb2fbc92 100644
--- a/tests/components_to_test/albert.py
+++ b/tests/components_to_test/albert.py
@@ -27,7 +27,7 @@ def bert_model_builder(checkpoint: bool = False):
                               attention_probs_dropout_prob=0.)
         print('building AlbertForSequenceClassification model')
 
-        # adapting huggingface BertForSequenceClassification for single unitest calling interface
+        # adapting huggingface BertForSequenceClassification for single unittest calling interface
         class ModelAdaptor(AlbertForSequenceClassification):
 
             def forward(self, input_ids, labels):
diff --git a/tests/test_booster/test_accelerator.py b/tests/test_booster/test_accelerator.py
index 895c494d0c17..6f3f66ed41b8 100644
--- a/tests/test_booster/test_accelerator.py
+++ b/tests/test_booster/test_accelerator.py
@@ -7,8 +7,8 @@
 @clear_cache_before_run()
 @parameterize('device', ['cpu', 'cuda'])
 def test_accelerator(device):
-    acceleartor = Accelerator(device)
+    accelerator = Accelerator(device)
     model = nn.Linear(8, 8)
-    model = acceleartor.configure_model(model)
+    model = accelerator.configure_model(model)
     assert next(model.parameters()).device.type == device
-    del model, acceleartor
+    del model, accelerator
diff --git a/tests/test_booster/test_plugin/test_dp_plugin_base.py b/tests/test_booster/test_plugin/test_dp_plugin_base.py
index 61aeded12203..689b334cae50 100644
--- a/tests/test_booster/test_plugin/test_dp_plugin_base.py
+++ b/tests/test_booster/test_plugin/test_dp_plugin_base.py
@@ -56,7 +56,7 @@ def no_sync(self, model: nn.Module) -> Iterator[None]:
 def check_dataloader_sharding():
     plugin = DPPluginWrapper()
 
-    # create a custom dasetset with 0 to 10
+    # create a custom dataset with 0 to 10
     dataset = TensorDataset(torch.arange(0, 10))
     train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
 
diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
index 4d63592f12b0..4992acbd7cc2 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@@ -48,7 +48,7 @@ def run_trainer(rank, world_size, port):
     pipelinable.policy = "uniform"
     model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
 
-    # craete dataloaders
+    # create dataloaders
     root = Path(os.environ['DATA'])
     transform_train = transforms.Compose([
         transforms.RandomCrop(32, padding=4, pad_if_needed=True),
@@ -68,7 +68,7 @@ def run_trainer(rank, world_size, port):
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, *_ = colossalai.initialize(model=model,
                                                          optimizer=optimizer,
                                                          criterion=criterion,
diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py
index 67d2ba5f5d98..62bbb8f50391 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py
@@ -50,7 +50,7 @@ def run_trainer(rank, world_size, port):
     pipelinable.policy = "uniform"
     model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
 
-    # craete dataloaders
+    # create dataloaders
     root = Path(os.environ['DATA'])
     transform_train = transforms.Compose([
         transforms.RandomCrop(32, padding=4, pad_if_needed=True),
@@ -70,7 +70,7 @@ def run_trainer(rank, world_size, port):
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, *_ = colossalai.initialize(model=model,
                                                          optimizer=optimizer,
                                                          criterion=criterion,
diff --git a/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py b/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
index ab483f7e47a3..bcac2ec426d9 100644
--- a/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
+++ b/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
@@ -64,7 +64,7 @@ def forward(self, x, y):
 
 
 def _run_act_ckpt_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
@@ -122,7 +122,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_act_ckpt_python_code_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
diff --git a/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py b/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
index 9064023d4f68..5b327807a57b 100644
--- a/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
+++ b/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
@@ -32,7 +32,7 @@ def forward(self, x):
 
 
 def _run_act_ckpt_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
@@ -89,7 +89,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_act_ckpt_python_code_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
diff --git a/tests/test_fx/test_codegen/test_offload_codegen.py b/tests/test_fx/test_codegen/test_offload_codegen.py
index 96e88eb92b33..c217b96586fe 100644
--- a/tests/test_fx/test_codegen/test_offload_codegen.py
+++ b/tests/test_fx/test_codegen/test_offload_codegen.py
@@ -56,7 +56,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
     fx_out = gm(data)
     assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
 
-    # test barckward
+    # test backward
     loss0 = non_fx_out.sum()
     loss0.backward()
     loss1 = fx_out.sum()
@@ -65,7 +65,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
 
 
 def _run_offload_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and input
@@ -120,7 +120,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_offload_codegen_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and input
diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_layers/test_sequence/test_sequence.py
index aac192d7eff0..60f2d55f43af 100644
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_layers/test_sequence/test_sequence.py
@@ -45,7 +45,7 @@ def check_ring_qk(rank, world_size):
     ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply
     sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length)
 
-    # check master and distributed attetion scores
+    # check master and distributed attention scores
     sub_master_a = a[:, rank * sub_seq_length:(rank + 1) * sub_seq_length]
     assert torch.allclose(sub_a, sub_master_a, rtol=1e-5, atol=1e-2)
 
diff --git a/tests/test_moe/test_kernel.py b/tests/test_moe/test_kernel.py
index ad9a172b72aa..39603c158731 100644
--- a/tests/test_moe/test_kernel.py
+++ b/tests/test_moe/test_kernel.py
@@ -41,7 +41,7 @@ def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.f
     if data_type == torch.float16:
         layer = layer.half()
 
-    # use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
+    # use matrix multiplication instead of COL_MOE_KERNEL in MOE dispatch and combine
     layer.use_kernel = False
     old_out, _ = layer(tokens)
     ech = old_out.shape
@@ -57,7 +57,7 @@ def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.f
     layer.gate_weight.grad.zero_()
 
     layer.use_kernel = True
-    new_out, _ = layer(tokens)    # get ouputs through colossal kernel
+    new_out, _ = layer(tokens)    # get outputs through colossal kernel
 
     if data_type == torch.float32:
         check_equal(old_out, new_out)
diff --git a/tests/test_tensor/model/test_model.py b/tests/test_tensor/model/test_model.py
index 79d70e53c5cb..288bd20e3844 100644
--- a/tests/test_tensor/model/test_model.py
+++ b/tests/test_tensor/model/test_model.py
@@ -329,6 +329,6 @@ def test_pretrain_load(world_size):
 
 if __name__ == '__main__':
     # test_model_parameters()
-    # test_colo_optgimizer()
+    # test_colo_optimizer()
     test_model(4)
     # test_pretrain_load(4)
diff --git a/tests/test_trainer/test_pipeline/test_p2p.py b/tests/test_trainer/test_pipeline/test_p2p.py
index cb7a193d2bfa..8ad366133d18 100644
--- a/tests/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_trainer/test_pipeline/test_p2p.py
@@ -90,7 +90,7 @@ def run_check(rank, world_size, port):
     prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
     next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
     logger.info('Rank {0}: prev rank {1}, next rank {2}'.format(rank, prev_rank, next_rank))
-    logger.info('Distributed environment is initialzied.')
+    logger.info('Distributed environment is initialized.')
 
     check_comm(world_size, rank, prev_rank, next_rank, logger)
     gpc.destroy()
diff --git a/tests/test_zero/test_gemini/test_chunkv2.py b/tests/test_zero/test_gemini/test_chunkv2.py
index 16764aa6b0b1..1cb31b260a99 100644
--- a/tests/test_zero/test_gemini/test_chunkv2.py
+++ b/tests/test_zero/test_gemini/test_chunkv2.py
@@ -23,7 +23,7 @@ def add_param(param_list, param_cp_list, *args, **kwargs):
     param_cp_list.append(param.clone())
 
 
-def check_euqal(param, param_cp):
+def check_equal(param, param_cp):
     if param.device != param_cp.device:
         temp = param.data.to(param_cp.device)
     else:
@@ -57,7 +57,7 @@ def exam_chunk_basic(init_device, keep_gathered, pin_memory):
         my_chunk.append_tensor(param)
     assert my_chunk.utilized_size == 597
     for param, param_cp in zip(param_list, param_cp_list):
-        check_euqal(param, param_cp)
+        check_equal(param, param_cp)
     my_chunk.close_chunk()
 
     if keep_gathered is False:
@@ -77,7 +77,7 @@ def exam_chunk_basic(init_device, keep_gathered, pin_memory):
     my_chunk.access_chunk()
     assert my_chunk.device_type == 'cuda'
     for param, param_cp in zip(param_list, param_cp_list):
-        check_euqal(param, param_cp)
+        check_equal(param, param_cp)
 
     assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4
     my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)