From 7c9f2ed6dd3dd27a099295ac22be8f3d1508c010 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 25 May 2023 13:09:42 +0800
Subject: [PATCH 01/52] [dtensor] polish sharding spec docstring (#3838)

* [dtensor] polish sharding spec docstring

* [dtensor] polish sharding spec example docstring
---
 colossalai/tensor/d_tensor/sharding_spec.py | 31 +++++++++++----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 2ea0c4db89fd..b927f6dfbe27 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -116,21 +116,21 @@ def build_difference_2d_dict(self):
 
     def dim_diff(self, other):
         '''
-        The difference between two _DimSpec.
+        The difference between two DimSpec.
 
         Argument:
-            other(_DimSpec): the dim spec to compare with.
+            other(DimSpec): the dim spec to compare with.
 
         Return:
             difference(int): the difference between two _DimSpec.
 
         Example:
-            dim_spec = _DimSpec([0])
-            other_dim_spec = _DimSpec([0, 1])
+            ```python
+            dim_spec = DimSpec([0])
+            other_dim_spec = DimSpec([0, 1])
             print(dim_spec.difference(other_dim_spec))
-
-        Output:
-            5
+            # output: 5
+            ```
         '''
         difference = self.difference_dict[(str(self), str(other))]
         return difference
@@ -142,9 +142,13 @@ class ShardingSpec:
     [R, R, S0, S1], which means
 
     Argument:
-        dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
-            and the value of the key describe which logical axis will be sharded in that dimension.
-        sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
+        dim_size (int): The number of dimensions of the tensor to be sharded.
+        dim_partition_dict (Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
+            and the value of the key describe which logical axis will be sharded in that dimension. Defaults to None.
+            E.g. {0: [0, 1]} means the first dimension of the tensor will be sharded in logical axis 0 and 1.
+        sharding_sequence (List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
+            Generally, users should specify either dim_partition_dict or sharding_sequence.
+            If both are given, users must ensure that they are consistent with each other. Defaults to None.
     '''
 
     def __init__(self,
@@ -208,6 +212,7 @@ def spec_diff(self, other):
         pair of sharding sequence.
 
         Example:
+            ```python
             dim_partition_dict = {0: [0, 1]}
             # DistSpec:
             #     shard_sequence: S01,R,R
@@ -219,10 +224,8 @@ def spec_diff(self, other):
             #     device_mesh_shape: (4, 4)
             sharding_spec_to_compare = ShardingSpec(device_mesh, entire_shape, dim_partition_dict_to_compare)
             print(sharding_spec.sharding_sequence_difference(sharding_spec_to_compare))
-
-        Output:
-            25
-
+            # output: 25
+            ```
         Argument:
             other(ShardingSpec): The ShardingSpec to compared with.
 

From 46503c35dd9342f943308ee451b62751f36bc961 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 1 Jun 2023 14:30:51 +0800
Subject: [PATCH 02/52] Modify torch version requirement to adapt torch 2.0

---
 colossalai/cli/launcher/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 6411b4302e95..4bb749f9d293 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def _arg_dict_to_list(arg_dict):
         extra_launch_args = dict()
 
     torch_version = version.parse(torch.__version__)
-    assert torch_version.major == 1
+    assert torch_version.major >= 1
 
     if torch_version.minor < 9:
         cmd = [

From 60ec33bb183e410ace44435d45673d64fea080db Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Fri, 2 Jun 2023 16:50:51 +0800
Subject: [PATCH 03/52] Add a new example of Dreambooth training using the
 booster API

---
 .../tutorial/new_api/dreambooth/README.md     | 113 +++
 .../tutorial/new_api/dreambooth/colossalai.sh |  17 +
 .../new_api/dreambooth/requirements.txt       |   7 +
 .../tutorial/new_api/dreambooth/test_ci.sh    |  23 +
 .../dreambooth/train_dreambooth_colossalai.py | 690 ++++++++++++++++++
 5 files changed, 850 insertions(+)
 create mode 100644 examples/tutorial/new_api/dreambooth/README.md
 create mode 100755 examples/tutorial/new_api/dreambooth/colossalai.sh
 create mode 100644 examples/tutorial/new_api/dreambooth/requirements.txt
 create mode 100644 examples/tutorial/new_api/dreambooth/test_ci.sh
 create mode 100644 examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py

diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
new file mode 100644
index 000000000000..bd7e7707ac78
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/README.md
@@ -0,0 +1,113 @@
+# [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) by [colossalai](https://github.com/hpcaitech/ColossalAI.git)
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+The `train_dreambooth_colossalai.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
+
+## Installation
+
+To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6-11.8. Notice that you may want to make sure the module versions suitable for the whole environment. Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
+
+```bash
+pip install colossalai
+```
+
+**From source**
+
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+python setup.py install
+```
+
+## Dataset for Teyvat BLIP captions
+Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
+
+BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
+
+For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
+
+The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
+
+## New API
+We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
+We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
+For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
+## Training
+
+We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400 \
+  --placement="cuda"
+```
+- `MODEL_NAME` refers to the model you are training.
+- `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
+- `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
+- `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
+- `placement`  refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time. The general script can be then modified as the following.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=800 \
+  --placement="cuda"
+```
+
+
+
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
+
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+4. Send your official proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!
diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
new file mode 100755
index 000000000000..7cf8b3a1307e
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -0,0 +1,17 @@
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1 
+DIFFUSERS_OFFLINE=1
+
+torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your_prompt" \
+  --resolution=512 \
+  --plugin="gemini" \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda" \
diff --git a/examples/tutorial/new_api/dreambooth/requirements.txt b/examples/tutorial/new_api/dreambooth/requirements.txt
new file mode 100644
index 000000000000..1ec828c630ef
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/requirements.txt
@@ -0,0 +1,7 @@
+diffusers>==0.5.0
+accelerate
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+modelcards
diff --git a/examples/tutorial/new_api/dreambooth/test_ci.sh b/examples/tutorial/new_api/dreambooth/test_ci.sh
new file mode 100644
index 000000000000..68862c46cfe9
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/test_ci.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -xe
+pip install -r requirements.txt
+
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your prompt" \
+  --resolution=512 \
+  --plugin=$plugin \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda"
+done
diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
new file mode 100644
index 000000000000..9da7cacb8aaf
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
@@ -0,0 +1,690 @@
+import argparse
+import hashlib
+import math
+import os
+from pathlib import Path
+from typing import Optional
+import shutil
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import colossalai
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+
+disable_existing_loggers()
+logger = get_dist_logger()
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default="a photo of sks dog",
+        required=False,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=("Minimal class images for prior preservation loss. If there are not enough images already present in"
+              " class_data_dir, additional images will be sampled with class_prompt."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=("The resolution for input images, all the images in the train/validation dataset will be resized to this"
+              " resolution"),
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default="cpu",
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+              " cropped. The images will be resized to the resolution first before cropping."),
+    )
+    parser.add_argument("--train_batch_size",
+                        type=int,
+                        default=4,
+                        help="Batch size (per device) for the training dataloader.")
+    parser.add_argument("--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images.")
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+              ' "constant", "constant_with_warmup"]'),
+    )
+    parser.add_argument("--lr_warmup_steps",
+                        type=int,
+                        default=500,
+                        help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--use_8bit_adam",
+                        action="store_true",
+                        help="Whether or not to use 8-bit Adam from bitsandbytes.")
+
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+              " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose([
+            transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ])
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def main(args):
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
+
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            pipeline.to(get_current_device())
+
+            for example in tqdm(
+                    sample_dataloader,
+                    desc="Generating class images",
+                    disable=not local_rank == 0,
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if local_rank == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        logger.info("Loading tokenizer from pretrained model", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+        # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
+
+    # Load models and create wrapper for stable diffusion
+
+    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
+
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
+
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # config optimizer for colossalai zero
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+
+    # load noise_scheduler
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    # prepare dataset
+    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {
+                "input_ids": input_ids
+            },
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(train_dataset,
+                                                   batch_size=args.train_batch_size,
+                                                   shuffle=True,
+                                                   collate_fn=collate_fn,
+                                                   num_workers=1)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(get_current_device(), dtype=weight_dtype)
+    text_encoder.to(get_current_device(), dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
+    # Train!
+    total_batch_size = args.train_batch_size * world_size
+
+    logger.info("***** Running training *****", ranks=[0])
+    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
+    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
+    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    torch.cuda.synchronize()
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            torch.cuda.reset_peak_memory_stats()
+            # Move batch to gpu
+            for key, value in batch.items():
+                batch[key] = value.to(get_current_device(), non_blocking=True)
+
+            # Convert images to latent space
+            optimizer.zero_grad()
+
+            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn_like(latents)
+            bsz = latents.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+            timesteps = timesteps.long()
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+            # Predict the noise residual
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            if args.with_prior_preservation:
+                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                target, target_prior = torch.chunk(target, 2, dim=0)
+
+                # Compute instance loss
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                # Compute prior loss
+                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+            optimizer.backward(loss)
+
+            optimizer.step()
+            lr_scheduler.step()
+            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            progress_bar.update(1)
+            global_step += 1
+            logs = {
+                "loss": loss.detach().item(),
+                "lr": optimizer.param_groups[0]["lr"],
+            }    # lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step % args.save_steps == 0:
+                torch.cuda.synchronize()
+                if local_rank == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
+                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
+            if global_step >= args.max_train_steps:
+                break
+    torch.cuda.synchronize()
+
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
+    if local_rank == 0:
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From 42e3232bc045aa7ea2fb690625d8baf588b80ed1 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Fri, 2 Jun 2023 17:00:57 +0800
Subject: [PATCH 04/52] roll back

---
 colossalai/cli/launcher/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 4bb749f9d293..6411b4302e95 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def _arg_dict_to_list(arg_dict):
         extra_launch_args = dict()
 
     torch_version = version.parse(torch.__version__)
-    assert torch_version.major >= 1
+    assert torch_version.major == 1
 
     if torch_version.minor < 9:
         cmd = [

From 25447d44079de7be9083d07834d75b74f5ce8680 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Mon, 5 Jun 2023 11:47:07 +0800
Subject: [PATCH 05/52] modify path

---
 examples/tutorial/new_api/dreambooth/colossalai.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
index 7cf8b3a1307e..2745c563aa73 100755
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -3,10 +3,10 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your_prompt" \
+  --pretrained_model_name_or_path="Path_to_your_model"  \
+  --instance_data_dir="Path_to_your_training_image" \
+  --output_dir="Path_to_your_save_dir" \
+  --instance_prompt="keli" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \

From ec9bbc0094f25f69c8bfa8f9653537c40bf91e36 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 6 Jun 2023 11:32:31 +0800
Subject: [PATCH 06/52] [devops] improving testmon cache (#3902)

* [devops] improving testmon cache

* [devops] fix branch name with slash

* [devops] fix branch name with slash

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] update readme
---
 .github/workflows/README.md       |  10 ++-
 .github/workflows/build_on_pr.yml | 118 ++++++++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index f40f4cc86d1b..3fad7e36f14c 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -43,10 +43,18 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files and a branch is created/deleted. It will run all the unit tests in the repository with 4 GPUs. |
 | `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 
+To reduce the average time of the unit test on PR, `Build on PR` workflow manages testmon cache.
+
+1. When creating a new branch, it copies `cache/main/.testmondata*` to `cache/<branch>/`.
+2. When creating a new PR or change the base branch of a PR, it copies `cache/<base_ref>/.testmondata*` to `cache/_pull/<pr_number>/`.
+3. When running unit tests for each PR, it restores testmon cache from `cache/_pull/<pr_number>/`. After the test, it stores the cache back to `cache/_pull/<pr_number>/`.
+4. When a PR is closed, if it's merged, it copies `cache/_pull/<pr_number>/.testmondata*` to `cache/<base_ref>/`. Otherwise, it just removes `cache/_pull/<pr_number>`.
+5. When a branch is deleted, it removes `cache/<ref>`.
+
 ### Example Test
 
 | Workflow Name              | File name                       | Description                                                                    |
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index a5a17d176c9d..b5f293107310 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -2,7 +2,7 @@ name: Build on PR
 
 on:
   pull_request:
-    types: [synchronize, opened, reopened]
+    types: [synchronize, opened, reopened, ready_for_review, closed, edited]
     branches:
       - "main"
       - "develop"
@@ -18,11 +18,63 @@ on:
       - "!tests/**.md" # ignore doc change
       - "pytest.ini" # test config change
       - "setup.py" # install command change
+  create:
+  delete:
 
 jobs:
+  prepare_cache:
+    name: Prepare testmon cache
+    if: |
+      github.event_name == 'create' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
+          fi
+        env:
+          MAIN_BRANCH: ${{ github.event.master_branch }}
+
+  prepare_cache_for_pr:
+    name: Prepare testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d "/github/home/testmon_cache/${BASE}" ]; then
+            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.head.ref }}
+
   detect:
     name: Detect file change
     if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
       github.event.pull_request.draft == false &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
@@ -135,9 +187,11 @@ jobs:
 
       - name: Restore Testmon Cache
         run: |
-          if [ -d /github/home/testmon_cache ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Execute Unit Testing
         run: |
@@ -149,8 +203,10 @@ jobs:
 
       - name: Store Testmon Cache
         run: |
-          [ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/
+          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Collate artifact
         env:
@@ -188,3 +244,55 @@ jobs:
         with:
           name: report
           path: report/
+
+  store_cache:
+    name: Store testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      github.event.action == 'closed' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Store testmon cache if possible
+        if: github.event.pull_request.merged == true
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Remove testmon cache
+        if: github.event.pull_request.merged != true
+        run: |
+          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+  remove_cache:
+    name: Remove testmon cache
+    if: |
+      github.event_name == 'delete' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Remove testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          rm -rf "/github/home/testmon_cache/${BASE}"

From c1535ccbba2688682708b0203cce97b01d7750ef Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <56809903+Fridge003@users.noreply.github.com>
Date: Tue, 6 Jun 2023 13:36:11 +0800
Subject: [PATCH 07/52] [doc] fix docs about booster api usage (#3898)

---
 colossalai/booster/booster.py                   | 4 ++--
 docs/source/en/features/zero_with_chunk.md      | 4 ++--
 docs/source/zh-Hans/features/zero_with_chunk.md | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 61d912157449..4a42e204982f 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -25,11 +25,11 @@ class Booster:
     Examples:
         ```python
         colossalai.launch(...)
-        plugin = GeminiPlugin(stage=3, ...)
+        plugin = GeminiPlugin(...)
         booster = Booster(precision='fp16', plugin=plugin)
 
         model = GPT2()
-        optimizer = Adam(model.parameters())
+        optimizer = HybridAdam(model.parameters())
         dataloader = Dataloader(Dataset)
         lr_scheduler = LinearWarmupScheduler()
         criterion = GPTLMLoss()
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index d6f6f611a64c..1b27d64b6897 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -195,7 +195,7 @@ def get_data(batch_size, seq_len, vocab_size):
 Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss:
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -211,7 +211,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 9030464ddf9a..9fe5601bbd1b 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -197,7 +197,7 @@ def get_data(batch_size, seq_len, vocab_size):
 最后，使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数：
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -213,7 +213,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)

From 0e484e620134e3c284216c5c493e8813318cfbdb Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 6 Jun 2023 14:07:36 +0800
Subject: [PATCH 08/52] [nfc]fix typo colossalai/pipeline tensor nn (#3899)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel

* fix typo colossalai/nn

* revert change warmuped

* fix typo colossalai/pipeline tensor nn
---
 colossalai/nn/optimizer/cpu_adam.py           |  2 +-
 colossalai/nn/optimizer/hybrid_adam.py        |  2 +-
 colossalai/pipeline/pipelinable.py            |  8 ++++----
 colossalai/pipeline/rpc/_pipeline_base.py     | 10 +++++-----
 colossalai/pipeline/rpc/_pipeline_schedule.py |  6 +++---
 colossalai/pipeline/utils.py                  |  2 +-
 colossalai/tensor/d_tensor/comm_spec.py       |  2 +-
 colossalai/tensor/d_tensor/sharding_spec.py   |  4 ++--
 colossalai/tensor/param_op_hook.py            |  2 +-
 colossalai/tensor/process_group.py            |  2 +-
 colossalai/tensor/shape_consistency.py        |  6 +++---
 colossalai/tensor/sharding_spec.py            |  6 +++---
 colossalai/tensor/utils.py                    |  2 +-
 13 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 1ec8783c53d3..3a6d37103398 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -13,7 +13,7 @@
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 526071b06f95..84903ac36832 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -14,7 +14,7 @@
 class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py
index 9731530a6e15..79913987b7cc 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@@ -83,7 +83,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
         for k, v in kwargs.items():
             if isinstance(v, torch.nn.Module):
                 v = self._layer_spec_dict[id(v)]
-            # (lyl)TODO: analyse ColoTensor as well
+            # (lyl)TODO: analyze ColoTensor as well
             modified_kwargs[k] = v
 
         # keep track of the module children
@@ -117,7 +117,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
     def to_layer_list(self, exec_seq=None):
         """
         Create a layer spec list and func list with execution sequence given by user.
-        If exec_seq is None, we will take the module initizing order as execution order.
+        If exec_seq is None, we will take the module initializing order as execution order.
         """
 
         self._exec_seq = exec_seq
@@ -177,7 +177,7 @@ def to_layer_list(self, exec_seq=None):
 
     def partition(self, num_chunks, pipeline_size, rank):
         """
-        Partitioned model will be built respect to partion policy.
+        Partitioned model will be built respect to partition policy.
         The real module instance will be built in this method.
         """
         if isinstance(self._policy, str):
@@ -193,7 +193,7 @@ def partition(self, num_chunks, pipeline_size, rank):
                 self.customized_parts = customized_partition(self._exec_seq)
                 assert len(self.customized_parts) == gpc.get_world_size(
                     ParallelMode.PIPELINE
-                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
+                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
                 parts = self.customized_parts[rank]
             else:
                 raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 2d7e25c82e7b..9e549df58214 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -123,7 +123,7 @@ def __init__(self,
         self.device = device
         self._initialize_outstanding_range()
 
-        # variable and const for context managment
+        # variable and const for context management
         self.outstanding = 0
         self.forward_times = 0
         self.backward_times = 0
@@ -226,7 +226,7 @@ def sync_global_worker_rrefs(self, pp_rank_to_worker_rref: Dict[int, PyRRef]) ->
         self.pp_rank_to_worker_rref = pp_rank_to_worker_rref
 
         # for some schedule need the other worker's info to initialise partition (like Chimera)
-        # construction of partition is executed after the registion of pp_rank_to_worker_rref
+        # construction of partition is executed after the registration of pp_rank_to_worker_rref
         self._initialize_partition()
 
     # res_use works for lifecycle counter,
@@ -418,7 +418,7 @@ def subscribe_producer(self, microbatch_id: int, forward_only: bool):
                 # On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
                 self.work_list[key] = work_item_from_producer
                 self.work_list_condition_lock.notify_all()
@@ -460,7 +460,7 @@ def subscribe_consumer(self, microbatch_id: int):
                 # On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_consumer = self._subscribe_consumer(microbatch_id)
                 self.work_list[key] = work_item_from_consumer
                 self.work_list_condition_lock.notify_all()
@@ -508,7 +508,7 @@ def _get_producer_consumer(self) -> None:
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = self.get_producer_stage_ids()
         self.consumer_stage_ids = self.get_consumer_stage_ids()
 
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/pipeline/rpc/_pipeline_schedule.py
index 0d572231d378..6eda8f3b34b7 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -123,7 +123,7 @@ def _get_producer_consumer(self) -> None:
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = []
         self.consumer_stage_ids = []
 
@@ -174,7 +174,7 @@ def _initialize_partition(self):
         else:
             # if it is down pipeline, create partition by origin method
             co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
-            # get the coresponding model state dict and wait for its init
+            # get the corresponding model state dict and wait for its init
             state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
             super()._initialize_partition()
             self.module_partition.load_state_dict(state_dict)
@@ -228,7 +228,7 @@ def _hook_before_step(self):
         stage_num = self.actual_stage_num
         co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)
 
-        # if currrent pp_rank is not the first to do step
+        # if current pp_rank is not the first to do step
         # wait its previous pp_rank finish step
         grads = self.get_parameter_gradients()
 
diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py
index df7226644a7a..ac8a3ad7d1db 100644
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -113,7 +113,7 @@ def _binary_search(weights, num):
 
 def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
     assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recommended"
 
     logger = get_dist_logger()
     parts = [[] for _ in range(pipeline_parallel_size)]
diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
index 765d8ec1b01a..159125fa16db 100644
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -28,7 +28,7 @@ class CommSpec:
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
         process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 2ea0c4db89fd..565012b58a03 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -41,7 +41,7 @@ def __repr__(self):
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -58,7 +58,7 @@ def _convert_str_to_shard_list(self, str_spec):
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index 9c2e0d4adbf1..8ed8176d996a 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -164,7 +164,7 @@ def _get_grad_args(*args):
     for obj in args:
         if _is_grad_tensor(obj):
             return args, None
-    # otherwise, the first arguement should be a tuple of grad tensors
+    # otherwise, the first argument should be a tuple of grad tensors
     # if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
     arg_zero = args[0]
     if not isinstance(arg_zero, tuple):
diff --git a/colossalai/tensor/process_group.py b/colossalai/tensor/process_group.py
index f108bdc247f5..8d2e9a616d76 100644
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@@ -130,7 +130,7 @@ def set_cpu_groups(self):
     @property
     def has_cpu_groups(self) -> bool:
         """has_cpu_groups
-        If cpu groups have been initailized.
+        If cpu groups have been initialized.
 
         Returns:
             bool: cpu process groups have been initialized or not.
diff --git a/colossalai/tensor/shape_consistency.py b/colossalai/tensor/shape_consistency.py
index 0a840006f086..5bec552d69d5 100644
--- a/colossalai/tensor/shape_consistency.py
+++ b/colossalai/tensor/shape_consistency.py
@@ -252,7 +252,7 @@ def get_all_all_to_all_spec(self, source_spec: ShardingSpec,
     def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
         '''
         Get all valid sharding specs from source_spec with single shard operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         For the sharding operation, we just care about legal sharding dimensions.
 
         Argument:
@@ -386,7 +386,7 @@ def get_all_mix_gather_spec(self, source_spec: ShardingSpec,
     def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
         '''
         Get all valid sharding specs from source_spec with one step transform, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         Note:
             all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
             and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
@@ -577,7 +577,7 @@ def shape_consistency(self, source_spec: ShardingSpec,
         Step3:
             Repeat above steps until the source spec transform to target spec.
 
-        During finding the transform path, commucation cost will be accumulated, and it
+        During finding the transform path, communication cost will be accumulated, and it
         will be finally used in auto parallel solver.
 
         Additionally, to avoid repeating the path search in runtime, we cached all solved path
diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py
index bed320130ccd..406ad49097b5 100644
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@@ -45,7 +45,7 @@ def __repr__(self):
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -62,7 +62,7 @@ def _convert_str_to_shard_list(self, str_spec):
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
@@ -166,7 +166,7 @@ class ShardingSpec:
         device_mesh(DeviceMesh): A logical view of a physical mesh.
         entire_shape(torch.Size): The entire shape of tensor before sharded.
         dim_partition_dict(Dict[int, List[int]]， optional): The key is the dimension of tensor to be sharded,
-            and the value of the key decribe which logical axis will be sharded in that dimension.
+            and the value of the key describe which logical axis will be sharded in that dimension.
         sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
     '''
 
diff --git a/colossalai/tensor/utils.py b/colossalai/tensor/utils.py
index 6e30f97fef03..e7d51d099e02 100644
--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):
 
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     '''
     _, shard_list = target_pair
     shard_list_list = []

From 176010f2898cd4353313fc909bf4d2f5a65860a1 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:08:22 +0800
Subject: [PATCH 09/52] update performance evaluation

---
 .../tutorial/new_api/dreambooth/README.md     | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
index bd7e7707ac78..8e1fdbbc8c1f 100644
--- a/examples/tutorial/new_api/dreambooth/README.md
+++ b/examples/tutorial/new_api/dreambooth/README.md
@@ -40,6 +40,9 @@ We have modified our previous implementation of Dreambooth with our new Booster
 We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
 For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
 
+
+
+
 ## Training
 
 We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
@@ -97,7 +100,22 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --placement="cuda"
 ```
 
-
+## Performance
+
+|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
+|:--------------:|:----:|:----------:|:-----------:|:-------:|
+|  Traditional   |  1   |     16     |     oom     |    \    |
+|  Traditional   |  1   |     8      |    61.81    |    1    |
+|   torch_ddp    |  4   |     16     |     oom     |    \    |
+|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
+|     gemini     |  4   |     16     |    53.29    |    \    |
+|     gemini     |  4   |     8      |    29.36    |  2.00   |
+| low_level_zero |  4   |     16     |    52.80    |    \    |
+| low_level_zero |  4   |     8      |    28.87    |  2.02   |
+
+The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
+We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
+the memory cost and the throughput for the plugins.
 
 ## Invitation to open-source contribution
 Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!

From b56c7f428379843a29f690d237b9796747ecf339 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:09:27 +0800
Subject: [PATCH 10/52] update shell file

---
 examples/tutorial/new_api/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
index 2745c563aa73..77dfb1cbd05a 100755
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -6,7 +6,7 @@ torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Path_to_your_model"  \
   --instance_data_dir="Path_to_your_training_image" \
   --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="keli" \
+  --instance_prompt="your prompt" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \

From 1c1f71cbd2718feee7e6dbb472053664e26f1c8e Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:51:11 +0800
Subject: [PATCH 11/52] fixing insecure hash function

---
 .../tutorial/new_api/dreambooth/train_dreambooth_colossalai.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
index 9da7cacb8aaf..5436e7d6b739 100644
--- a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
@@ -397,7 +397,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 

From b29e1f07224298aea35aab7ee83284beac28e0d8 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 15:50:03 +0800
Subject: [PATCH 12/52] change directory

---
 examples/images/dreambooth/README.md          |  23 +
 examples/images/dreambooth/colossalai.sh      |  21 +-
 examples/images/dreambooth/test_ci.sh         |  23 +
 .../dreambooth/train_dreambooth_colossalai.py |  93 ++-
 .../train_dreambooth_colossalai_lora.py       | 126 ++--
 .../tutorial/new_api/dreambooth/README.md     | 131 ----
 .../tutorial/new_api/dreambooth/colossalai.sh |  17 -
 .../new_api/dreambooth/requirements.txt       |   7 -
 .../tutorial/new_api/dreambooth/test_ci.sh    |  23 -
 .../dreambooth/train_dreambooth_colossalai.py | 690 ------------------
 10 files changed, 183 insertions(+), 971 deletions(-)
 delete mode 100644 examples/tutorial/new_api/dreambooth/README.md
 delete mode 100755 examples/tutorial/new_api/dreambooth/colossalai.sh
 delete mode 100644 examples/tutorial/new_api/dreambooth/requirements.txt
 delete mode 100644 examples/tutorial/new_api/dreambooth/test_ci.sh
 delete mode 100644 examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py

diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 7c117d841e24..bfd865a6dfa9 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -92,6 +92,29 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --placement="cuda"
 ```
 
+## New API
+We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
+We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
+For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
+## Performance
+
+|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
+|:--------------:|:----:|:----------:|:-----------:|:-------:|
+|  Traditional   |  1   |     16     |     oom     |    \    |
+|  Traditional   |  1   |     8      |    61.81    |    1    |
+|   torch_ddp    |  4   |     16     |     oom     |    \    |
+|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
+|     gemini     |  4   |     16     |    53.29    |    \    |
+|     gemini     |  4   |     8      |    29.36    |  2.00   |
+| low_level_zero |  4   |     16     |    52.80    |    \    |
+| low_level_zero |  4   |     8      |    28.87    |  2.02   |
+
+The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
+We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
+the memory cost and the throughput for the plugins.
+
+
 ## Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. `--instance_prompt="a photo of sks dog" ` in the above example) in your prompt.
diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 227d8b8bdb04..cfb00412aced 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -1,20 +1,15 @@
-export MODEL_NAME= <Your Pretrained Model Path> 
-export INSTANCE_DIR= <Your Input Pics Path>
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-torchrun --nproc_per_node 2 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of a dog" \
+torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Path_to_your_model"  \
+  --instance_data_dir="Path_to_your_training_image" \
+  --output_dir="Path_to_your_save_dir" \
+  --instance_prompt="your prompt" \
   --resolution=512 \
+  --plugin="gemini" \
   --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index e69de29bb2d1..68862c46cfe9 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -xe
+pip install -r requirements.txt
+
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your prompt" \
+  --resolution=512 \
+  --plugin=$plugin \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda"
+done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index d07febea0a84..5436e7d6b739 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -4,6 +4,7 @@
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -21,9 +22,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
+from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -58,6 +62,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -193,6 +204,12 @@ def parse_args(input_args=None):
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -339,18 +356,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -392,7 +397,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -452,12 +457,18 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -468,10 +479,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -554,6 +577,8 @@ def collate_fn(examples):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -642,36 +667,24 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
                 if local_rank == 0:
-                    pipeline = DiffusionPipeline.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        unet=torch_unet,
-                        revision=args.revision,
-                    )
                     save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    pipeline.save_pretrained(save_path)
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            revision=args.revision,
-        )
-
-        pipeline.save_pretrained(args.output_dir)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 6715b473a567..64cdd2a31734 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -4,6 +4,7 @@
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -23,9 +24,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -60,6 +64,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -195,6 +206,12 @@ def parse_args(input_args=None):
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -341,18 +358,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -394,7 +399,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -454,32 +459,42 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
-        unet.requires_grad_(False)
-
-        # Set correct lora layers
-        lora_attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
-                                                           cross_attention_dim=cross_attention_dim)
-
-        unet.set_attn_processor(lora_attn_procs)
-        lora_layers = AttnProcsLayers(unet.attn_processors)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet.requires_grad_(False)
+
+    # Set correct lora layers
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+
+        lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
+                                                       cross_attention_dim=cross_attention_dim)
+
+    unet.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(unet.attn_processors)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -490,10 +505,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -576,6 +603,8 @@ def collate_fn(examples):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -664,27 +693,24 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
                 if local_rank == 0:
                     save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    torch_unet = torch_unet.to(torch.float32)
-                    torch_unet.save_attn_procs(save_path)
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    torch_unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        torch_unet = torch_unet.to(torch.float32)
-        torch_unet.save_attn_procs(save_path)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
deleted file mode 100644
index 8e1fdbbc8c1f..000000000000
--- a/examples/tutorial/new_api/dreambooth/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) by [colossalai](https://github.com/hpcaitech/ColossalAI.git)
-
-[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
-The `train_dreambooth_colossalai.py` script shows how to implement the training procedure and adapt it for stable diffusion.
-
-By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
-
-## Installation
-
-To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6-11.8. Notice that you may want to make sure the module versions suitable for the whole environment. Before running the scripts, make sure to install the library's training dependencies:
-
-```bash
-pip install -r requirements.txt
-```
-
-### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
-
-```bash
-pip install colossalai
-```
-
-**From source**
-
-```bash
-git clone https://github.com/hpcaitech/ColossalAI.git
-python setup.py install
-```
-
-## Dataset for Teyvat BLIP captions
-Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
-
-BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
-
-For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
-
-The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
-
-## New API
-We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
-We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
-For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
-
-
-
-
-## Training
-
-We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400 \
-  --placement="cuda"
-```
-- `MODEL_NAME` refers to the model you are training.
-- `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
-- `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
-- `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-- `placement`  refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
-
-### Training with prior-preservation loss
-
-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time. The general script can be then modified as the following.
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --placement="cuda"
-```
-
-## Performance
-
-|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
-|:--------------:|:----:|:----------:|:-----------:|:-------:|
-|  Traditional   |  1   |     16     |     oom     |    \    |
-|  Traditional   |  1   |     8      |    61.81    |    1    |
-|   torch_ddp    |  4   |     16     |     oom     |    \    |
-|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
-|     gemini     |  4   |     16     |    53.29    |    \    |
-|     gemini     |  4   |     8      |    29.36    |  2.00   |
-| low_level_zero |  4   |     16     |    52.80    |    \    |
-| low_level_zero |  4   |     8      |    28.87    |  2.02   |
-
-The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
-We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
-the memory cost and the throughput for the plugins.
-
-## Invitation to open-source contribution
-Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
-
-You may contact us or participate in the following ways:
-1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
-2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
-3. Join the Colossal-AI community on
-[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
-and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
-4. Send your official proposal to email contact@hpcaitech.com
-
-Thanks so much to all of our amazing contributors!
diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
deleted file mode 100755
index 77dfb1cbd05a..000000000000
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-HF_DATASETS_OFFLINE=1
-TRANSFORMERS_OFFLINE=1 
-DIFFUSERS_OFFLINE=1
-
-torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Path_to_your_model"  \
-  --instance_data_dir="Path_to_your_training_image" \
-  --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="your prompt" \
-  --resolution=512 \
-  --plugin="gemini" \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --placement="cuda" \
diff --git a/examples/tutorial/new_api/dreambooth/requirements.txt b/examples/tutorial/new_api/dreambooth/requirements.txt
deleted file mode 100644
index 1ec828c630ef..000000000000
--- a/examples/tutorial/new_api/dreambooth/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-diffusers>==0.5.0
-accelerate
-torchvision
-transformers>=4.21.0
-ftfy
-tensorboard
-modelcards
diff --git a/examples/tutorial/new_api/dreambooth/test_ci.sh b/examples/tutorial/new_api/dreambooth/test_ci.sh
deleted file mode 100644
index 68862c46cfe9..000000000000
--- a/examples/tutorial/new_api/dreambooth/test_ci.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-set -xe
-pip install -r requirements.txt
-
-HF_DATASETS_OFFLINE=1
-TRANSFORMERS_OFFLINE=1
-DIFFUSERS_OFFLINE=1
-
-for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your prompt" \
-  --resolution=512 \
-  --plugin=$plugin \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --placement="cuda"
-done
diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
deleted file mode 100644
index 5436e7d6b739..000000000000
--- a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
+++ /dev/null
@@ -1,690 +0,0 @@
-import argparse
-import hashlib
-import math
-import os
-from pathlib import Path
-from typing import Optional
-import shutil
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.optimization import get_scheduler
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
-from colossalai.zero.gemini import get_static_torch_model
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
-
-disable_existing_loggers()
-logger = get_dist_logger()
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--externel_unet_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to the externel unet model.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default="a photo of sks dog",
-        required=False,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=("Minimal class images for prior preservation loss. If there are not enough images already present in"
-              " class_data_dir, additional images will be sampled with class_prompt."),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=("The resolution for input images, all the images in the train/validation dataset will be resized to this"
-              " resolution"),
-    )
-    parser.add_argument(
-        "--placement",
-        type=str,
-        default="cpu",
-        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-              " cropped. The images will be resized to the resolution first before cropping."),
-    )
-    parser.add_argument("--train_batch_size",
-                        type=int,
-                        default=4,
-                        help="Batch size (per device) for the training dataloader.")
-    parser.add_argument("--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images.")
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-              ' "constant", "constant_with_warmup"]'),
-    )
-    parser.add_argument("--lr_warmup_steps",
-                        type=int,
-                        default=500,
-                        help="Number of steps for the warmup in the lr scheduler.")
-    parser.add_argument("--use_8bit_adam",
-                        action="store_true",
-                        help="Whether or not to use 8-bit Adam from bitsandbytes.")
-
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument('-p',
-                        '--plugin',
-                        type=str,
-                        default='torch_ddp',
-                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
-                        help="plugin to use")
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-              " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        if args.class_data_dir is not None:
-            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        size=512,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-
-        self.instance_images_path = list(Path(instance_data_root).iterdir())
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = list(self.class_data_root.iterdir())
-            self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose([
-            transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ])
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            padding="do_not_pad",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                padding="do_not_pad",
-                truncation=True,
-                max_length=self.tokenizer.model_max_length,
-            ).input_ids
-
-        return example
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main(args):
-    if args.seed is None:
-        colossalai.launch_from_torch(config={})
-    else:
-        colossalai.launch_from_torch(config={}, seed=args.seed)
-
-    local_rank = gpc.get_local_rank(ParallelMode.DATA)
-    world_size = gpc.get_world_size(ParallelMode.DATA)
-
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
-                safety_checker=None,
-                revision=args.revision,
-            )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
-
-            pipeline.to(get_current_device())
-
-            for example in tqdm(
-                    sample_dataloader,
-                    desc="Generating class images",
-                    disable=not local_rank == 0,
-            ):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-
-            del pipeline
-
-    # Handle the repository creation
-    if local_rank == 0:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer_name,
-            revision=args.revision,
-            use_fast=False,
-        )
-    elif args.pretrained_model_name_or_path:
-        logger.info("Loading tokenizer from pretrained model", ranks=[0])
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="tokenizer",
-            revision=args.revision,
-            use_fast=False,
-        )
-        # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load models and create wrapper for stable diffusion
-
-    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
-
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-
-    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
-    )
-
-
-    if args.externel_unet_path is None:
-        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                subfolder="unet",
-                                                revision=args.revision,
-                                                low_cpu_mem_usage=False)
-    else:
-        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
-        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
-                                                revision=args.revision,
-                                                low_cpu_mem_usage=False)
-
-    vae.requires_grad_(False)
-    text_encoder.requires_grad_(False)
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-
-    if args.scale_lr:
-        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
-
-    # Use Booster API to use Gemini/Zero with ColossalAI
-
-    booster_kwargs = {}
-    if args.plugin == 'torch_ddp_fp16':
-        booster_kwargs['mixed_precision'] = 'fp16'
-    if args.plugin.startswith('torch_ddp'):
-        plugin = TorchDDPPlugin()
-    elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
-    elif args.plugin == 'low_level_zero':
-        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
-
-    booster = Booster(plugin=plugin, **booster_kwargs)
-
-    # config optimizer for colossalai zero
-    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
-
-    # load noise_scheduler
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    # prepare dataset
-    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        center_crop=args.center_crop,
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["instance_prompt_ids"] for example in examples]
-        pixel_values = [example["instance_images"] for example in examples]
-
-        # Concat class and instance examples for prior preservation.
-        # We do this to avoid doing two forward passes.
-        if args.with_prior_preservation:
-            input_ids += [example["class_prompt_ids"] for example in examples]
-            pixel_values += [example["class_images"] for example in examples]
-
-        pixel_values = torch.stack(pixel_values)
-        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-        input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
-            padding="max_length",
-            max_length=tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids
-
-        batch = {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-        return batch
-
-    train_dataloader = torch.utils.data.DataLoader(train_dataset,
-                                                   batch_size=args.train_batch_size,
-                                                   shuffle=True,
-                                                   collate_fn=collate_fn,
-                                                   num_workers=1)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-    weight_dtype = torch.float32
-    if args.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif args.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move text_encode and vae to gpu.
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    vae.to(get_current_device(), dtype=weight_dtype)
-    text_encoder.to(get_current_device(), dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
-
-    # Train!
-    total_batch_size = args.train_batch_size * world_size
-
-    logger.info("***** Running training *****", ranks=[0])
-    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
-    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
-    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    torch.cuda.synchronize()
-    for epoch in range(args.num_train_epochs):
-        unet.train()
-        for step, batch in enumerate(train_dataloader):
-            torch.cuda.reset_peak_memory_stats()
-            # Move batch to gpu
-            for key, value in batch.items():
-                batch[key] = value.to(get_current_device(), non_blocking=True)
-
-            # Convert images to latent space
-            optimizer.zero_grad()
-
-            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-            latents = latents * 0.18215
-
-            # Sample noise that we'll add to the latents
-            noise = torch.randn_like(latents)
-            bsz = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-            timesteps = timesteps.long()
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            # Get the text embedding for conditioning
-            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-            # Predict the noise residual
-            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-            # Get the target for loss depending on the prediction type
-            if noise_scheduler.config.prediction_type == "epsilon":
-                target = noise
-            elif noise_scheduler.config.prediction_type == "v_prediction":
-                target = noise_scheduler.get_velocity(latents, noise, timesteps)
-            else:
-                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-            if args.with_prior_preservation:
-                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                target, target_prior = torch.chunk(target, 2, dim=0)
-
-                # Compute instance loss
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
-
-                # Compute prior loss
-                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                # Add the prior loss to the instance loss.
-                loss = loss + args.prior_loss_weight * prior_loss
-            else:
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-            optimizer.backward(loss)
-
-            optimizer.step()
-            lr_scheduler.step()
-            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            progress_bar.update(1)
-            global_step += 1
-            logs = {
-                "loss": loss.detach().item(),
-                "lr": optimizer.param_groups[0]["lr"],
-            }    # lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-
-            if global_step % args.save_steps == 0:
-                torch.cuda.synchronize()
-                if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
-                    if not os.path.exists(os.path.join(save_path, "config.json")):
-                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
-                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
-            if global_step >= args.max_train_steps:
-                break
-    torch.cuda.synchronize()
-
-    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
-    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
-    if local_rank == 0:
-        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
-            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)

From d3379f0be7e30854ee2353924d735642f4909aab Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:07:34 +0800
Subject: [PATCH 13/52] fixed model saving bugs

---
 examples/images/dreambooth/train_dreambooth_colossalai.py     | 4 ++--
 .../images/dreambooth/train_dreambooth_colossalai_lora.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 5436e7d6b739..eae52b5ecd7e 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -667,9 +667,9 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                     if not os.path.exists(os.path.join(save_path, "config.json")):
                         shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 64cdd2a31734..dce65ff514b7 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -693,9 +693,9 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                     if not os.path.exists(os.path.join(save_path, "config.json")):
                         shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])

From 79c9f776a9ea42991df54d11e2c4b3ac4a7eeea9 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:20:45 +0800
Subject: [PATCH 14/52] fixed port

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 68862c46cfe9..0209c547a08f 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -7,7 +7,7 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Your Pretrained Model Path"  \
   --instance_data_dir="Your Input Pics Path" \
   --output_dir="path-to-save-model" \

From b4437e88c319874269b022c68e177f95d45b607b Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:21:38 +0800
Subject: [PATCH 15/52] fixed port

---
 examples/images/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index cfb00412aced..54ebac39b925 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -2,7 +2,7 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Path_to_your_model"  \
   --instance_data_dir="Path_to_your_training_image" \
   --output_dir="Path_to_your_save_dir" \

From 41fb7236aa32c307e83b0b9cc50ce2a6da279343 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 6 Jun 2023 18:58:58 +0800
Subject: [PATCH 16/52] [devops] hotfix CI about testmon cache (#3910)

* [devops] hotfix CI about testmon cache

* [devops] fix testmon cahe on pr
---
 .github/workflows/build_on_pr.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index b5f293107310..a2807859b591 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -65,10 +65,10 @@ jobs:
         run: | # branch name may contain slash, we need to replace it with space
           export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
           if [ -d "/github/home/testmon_cache/${BASE}" ]; then
-            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
           fi
         env:
-          PR_NUMBER: ${{ github.event.pull_request.head.ref }}
+          PR_NUMBER: ${{ github.event.number }}
 
   detect:
     name: Detect file change

From b5f0566363687aaa91767bb7069af874bedfb7e8 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 10:41:16 +0800
Subject: [PATCH 17/52] [chat] add distributed PPO trainer (#3740)

* Detached ppo (#9)

* run the base

* working on dist ppo

* sync

* detached trainer

* update detached trainer. no maker update function

* facing init problem

* 1 maker 1 trainer detached run. but no model update

* facing cuda problem

* fix save functions

* verified maker update

* nothing

* add ignore

* analyize loss issue

* remove some debug codes

* facing 2m1t stuck issue

* 2m1t verified

* do not use torchrun

* working on 2m2t

* working on 2m2t

* initialize strategy in ray actor env

* facing actor's init order issue

* facing ddp model update issue (need unwarp ddp)

* unwrap ddp actor

* checking 1m2t stuck problem

* nothing

* set timeout for trainer choosing. It solves the stuck problem!

* delete some debug output

* rename to sync with upstream

* rename to sync with upstream

* coati rename

* nothing

* I am going to detach the replaybuffer from trainer and make it a Ray Actor. Two benefits: 1. support TP trainer. 2. asynchronized buffer operations

* experience_maker_holder performs target-revolving _send_experience() instead of length comparison.

* move code to ray subfolder

* working on pipeline inference

* apply comments

* working on pipeline strategy. in progress.

* remove pipeline code. clean this branch

* update remote parameters by state_dict. no test

* nothing

* state_dict sharding transfer

* merge debug branch

* gemini _unwrap_model fix

* simplify code

* simplify code & fix LoRALinear AttributeError

* critic unwrapped state_dict

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] add perfomance evaluator and fix bugs (#10)

* [chat] add performance evaluator for ray

* [chat] refactor debug arg

* [chat] support hf config

* [chat] fix generation

* [chat] add 1mmt dummy example

* [chat] fix gemini ckpt

* split experience to send (#11)

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] refactor trainer and maker (#12)

* [chat] refactor experience maker holder

* [chat] refactor model init

* [chat] refactor trainer args

* [chat] refactor model init

* [chat] refactor trainer

* [chat] refactor experience sending logic and training loop args (#13)

* [chat] refactor experience send logic

* [chat] refactor trainer

* [chat] refactor trainer

* [chat] refactor experience maker

* [chat] refactor pbar

* [chat] refactor example folder (#14)

* [chat] support quant (#15)

* [chat] add quant

* [chat] add quant example

* prompt example (#16)

* prompt example

* prompt load csv data

* remove legacy try

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] add mmmt dummy example and refactor experience sending (#17)

* [chat] add mmmt dummy example

* [chat] refactor naive strategy

* [chat] fix struck problem

* [chat] fix naive strategy

* [chat] optimize experience maker sending logic

* [chat] refactor sending assignment

* [chat] refactor performance evaluator (#18)

* Prompt Example & requires_grad state_dict & sharding state_dict (#19)

* prompt example

* prompt load csv data

* remove legacy try

* maker models require_grad set to False

* working on zero redundancy update

* mmmt_prompt example; naive strategy requires_grad state_dict & sharding; maker model requires_no_grad.

* remove legacy examples

* remove legacy examples

* remove replay buffer tp state. bad design

---------

Co-authored-by: csric <richcsr256@gmail.com>

* state_dict sending adapts to new unwrap function (#20)

* prompt example

* prompt load csv data

* remove legacy try

* maker models require_grad set to False

* working on zero redundancy update

* mmmt_prompt example; naive strategy requires_grad state_dict & sharding; maker model requires_no_grad.

* remove legacy examples

* remove legacy examples

* remove replay buffer tp state. bad design

* opt benchmark

* better script

* nothing

* [chat] strategy refactor unwrap model

* [chat] strategy refactor save model

* [chat] add docstr

* [chat] refactor trainer save model

* [chat] fix strategy typing

* [chat] refactor trainer save model

* [chat] update readme

* [chat] fix unit test

* working on lora reconstruction

* state_dict sending adapts to new unwrap function

* remove comments

---------

Co-authored-by: csric <richcsr256@gmail.com>
Co-authored-by: ver217 <lhx0217@gmail.com>

* [chat-ray] add readme (#21)

* add readme

* transparent graph

* add note background

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] get images from url (#22)

* Refactor/chat ray (#23)

* [chat] lora add todo

* [chat] remove unused pipeline strategy

* [chat] refactor example structure

* [chat] setup ci for ray

* [chat-ray] Support LoRA trainer. LoRA weights reconstruction. (#24)

* lora support prototype

* lora support

* 1mmt lora & remove useless code

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] fix test ci for ray

* [chat] fix test ci requirements for ray

* [chat] fix ray runtime env

* [chat] fix ray runtime env

* [chat] fix example ci docker args

* [chat] add debug info in trainer

* [chat] add nccl debug info

* [chat] skip ray test

* [doc] fix typo

---------

Co-authored-by: csric <59389055+CsRic@users.noreply.github.com>
Co-authored-by: csric <richcsr256@gmail.com>
---
 .github/workflows/run_chatgpt_examples.yml    |   2 +-
 .../Chat/benchmarks/ray/1mmt_dummy.py         | 178 +++++++++++
 .../Chat/benchmarks/ray/mmmt_dummy.py         | 189 ++++++++++++
 applications/Chat/coati/models/lora.py        |   8 +-
 applications/Chat/coati/quant/__init__.py     |   7 +
 .../Chat/coati/quant/llama_gptq/__init__.py   |   5 +
 .../Chat/coati/quant/llama_gptq/loader.py     |  26 ++
 .../coati/quant/llama_gptq/model_utils.py     |  13 +
 .../Chat/coati/quant/llama_gptq/quant.py      | 283 ++++++++++++++++++
 applications/Chat/coati/quant/utils.py        |  28 ++
 applications/Chat/coati/ray/README.md         | 160 ++++++++++
 applications/Chat/coati/ray/__init__.py       |   2 -
 .../Chat/coati/ray/callbacks/__init__.py      |   9 +
 applications/Chat/coati/ray/callbacks/base.py |  66 ++++
 .../ray/callbacks/performance_evaluator.py    | 212 +++++++++++++
 .../ray/{src => }/detached_replay_buffer.py   |  65 ++--
 .../Chat/coati/ray/detached_trainer_base.py   | 179 +++++++++++
 .../ray/{src => }/detached_trainer_ppo.py     | 198 ++++++------
 applications/Chat/coati/ray/example/1m1t.py   | 153 ----------
 applications/Chat/coati/ray/example/1m1t.sh   |  23 --
 applications/Chat/coati/ray/example/1m2t.py   | 186 ------------
 applications/Chat/coati/ray/example/1m2t.sh   |  23 --
 applications/Chat/coati/ray/example/2m1t.py   | 140 ---------
 applications/Chat/coati/ray/example/2m1t.sh   |  23 --
 applications/Chat/coati/ray/example/2m2t.py   | 209 -------------
 applications/Chat/coati/ray/example/2m2t.sh   |  23 --
 .../Chat/coati/ray/experience_maker_holder.py | 271 +++++++++++++++++
 .../Chat/coati/ray/lora_constructor.py        | 122 ++++++++
 applications/Chat/coati/ray/src/__init__.py   |   0
 .../coati/ray/src/detached_trainer_base.py    | 121 --------
 .../coati/ray/src/experience_maker_holder.py  | 172 -----------
 .../Chat/coati/ray/src/pipeline_strategy.py   | 105 -------
 applications/Chat/coati/ray/src/utils.py      |  48 ---
 applications/Chat/coati/ray/utils.py          | 152 ++++++++++
 .../Chat/coati/trainer/strategies/base.py     |   4 +
 .../coati/trainer/strategies/colossalai.py    |  12 +
 .../Chat/coati/trainer/strategies/ddp.py      |  13 +-
 .../Chat/coati/trainer/strategies/naive.py    |  62 +++-
 .../Chat/coati/trainer/strategies/sampler.py  |   1 +
 applications/Chat/examples/ray/1mmt_prompt.py | 175 +++++++++++
 applications/Chat/examples/ray/mmmt_prompt.py | 189 ++++++++++++
 .../Chat/examples/ray/requirements.txt        |   1 +
 applications/Chat/examples/ray/test_ci.sh     |  12 +
 applications/Chat/examples/test_ci.sh         |   3 +
 44 files changed, 2495 insertions(+), 1378 deletions(-)
 create mode 100644 applications/Chat/benchmarks/ray/1mmt_dummy.py
 create mode 100644 applications/Chat/benchmarks/ray/mmmt_dummy.py
 create mode 100644 applications/Chat/coati/quant/__init__.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/__init__.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/loader.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/model_utils.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/quant.py
 create mode 100644 applications/Chat/coati/quant/utils.py
 create mode 100644 applications/Chat/coati/ray/README.md
 create mode 100644 applications/Chat/coati/ray/callbacks/__init__.py
 create mode 100644 applications/Chat/coati/ray/callbacks/base.py
 create mode 100644 applications/Chat/coati/ray/callbacks/performance_evaluator.py
 rename applications/Chat/coati/ray/{src => }/detached_replay_buffer.py (62%)
 create mode 100644 applications/Chat/coati/ray/detached_trainer_base.py
 rename applications/Chat/coati/ray/{src => }/detached_trainer_ppo.py (55%)
 delete mode 100644 applications/Chat/coati/ray/example/1m1t.py
 delete mode 100644 applications/Chat/coati/ray/example/1m1t.sh
 delete mode 100644 applications/Chat/coati/ray/example/1m2t.py
 delete mode 100644 applications/Chat/coati/ray/example/1m2t.sh
 delete mode 100644 applications/Chat/coati/ray/example/2m1t.py
 delete mode 100644 applications/Chat/coati/ray/example/2m1t.sh
 delete mode 100644 applications/Chat/coati/ray/example/2m2t.py
 delete mode 100644 applications/Chat/coati/ray/example/2m2t.sh
 create mode 100644 applications/Chat/coati/ray/experience_maker_holder.py
 create mode 100644 applications/Chat/coati/ray/lora_constructor.py
 delete mode 100644 applications/Chat/coati/ray/src/__init__.py
 delete mode 100644 applications/Chat/coati/ray/src/detached_trainer_base.py
 delete mode 100644 applications/Chat/coati/ray/src/experience_maker_holder.py
 delete mode 100644 applications/Chat/coati/ray/src/pipeline_strategy.py
 delete mode 100644 applications/Chat/coati/ray/src/utils.py
 create mode 100644 applications/Chat/coati/ray/utils.py
 create mode 100644 applications/Chat/examples/ray/1mmt_prompt.py
 create mode 100644 applications/Chat/examples/ray/mmmt_prompt.py
 create mode 100644 applications/Chat/examples/ray/requirements.txt
 create mode 100755 applications/Chat/examples/ray/test_ci.sh

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 9d9d3a007851..129bf7ed3270 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
+      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
       run:
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
new file mode 100644
index 000000000000..9e8f36cefc4f
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py
@@ -0,0 +1,178 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker0", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        kl_coef=0.1,
+        debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f'maker{x}' for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    wait_tasks.append(
+        experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                 num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
new file mode 100644
index 000000000000..46a0062893b8
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                     num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 0533a60dc532..2a9059e6901e 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -61,7 +61,13 @@ def T(w):
         if self.merge_weights and self.merged:
             # Make sure that the weights are not merged
             if self.r > 0:
-                self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+                if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
+                    # FIXME(csric): temporary fix
+                    self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
+                    self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
+                    self.reset_parameters()
+                else:
+                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
             self.merged = False
 
     def eval(self):
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/Chat/coati/quant/__init__.py
new file mode 100644
index 000000000000..a65a78d07bb8
--- /dev/null
+++ b/applications/Chat/coati/quant/__init__.py
@@ -0,0 +1,7 @@
+from .llama_gptq import load_quant as llama_load_quant
+from .utils import low_resource_init
+
+__all__ = [
+    'llama_load_quant',
+    'low_resource_init',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/Chat/coati/quant/llama_gptq/__init__.py
new file mode 100644
index 000000000000..51c8d6316290
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/__init__.py
@@ -0,0 +1,5 @@
+from .loader import load_quant
+
+__all__ = [
+    'load_quant',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/Chat/coati/quant/llama_gptq/loader.py
new file mode 100644
index 000000000000..5353dc8a2ea3
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/loader.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from .model_utils import find_layers
+from .quant import make_quant
+
+
+def load_quant(model: nn.Module, checkpoint: str, wbits: int, groupsize: int):
+    model = model.eval()
+    layers = find_layers(model)
+
+    # ignore lm head
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+
+    make_quant(model, layers, wbits, groupsize)
+
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+
+    return model
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/Chat/coati/quant/llama_gptq/model_utils.py
new file mode 100644
index 000000000000..62db171abb52
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/model_utils.py
@@ -0,0 +1,13 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
+
+import torch
+import torch.nn as nn
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/Chat/coati/quant/llama_gptq/quant.py
new file mode 100644
index 000000000000..f7d5b7ce4bd8
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/quant.py
@@ -0,0 +1,283 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/quant.py
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def quantize(x, scale, zero, maxq):
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        self.scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+        else:
+            self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float('inf'), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+try:
+    import quant_cuda
+except:
+    print('CUDA extension not installed.')
+
+# Assumes layer is perfectly divisible into 256 * 256 blocks
+
+
+class QuantLinear(nn.Module):
+
+    def __init__(self, bits, groupsize, infeatures, outfeatures):
+        super().__init__()
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2, int(math.log2(groupsize)))):
+            raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)")
+        groupsize = groupsize if groupsize != -1 else infeatures
+        self.groupsize = groupsize
+        self.register_buffer(
+            'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
+                                  dtype=torch.int))
+        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
+        self.register_buffer('bias', torch.zeros(outfeatures))
+        self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
+        self._initialized_quant_state = False
+
+    def pack(self, linear, scales, zeros):
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.groupsize
+            intweight.append(
+                torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
+                                                                                                                  None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros((intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32)
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        intermediate_dtype = torch.float32
+
+        if not self._initialized_quant_state:
+            # Do we even have a bias? Check for at least one non-zero element.
+            if self.bias is not None and bool(torch.any(self.bias != 0)):
+                # Then make sure it's the right type.
+                self.bias.data = self.bias.data.to(intermediate_dtype)
+            else:
+                self.bias = None
+
+        outshape = list(x.shape)
+        outshape[-1] = self.outfeatures
+        x = x.reshape(-1, x.shape[-1])
+        if self.bias is None:
+            y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device)
+        else:
+            y = self.bias.clone().repeat(x.shape[0], 1)
+
+        output_dtype = x.dtype
+        x = x.to(intermediate_dtype)
+        if self.bits == 2:
+            quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 3:
+            quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 4:
+            quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 8:
+            quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        else:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        y = y.to(output_dtype)
+        return y.reshape(outshape)
+
+
+def make_quant(module, names, bits, groupsize, name=''):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
+    for name1, child in module.named_children():
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
diff --git a/applications/Chat/coati/quant/utils.py b/applications/Chat/coati/quant/utils.py
new file mode 100644
index 000000000000..01b8cff0add1
--- /dev/null
+++ b/applications/Chat/coati/quant/utils.py
@@ -0,0 +1,28 @@
+from contextlib import contextmanager
+
+import torch
+
+
+def _noop(*args, **kwargs):
+    pass
+
+
+@contextmanager
+def low_resource_init():
+    """This context manager disables weight initialization and sets the default float dtype to half.
+    """
+    old_kaiming_uniform_ = torch.nn.init.kaiming_uniform_
+    old_uniform_ = torch.nn.init.uniform_
+    old_normal_ = torch.nn.init.normal_
+    dtype = torch.get_default_dtype()
+    try:
+        torch.nn.init.kaiming_uniform_ = _noop
+        torch.nn.init.uniform_ = _noop
+        torch.nn.init.normal_ = _noop
+        torch.set_default_dtype(torch.half)
+        yield
+    finally:
+        torch.nn.init.kaiming_uniform_ = old_kaiming_uniform_
+        torch.nn.init.uniform_ = old_uniform_
+        torch.nn.init.normal_ = old_normal_
+        torch.set_default_dtype(dtype)
diff --git a/applications/Chat/coati/ray/README.md b/applications/Chat/coati/ray/README.md
new file mode 100644
index 000000000000..228155a6855b
--- /dev/null
+++ b/applications/Chat/coati/ray/README.md
@@ -0,0 +1,160 @@
+# Distributed PPO Training on Stage 3
+
+## Detach Experience Makers and Trainers
+
+We can completely separate the trainers and makers.
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/basic_structure.png?raw=true" width=600/>
+</p>
+
+- The experience maker performs inference, produces experience, and remotely delivers it to the trainer (1).
+- The trainer consumes experience to train models, and periodically transmits new model parameters to the maker (2.1, 2.2).
+- Using an experience buffer to overlap transmission and computing.
+
+In this manner, each node will work continuously without model idle time, and different optimization strategies can be applied for inference and training to meet the needs of speed or storage. It is also helpful for scalability.
+
+`DetachedPPOTrainer` and `ExperienceMakerHolder` are Ray Actors (distinguished from Actor Model), representing Trainer and Experience Maker on the graph above, respectively.
+
+[More about Ray Core](https://docs.ray.io/en/latest/ray-core/walkthrough.html)
+
+## Usage
+
+See examples at `ColossalAI/application/Chat/examples/ray`
+
+### Setup Makers
+
+- define makers' environment variables :
+
+    ```python
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(num_makers)]
+
+    ```
+- define maker models :
+    ```python
+    def model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        reward_model = get_reward_model_from_args(...)
+        initial_model = get_actor_from_args(...)
+        return actor, critic, reward_model, initial_model
+
+    ```
+- set experience_holder_refs :
+
+    ```python
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(
+            name=f"maker_{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            detached_trainer_name_list=[f"trainer_{x}" for x in target_trainers(...)],
+            model_fn=model_fn,
+            ...)
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+    ```
+    The names in the `detached_trainer_name_list` refer to the target trainers that the maker should send experience to.
+    We set a trainer's name the same as a maker, by `.options(name="str")`. See below.
+
+### Setup Trainers
+
+- define trainers' environment variables :
+    ```python
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(num_trainers)]
+    ```
+- define trainer models :
+
+    ```python
+    def trainer_model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        return actor, critic
+    ```
+- set trainer_refs :
+    ```python
+    trainer_refs = [
+        DetachedPPOTrainer.options(
+            name=f"trainer{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            experience_maker_holder_name_list=[f"maker{x}" for x in target_makers(...)],
+            model_fn = trainer_model_fn(),
+            ...)
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+    ```
+    The names in `experience_maker_holder_name_list` refer to the target makers that the trainer should send updated models to.
+    By setting  `detached_trainer_name_list` and `experience_maker_holder_name_list`, we can customize the transmission graph.
+
+### Launch Jobs
+- define data_loader :
+    ```python
+    def data_loader_fn():
+        return = torch.utils.data.DataLoader(dataset=dataset)
+
+    ```
+- launch makers :
+    ```python
+    wait_tasks = []
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(data_loader_fn(),
+                                                     num_steps=experience_steps))
+
+    ```
+
+- launch trainers :
+    ```python
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, update_steps, train_epochs))
+    ```
+
+- wait for done :
+    ```python
+    ray.get(wait_tasks)
+    ```
+
+## Flexible Structure
+
+We can deploy different strategies to makers and trainers. Here are some notions.
+
+### 2 Makers 1 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m1t.png?raw=true" width=600/>
+</p>
+
+### 2 Makers 2 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t.png?raw=true" width=600/>
+</p>
+
+### Maker Inference Quantization
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t_quantize.png?raw=true" width=600/>
+</p>
+
+### Tensor Parallel
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/tp_ddp_hybrid.png?raw=true" width=600/>
+</p>
+
+## TODO
+
+- [ ] Support LoRA
+- [ ] Support TP & PP
diff --git a/applications/Chat/coati/ray/__init__.py b/applications/Chat/coati/ray/__init__.py
index 5802c05bc03f..e69de29bb2d1 100644
--- a/applications/Chat/coati/ray/__init__.py
+++ b/applications/Chat/coati/ray/__init__.py
@@ -1,2 +0,0 @@
-from .src.detached_replay_buffer import DetachedReplayBuffer
-from .src.detached_trainer_ppo import DetachedPPOTrainer
diff --git a/applications/Chat/coati/ray/callbacks/__init__.py b/applications/Chat/coati/ray/callbacks/__init__.py
new file mode 100644
index 000000000000..5f5e488f383e
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/__init__.py
@@ -0,0 +1,9 @@
+from .base import MakerCallback, TrainerCallback
+from .performance_evaluator import ExperienceMakerPerformanceEvaluator, TrainerPerformanceEvaluator
+
+__all__ = [
+    "TrainerCallback",
+    "MakerCallback",
+    "ExperienceMakerPerformanceEvaluator",
+    "TrainerPerformanceEvaluator",
+]
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/Chat/coati/ray/callbacks/base.py
new file mode 100644
index 000000000000..3306150a41ff
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/base.py
@@ -0,0 +1,66 @@
+from abc import ABC
+
+from coati.experience_maker import Experience
+
+
+class TrainerCallback(ABC):
+    """
+        Base callback class. It defines the interface for callbacks.
+    """
+
+    def on_fit_start(self) -> None:
+        pass
+
+    def on_fit_end(self) -> None:
+        pass
+
+    def on_episode_start(self, episode: int) -> None:
+        pass
+
+    def on_episode_end(self, episode: int) -> None:
+        pass
+
+    def on_epoch_start(self, epoch: int) -> None:
+        pass
+
+    def on_epoch_end(self, epoch: int) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        pass
+
+    def on_update_start(self) -> None:
+        pass
+
+    def on_update_end(self) -> None:
+        pass
+
+
+class MakerCallback(ABC):
+
+    def on_loop_start(self) -> None:
+        pass
+
+    def on_loop_end(self) -> None:
+        pass
+
+    def on_make_experience_start(self) -> None:
+        pass
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        pass
+
+    def on_send_start(self) -> None:
+        pass
+
+    def on_send_end(self) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self) -> None:
+        pass
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
new file mode 100644
index 000000000000..cd3517609e7a
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
@@ -0,0 +1,212 @@
+from time import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from coati.experience_maker import Experience
+
+from .base import MakerCallback, TrainerCallback
+
+
+def get_world_size() -> int:
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class Timer:
+
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        self.duration += time() - self.start_time
+
+    def reset(self) -> None:
+        self.duration = 0.
+
+
+class ExperienceMakerPerformanceEvaluator(MakerCallback):
+
+    def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int,
+                 reward_model_num_params: int) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.initial_model_num_params = initial_model_num_params
+        self.reward_model_num_params = reward_model_num_params
+
+        self.batch_timer = Timer()
+        self.send_timer = Timer()
+        self.make_experience_timer = Timer()
+        self.total_samples: int = 0
+        self.make_experience_flop: int = 0
+
+        print_rank_0(
+            f'ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_make_experience_start(self) -> None:
+        self.make_experience_timer.start()
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        self.make_experience_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor generate
+        num_actions = experience.action_mask.size(1)
+        input_len = seq_len - num_actions
+        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
+        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
+        # actor forward
+        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
+        # critic forward
+        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
+        # initial model forward
+        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
+        # reward model forward
+        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
+
+    def on_send_start(self) -> None:
+        self.send_timer.start()
+
+    def on_send_end(self) -> None:
+        self.send_timer.end()
+
+    def on_batch_start(self) -> None:
+        self.batch_timer.start()
+
+    def on_batch_end(self) -> None:
+        self.batch_timer.end()
+
+    def on_loop_end(self) -> None:
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
+        avg_overall_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_send_duration = all_reduce_mean(self.send_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_overall_duration + 1e-12)
+        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
+        avg_time_per_sample = (avg_overall_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / \
+            (self.total_samples * self.world_size)
+        avg_send_time_per_sample = (avg_send_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Making Experience Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n' +
+            f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
+
+
+class TrainerPerformanceEvaluator(TrainerCallback):
+
+    def __init__(self,
+                 actor_num_params: int,
+                 critic_num_params: int,
+                 enable_grad_checkpoint: bool = False,
+                 ignore_first_episodes: int = 1) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_first_episodes = ignore_first_episodes
+        self.ignore_this_episode = False
+
+        self.episode_timer = Timer()
+        self.batch_timer = Timer()
+        self.update_timer = Timer()
+        self.total_samples: int = 0
+        self.learn_flop: int = 0
+
+        print_rank_0(
+            f'Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_episode_start(self, episodes: int) -> None:
+        self.ignore_this_episode = episodes < self.ignore_first_episodes
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.start()
+
+    def on_episode_end(self, episodes: int) -> None:
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.end()
+
+    def on_batch_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.start()
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor forward-backward, 3 means forward(1) + backward(2)
+        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+        # critic forward-backward
+        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+
+    def on_update_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.start()
+
+    def on_update_end(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.end()
+
+    def on_fit_end(self) -> None:
+        if self.total_samples == 0:
+            print_rank_0('No samples are collected, skip trainer performance evaluation')
+            return
+        avg_train_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_update_duration = all_reduce_mean(self.update_timer.duration, self.world_size)
+        avg_episode_duration = all_reduce_mean(self.episode_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_episode_duration + 1e-12)
+        avg_learn_tflops = self.learn_flop / 1e12 / (avg_train_duration + 1e-12)
+        avg_time_per_sample = (avg_episode_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_train_time_per_sample = (avg_train_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_update_time_per_sample = (avg_update_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Learning Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_learn_tflops:.3f}\n' + f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
diff --git a/applications/Chat/coati/ray/src/detached_replay_buffer.py b/applications/Chat/coati/ray/detached_replay_buffer.py
similarity index 62%
rename from applications/Chat/coati/ray/src/detached_replay_buffer.py
rename to applications/Chat/coati/ray/detached_replay_buffer.py
index 18c8db388e88..2f765281178a 100644
--- a/applications/Chat/coati/ray/src/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/detached_replay_buffer.py
@@ -1,22 +1,24 @@
-import torch
+import asyncio
+import copy
 import random
-from typing import List, Any
-# from torch.multiprocessing import Queue
-from ray.util.queue import Queue
+from threading import Lock
+from typing import Any, List
+
 import ray
-import asyncio
+import torch
 from coati.experience_maker.base import Experience
-from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
 from coati.replay_buffer import ReplayBuffer
-from threading import Lock
-import copy
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
+# from torch.multiprocessing import Queue
+from ray.util.queue import Queue
+
 
 class DetachedReplayBuffer:
     '''
-        Detached replay buffer. Share Experience across workers on the same node. 
-        Therefore a trainer node is expected to have only one instance. 
+        Detached replay buffer. Share Experience across workers on the same node.
+        Therefore a trainer node is expected to have only one instance.
         It is ExperienceMakerHolder's duty to call append(exp) method, remotely.
-    
+
     Args:
         sample_batch_size: Batch size when sampling. Exp won't enqueue until they formed a batch.
         tp_world_size: Number of workers in the same tp group
@@ -24,31 +26,25 @@ class DetachedReplayBuffer:
         cpu_offload: Whether to offload experience to cpu when sampling. Defaults to True.
     '''
 
-    def __init__(self, sample_batch_size: int, tp_world_size: int = 1, limit : int = 0, cpu_offload: bool = True) -> None:
-        self.cpu_offload = cpu_offload
+    def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
         self.sample_batch_size = sample_batch_size
         self.limit = limit
-        self.items = Queue(self.limit, actor_options={"num_cpus":1})
-        self.batch_collector : List[BufferItem] = []
+        self.items = Queue(self.limit, actor_options={"num_cpus": 1})
+        self.batch_collector: List[BufferItem] = []
 
+    @torch.no_grad()
+    def append(self, experience: Experience) -> None:
         '''
-        Workers in the same tp group share this buffer and need same sample for one step.
-            Therefore a held_sample should be returned tp_world_size times before it could be dropped.
-            worker_state records whether a worker got the held_sample
+        Expected to be called remotely.
         '''
-        self.tp_world_size = tp_world_size
-        self.worker_state = [False] * self.tp_world_size
-        self.held_sample = None
-        self._worker_state_lock = Lock()
+        items = split_experience_batch(experience)
+        self.extend(items)
 
     @torch.no_grad()
-    def append(self, experience: Experience) -> None:
+    def extend(self, items: List[BufferItem]) -> None:
         '''
         Expected to be called remotely.
         '''
-        if self.cpu_offload:
-            experience.to_device(torch.device('cpu'))
-        items = split_experience_batch(experience)
         self.batch_collector.extend(items)
         while len(self.batch_collector) >= self.sample_batch_size:
             items = self.batch_collector[:self.sample_batch_size]
@@ -62,19 +58,10 @@ def clear(self) -> None:
         self.items = Queue(self.limit)
         self.worker_state = [False] * self.tp_world_size
         self.batch_collector = []
-     
+
     @torch.no_grad()
-    def sample(self, worker_rank = 0, to_device = "cpu") -> Experience:
-        self._worker_state_lock.acquire()
-        if not any(self.worker_state):
-            self.held_sample = self._sample_and_erase()
-        self.worker_state[worker_rank] = True
-        if all(self.worker_state):
-            self.worker_state = [False] * self.tp_world_size
-            ret = self.held_sample
-        else:
-            ret = copy.deepcopy(self.held_sample)
-        self._worker_state_lock.release()
+    def sample(self, worker_rank=0, to_device="cpu") -> Experience:
+        ret = self._sample_and_erase()
         ret.to_device(to_device)
         return ret
 
@@ -85,4 +72,4 @@ def _sample_and_erase(self) -> Experience:
 
     def get_length(self) -> int:
         ret = self.items.qsize()
-        return ret
\ No newline at end of file
+        return ret
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/Chat/coati/ray/detached_trainer_base.py
new file mode 100644
index 000000000000..ac2d35e9da19
--- /dev/null
+++ b/applications/Chat/coati/ray/detached_trainer_base.py
@@ -0,0 +1,179 @@
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import ray
+import torch
+from coati.experience_maker import Experience
+from coati.replay_buffer.utils import BufferItem
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from .callbacks import TrainerCallback
+from .detached_replay_buffer import DetachedReplayBuffer
+from .utils import is_rank_0
+
+
+class DetachedTrainer(ABC):
+    '''
+        Base class for detached rlhf trainers.
+        'detach' means that the experience maker is detached compared to a normal Trainer.
+        Please set name attribute during init:
+            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
+            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
+    Args:
+        detached_strategy (DetachedStrategy): the strategy to use for training
+        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
+        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+
+    '''
+
+    def __init__(self,
+                 experience_maker_holder_name_list: List[str],
+                 train_batch_size: int = 8,
+                 buffer_limit: int = 0,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[TrainerCallback] = [],
+                 debug: bool = False) -> None:
+        super().__init__()
+        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit)
+        self.dataloader_pin_memory = dataloader_pin_memory
+        self.callbacks = callbacks
+        self.target_holder_name_list = experience_maker_holder_name_list
+        self.target_holder_list = []
+        self._is_target_holder_initialized = False
+        self._debug = debug
+
+    def update_target_holder_list(self):
+        # as the length of target_holder_list may be zero, we need to check it by a bool flag
+        if not self._is_target_holder_initialized:
+            for name in self.target_holder_name_list:
+                self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+            self._is_target_holder_initialized = True
+
+    @abstractmethod
+    def _update_remote_makers(self, fully_update: bool = False, **kwargs):
+        pass
+
+    def sync_models_to_remote_makers(self, **kwargs):
+        self._update_remote_makers(fully_update=True, **kwargs)
+
+    @abstractmethod
+    def training_step(self, experience: Experience) -> Dict[str, Any]:
+        pass
+
+    def _learn(self, update_steps: int, train_epochs: int) -> None:
+        data = []
+        # warmup
+        pbar = tqdm(range(update_steps), desc=f'Train epoch [1/{train_epochs}]', disable=not is_rank_0())
+        self._on_epoch_start(0)
+        self._learn_epoch(pbar, data)
+        self._on_epoch_end(0)
+        # item is already a batch
+        dataloader = DataLoader(data,
+                                batch_size=1,
+                                shuffle=True,
+                                pin_memory=self.dataloader_pin_memory,
+                                collate_fn=lambda x: x[0])
+        for epoch in range(1, train_epochs):
+            pbar = tqdm(dataloader, desc=f'Train epoch [{epoch + 1}/{train_epochs}]', disable=not is_rank_0())
+            self._on_epoch_start(epoch)
+            self._learn_epoch(pbar, data)
+            self._on_epoch_end(epoch)
+
+    def _learn_epoch(self, pbar: tqdm, data: List[Experience]) -> None:
+        is_warmup = len(data) == 0
+        for x in pbar:
+            if self._debug:
+                print("[trainer] training step")
+            # sample a batch and then train to avoid waiting
+            experience = x if not is_warmup else self._buffer_sample()
+            experience.to_device(torch.cuda.current_device())
+            self._on_batch_start()
+            metrics = self.training_step(experience)
+            self._on_batch_end(metrics, experience)
+
+            if self._debug:
+                print("[trainer] step over")
+            experience.to_device("cpu")
+            if is_warmup:
+                data.append(experience)
+            pbar.set_postfix(metrics)
+
+    def fit(self, total_steps: int, update_steps: int, train_epochs: int = 1) -> None:
+        self._on_fit_start()
+        for i in tqdm(range(total_steps // update_steps), desc='Trainer', disable=not is_rank_0()):
+            self._on_episode_start(i)
+            self._learn(update_steps, train_epochs)
+            self._on_update_start()
+            self._update_remote_makers()
+            self._on_update_end()
+            self._on_episode_end(i)
+        self._on_fit_end()
+
+    @ray.method(concurrency_group="buffer_length")
+    def buffer_get_length(self):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print("[trainer]                telling length")
+        return self.detached_replay_buffer.get_length()
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_append(self, experience: Experience):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.append(experience)
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_extend(self, items: List[BufferItem]):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.extend(items)
+
+    @ray.method(concurrency_group="buffer_sample")
+    def _buffer_sample(self):
+        return self.detached_replay_buffer.sample()
+
+    def _on_fit_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_start()
+
+    def _on_fit_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_end()
+
+    def _on_episode_start(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_start(episode)
+
+    def _on_episode_end(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_end(episode)
+
+    def _on_epoch_start(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_start(epoch)
+
+    def _on_epoch_end(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch)
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end(metrics, experience)
+
+    def _on_update_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_start()
+
+    def _on_update_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_end()
diff --git a/applications/Chat/coati/ray/src/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
similarity index 55%
rename from applications/Chat/coati/ray/src/detached_trainer_ppo.py
rename to applications/Chat/coati/ray/detached_trainer_ppo.py
index 838e82d07f4a..5f0032716f93 100644
--- a/applications/Chat/coati/ray/src/detached_trainer_ppo.py
+++ b/applications/Chat/coati/ray/detached_trainer_ppo.py
@@ -1,24 +1,38 @@
-from typing import Any, Callable, Dict, List, Optional
-import torch
-from torch.optim import Adam
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
+import ray
+import torch
 from coati.experience_maker import Experience, NaiveExperienceMaker
 from coati.models.base import Actor, Critic
-from coati.models.generation_utils import update_model_kwargs_fn
 from coati.models.loss import PolicyLoss, ValueLoss
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
 from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
+from torch.optim import Adam
 
 from colossalai.nn.optimizer import HybridAdam
 
-import ray
-
-
-from .utils import is_rank_0, get_cuda_actor_critic_from_args, get_strategy_from_args, set_dist_env
+from .callbacks import TrainerCallback, TrainerPerformanceEvaluator
 from .detached_trainer_base import DetachedTrainer
-
-
-@ray.remote(concurrency_groups={"buffer_length": 1, "buffer_append":1, "buffer_sample":1,"model_io": 1, "compute": 1})
+from .lora_constructor import LoRAConstructor
+from .utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_model_numel,
+    get_rank,
+    get_strategy_from_args,
+    is_rank_0,
+    set_dist_env,
+    state_dict_to,
+)
+
+
+@ray.remote(concurrency_groups={
+    "buffer_length": 1,
+    "buffer_append": 1,
+    "buffer_sample": 1,
+    "model_io": 1,
+    "compute": 1
+})
 class DetachedPPOTrainer(DetachedTrainer):
     '''
         Detached Trainer for PPO algorithm
@@ -40,86 +54,102 @@ class DetachedPPOTrainer(DetachedTrainer):
         generate_kwargs (dict, optional): the kwargs to use while model generating
     '''
 
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 strategy: str,
-                 model: str,
-                 env_info: Dict[str, str] = None,
-                 pretrained: str = None,
-                 lora_rank: int = 0,
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 eps_clip: float = 0.2,
-                 value_clip: float = 0.4,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 10,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
+    def __init__(
+        self,
+        experience_maker_holder_name_list: List[str],
+        strategy_fn: Callable[[], Strategy],
+        model_fn: Callable[[], Tuple[Actor, Critic]],
+        env_info: Dict[str, str] = None,
+        train_batch_size: int = 8,
+        buffer_limit: int = 0,
+        eps_clip: float = 0.2,
+        value_clip: float = 0.4,
+        dataloader_pin_memory: bool = True,
+        callbacks: List[TrainerCallback] = [],
+        eval_performance: bool = False,
+        debug: bool = False,
+        update_lora_weights: bool = False,
+    ) -> None:
         # set environment variables
         if env_info:
             set_dist_env(env_info=env_info)
         # configure strategy
-        self.strategy = get_strategy_from_args(strategy)
+        self.strategy = strategy_fn()
         # configure models, loss and optimizers
         with self.strategy.model_init_context():
-            self.actor, self.critic = get_cuda_actor_critic_from_args(model, pretrained, lora_rank)
+            self.actor, self.critic = model_fn()
 
-        if strategy != 'colossalai_gemini':
-            self.actor.to(torch.float16).to(torch.cuda.current_device())
-            self.critic.to(torch.float16).to(torch.cuda.current_device())
+        if eval_performance:
+            actor_numel = get_model_numel(self.actor)
+            critic_numel = get_model_numel(self.critic)
+            evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel)
+            callbacks = callbacks + [evaluator]
 
-        if strategy.startswith('colossalai'):
-            self.actor_optim = HybridAdam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = HybridAdam(self.critic.parameters(), lr=5e-6)
+        if isinstance(self.strategy, ColossalAIStrategy):
+            self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7)
         else:
-            self.actor_optim = Adam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = Adam(self.critic.parameters(), lr=5e-6)
+            self.actor_optim = Adam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = Adam(self.critic.parameters(), lr=1e-7)
 
         (self.actor, self.actor_optim), (self.critic, self.critic_optim) = \
             self.strategy.prepare((self.actor, self.actor_optim), (self.critic, self.critic_optim))
-        generate_kwargs = _set_default_generate_kwargs(self.strategy, generate_kwargs, self.actor)
 
+        # configure trainer
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
 
         super().__init__(experience_maker_holder_name_list,
                          train_batch_size=train_batch_size,
                          buffer_limit=buffer_limit,
-                         buffer_cpu_offload=buffer_cpu_offload,
-                         experience_batch_size=experience_batch_size,
-                         max_epochs=max_epochs,
                          dataloader_pin_memory=dataloader_pin_memory,
                          callbacks=callbacks,
-                         **generate_kwargs)
+                         debug=debug)
+        if self._debug:
+            print(f'[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}')
+
+        self._update_lora_weights = update_lora_weights
 
     @ray.method(concurrency_group="model_io")
-    def _update_remote_makers(self):
+    @torch.no_grad()
+    def _update_remote_makers(self, fully_update: bool = False, **config):
         # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+        if not fully_update:
+            config['requires_grad_only'] = True
+        self.update_target_holder_list()
+        # mark start, ensure order
+        tasks = []
+        for target_holder in self.target_holder_list:
+            tasks.append(target_holder.update_experience_maker.remote(chunk_start=True, fully_update=fully_update))
+        ray.get(tasks)
+        # sending loop
+        tasks = []
+
+        for state_dict_shard in self._get_model_state_dict_shard(self.actor, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.update_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
-                    
-    @ray.method(concurrency_group="model_io")
-    def initialize_remote_makers(self):
-        # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_actor_state_dict=state_dict_shard,
+                        new_actor_lora_config_dict=self._get_model_lora_config_dict(self.actor),
+                        fully_update=fully_update))
+        # sending loop
+        for state_dict_shard in self._get_model_state_dict_shard(self.critic, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.initialize_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_critic_state_dict=state_dict_shard,
+                        new_critic_lora_config_dict=self._get_model_lora_config_dict(self.critic),
+                        fully_update=fully_update))
+        ray.get(tasks)
+        # mark end
+        for target_holder in self.target_holder_list:
+            target_holder.update_experience_maker.remote(chunk_end=True, fully_update=fully_update)
 
     @ray.method(concurrency_group="compute")
     def training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
         self.critic.train()
 
-        experience.to_device(torch.cuda.current_device())
         num_actions = experience.action_mask.size(1)
         action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
         actor_loss = self.actor_loss_fn(action_log_probs,
@@ -155,38 +185,16 @@ def strategy_save_actor_optim(self, path: str, only_rank0: bool = False) -> None
     def strategy_save_critic_optim(self, path: str, only_rank0: bool = False) -> None:
         self.strategy.save_optimizer(self.critic_optim, path, only_rank0)
 
-    def _get_unwrapped_actor(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = Actor(self.strategy._unwrap_model(self.actor))
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return Actor(self.strategy._unwrap_actor(self.actor))
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.actor
-
-    def _get_unwrapped_critic(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = self.strategy._unwrap_model(self.critic)
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return self.critic.module
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.critic
-
-
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
-    origin_model = strategy._unwrap_actor(actor)
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
-        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
-
-    if 'update_model_kwargs_fn' not in generate_kwargs:
-        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
-
-    return new_kwargs
-   
\ No newline at end of file
+    def _get_model_state_dict_shard(self, model: torch.nn.Module, fully_update=False, **config):
+        for state_dict in self.strategy.get_model_state_dict_shard(model, **config):
+            if not self._update_lora_weights or fully_update:
+                yield state_dict_to(state_dict)
+            else:
+                state_dict_lora, _ = LoRAConstructor.filter_state_dict_lora(state_dict)
+                yield state_dict_to(state_dict_lora)
+
+    def _get_model_lora_config_dict(self, model: torch.nn.Module):
+        if not self._update_lora_weights:
+            return None
+        unwrapped_model = self.strategy.unwrap_model(model)
+        return LoRAConstructor.extract_lora_config(unwrapped_model)
diff --git a/applications/Chat/coati/ray/example/1m1t.py b/applications/Chat/coati/ray/example/1m1t.py
deleted file mode 100644
index a6527370505b..000000000000
--- a/applications/Chat/coati/ray/example/1m1t.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-    
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '1',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '1',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info = env_info_trainer,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        env_info = env_info_maker,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3 # +3 for fault tolerance
-    maker_done_ref = experience_holder_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m1t.sh b/applications/Chat/coati/ray/example/1m1t.sh
deleted file mode 100644
index f7c5054c800e..000000000000
--- a/applications/Chat/coati/ray/example/1m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy colossalai_zero2 --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10   --debug
diff --git a/applications/Chat/coati/ray/example/1m2t.py b/applications/Chat/coati/ray/example/1m2t.py
deleted file mode 100644
index 3883c364a8e0..000000000000
--- a/applications/Chat/coati/ray/example/1m2t.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1])
-    ray.init(dashboard_port = 1145)
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug= args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs * 2 + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m2t.sh b/applications/Chat/coati/ray/example/1m2t.sh
deleted file mode 100644
index 669f4141026c..000000000000
--- a/applications/Chat/coati/ray/example/1m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m2t.py "/path/to/prompts.csv" --model gpt2 \
-    --maker_strategy naive --trainer_strategy ddp --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  #--debug 
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/example/2m1t.py b/applications/Chat/coati/ray/example/2m1t.py
deleted file mode 100644
index b655de1ab1fa..000000000000
--- a/applications/Chat/coati/ray/example/2m1t.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def main(args):
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs // 2 + 3 # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_1_done_ref, maker_2_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m1t.sh b/applications/Chat/coati/ray/example/2m1t.sh
deleted file mode 100644
index a207d4118d60..000000000000
--- a/applications/Chat/coati/ray/example/2m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 3
-
-export RAY_NAMESPACE="admin"
-
-python 2m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy naive --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  # --debug
diff --git a/applications/Chat/coati/ray/example/2m2t.py b/applications/Chat/coati/ray/example/2m2t.py
deleted file mode 100644
index 435c71915fc2..000000000000
--- a/applications/Chat/coati/ray/example/2m2t.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    env_info_maker_2 = {'local_rank' : '0',
-                        'rank' : '1',
-                        'world_size' : '2',
-                        'master_port': maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1,
-           env_info_maker_2])
-    ray.init()
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_2,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref, maker_2_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m2t.sh b/applications/Chat/coati/ray/example/2m2t.sh
deleted file mode 100644
index fb4024766c54..000000000000
--- a/applications/Chat/coati/ray/example/2m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 2m2t.py "path/to/prompts.csv" \
-    --maker_strategy naive --trainer_strategy colossalai_zero2 --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  --debug
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/Chat/coati/ray/experience_maker_holder.py
new file mode 100644
index 000000000000..8551ef1eacef
--- /dev/null
+++ b/applications/Chat/coati/ray/experience_maker_holder.py
@@ -0,0 +1,271 @@
+import os
+import time
+import tracemalloc
+from copy import deepcopy
+from threading import Lock
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import ray
+import torch
+import torch.nn as nn
+from coati.experience_maker import Experience, ExperienceMaker, NaiveExperienceMaker
+from coati.models.base import Actor, Critic, RewardModel
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
+from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import Strategy
+from coati.trainer.strategies.sampler import DistributedSampler
+from ray.exceptions import GetTimeoutError
+from torch import Tensor
+from tqdm import tqdm
+
+from .callbacks import ExperienceMakerPerformanceEvaluator, MakerCallback
+from .utils import (get_model_numel, 
+                    get_rank, 
+                    get_world_size, 
+                    is_rank_0, 
+                    set_dist_env,
+                    state_dict_to)
+from .lora_constructor import LoRAConstructor
+
+@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
+class ExperienceMakerHolder:
+    '''
+    Args:
+        detached_trainer_name_list: str list to get ray actor handles
+        strategy:
+        kl_coef: the coefficient of kl divergence loss
+        sync_models_from_trainers: whether to sync models from trainers. If True, you must call sync_models_to_remote_makers() in trainers to sync models.
+    '''
+
+    def __init__(
+            self,
+            detached_trainer_name_list: List[str],
+            strategy_fn: Callable[[], Strategy],
+    # a function returns (actor, critic, reward_model, initial_model)
+            model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
+            env_info: Dict[str, str] = None,
+            sync_models_from_trainers: bool = False,
+            buffer_cpu_offload: bool = True,
+            kl_coef: float = 0.1,
+            callbacks: List[MakerCallback] = [],
+            eval_performance: bool = False,
+            debug: bool = False,
+            update_lora_weights: bool = False,
+            **generate_kwargs):
+        # set environment variables
+        if env_info:
+            set_dist_env(env_info=env_info)
+        self.target_trainer_list = []
+        assert len(detached_trainer_name_list) > 0
+        self._detached_trainer_name_list = detached_trainer_name_list
+        self.strategy = strategy_fn()
+        self.buffer_cpu_offload = buffer_cpu_offload
+        self.kl_coef = kl_coef
+        # init models
+        with self.strategy.model_init_context():
+            actor, critic, reward_model, initial_model = model_fn()
+        self.generate_kwargs = _set_default_generate_kwargs(generate_kwargs, actor)
+        if eval_performance:
+            actor_numel = get_model_numel(actor)
+            critic_numel = get_model_numel(critic)
+            initial_model_numel = get_model_numel(initial_model)
+            reward_model_numel = get_model_numel(reward_model)
+            evaluator = ExperienceMakerPerformanceEvaluator(actor_numel, critic_numel, initial_model_numel,
+                                                            reward_model_numel)
+            callbacks = callbacks + [evaluator]
+
+        actor, critic, reward_model, initial_model = self.strategy.prepare(actor, critic, reward_model, initial_model)
+        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
+        self.callbacks = callbacks
+
+        self._model_visit_lock = Lock()
+
+        self._is_fully_initialized = not sync_models_from_trainers
+
+        self._debug = debug
+        self._update_lora_weights = update_lora_weights
+        if self._update_lora_weights:
+            self.actor_lora_constructor = LoRAConstructor()
+            self.critic_lora_constructor = LoRAConstructor()
+
+        self.target_auto_balance = False
+
+        self._target_idx = 0
+
+        if self._debug:
+            print(f'[maker{get_rank()}] will send items to {self._detached_trainer_name_list}')
+            if not self._is_fully_initialized:
+                print(f'[maker{get_rank()}] Waiting for INIT')
+
+    def _get_ready(self):
+        while not self._fully_initialized():
+            time.sleep(1.0)
+
+    def _fully_initialized(self):
+        return self._is_fully_initialized
+
+    def _init_target_trainer_list(self):
+        if len(self.target_trainer_list) > 0:
+            return
+        for name in self._detached_trainer_name_list:
+            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+
+    # copy from ../trainer/base.py
+    @ray.method(concurrency_group="compute")
+    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
+        if isinstance(inputs, Tensor):
+            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
+        elif isinstance(inputs, dict):
+            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
+        else:
+            raise ValueError(f'Unsupported input type "{type(inputs)}"')
+
+    @ray.method(concurrency_group="experience_io")
+    def _send_items(self, experience: Experience) -> None:
+        self._init_target_trainer_list()
+        items = split_experience_batch(experience)
+        items_per_trainer = [[] for _ in range(len(self.target_trainer_list))]
+        for item in items:
+            items_per_trainer[self._target_idx].append(item)
+            self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
+        for i, target_trainer in enumerate(self.target_trainer_list):
+            if len(items_per_trainer[i]) > 0:
+                target_trainer.buffer_extend.remote(items_per_trainer[i])
+
+    def _inference_step(self, batch) -> None:
+        self._on_batch_start()
+        with self._model_visit_lock:
+            self._on_make_experience_start()
+            experience = self._make_experience(batch)
+            self._on_make_experience_end(experience)
+        self._on_send_start()
+        if self.buffer_cpu_offload:
+            experience.to_device('cpu')
+        self._send_items(experience)
+        self._on_send_end()
+        self._on_batch_end()
+
+    def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1, num_steps: int = 0):
+        """Working loop of the experience maker.
+
+        Args:
+            dataloader_fn (Callable[[], Iterable]): A function that returns a dataloader.
+            num_epochs (int, optional): Iterate the dataloader for number of epochs. Defaults to 1.
+            num_steps (int, optional): Iterate the dataloader for number if steps. If this value > 0, num_epochs will be ignored. Defaults to 0.
+        """
+        self._get_ready()
+        self._on_loop_start()
+        dataloader = dataloader_fn()
+        if num_steps > 0:
+            # ignore num epochs
+            it = iter(dataloader)
+            for _ in tqdm(range(num_steps), desc='ExperienceMaker', disable=not is_rank_0()):
+                try:
+                    batch = next(it)
+                except StopIteration:
+                    it = iter(dataloader)
+                    batch = next(it)
+                self._inference_step(batch)
+        else:
+            with tqdm(total=num_epochs * len(dataloader), desc='ExperienceMaker', disable=not is_rank_0()) as pbar:
+                for _ in range(num_epochs):
+                    for batch in dataloader:
+                        self._inference_step(batch)
+                        pbar.update()
+        self._on_loop_end()
+
+    @ray.method(concurrency_group="model_io")
+    def update_experience_maker(self,
+                                new_actor_state_dict: Dict[str, Any] = None,
+                                new_actor_lora_config_dict: Dict[str, Any] = None,
+                                new_critic_state_dict: Dict[str, Any] = None,
+                                new_critic_lora_config_dict: Dict[str, Any] = None,
+                                fully_update: bool = False,
+                                chunk_start: bool = None,
+                                chunk_end: bool = None):
+        '''
+            called by trainer
+            chunk_start: Set True at the first call. Before sending state_dict calls
+            chunk_end: Set True at the last call. After sending state_dict calls.
+            fully_update: Set True if you want to sync models when initializing
+
+            TODO: load_state_dict integrate with model-sharding strategy
+        '''
+        _watch_memory = self._debug
+        if chunk_start:
+            if self._debug:
+                print("[maker] UPDATE ")
+            if _watch_memory:
+                tracemalloc.start()
+            self._model_visit_lock.acquire()
+
+        with torch.no_grad():
+            if new_actor_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.actor.model.load_state_dict(new_actor_state_dict, strict=False)
+                else:
+                    new_actor_state_dict = state_dict_to(new_actor_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.actor_lora_constructor.reconstruct_increase(new_actor_state_dict, new_actor_lora_config_dict)
+                    self.actor_lora_constructor.load_state_dict_increase(self.experience_maker.actor.model, state_dict_increasae)
+            if new_critic_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.critic.load_state_dict(new_critic_state_dict, strict=False)
+                else:
+                    new_critic_state_dict = state_dict_to(new_critic_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.critic_lora_constructor.reconstruct_increase(new_critic_state_dict, new_critic_lora_config_dict)
+                    self.critic_lora_constructor.load_state_dict_increase(self.experience_maker.critic, state_dict_increasae)
+
+        # the lock must be released after both actor and critic being updated
+        if chunk_end:
+            self._model_visit_lock.release()
+            if _watch_memory:
+                current, peak = tracemalloc.get_traced_memory()
+                print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+                tracemalloc.stop()
+            if fully_update:
+                self._is_fully_initialized = True
+
+    def _on_make_experience_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_start()
+
+    def _on_make_experience_end(self, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_end(experience)
+
+    def _on_loop_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_start()
+
+    def _on_loop_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_end()
+
+    def _on_send_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_start()
+
+    def _on_send_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_end()
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end()
+
+
+def _set_default_generate_kwargs(generate_kwargs: dict, actor: Actor) -> None:
+    origin_model = actor.model
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
+
+    return new_kwargs
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/Chat/coati/ray/lora_constructor.py
new file mode 100644
index 000000000000..599a58248728
--- /dev/null
+++ b/applications/Chat/coati/ray/lora_constructor.py
@@ -0,0 +1,122 @@
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from loralib.layers import LoRALayer
+from coati.models.lora import LoraLinear
+
+
+@dataclass
+class LoRAConfig:
+    r: int = 0
+    lora_alpha: int = 1
+    lora_dropout: float = 0
+    fan_in_fan_out: bool = False
+
+
+class LoRAConstructor:
+    '''
+    Tools for reconstructing a model from a remote LoRA model.
+    (Transfering only LoRA data costs much less!)
+    Usage:
+        Step 1 (Sender):
+            filter_state_dict_lora()
+            
+        Step 2 (Sender, Optional):
+            extract_lora_config()
+            
+        Step 3 (Sender):
+            send state_dict_lora and lora_config_dict
+            
+        Step 4 (Receiver):
+            reconstruct_increase()
+            
+        Step 5 (Receiver):
+            load_state_dict_increase()
+            
+    '''
+
+    def __init__(self):
+        self.lora_config_dict = None
+
+    def register_lora_config(self, lora_config_dict: Dict[str, Any]):
+        self.lora_config_dict = lora_config_dict
+
+    def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict: Dict[str, Any]):
+        '''
+            xxx.lora_A, xxx.lora_B -->> xxx.weight
+            Warning: the xxx.weight here is the increment actually.
+        '''
+        if lora_config_dict is not None:
+            self.register_lora_config(lora_config_dict)
+
+        state_dict_increasae = OrderedDict()
+        config_iter = iter(self.lora_config_dict.items())
+        lora_A, lora_B, layer_prefix = None, None, None
+        for k, v in state_dict_lora.items():
+            if k.rpartition('.')[-1] == 'lora_A':
+                lora_A = v
+                layer_prefix = k.rpartition('.')[0]
+            elif k.rpartition('.')[-1] == 'lora_B':
+                assert layer_prefix == k.rpartition('.')[0], "unmatched (lora_A, lora_B) pair"
+                layer_prefix_2, config = next(config_iter)
+                assert layer_prefix_2 == layer_prefix, "unmatched (state_dict, config_dict) pair"
+                lora_B = v
+                weight_data_increase = self._compute(lora_A, lora_B, config)
+                state_dict_increasae[layer_prefix + '.weight'] = weight_data_increase
+                lora_A, lora_B, layer_prefix = None, None, None
+            else:
+                raise ValueError('unexpected key')
+        return state_dict_increasae
+
+    def _compute(self, lora_A, lora_B, config=LoRAConfig()):
+        def T(w):
+            return w.T if config.fan_in_fan_out else w
+        if config.r > 0:
+            scaling = config.lora_alpha / config.r
+            weight_data_increase = T(lora_B @ lora_A) * scaling
+            return weight_data_increase
+        return 0
+
+    def load_state_dict_increase(self, model: nn.Module, state_dict_increasae: Dict[str, Any]):
+        '''
+        The final reconstruction step
+        '''
+        # naive approach
+        model.load_state_dict({k: v + model.state_dict()[k] for k, v in state_dict_increasae.items()}, strict=False)
+
+    @staticmethod
+    def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
+        '''
+        if keep_non_lora, also return non_lora state_dict
+        '''
+        state_dict_lora = OrderedDict()
+        state_dict_non_lora = OrderedDict()
+        for k, v in state_dict.items():
+            if 'lora_A' in k or 'lora_B' in k:
+                state_dict_lora[k] = v
+            elif keep_non_lora:
+                state_dict_non_lora[k] = v
+        if keep_non_lora:
+            return state_dict_lora, state_dict_non_lora
+        else:
+            return state_dict_lora, None
+
+    @staticmethod
+    def extract_lora_config(model: nn.Module) -> Dict[str, LoRAConfig]:
+        '''
+        extract LoraLinear model.
+        return OrderedDict(): name -> LoRAConfig
+        '''
+        lora_config_dict = OrderedDict()
+
+        for name, child in model.named_modules():
+            if isinstance(child, LoraLinear):
+                lora_config_dict[name] = LoRAConfig(r=child.r,
+                                                    lora_alpha=child.lora_alpha,
+                                                    lora_dropout=child.lora_dropout,
+                                                    fan_in_fan_out=child.fan_in_fan_out)
+
+        return lora_config_dict
diff --git a/applications/Chat/coati/ray/src/__init__.py b/applications/Chat/coati/ray/src/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/applications/Chat/coati/ray/src/detached_trainer_base.py b/applications/Chat/coati/ray/src/detached_trainer_base.py
deleted file mode 100644
index f1ed1ec71499..000000000000
--- a/applications/Chat/coati/ray/src/detached_trainer_base.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
-from tqdm import tqdm
-from coati.trainer.callbacks import Callback
-from coati.experience_maker import Experience
-import ray
-import os
-
-from .detached_replay_buffer import DetachedReplayBuffer
-from .utils import is_rank_0
-
-class DetachedTrainer(ABC):
-    '''
-        Base class for detached rlhf trainers. 
-        'detach' means that the experience maker is detached compared to a normal Trainer.
-        Please set name attribute during init:
-            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
-            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
-    Args:
-        detached_strategy (DetachedStrategy): the strategy to use for training
-        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
-        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
-        max_epochs (int, defaults to 1): the number of epochs of training process
-        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        generate_kwargs (dict, optional): the kwargs to use while model generating
-    '''
-
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 1,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
-        super().__init__()
-        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit, cpu_offload=buffer_cpu_offload)
-        self.experience_batch_size = experience_batch_size
-        self.max_epochs = max_epochs
-        self.dataloader_pin_memory = dataloader_pin_memory
-        self.callbacks = callbacks
-        self.generate_kwargs = generate_kwargs
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-
-    def update_target_holder_list(self, experience_maker_holder_name_list):
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-        for name in self.target_holder_name_list:
-            self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-
-    @abstractmethod
-    def _update_remote_makers(self):
-        pass
-
-    @abstractmethod
-    def training_step(self, experience: Experience) -> Dict[str, Any]:
-        pass
-
-    def _learn(self):
-        pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
-        for _ in pbar:
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] sampling exp")
-            experience = self._buffer_sample()
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] training step")
-            metrics = self.training_step(experience)
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] step over")
-            pbar.set_postfix(metrics)
-
-    def fit(self, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
-        self._on_fit_start()
-        for episode in range(num_episodes):
-            self._on_episode_start(episode)
-            for timestep in tqdm(range(max_timesteps // update_timesteps),
-                                 desc=f'Episode [{episode+1}/{num_episodes}]',
-                                 disable=not is_rank_0()):
-                self._learn()
-                self._update_remote_makers()
-            self._on_episode_end(episode)
-        self._on_fit_end()
-
-    @ray.method(concurrency_group="buffer_length")
-    def buffer_get_length(self):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[trainer]                telling length")
-        return self.detached_replay_buffer.get_length()
-
-    @ray.method(concurrency_group="buffer_append")
-    def buffer_append(self, experience: Experience):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            # print(f"[trainer] receiving exp. Current buffer length: {self.detached_replay_buffer.get_length()}")
-            print(f"[trainer]               receiving exp.")
-        self.detached_replay_buffer.append(experience)
-
-    @ray.method(concurrency_group="buffer_sample")
-    def _buffer_sample(self):
-        return self.detached_replay_buffer.sample()
-
-    def _on_fit_start(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_start()
-
-    def _on_fit_end(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_end()
-
-    def _on_episode_start(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_start(episode)
-
-    def _on_episode_end(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_end(episode)
diff --git a/applications/Chat/coati/ray/src/experience_maker_holder.py b/applications/Chat/coati/ray/src/experience_maker_holder.py
deleted file mode 100644
index 0ae4e3125b70..000000000000
--- a/applications/Chat/coati/ray/src/experience_maker_holder.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import torch
-from typing import Any, Callable, Dict, List, Optional, Union
-import ray
-from ray.exceptions import GetTimeoutError
-from torch import Tensor
-import torch.nn as nn
-from coati.models.base import Actor, Critic, RewardModel
-from coati.trainer.strategies.sampler import DistributedSampler
-from coati.trainer.strategies import Strategy
-from coati.experience_maker import NaiveExperienceMaker, Experience, ExperienceMaker
-
-from copy import deepcopy
-from threading import Lock
-import time
-import os
-
-
-from .utils import is_rank_0, get_strategy_from_args, set_dist_env
-
-
-@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
-class ExperienceMakerHolder:
-    '''
-    Args:
-        detached_trainer_name_list: str list to get ray actor handles
-        strategy: 
-        experience_batch_size: batch size of generated experience
-        kl_coef: the coefficient of kl divergence loss
-    '''
-
-    def __init__(self,
-                 detached_trainer_name_list: List[str],
-                 strategy: str,
-                 env_info: Dict[str, str] = None,
-                 experience_batch_size: int = 8,
-                 kl_coef: float = 0.1,
-                 **generate_kwargs):
-        # set environment variables
-        if env_info:
-            set_dist_env(env_info=env_info)
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-        self.strategy_str = strategy
-        self.strategy = get_strategy_from_args(strategy)
-        self.experience_batch_size = experience_batch_size
-        self.kl_coef = kl_coef
-        self.generate_kwargs = generate_kwargs
-        # Need a trainer to give an actor and a critic via initialize_experience_maker(...)
-        actor, critic, reward_model, initial_model = None, None, None, None
-        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
-        self._model_visit_lock = Lock()
-        self.fully_initialized = False
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] Waiting for INIT')
-
-    def _get_ready(self):
-        while not self.fully_initialized:
-            time.sleep(1.0)
-
-    def update_target_trainer_list(self, detached_trainer_name_list):
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name))
-
-    # copy from ../trainer/base.py
-    @ray.method(concurrency_group="compute")
-    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
-        self._get_ready()
-        if isinstance(inputs, Tensor):
-            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
-        elif isinstance(inputs, dict):
-            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
-        else:
-            raise ValueError(f'Unsupported input type "{type(inputs)}"')
-
-    @ray.method(concurrency_group="experience_io")
-    def _send_experience(self, experience):
-        '''
-        ignore it
-
-        # choose a trainer that has the least experience batch in its detached_replay_buffer
-        chosen_trainer = None
-        min_length = None
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[maker] choosing target trainer")
-        while chosen_trainer is None:
-            for target_trainer in self.target_trainer_list:
-                try:
-                    temp_length = ray.get(target_trainer.buffer_get_length.remote(), timeout=0.1)
-                    if min_length is None:
-                        min_length = temp_length
-                        chosen_trainer = target_trainer
-                    else:
-                        if temp_length < min_length:
-                            min_length = temp_length
-                            chosen_trainer = target_trainer
-                except GetTimeoutError:
-                    pass
-                    
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        '''
-        # 
-        if not hasattr(self, "_target_idx"):
-            self._target_idx = 0
-        chosen_trainer = self.target_trainer_list[self._target_idx]
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
-
-    def workingloop(self, dataset, tokenizer: Optional[Callable[[Any], dict]] = None, times=5000 * 50000):
-        self._get_ready()
-        sampler = self.strategy.setup_sampler(dataset)
-        for _ in range(times):
-            rand_prompts = sampler.sample(self.experience_batch_size)
-            if tokenizer is not None:
-                inputs = tokenizer(rand_prompts)
-            else:
-                inputs = rand_prompts
-            self._model_visit_lock.acquire()
-            experience = self._make_experience(inputs=inputs)
-            self._model_visit_lock.release()
-            self._send_experience(experience=experience)
-
-    @ray.method(concurrency_group="model_io")
-    def initialize_experience_maker(self, init_actor: Actor, init_critic: Critic):
-        '''
-        called by trainer. Only once.
-        '''
-        # TODO: reduce malloc
-        if self.fully_initialized:
-            return
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] INIT')
-        with torch.no_grad():
-            with self.strategy.model_init_context():
-                actor = init_actor
-                critic = init_critic
-                initial_model = deepcopy(actor)
-                reward_model = RewardModel(deepcopy(critic.model),
-                                           deepcopy(critic.value_head)).to(torch.cuda.current_device())
-            if self.strategy_str != 'colossalai_gemini':
-                actor.to(torch.float16).to(torch.cuda.current_device())
-                critic.to(torch.float16).to(torch.cuda.current_device())
-                initial_model.to(torch.float16).to(torch.cuda.current_device())
-                reward_model.to(torch.float16).to(torch.cuda.current_device())
-
-            self.experience_maker.actor = self.strategy.prepare(actor)
-            self.experience_maker.critic = self.strategy.prepare(critic)
-            self.experience_maker.initial_model = self.strategy.prepare(initial_model)
-            self.experience_maker.reward_model = self.strategy.prepare(reward_model)
-        self.fully_initialized = True
-
-    @ray.method(concurrency_group="model_io")
-    def update_experience_maker(self, new_actor: Actor, new_critic: Critic):
-        '''
-            called by trainer
-        '''
-        # TODO: reduce malloc
-        self._model_visit_lock.acquire()
-        with torch.no_grad():
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[maker] UPDATE ")
-            if self.strategy_str != 'colossalai_gemini':
-                new_actor.to(torch.float16).to(torch.cuda.current_device())
-                new_critic.to(torch.float16).to(torch.cuda.current_device())
-            self.experience_maker.actor = self.strategy.prepare(new_actor)
-            self.experience_maker.critic = self.strategy.prepare(new_critic)
-        self._model_visit_lock.release()
diff --git a/applications/Chat/coati/ray/src/pipeline_strategy.py b/applications/Chat/coati/ray/src/pipeline_strategy.py
deleted file mode 100644
index 7ecb5d7d86d6..000000000000
--- a/applications/Chat/coati/ray/src/pipeline_strategy.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# WIP
-
-
-from coati.trainer.strategies import Strategy
-from coati.trainer.strategies import NaiveStrategy
-from coati.models.base import Actor, RewardModel, Critic
-
-import numpy as np
-import torch
-from torch._C._distributed_rpc import _is_current_rpc_agent_set
-
-import colossalai
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
-from colossalai.fx import ColoTracer
-from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-
-
-import os
-from functools import partial
-import random
-
-rpc_is_initialized = _is_current_rpc_agent_set
-
-class PipelineModel(torch.nn.Module):
-    '''
-    Actor has 2 kinds of jobs: forward and generate. 
-        better to just pipeline the inner model
-    '''
-    def __init__(self,
-                 model: torch.nn.Module,
-                 stage_num: int,
-                 num_microbatches: int,
-                 data_kwargs = None,
-                 ):
-        super().__init__()
-        # create partition module
-        def create_partition_module(pp_rank:int, stage_num: int, model, data_kwargs):
-            model.eval()
-            tracer = ColoTracer()
-            meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
-            graph = tracer.trace(root=model, meta_args=meta_args)
-            gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
-            annotated_model = balanced_split_pass(gm, stage_num)
-            top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
-            topo = get_fx_topology(top_module)
-            for submodule in split_submodules:
-                if isinstance(submodule, torch.fx.GraphModule):
-                    setattr(submodule, '_topo', topo)
-            return split_submodules[pp_rank + 1]
-    
-        def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
-            partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
-            return partition
-        self.inference_engine = OneFOneBPipelineEngine(
-            partition_fn=partial(partition, model, data_kwargs),
-            stage_num=stage_num,
-            num_microbatches=num_microbatches,
-            device='cuda',
-        )
-
-    def forward(self,
-                **model_inputs):
-        return self.inference_engine.forward_backward(**model_inputs, forward_only=True)
-
-
-
-class PPStrategy(NaiveStrategy):
-    """
-        Strategy for Pipeline inference (inference only!)
-        
-        master node only
-    """
-    def __init__(
-        self,
-        seed: int = 42
-    ):
-        self.seed = seed
-        super().__init__()
-        
-        
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-        ppg.set_global_info(rank = int(os.environ['RANK']),
-                            world_size=int(os.environ['WORLD_SIZE']),
-                            dp_degree=1,
-                            tp_degree=1,
-                            num_worker_threads=128,
-                            device="cuda")
-        
-    def model_init_context(self):
-        return super().model_init_context()
-    
-    def setup_model(self, model: torch.nn.Module) -> torch.nn.Module:
-        if isinstance(model, Actor) or \
-            isinstance(model, RewardModel) or \
-            isinstance(model, Critic):
-            model.model = PipelineModel(model.model)
-
-    def set_seed(self, seed: int) -> None:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
diff --git a/applications/Chat/coati/ray/src/utils.py b/applications/Chat/coati/ray/src/utils.py
deleted file mode 100644
index c750879b6d18..000000000000
--- a/applications/Chat/coati/ray/src/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch.distributed as dist
-from typing import Any, Callable, Dict, List, Optional
-from coati.models.bloom import BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-import torch
-import os
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
-
-
-def get_cuda_actor_critic_from_args(model: str, pretrained: str = None, lora_rank=0):
-    if model == 'gpt2':
-        actor = GPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'bloom':
-        actor = BLOOMActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'opt':
-        actor = OPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    else:
-        raise ValueError(f'Unsupported model "{model}"')
-    return actor, critic
-
-
-def get_strategy_from_args(strategy: str):
-    if strategy == 'naive':
-        strategy_ = NaiveStrategy()
-    elif strategy == 'ddp':
-        strategy_ = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-    return strategy_
-
-
-def set_dist_env(env_info: Dict[str, str]):
-    os.environ["RANK"] = env_info['rank']
-    os.environ["LOCAL_RANK"] = env_info['local_rank']
-    os.environ["WORLD_SIZE"] = env_info['world_size']
-    os.environ['MASTER_PORT'] = env_info['master_port']
-    os.environ['MASTER_ADDR'] = env_info['master_addr']
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
new file mode 100644
index 000000000000..4361ee236771
--- /dev/null
+++ b/applications/Chat/coati/ray/utils.py
@@ -0,0 +1,152 @@
+import os
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
+
+
+def is_rank_0() -> bool:
+    return not dist.is_initialized() or dist.get_rank() == 0
+
+
+def get_rank() -> int:
+    return dist.get_rank() if dist.is_initialized() else 0
+
+
+def get_world_size() -> int:
+    return dist.get_world_size() if dist.is_initialized() else 1
+
+
+def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        actor = GPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'bloom':
+        actor = BLOOMActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'opt':
+        actor = OPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'llama':
+        actor = LlamaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'roberta':
+        actor = RoBERTaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    else:
+        raise ValueError(f'Unsupported actor model "{model}"')
+    return actor
+
+
+def get_critic_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'bloom':
+        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'opt':
+        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'llama':
+        critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'roberta':
+        critic = RoBERTaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return critic
+
+
+def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
+    if model == 'gpt2':
+        reward_model = GPTRM(pretrained=pretrained, config=config)
+    elif model == 'bloom':
+        reward_model = BLOOMRM(pretrained=pretrained, config=config)
+    elif model == 'opt':
+        reward_model = OPTRM(pretrained=pretrained, config=config)
+    elif model == 'llama':
+        reward_model = LlamaRM(pretrained=pretrained, config=config)
+    elif model == 'roberta':
+        reward_model = RoBERTaRM(pretrained=pretrained, config=config)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return reward_model
+
+
+def get_strategy_from_args(strategy: str):
+    if strategy == 'naive':
+        strategy_ = NaiveStrategy()
+    elif strategy == 'ddp':
+        strategy_ = DDPStrategy()
+    elif strategy == 'colossalai_gemini':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif strategy == 'colossalai_gemini_cpu':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2_cpu':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+    return strategy_
+
+
+def get_tokenizer_from_args(model: str, **kwargs):
+    if model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif model == 'llama':
+        pretrain_path = kwargs["pretrain"]
+        tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
+    elif model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{model}"')
+
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+
+def set_dist_env(env_info: Dict[str, str]):
+    os.environ["RANK"] = env_info['rank']
+    os.environ["LOCAL_RANK"] = env_info['local_rank']
+    os.environ["WORLD_SIZE"] = env_info['world_size']
+    os.environ['MASTER_PORT'] = env_info['master_port']
+    os.environ['MASTER_ADDR'] = env_info['master_addr']
+
+
+def get_model_numel(model: nn.Module) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    return numel
+
+
+def get_receivers_per_sender(sender_idx: int, num_senders: int, num_receivers: int, allow_idle_sender: bool) -> list:
+    target_receivers = []
+    if num_senders <= num_receivers or allow_idle_sender:
+        # a sender will send data to one or more than one receivers
+        # a receiver only has one sender
+        for i in range(num_receivers):
+            if i % num_senders == sender_idx:
+                target_receivers.append(i)
+    else:
+        # a sender will send data to one receiver
+        # a receiver may have more than one sender
+        target_receivers.append(sender_idx % num_receivers)
+    return target_receivers
+
+
+def state_dict_to(state_dict: Dict[str, Any],
+                  dtype: torch.dtype = torch.float16,
+                  device: torch.device = torch.device('cpu')):
+    '''
+        keep state_dict intact
+    '''
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k] = v.to(dtype=dtype, device=device)
+    return new_state_dict
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index b1452869179e..bd30422022ae 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -130,3 +130,7 @@ def save_pretrained(self,
                         only_rank0: bool = True,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
         pass
+
+    @abstractmethod
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        pass
\ No newline at end of file
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 8aa302c77eee..88268b677eb2 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -186,3 +186,15 @@ def save_pretrained(self,
         if self.stage == 3:
             raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
         super().save_pretrained(model, path, only_rank0, tokenizer)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        if self.stage != 3:
+            yield from super().get_model_state_dict_shard(model, **config)
+        else:
+            # unwrapped_model = self._unwrap_model(model)
+            # for module in unwrapped_model.modules():
+            #     if isinstance(module, LoraLinear):
+            #         module.merge_weights = True
+            #         module.eval()
+            base_model: ZeroDDP = get_base_model(model)
+            yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 7910b57878f8..a1fecb36373f 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -26,19 +26,8 @@ def __init__(self, seed: int = 42) -> None:
         super().__init__()
 
     def setup_distributed(self) -> None:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-        except KeyError as e:
-            raise RuntimeError(
-                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-            )
-        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+        self._try_init_dist(force=True)
         self.set_seed(self.seed)
-        torch.cuda.set_device(local_rank)
 
     def set_seed(self, seed: int) -> None:
         random.seed(seed)
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index 4d94026ce932..972deebeaa0d 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -1,10 +1,17 @@
-from typing import Any, Optional
+import os
+import sys
+from collections import OrderedDict
+from typing import Any, Dict, Optional
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 from coati.models.base import get_base_model
 from coati.replay_buffer import ReplayBuffer
+from coati.models.base import RewardModel
+from coati.models.lora import LoraLinear
+from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.modeling_utils import PreTrainedModel
@@ -13,6 +20,15 @@
 from .base import Strategy
 
 
+# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
+def get_grad_required_state_dict(model: nn.Module):
+    state_dict = OrderedDict()
+    for name, parameter in model.named_parameters():
+        if parameter.requires_grad:
+            state_dict[name] = parameter.detach()
+    return state_dict
+
+
 class NaiveStrategy(Strategy):
     """
         Strategy for single GPU. No parallelism is used.
@@ -25,7 +41,7 @@ def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
         optimizer.step()
 
     def setup_distributed(self) -> None:
-        pass
+        self._try_init_dist(force=False)
 
     def setup_model(self, model: nn.Module) -> nn.Module:
         return model
@@ -68,3 +84,45 @@ def save_pretrained(self,
         unwrapped_model.save_pretrained(path)
         if tokenizer is not None:
             tokenizer.save_pretrained(path)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        # TODO: implement sharding on naive strategy
+        model = self.unwrap_model(model)
+        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+            state_dict = get_grad_required_state_dict(model)
+        else:
+            state_dict = model.state_dict()
+
+        if 'shard_size' in config:
+            shard_size = config['shard_size']
+            accumulate_size = 0
+            state_dict_shard = OrderedDict()
+            for name, param in state_dict.items():
+                state_dict_shard[name] = param
+                accumulate_size += param.numel() * param.element_size()
+                if accumulate_size >= shard_size:
+                    accumulate_size = 0
+                    yield state_dict_shard
+                    state_dict_shard = OrderedDict()
+            if accumulate_size > 0:
+                yield state_dict_shard
+        else:
+            yield state_dict
+
+    def _try_init_dist(self, force: bool = False) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            torch.cuda.set_device(local_rank)
+        except KeyError as e:
+            if force:
+                raise RuntimeError(
+                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+                )
+        except Exception as e:
+            if force:
+                raise e
diff --git a/applications/Chat/coati/trainer/strategies/sampler.py b/applications/Chat/coati/trainer/strategies/sampler.py
index d726fa640fa2..65e199dbf029 100644
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -27,6 +27,7 @@ def __init__(self, dataset, num_replicas: int, rank: int) -> None:
         assert len(indices) == self.num_samples
         self.indices = indices
 
+
     def sample(self, batch_size: int) -> list:
         sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
         return [self.dataset[idx] for idx in sampled_indices]
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
new file mode 100644
index 000000000000..afdd6a922cc7
--- /dev/null
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+    get_tokenizer_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = get_tokenizer_from_args(args.model)
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=["maker1"],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        experience_batch_size=args.experience_batch_size,
+        kl_coef=0.1,
+        debug=args.debug,
+        update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
new file mode 100644
index 000000000000..fa7b2bd7edfd
--- /dev/null
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain, lora_rank=args.lora_rank).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain, lora_rank=args.lora_rank).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/requirements.txt b/applications/Chat/examples/ray/requirements.txt
new file mode 100644
index 000000000000..e0275631807f
--- /dev/null
+++ b/applications/Chat/examples/ray/requirements.txt
@@ -0,0 +1 @@
+ray
diff --git a/applications/Chat/examples/ray/test_ci.sh b/applications/Chat/examples/ray/test_ci.sh
new file mode 100755
index 000000000000..895f7de0fea9
--- /dev/null
+++ b/applications/Chat/examples/ray/test_ci.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -xe
+BASE=$(realpath $(dirname $0))
+
+export RAY_NAMESPACE=admin
+export DATA=/data/scratch/chatgpt/prompts.csv
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 2b049163c801..2fa6c6052f8d 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
+
+# 3080 doesn't support P2P, skip this test
+# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}

From 4fc8bc68ac707302ad7d47706778f42a4d5031bf Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Wed, 7 Jun 2023 11:02:19 +0800
Subject: [PATCH 18/52] modify file path

---
 examples/images/dreambooth/colossalai.sh | 8 ++++----
 examples/images/dreambooth/dreambooth.sh | 6 +++---
 examples/images/dreambooth/test_ci.sh    | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 54ebac39b925..3b15ad887b0a 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -3,10 +3,10 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Path_to_your_model"  \
-  --instance_data_dir="Path_to_your_training_image" \
-  --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="your prompt" \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \
diff --git a/examples/images/dreambooth/dreambooth.sh b/examples/images/dreambooth/dreambooth.sh
index e063bc8279c5..f6b8f5e1b87e 100644
--- a/examples/images/dreambooth/dreambooth.sh
+++ b/examples/images/dreambooth/dreambooth.sh
@@ -1,7 +1,7 @@
 python train_dreambooth.py \
-    --pretrained_model_name_or_path= ## Your Model Path  \
-    --instance_data_dir=  ## Your Training Input Pics Path \
-    --output_dir="path-to-save-model" \
+    --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4" \
+    --instance_data_dir="/data/dreambooth/Teyvat/data" \
+    --output_dir="./weight_output" \
     --instance_prompt="a photo of a dog" \
     --resolution=512 \
     --train_batch_size=1 \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 0209c547a08f..c0b0c2b3d016 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -8,10 +8,10 @@ DIFFUSERS_OFFLINE=1
 
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
   torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your prompt" \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
   --resolution=512 \
   --plugin=$plugin \
   --train_batch_size=1 \

From 9c88b6cbd1597d5f429b61a04a7219b9e0d14a1b Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 11:10:12 +0800
Subject: [PATCH 19/52] [lazy] fix compatibility problem on torch 1.13 (#3911)

---
 colossalai/lazy/lazy_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/lazy/lazy_init.py b/colossalai/lazy/lazy_init.py
index c1fda3c53865..76f550dc4392 100644
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -37,7 +37,7 @@
 # If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
 # without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
 # These ops cannot be unwrapped using .data
-_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__']
+_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__', 'numel', 'size', 'dim']
 
 _LEGACY_TENSOR_CONSTRUCTOR = {
     'FloatTensor': torch.float,

From c25d421f3e2e4599d44f88a07ba7621c3991548c Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 12:39:12 +0800
Subject: [PATCH 20/52] [devops] hotfix testmon cache clean logic (#3917)

---
 .github/workflows/build_on_pr.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index a2807859b591..8b2253e57cfb 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -271,7 +271,6 @@ jobs:
           PR_NUMBER: ${{ github.event.pull_request.number }}
 
       - name: Remove testmon cache
-        if: github.event.pull_request.merged != true
         run: |
           rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
         env:

From 5e2132dcff0fae0866aa3ce4b1aecbc767ed189b Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 7 Jun 2023 15:37:37 +0800
Subject: [PATCH 21/52] [workflow] added docker latest tag for release (#3920)

---
 .github/workflows/release_docker_after_publish.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml
index 22698ca192ed..6c8df9730b0d 100644
--- a/.github/workflows/release_docker_after_publish.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@@ -23,8 +23,11 @@ jobs:
         run: |
           version=$(cat version.txt)
           tag=hpcaitech/colossalai:$version
+          latest=hpcaitech/colossalai:latest
           docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 --build-arg VERSION=v${version} -t $tag ./docker
+          docker tag $tag $latest
           echo "tag=${tag}" >> $GITHUB_OUTPUT
+          echo "latest=${latest}" >> $GITHUB_OUTPUT
 
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
@@ -36,6 +39,7 @@ jobs:
         id: docker-push
         run: |
           docker push ${{ steps.build.outputs.tag }}
+          docker push ${{ steps.build.outputs.latest }}
 
   notify:
     name: Notify Lark via webhook

From a55fb00c18dbc36941d2e850312015e0930837a2 Mon Sep 17 00:00:00 2001
From: wukong1992 <wukong1992@users.noreply.github.com>
Date: Wed, 7 Jun 2023 15:51:00 +0800
Subject: [PATCH 22/52] [booster] update bert example, using booster api
 (#3885)

---
 examples/language/bert/README.md          |  34 +++
 examples/language/bert/benchmark.py       | 174 ++++++++++++
 examples/language/bert/benchmark.sh       |   9 +
 examples/language/bert/benchmark_utils.py | 146 ++++++++++
 examples/language/bert/data.py            | 127 +++++++++
 examples/language/bert/finetune.py        | 220 ++++++++++++++
 examples/language/bert/requirements.txt   |   9 +
 examples/language/bert/run_gemini.sh      |  22 --
 examples/language/bert/test_ci.sh         |  10 +-
 examples/language/bert/train_bert_demo.py | 331 ----------------------
 10 files changed, 727 insertions(+), 355 deletions(-)
 create mode 100644 examples/language/bert/README.md
 create mode 100644 examples/language/bert/benchmark.py
 create mode 100755 examples/language/bert/benchmark.sh
 create mode 100644 examples/language/bert/benchmark_utils.py
 create mode 100644 examples/language/bert/data.py
 create mode 100644 examples/language/bert/finetune.py
 create mode 100644 examples/language/bert/requirements.txt
 delete mode 100644 examples/language/bert/run_gemini.sh
 mode change 100644 => 100755 examples/language/bert/test_ci.sh
 delete mode 100644 examples/language/bert/train_bert_demo.py

diff --git a/examples/language/bert/README.md b/examples/language/bert/README.md
new file mode 100644
index 000000000000..c845a5c50387
--- /dev/null
+++ b/examples/language/bert/README.md
@@ -0,0 +1,34 @@
+## Overview
+
+This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
+
+## Finetune
+```
+bash test_ci.sh
+```
+
+## Benchmark
+```
+bash benchmark.sh
+```
+
+Now include these metrics in benchmark: CUDA mem occupy, throughput and the number of model parameters. If you have custom metrics, you can add them to benchmark_util.
+
+## Results
+
+### Bert
+
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | 21.44 GB | 3.0 | 82M |
+| ddp_fp16 | 16.26 GB | 11.3 | 82M |
+| gemini | 11.0 GB | 12.9 | 82M |
+| low_level_zero | 11.29 G | 14.7 | 82M |
+
+### AlBert
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | OOM |  | |
+| ddp_fp16 | OOM |  | |
+| gemini | 69.39 G | 1.3 | 208M |
+| low_level_zero | 56.89 G | 1.4 | 208M |
\ No newline at end of file
diff --git a/examples/language/bert/benchmark.py b/examples/language/bert/benchmark.py
new file mode 100644
index 000000000000..ae8b2269a534
--- /dev/null
+++ b/examples/language/bert/benchmark.py
@@ -0,0 +1,174 @@
+import argparse
+
+import torch
+from benchmark_utils import benchmark
+from torch.utils.data import DataLoader, Dataset
+from transformers import (
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    BertConfig,
+    BertForSequenceClassification,
+    get_linear_schedule_with_warmup,
+)
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+SEQ_LEN = 512
+VOCAB_SIZE = 1000
+NUM_LABELS = 10
+DATASET_LEN = 1000
+
+
+class RandintDataset(Dataset):
+
+    def __init__(self, dataset_length: int, sequence_length: int, vocab_size: int, n_class: int):
+
+        self._sequence_length = sequence_length
+        self._vocab_size = vocab_size
+        self._n_class = n_class
+        self._dataset_length = dataset_length
+        self._datas = torch.randint(
+            low=0,
+            high=self._vocab_size,
+            size=(self._dataset_length, self._sequence_length,),
+            dtype=torch.long,
+        )
+        self._labels = torch.randint(low=0, high=self._n_class, size=(self._dataset_length, 1), dtype=torch.long) 
+
+    def __len__(self):
+        return self._dataset_length
+
+    def __getitem__(self, idx):
+        return self._datas[idx], self._labels[idx]
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+
+    args = parser.parse_args()
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+
+    train_dataset = RandintDataset(dataset_length=DATASET_LEN,
+                                   sequence_length=SEQ_LEN,
+                                   vocab_size=VOCAB_SIZE,
+                                   n_class=NUM_LABELS)
+    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+
+    if args.model_type == "bert":
+        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = BertForSequenceClassification(cfg)
+    elif args.model_type == "albert":
+        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = AlbertForSequenceClassification(cfg)
+    else:
+        raise RuntimeError
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # criterion
+    criterion = lambda inputs: inputs[0]
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Benchmark model
+    # ==============================
+
+    results = benchmark(model,
+                        booster,
+                        optimizer,
+                        lr_scheduler,
+                        train_dataloader,
+                        criterion=criterion,
+                        epoch_num=NUM_EPOCHS)
+
+    coordinator.print_on_master(results)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/bert/benchmark.sh b/examples/language/bert/benchmark.sh
new file mode 100755
index 000000000000..9453d1373f2f
--- /dev/null
+++ b/examples/language/bert/benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+   torchrun --standalone --nproc_per_node 2  benchmark.py --plugin $plugin --model_type "bert"
+   torchrun --standalone --nproc_per_node 2  benchmark.py  --plugin $plugin --model_type "albert"
+done
diff --git a/examples/language/bert/benchmark_utils.py b/examples/language/bert/benchmark_utils.py
new file mode 100644
index 000000000000..886017a41826
--- /dev/null
+++ b/examples/language/bert/benchmark_utils.py
@@ -0,0 +1,146 @@
+import inspect
+from logging import getLogger
+from time import time
+from typing import Callable
+
+import torch
+import yaml
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+logger = getLogger("colossalai-booster-benchmark")
+_INVALID = float("nan")
+
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def _is_valid(val):
+    return val == val
+
+
+def get_call_arg_names(module_or_fn):
+    if isinstance(module_or_fn, torch.nn.Module):
+        return inspect.getfullargspec(module_or_fn.forward)[0][1:]
+    return inspect.getfullargspec(module_or_fn)[0]
+
+
+def measure_params(model):
+    num_params = _INVALID
+
+    try:
+        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    except AttributeError as e:
+        logger.error(f"Unable to measure model params due to error: {e}")
+
+    return num_params
+
+
+def warm_up(
+    model,
+    booster,
+    dataloader,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    num_runs=10,
+):
+    for i, data in enumerate(dataloader):
+        if i > num_runs:
+            break
+        inputs, labels = data[0].cuda(), data[1].cuda()
+        outputs = model(inputs, labels=labels)
+        loss = criterion(outputs)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+
+def fmt(d: dict):
+    return yaml.dump(d)
+
+
+def benchmark(
+    model: torch.nn.Module,
+    booster: Booster,
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: LRScheduler,
+    dataloader: DataLoader,
+    criterion: Callable = None,
+    warm_up_fn=warm_up,
+    epoch_num: int = 3,
+    batch_size: int = 32,
+    warm_up_steps: int = 3,
+):
+    results = {}
+    model_device = torch.cuda.current_device()
+
+    # Warm up
+    warm_up_fn(
+        model,
+        booster,
+        dataloader,
+        criterion,
+        optimizer,
+        lr_scheduler,
+        num_runs=warm_up_steps,
+    )
+    # Measure params
+    params = measure_params(model)
+    if _is_valid(params):
+        results["params"] = format_num(params)
+        logger.info(f"Model parameters: {params} ({format_num(params)})")
+
+    # Measure Allocated Memory and Throughput
+    memory = {}
+    throughput = {}
+    torch.cuda.reset_peak_memory_stats(device=model_device)
+    pre_mem = torch.cuda.memory_allocated(device=model_device)
+
+    start_time = time()
+
+    for epoch in range(epoch_num):
+        with tqdm(dataloader, desc=f'Epoch [{epoch + 1}/{epoch_num}]',
+                  disable=not DistCoordinator().is_master()) as pbar:
+            for data in pbar:
+                inputs, labels = data[0].cuda(), data[1].cuda()
+                outputs = model(inputs, labels=labels)
+                loss = criterion(outputs)
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    end_time = time()
+
+    all_sample = epoch_num * len(dataloader)
+
+    post_mem = torch.cuda.memory_allocated(device=model_device)
+    max_mem = torch.cuda.max_memory_allocated(device=model_device)
+
+    memory[f"batch_size_{batch_size}"] = {
+        "cuda_pre_training_bytes": format_num(pre_mem, bytes=True),
+        "cuda_max_training_bytes": format_num(max_mem, bytes=True),
+        "cuda_post_training_bytes": format_num(post_mem, bytes=True),
+    }
+    logger.info(fmt({f"Memory results (batch_size={batch_size})": memory[f"batch_size_{batch_size}"]}))
+
+    throughput[f"batch_size_{batch_size}"] = {"throughput:": "{:.1f}".format(all_sample * DistCoordinator().world_size / (end_time - start_time))}
+    logger.info(fmt({f"Throughput results (batch_size={batch_size})": throughput[f"batch_size_{batch_size}"]}))
+
+    results["throughput"] = throughput
+    results["memory"] = memory
+
+    return results
diff --git a/examples/language/bert/data.py b/examples/language/bert/data.py
new file mode 100644
index 000000000000..981cedcca8c2
--- /dev/null
+++ b/examples/language/bert/data.py
@@ -0,0 +1,127 @@
+import datasets
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+
+
+class GLUEDataBuilder:
+
+    task_text_field_map = {
+        "cola": ["sentence"],
+        "sst2": ["sentence"],
+        "mrpc": ["sentence1", "sentence2"],
+        "qqp": ["question1", "question2"],
+        "stsb": ["sentence1", "sentence2"],
+        "mnli": ["premise", "hypothesis"],
+        "qnli": ["question", "sentence"],
+        "rte": ["sentence1", "sentence2"],
+        "wnli": ["sentence1", "sentence2"],
+        "ax": ["premise", "hypothesis"],
+    }
+
+    glue_task_num_labels = {
+        "cola": 2,
+        "sst2": 2,
+        "mrpc": 2,
+        "qqp": 2,
+        "stsb": 1,
+        "mnli": 3,
+        "qnli": 2,
+        "rte": 2,
+        "wnli": 2,
+        "ax": 3,
+    }
+
+    loader_columns = [
+        "datasets_idx",
+        "input_ids",
+        "token_type_ids",
+        "attention_mask",
+        "start_positions",
+        "end_positions",
+        "labels",
+    ]
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        plugin: DPPluginBase,
+        task_name: str = "mrpc",
+        max_seq_length: int = 128,
+        train_batch_size: int = 32,
+        eval_batch_size: int = 32,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_name_or_path = model_name_or_path
+        self.task_name = task_name
+        self.max_seq_length = max_seq_length
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.plugin = plugin
+
+        self.text_fields = self.task_text_field_map[task_name]
+        self.num_labels = self.glue_task_num_labels[task_name]
+        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+        self.setup()
+
+    def setup(self):
+        self.dataset = datasets.load_dataset("glue", self.task_name)
+
+        for split in self.dataset.keys():
+            self.dataset[split] = self.dataset[split].map(
+                self.convert_to_features,
+                batched=True,
+                remove_columns=["label"],
+            )
+            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
+            self.dataset[split].set_format(type="torch", columns=self.columns)
+
+        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]
+
+    def prepare_data(self):
+        datasets.load_dataset("glue", self.task_name)
+        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+
+    def train_dataloader(self):
+        return self.plugin.prepare_dataloader(self.dataset["train"],
+                                              batch_size=self.train_batch_size,
+                                              shuffle=True,
+                                              drop_last=True)
+
+    def val_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def test_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def convert_to_features(self, example_batch):
+
+        # Either encode single sentence or sentence pairs
+        if len(self.text_fields) > 1:
+            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
+        else:
+            texts_or_text_pairs = example_batch[self.text_fields[0]]
+
+        # Tokenize the text/text pairs
+        features = self.tokenizer.batch_encode_plus(texts_or_text_pairs,
+                                                    max_length=self.max_seq_length,
+                                                    padding='max_length',
+                                                    truncation=True)
+
+        # Rename label to labels to make it easier to pass to model forward
+        features["labels"] = example_batch["label"]
+
+        return features
diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
new file mode 100644
index 000000000000..b209ffde85a4
--- /dev/null
+++ b/examples/language/bert/finetune.py
@@ -0,0 +1,220 @@
+import argparse
+from typing import List, Union
+
+import evaluate
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from data import GLUEDataBuilder
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AlbertForSequenceClassification,
+    AutoConfig,
+    BertForSequenceClassification,
+    get_linear_schedule_with_warmup,
+)
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+
+
+def move_to_cuda(batch):
+    return {k: v.cuda() for k, v in batch.items()}
+
+
+@torch.no_grad()
+def evaluate_model(model: nn.Module, test_dataloader: Union[DataLoader, List[DataLoader]], num_labels: int, task_name: str,
+             eval_splits: List[str], coordinator: DistCoordinator):
+    metric = evaluate.load("glue", task_name, process_id=coordinator.rank, num_process=coordinator.world_size)
+    model.eval()
+
+    def evaluate_subset(dataloader: DataLoader):
+        accum_loss = torch.zeros(1, device=get_current_device())
+        for batch in dataloader:
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            val_loss, logits = outputs[:2]
+            accum_loss.add_(val_loss)
+
+            if num_labels > 1:
+                preds = torch.argmax(logits, axis=1)
+            elif num_labels == 1:
+                preds = logits.squeeze()
+
+            labels = batch["labels"]
+
+            metric.add_batch(predictions=preds, references=labels)
+
+        results = metric.compute()
+        dist.all_reduce(accum_loss.div_(len(dataloader)))
+        if coordinator.is_master():
+            results['loss'] = accum_loss.item() / coordinator.world_size
+        return results
+
+    if isinstance(test_dataloader, DataLoader):
+        return evaluate_subset(test_dataloader)
+    else:
+        assert len(test_dataloader) == len(eval_splits)
+        final_results = {}
+        for split, sub_loader in zip(eval_splits, test_dataloader):
+            results = evaluate_subset(sub_loader)
+            final_results.update({f'{k}_{split}': v for k, v in results.items()})
+        return final_results
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, lr_scheduler, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for batch in pbar:
+            # Forward pass
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            loss = outputs[0]
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            lr_scheduler.step()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+    parser.add_argument('--target_f1', type=float, default=None, help="target f1 score. Raise exception if not reached")
+    args = parser.parse_args()
+
+    if args.model_type == 'bert':
+        model_name = "bert-base-uncased"
+    elif args.model_type == 'albert':
+        model_name = "albert-xxlarge-v2"
+    else:
+        raise RuntimeError
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    data_builder = GLUEDataBuilder(model_name,
+                                   plugin,
+                                   args.task,
+                                   train_batch_size=BATCH_SIZE,
+                                   eval_batch_size=BATCH_SIZE)
+    train_dataloader = data_builder.train_dataloader()
+    test_dataloader = data_builder.test_dataloader()
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+
+    cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels)
+    if model_name == "bert-base-uncased":
+        model = BertForSequenceClassification.from_pretrained(model_name, config=cfg)
+    elif model_name == "albert-xxlarge-v2":
+        model = AlbertForSequenceClassification.from_pretrained(model_name, config=cfg)
+    else:
+        raise RuntimeError
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Train model
+    # ==============================
+    for epoch in range(NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, lr_scheduler, train_dataloader, booster, coordinator)
+
+    results = evaluate_model(model, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits,
+                       coordinator)
+
+    if coordinator.is_master():
+        print(results)
+        if args.target_f1 is not None and 'f1' in results:
+            assert results['f1'] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/bert/requirements.txt b/examples/language/bert/requirements.txt
new file mode 100644
index 000000000000..377422c260ad
--- /dev/null
+++ b/examples/language/bert/requirements.txt
@@ -0,0 +1,9 @@
+colossalai
+evaluate
+datasets
+torch
+tqdm
+transformers
+scipy
+scikit-learn
+ptflops
diff --git a/examples/language/bert/run_gemini.sh b/examples/language/bert/run_gemini.sh
deleted file mode 100644
index d791334e8c97..000000000000
--- a/examples/language/bert/run_gemini.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-set -x
-# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
-export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
-
-# The following options only valid when DISTPLAN="colossalai"
-export GPUNUM=${GPUNUM:-1}
-export PLACEMENT=${PLACEMENT:-"cpu"}
-export BATCH_SIZE=${BATCH_SIZE:-16}
-
-# bert | albert
-export MODEL_TYPE=${MODEL_TYPE:-"bert"}
-export TRAIN_STEP=${TRAIN_STEP:-10}
-
-mkdir -p gemini_logs
-
-env CUDA_LAUNCH_BLOCKING=1 torchrun --standalone --nproc_per_node=${GPUNUM} ./train_bert_demo.py \
---model_type=${MODEL_TYPE} \
---batch_size=${BATCH_SIZE} \
---placement=${PLACEMENT} \
---distplan=${DISTPLAN} \
---train_step=${TRAIN_STEP} \
-2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_${PLACEMENT}.log
diff --git a/examples/language/bert/test_ci.sh b/examples/language/bert/test_ci.sh
old mode 100644
new mode 100755
index 42c63fec50c0..7fc6daabb2f3
--- a/examples/language/bert/test_ci.sh
+++ b/examples/language/bert/test_ci.sh
@@ -1,2 +1,8 @@
-set -x
-env GPUNUM=1 bash run_gemini.sh
+#!/bin/bash
+set -xe
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+   torchrun --standalone --nproc_per_node 4  finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert"
+done
diff --git a/examples/language/bert/train_bert_demo.py b/examples/language/bert/train_bert_demo.py
deleted file mode 100644
index 9a0278b2c711..000000000000
--- a/examples/language/bert/train_bert_demo.py
+++ /dev/null
@@ -1,331 +0,0 @@
-import os
-from functools import partial
-from time import time
-
-import psutil
-import torch
-from packaging import version
-from torch import nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from transformers import AlbertConfig, AlbertForSequenceClassification, BertConfig, BertForSequenceClassification
-
-import colossalai
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
-
-CAI_VERSION = colossalai.__version__
-
-
-def get_tflops(model_numel, batch_size, seq_len, step_time):
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
-
-
-def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
-    from contextlib import nullcontext
-
-    from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
-    if enable_flag:
-        return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                       schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
-                       on_trace_ready=tensorboard_trace_handler(save_dir),
-                       record_shapes=True,
-                       profile_memory=True)
-    else:
-
-        class DummyProfiler:
-
-            def __init__(self):
-                self.step_number = 0
-
-            def step(self):
-                self.step_number += 1
-
-        return nullcontext(DummyProfiler())
-
-
-def get_time_stamp():
-    import time
-    cur_time = time.strftime("%d-%H:%M", time.localtime())
-    return cur_time
-
-
-def get_bert_data(batch_size: int, sequence_length: int, vacob_size: int, n_class: int, device: torch.device):
-    input = torch.randint(
-        low=0,
-        high=vacob_size,
-        size=(batch_size, sequence_length),
-        device=device,
-        dtype=torch.long,
-    )
-    label = torch.randint(low=0, high=n_class, size=(batch_size,), device=device, dtype=torch.long)
-    return input, label
-
-
-def parse_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument(
-        "--distplan",
-        type=str,
-        default='CAI_Gemini',
-        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
-    )
-    parser.add_argument(
-        "--placement",
-        type=str,
-        default='cpu',
-        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="batch size per DP group of training.",
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default="bert",
-        help="bert or albert",
-    )
-    parser.add_argument(
-        "--train_step",
-        type=int,
-        default=10,
-        help="training iterations for test",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-SEQ_LEN = 512
-VOCAB_SIZE = 1000
-NUM_LABELS = 10
-
-
-# Parameter Sharding Strategies for Tensor Parallelism
-def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
-    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    param.set_tensor_spec(*spec)
-
-
-def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(0, param, pg)
-
-
-def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(-1, param, pg)
-
-
-def get_cpu_mem():
-    return psutil.Process().memory_info().rss / 1024**2
-
-
-def get_gpu_mem():
-    return torch.cuda.memory_allocated() / 1024**2
-
-
-def get_mem_info(prefix=''):
-    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
-
-
-def get_model_size(model: nn.Module):
-    total_numel = 0
-    for module in model.modules():
-        for p in module.parameters(recurse=False):
-            total_numel += p.numel()
-    return total_numel
-
-
-def model_builder(args):
-    if args.model_type == "bert":
-        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
-        return BertForSequenceClassification(cfg)
-    elif args.model_type == "albert":
-        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
-        return AlbertForSequenceClassification(cfg)
-    else:
-        raise RuntimeError
-
-
-def model_size_formatter(numel: int) -> str:
-    GB_SIZE = 10**9
-    MB_SIZE = 10**6
-    KB_SIZE = 10**3
-    if numel >= GB_SIZE:
-        return f'{numel / GB_SIZE:.1f}B'
-    elif numel >= MB_SIZE:
-        return f'{numel / MB_SIZE:.1f}M'
-    elif numel >= KB_SIZE:
-        return f'{numel / KB_SIZE:.1f}K'
-    else:
-        return str(numel)
-
-
-def set_cpu_maximum_parallelism():
-    conf_str = torch.__config__.parallel_info()
-    inter_str = conf_str.split("hardware_concurrency() : ")[1]
-    max_concurrency = inter_str.split('\n')[0]
-    os.environ["OMP_NUM_THREADS"] = max_concurrency
-    print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")
-
-
-def main():
-    # version check
-    # this example is supposed to work for versions greater than 0.2.0
-    assert version.parse(CAI_VERSION) >= version.parse("0.2.0")
-
-    set_cpu_maximum_parallelism()
-    args = parse_args()
-
-    # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
-    if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
-        raise TypeError(f"{args.distplan} is error")
-
-    # batch size per DP degree
-    BATCH_SIZE = args.batch_size
-
-    NUM_STEPS = args.train_step
-
-    WARMUP_STEPS = 1
-    assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
-    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
-    PROF_FLAG = False    # The flag of profiling, False by default
-
-    disable_existing_loggers()
-    colossalai.launch_from_torch(config={})
-
-    logger = get_dist_logger()
-    logger.info(f" {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
-
-    torch.manual_seed(123)
-    if args.distplan.startswith("CAI"):
-        # all param must use the same process group.
-        world_size = torch.distributed.get_world_size()
-
-        # build a base-bert model
-        with ColoInitContext(device=get_current_device(), dtype=torch.half):
-            model = model_builder(args)
-            # model = BertForSequenceClassification(BertConfig(vocal_size =  VOCAB_SIZE))
-
-        # asign running configurations
-        gemini_config = None
-        if args.distplan.startswith("CAI_ZeRO"):
-            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
-        elif args.distplan == "CAI_Gemini":
-            gemini_config = dict(strict_ddp_mode=True,
-                                 device=get_current_device(),
-                                 placement_policy=args.placement,
-                                 pin_memory=True,
-                                 hidden_dim=model.config.hidden_size,
-                                 search_range_mb=128)
-            optim_config = dict(gpu_margin_mem_ratio=0.)
-        else:
-            raise RuntimeError
-
-        # build a highly optimized gpu/cpu optimizer
-        optimizer = HybridAdam(model.parameters(), lr=1e-3)
-
-        if args.distplan == "CAI_ZeRO1":
-            zero_stage = 1
-        elif args.distplan == "CAI_ZeRO2":
-            zero_stage = 2
-        elif args.distplan == "CAI_Gemini":
-            zero_stage = 3
-        else:
-            raise RuntimeError
-
-        # wrap your model and optimizer
-        model = zero_model_wrapper(model, zero_stage, gemini_config)
-        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
-
-        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
-    elif args.distplan.startswith("Pytorch"):
-        model = model_builder(args).cuda()
-        model = DDP(model)
-        if args.distplan.endswith("DDP"):
-            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-        elif args.distplan.endswith("ZeRO"):
-            from torch.distributed.optim import ZeroRedundancyOptimizer
-            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
-    else:
-        raise RuntimeError
-
-    # model is shared after TP
-    numel = get_model_size(model)
-    logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
-    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
-
-    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
-    # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
-    # = batch_per_DP_group * numel * seq_len * 8
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
-
-    torch.cuda.synchronize()
-    model.train()
-    tflops_list = []
-
-    def train_step():
-        # we just use randomly generated data here
-        input_ids, labels = get_bert_data(BATCH_SIZE,
-                                          SEQ_LEN,
-                                          VOCAB_SIZE,
-                                          NUM_LABELS,
-                                          device=torch.cuda.current_device())
-        optimizer.zero_grad()
-
-        start = time()
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-        torch.cuda.synchronize()
-        fwd_end = time()
-        fwd_time = fwd_end - start
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
-
-        if args.distplan.startswith("CAI"):
-            optimizer.backward(loss)
-        elif args.distplan.startswith("Pytorch"):
-            loss.backward()
-        else:
-            raise RuntimeError
-
-        torch.cuda.synchronize()
-        bwd_end = time()
-        bwd_time = bwd_end - fwd_end
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
-
-        optimizer.step()
-        torch.cuda.synchronize()
-        optim_time = time() - bwd_end
-        step_time = time() - start
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])
-
-        step_tflops = get_tflops_func(step_time)
-        logger.info(
-            f"[{n + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
-            ranks=[0],
-        )
-        if n >= WARMUP_STEPS:
-            tflops_list.append(step_tflops)
-
-    demo_profiler = get_profile_context(PROF_FLAG,
-                                        WARMUP_STEPS,
-                                        NUM_STEPS - WARMUP_STEPS,
-                                        save_dir=f"profile/{get_time_stamp()}-demo")
-
-    with demo_profiler as prof:
-        for n in range(NUM_STEPS):
-            train_step()
-            prof.step()
-
-    tflops_list.sort()
-    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
-    logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
-    torch.cuda.synchronize()
-
-
-if __name__ == '__main__':
-    main()

From b306cecf28dba07dbdec5f5e1aca1087a11071fb Mon Sep 17 00:00:00 2001
From: Liu Ziming <38985202+MaruyamaAya@users.noreply.github.com>
Date: Wed, 7 Jun 2023 16:05:00 +0800
Subject: [PATCH 23/52] [example] Modify palm example with the new booster API
 (#3913)

* Modify torch version requirement to adapt torch 2.0

* modify palm example using new booster API

* roll back

* fix port

* polish

* polish
---
 examples/language/palm/README.md  |  3 ++
 examples/language/palm/run.sh     |  8 +++--
 examples/language/palm/test_ci.sh |  2 +-
 examples/language/palm/train.py   | 50 +++++++++++++++----------------
 4 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/examples/language/palm/README.md b/examples/language/palm/README.md
index 486bf240f89c..3ff3939d63d4 100644
--- a/examples/language/palm/README.md
+++ b/examples/language/palm/README.md
@@ -43,6 +43,9 @@ palm = PaLM(
 )
 ```
 
+## New API
+We have modified our previous implementation of PaLM with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in train.py. We have also offer a shell script test_ci.sh for you to go through all our plugins for the booster. For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
 ## Test on Enwik8
 
 ```bash
diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh
index 7a533509e009..2a846e81a9a7 100644
--- a/examples/language/palm/run.sh
+++ b/examples/language/palm/run.sh
@@ -3,9 +3,11 @@ export DISTPAN="colossalai"
 
 # The following options only valid when DISTPAN="colossalai"
 export TPDEGREE=1
-export GPUNUM=1
+export GPUNUM=4
 export PLACEMENT='cpu'
 export USE_SHARD_INIT=False
-export BATCH_SIZE=4
+export BATCH_SIZE=1
 
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  \
+--dummy_data=True --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --plugin='gemini' \
+--placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh
index f21095578077..4de6a44e5bf7 100644
--- a/examples/language/palm/test_ci.sh
+++ b/examples/language/palm/test_ci.sh
@@ -4,6 +4,6 @@ for BATCH_SIZE in 2
 do
 for GPUNUM in 1 4
 do
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  2>&1 | tee run.log
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --standalone  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  --plugin='gemini' 2>&1 | tee run.log
 done
 done
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index b16da1c7744a..62062e8bd272 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -9,6 +9,8 @@
 import torch.optim as optim
 import tqdm
 from packaging import version
+
+from colossalai.nn import HybridAdam
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
 from torch.utils.data import DataLoader, Dataset
@@ -18,6 +20,8 @@
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, ZeroDDP
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 # constants
 
@@ -58,6 +62,12 @@ def parse_args():
         help=
         "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--batch_size",
         type=int,
@@ -101,28 +111,6 @@ def get_model_size(model: nn.Module):
     return total_numel
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placement_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placement_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placement_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
 
 
 # Parameter Sharding Strategies for Tensor Parallelism
@@ -218,6 +206,18 @@ def __len__(self):
 if args.distplan == "colossalai":
     # instantiate GPT-like decoder model
 
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+    logger.info(f"plugin: {plugin}")
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
     default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
     ctx = ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg)
@@ -228,12 +228,12 @@ def __len__(self):
 
     pg = default_pg
     tensor_parallelize(model, pg)
-    model = gemini_zero_dpp(model, pg, args.placement)
 
     # optimizer
 
-    #optimizer = GeminiAdamOptimizer(model, lr=1e-7, initial_scale=2**5)
-    optimizer = GeminiAdamOptimizer(model, lr=LEARNING_RATE, initial_scale=2**5)
+    optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE, initial_scale=2**5)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+
 else:
     model = PaLM(num_tokens=256, dim=512, depth=8)
     model = AutoregressiveWrapper(model, max_seq_len=2048)

From a9d1cadc49bd0a37208d8d7f321f16fd37c41471 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 7 Jun 2023 16:08:37 +0800
Subject: [PATCH 24/52] fix typo with colossalai/trainer utils zero (#3908)

---
 colossalai/trainer/_trainer.py                            | 8 ++++----
 colossalai/utils/data_sampler/data_parallel_sampler.py    | 2 +-
 colossalai/utils/model/utils.py                           | 2 +-
 colossalai/utils/profiler/legacy/comm_profiler.py         | 8 ++++----
 colossalai/utils/profiler/legacy/pcie_profiler.py         | 6 +++---
 colossalai/utils/profiler/legacy/prof_utils.py            | 4 ++--
 colossalai/utils/rank_recorder/README.md                  | 4 ++--
 colossalai/utils/rank_recorder/rank_recorder.py           | 4 ++--
 colossalai/zero/gemini/chunk/chunk.py                     | 2 +-
 colossalai/zero/gemini/chunk/manager.py                   | 2 +-
 .../zero/gemini/memory_tracer/chunk_memstats_collector.py | 2 +-
 colossalai/zero/gemini/memory_tracer/memory_monitor.py    | 4 ++--
 colossalai/zero/gemini/utils.py                           | 2 +-
 colossalai/zero/legacy/gemini/ophooks/utils.py            | 4 ++--
 colossalai/zero/legacy/gemini/tensor_utils.py             | 2 +-
 15 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/colossalai/trainer/_trainer.py b/colossalai/trainer/_trainer.py
index 60bbc4eeee32..bfe1c403fd48 100644
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@@ -31,9 +31,9 @@ class Trainer:
         >>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
         >>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
         >>> # Beginning training progress
-        >>> timier = ...
+        >>> timer = ...
         >>> logger = ...
-        >>> trainer = Trainer(engine=engine, logger=logger, timer=timier)
+        >>> trainer = Trainer(engine=engine, logger=logger, timer=timer)
         >>> # add hooks you would like to use here.
         >>> hook_list = []
         >>> trainer.fit(
@@ -56,7 +56,7 @@ def __init__(
         timer: MultiTimer = None,
         logger: DistributedLogger = None,
     ):
-        # training-ralated params
+        # training-related params
         self._engine = engine
         self._max_epochs = 0
         self._cur_epoch = 0
@@ -118,7 +118,7 @@ def _set_current_step(self, epoch: int):
         self._cur_step = epoch * self._steps_per_epoch
 
     def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
-        """Call timer funciton with a given timer name.
+        """Call timer function with a given timer name.
 
         Args:
             action (str): Function to be called on timer.
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
index 945dc54b397a..2318e07a7f8d 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-# adpated from torch.utils.data.DistributedSampler
+# adapted from torch.utils.data.DistributedSampler
 
 import math
 import random
diff --git a/colossalai/utils/model/utils.py b/colossalai/utils/model/utils.py
index f49607376439..21bc530934d3 100644
--- a/colossalai/utils/model/utils.py
+++ b/colossalai/utils/model/utils.py
@@ -70,7 +70,7 @@ def _init_subclass(cls, **kwargs):
             cls.__init__ = preprocess_after(cls.__init__)
 
         # Replace .__init__() for all existing subclasses of torch.nn.Module
-        # Excution self._post_init_method after the default init function.
+        # Execution self._post_init_method after the default init function.
         substitute_init_recursively(torch.nn.modules.module.Module, _enable_class, set())
 
         # holding on to the current __init__subclass__ for exit
diff --git a/colossalai/utils/profiler/legacy/comm_profiler.py b/colossalai/utils/profiler/legacy/comm_profiler.py
index a4f5729c97ec..334f0113ee90 100644
--- a/colossalai/utils/profiler/legacy/comm_profiler.py
+++ b/colossalai/utils/profiler/legacy/comm_profiler.py
@@ -111,7 +111,7 @@ def append(s: str = None):
             res.append(sep)
 
         if self.warn_flag:
-            append("Warnning: there exists multiple communication operations in the same time. As a result, "
+            append("Warning: there exists multiple communication operations in the same time. As a result, "
                    "the profiling result is not accurate.")
 
         if self.total_cuda_time == 0:
@@ -123,12 +123,12 @@ def append(s: str = None):
         append("total number of calls: {}".format(self.total_count))
         append("All events:")
 
-        seperation = '-' * 74
+        separation = '-' * 74
         row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
 
-        append(seperation)
+        append(separation)
         append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
-        append(seperation)
+        append(separation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
         for location, event in show_list:
diff --git a/colossalai/utils/profiler/legacy/pcie_profiler.py b/colossalai/utils/profiler/legacy/pcie_profiler.py
index 526222941ef9..8f812f5cfc7b 100644
--- a/colossalai/utils/profiler/legacy/pcie_profiler.py
+++ b/colossalai/utils/profiler/legacy/pcie_profiler.py
@@ -130,12 +130,12 @@ def append(s: str = None):
 
         append("Possible data transmission events in PCIE:")
 
-        seperation = '-' * 62
+        separation = '-' * 62
         row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
 
-        append(seperation)
+        append(separation)
         append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
-        append(seperation)
+        append(separation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
         for location, event in show_list:
diff --git a/colossalai/utils/profiler/legacy/prof_utils.py b/colossalai/utils/profiler/legacy/prof_utils.py
index 87ad644a7ecc..2f7eee827651 100644
--- a/colossalai/utils/profiler/legacy/prof_utils.py
+++ b/colossalai/utils/profiler/legacy/prof_utils.py
@@ -32,9 +32,9 @@ def _format_memory(nbytes):
         return str(nbytes) + ' B'
 
 
-def _format_bandwidth(volme: float or int, time_us: int):
+def _format_bandwidth(volume: float or int, time_us: int):
     sec_div_mb = (1000.0 / 1024.0)**2
-    mb_per_sec = volme / time_us * sec_div_mb
+    mb_per_sec = volume / time_us * sec_div_mb
 
     if mb_per_sec >= 1024.0:
         return '{:.3f} GB/s'.format(mb_per_sec / 1024.0)
diff --git a/colossalai/utils/rank_recorder/README.md b/colossalai/utils/rank_recorder/README.md
index e30a925d2a92..da8a6039d543 100644
--- a/colossalai/utils/rank_recorder/README.md
+++ b/colossalai/utils/rank_recorder/README.md
@@ -1,5 +1,5 @@
 # Rank Recorder
-This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualise the json file easily.
+This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualize the json file easily.
 
 Before using the tool, you should ensure dist.is_initialized() return true before exit of program. 
 
@@ -20,7 +20,7 @@ with recorder(record_name, current_rank) as r:
 ```
 
 ## Example
-This is a demo to display kernel select in cuda and visualise the cost of several procedures in each rank.
+This is a demo to display kernel select in cuda and visualize the cost of several procedures in each rank.
 
 ```python
 import time
diff --git a/colossalai/utils/rank_recorder/rank_recorder.py b/colossalai/utils/rank_recorder/rank_recorder.py
index c088ceeb2e87..40bb7e184a12 100644
--- a/colossalai/utils/rank_recorder/rank_recorder.py
+++ b/colossalai/utils/rank_recorder/rank_recorder.py
@@ -133,7 +133,7 @@ def merge_recode(self):
         with open(self.export_name + '.json', 'w', encoding='utf-8') as f:
             json.dump(recoders, f, ensure_ascii=False)
 
-    def visualise_record(self):
+    def visualize_record(self):
         with open(self.export_name + '.json', 'r', encoding='utf-8') as f:
             records = json.load(f)
         records = dict(records)
@@ -171,7 +171,7 @@ def exit_worker(self):
         if rank == 1:
             # take the base time of rank 0 as standard
             self.merge_recode()
-            self.visualise_record()
+            self.visualize_record()
 
 
 recorder = Recorder()
diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index a7682eaf62e9..51da9be2b1f8 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -416,7 +416,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data_slice: torch.Ten
         Copy data slice to the memory space indexed by the input tensor in the chunk.
 
         Args:
-            tensor (torch.Tensor): the tensor used to retrive meta information
+            tensor (torch.Tensor): the tensor used to retrieve meta information
             data_slice (torch.Tensor): the tensor to be copied to the chunk
         """
         # sanity check
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index 77368d06d255..38d34f14863e 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -157,7 +157,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data: torch.Tensor) -
         Copy data to the chunk.
 
         Args:
-            tensor (torch.Tensor): the tensor used to retrive meta information
+            tensor (torch.Tensor): the tensor used to retrieve meta information
             data (torch.Tensor): the tensor to be copied to the chunk
         """
         chunk = self.tensor_chunk_map[tensor]
diff --git a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
index f5eb05b4f22a..83903bbf4023 100644
--- a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
+++ b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
@@ -25,7 +25,7 @@ def __init__(self, chunk_manager: ChunkManager, memstats: Optional[MemStats] = N
     # override
     def record_model_data_volume(self) -> None:
         """
-        record model data volumn on cuda and cpu.
+        record model data volume on cuda and cpu.
         """
         if self._start_flag and not self.use_outside_memstats:
             cuda_mem = self._chunk_manager.total_mem['cuda']
diff --git a/colossalai/zero/gemini/memory_tracer/memory_monitor.py b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
index f8d99dbce7a4..4bb585677d5b 100644
--- a/colossalai/zero/gemini/memory_tracer/memory_monitor.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
@@ -45,7 +45,7 @@ def clear(self):
 
 class AsyncMemoryMonitor(MemoryMonitor):
     """
-    An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
+    An Async Memory Monitor running during computing. Sampling memory usage of the current GPU
     at interval of `1/(10**power)` sec.
 
     The idea comes from Runtime Memory Tracer of PatrickStar
@@ -67,7 +67,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
         async_mem_monitor.save('log.pkl')
 
     Args:
-        power (int, optional): the power of time interva. Defaults to 10.
+        power (int, optional): the power of time interval. Defaults to 10.
 
     .. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
         https://arxiv.org/abs/2108.05818
diff --git a/colossalai/zero/gemini/utils.py b/colossalai/zero/gemini/utils.py
index e52b5b836b0b..6f4a253b504b 100644
--- a/colossalai/zero/gemini/utils.py
+++ b/colossalai/zero/gemini/utils.py
@@ -73,7 +73,7 @@ def get_static_torch_model(zero_ddp_model,
         zero_ddp_model (ZeroDDP): a zero ddp model
         device (torch.device): the device of the final torch model
         dtype (torch.dtype): the dtype of the final torch model
-        only_rank_0 (bool): if True, only rank0 has the coverted torch model
+        only_rank_0 (bool): if True, only rank0 has the converted torch model
 
     Returns:
         torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
diff --git a/colossalai/zero/legacy/gemini/ophooks/utils.py b/colossalai/zero/legacy/gemini/ophooks/utils.py
index 84e8298c1d51..f88ad2b00e9e 100644
--- a/colossalai/zero/legacy/gemini/ophooks/utils.py
+++ b/colossalai/zero/legacy/gemini/ophooks/utils.py
@@ -88,7 +88,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
                                  ophook_list: List[BaseOpHook],
                                  name: str = "",
                                  filter_fn: Optional[Callable] = None):
-    r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
+    r"""Recursively register pre/post hooks for all submodules in the module in FWD and BWD."""
     assert isinstance(module, torch.nn.Module)
     assert isinstance(ophook_list, (list, tuple))
     assert len(ophook_list) > 0, 'expected at least 1 hook in the argument ophook_list but found 0'
@@ -103,7 +103,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
     if len(list(module.parameters(recurse=False))) == 0:
         return
 
-    # return from flitered module
+    # return from filtered module
     if filter_fn is not None and filter_fn(module):
         return
 
diff --git a/colossalai/zero/legacy/gemini/tensor_utils.py b/colossalai/zero/legacy/gemini/tensor_utils.py
index b7f23e0253fd..843e330ee2c6 100644
--- a/colossalai/zero/legacy/gemini/tensor_utils.py
+++ b/colossalai/zero/legacy/gemini/tensor_utils.py
@@ -77,7 +77,7 @@ def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], t
     move a tensor to the target_device
     Args:
         t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
-        target_device: a traget device, if type is int, it the index of cuda card.
+        target_device: a target device, if type is int, it the index of cuda card.
     """
     if not isinstance(target_device, torch.device):
         target_device = torch.device(f'cuda:{target_device}')

From c94a33579b7c70d96905ea8b2c3a4baf28451cb0 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Wed, 7 Jun 2023 17:23:01 +0800
Subject: [PATCH 25/52] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index c0b0c2b3d016..8ba413a149b5 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -6,8 +6,9 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
+#  "torch_ddp" "torch_ddp_fp16"
+for plugin in "low_level_zero" "gemini"; do
+  torchrun --nproc_per_node 8 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
   --output_dir="./weight_output" \

From 12c90db3f30b6d9013a32eee27ea04ec4d631ddc Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 17:59:58 +0800
Subject: [PATCH 26/52] [doc] add lazy init tutorial (#3922)

* [doc] add lazy init en doc

* [doc] add lazy init zh doc

* [doc] add lazy init doc in sidebar

* [doc] add lazy init doc test

* [doc] fix lazy init doc link
---
 docs/sidebars.json                        |  1 +
 docs/source/en/features/lazy_init.md      | 71 +++++++++++++++++++++++
 docs/source/zh-Hans/features/lazy_init.md | 71 +++++++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 docs/source/en/features/lazy_init.md
 create mode 100644 docs/source/zh-Hans/features/lazy_init.md

diff --git a/docs/sidebars.json b/docs/sidebars.json
index 8be40e4512f9..c3cfbbeef689 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -64,6 +64,7 @@
         },
         "features/pipeline_parallel",
         "features/nvme_offload",
+        "features/lazy_init",
         "features/cluster_utils"
       ]
     },
diff --git a/docs/source/en/features/lazy_init.md b/docs/source/en/features/lazy_init.md
new file mode 100644
index 000000000000..40f5da1cb84d
--- /dev/null
+++ b/docs/source/en/features/lazy_init.md
@@ -0,0 +1,71 @@
+# Lazy initialization
+
+Author: Hongxin Liu
+
+**Prerequisite**
+- [Booster API](../basics/booster_api.md)
+- [Booster Plugins](../basics/booster_plugins.md)
+- [Booster Checkpoint](../basics/booster_checkpoint.md)
+
+**Related discussion**
+- [Lazy initialization of model](https://github.com/hpcaitech/ColossalAI/discussions/3124)
+
+## Introduction
+
+LazyTensor allows DL framework (PyTorch) to execute operations lazily, by storing all operations related to it and reruning them when it's required to be materialized.
+
+LazyInit defers model initialization and it's based on LazyTensor.
+
+This is especially useful when we use model parallelism to train large models, in which case the model cannot fit in GPU memory. Through this, we can initialize model tensors using meta tensor and do static analysis to get shard strategy. And then materialize each tensor and apply the shard strategy. The static analysis can be omitted if the shard strategy is known in advance.
+
+## Usage
+
+You may use lazy initialization when using Gemini, tensor parallelism, pipeline parallelism, and auto-parallelism. In other cases, you may not need to use lazy initialization.
+
+Gemini is compatible with lazy initialization. You can use them together directly.
+
+```python
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.optimizer import HybridAdam
+from torch.nn import Linear
+import colossalai
+
+colossalai.launch_from_torch({})
+
+plugin = GeminiPlugin()
+booster = Booster(plugin=plugin)
+
+with LazyInitContext():
+    model = Linear(10, 10)
+
+optimizer = HybridAdam(model.parameters())
+model, optimizer, *_ = booster.boost(model, optimizer)
+```
+
+Note that using lazy initialization when using Gemini is not necessary but recommended. If you don't use lazy initialization, you may get OOM error when initializing the model. If you use lazy initialization, you can avoid this error.
+
+> ⚠ Lazy initialization support for tensor parallelism, pipeline parallelism, and auto-parallelism is still under development.
+
+### Load from pretrained model
+
+We should not load pretrained weight in `LazyInitContext`. If so, lazy initialization is meaningless, as the checkpoint is loaded and it takes much GPU memory. A recommended way is to initialize model from scratch in `LazyInitContext` and load pretrained weight outside `LazyInitContext` after calling `Booster.boost()`.
+
+<!--- doc-test-ignore-start -->
+```python
+with LazyInitContext():
+    model = GPT2LMHeadModel(config)
+
+optimizer = ...
+lr_scheduler = ...
+dataloader = ...
+model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
+
+booster.load_model(model, pretrained_path)
+```
+<!--- doc-test-ignore-end -->
+
+As booster supports both pytorch-fashion checkpoint and huggingface/transformers-fashion pretrained weight, the `pretrained_path` of the above pseudo-code can be either a checkpoint file path or a pretrained weight path. Note that it does not support loading pretrained weights from network. You should download the pretrained weight first and then use a local path.
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 lazy_init.py  -->
diff --git a/docs/source/zh-Hans/features/lazy_init.md b/docs/source/zh-Hans/features/lazy_init.md
new file mode 100644
index 000000000000..9a3cd90caa8d
--- /dev/null
+++ b/docs/source/zh-Hans/features/lazy_init.md
@@ -0,0 +1,71 @@
+# 惰性初始化
+
+作者: Hongxin Liu
+
+**前置教程**
+- [Booster API](../basics/booster_api.md)
+- [Booster 插件](../basics/booster_plugins.md)
+- [Booster Checkpoint](../basics/booster_checkpoint.md)
+
+**相关讨论**
+- [模型的惰性初始化](https://github.com/hpcaitech/ColossalAI/discussions/3124)
+
+## 引言
+
+LazyTensor 允许深度学习框架 (PyTorch) 延迟执行操作，方法是存储与其相关的所有操作并在需要具体化时重新运行它们。
+
+LazyInit 基于 LazyTensor，并支持延迟模型初始化。
+
+这在我们使用模型并行来训练大型模型时特别有用，在这种情况下模型无法容纳在 GPU 内存中。通过这个，我们可以使用 Meta 张量初始化模型张量并进行静态分析以获得分片策略。然后具体化每个张量并应用分片策略。如果事先知道分片策略，则可以省略静态分析。
+
+## 用法
+
+您可以在使用 Gemini、张量并行、流水线并行和自动并行时使用惰性初始化。在其他情况下，您可能不需要使用惰性初始化。
+
+Gemini 与惰性初始化兼容。您可以直接将它们一起使用。
+
+```python
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.optimizer import HybridAdam
+from torch.nn import Linear
+import colossalai
+
+colossalai.launch_from_torch({})
+
+plugin = GeminiPlugin()
+booster = Booster(plugin=plugin)
+
+with LazyInitContext():
+    model = Linear(10, 10)
+
+optimizer = HybridAdam(model.parameters())
+model, optimizer, *_ = booster.boost(model, optimizer)
+```
+
+请注意，在使用 Gemini 时使用惰性初始化不是必需的，但建议使用。如果不使用惰性初始化，在初始化模型时可能会出现 OOM 错误。如果使用惰性初始化，则可以避免此错误。
+
+> ⚠ 对张量并行、流水线并行和自动并行的惰性初始化支持仍在开发中。
+
+### 从预训练模型加载
+
+我们不应该在 `LazyInitContext` 中加载预训练权重。如果这样，惰性初始化就没有意义，因为检查点已加载并且需要大量 GPU 内存。推荐的方法是在 `LazyInitContext` 中初始化模型，并在调用 `Booster.boost()` 后在 `LazyInitContext` 之外加载预训练权重。
+
+<!--- doc-test-ignore-start -->
+```python
+with LazyInitContext():
+    model = GPT2LMHeadModel(config)
+
+optimizer = ...
+lr_scheduler = ...
+dataloader = ...
+model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
+
+booster.load_model(model, pretrained_path)
+```
+<!--- doc-test-ignore-end -->
+
+由于 booster 同时支持 pytorch 风格的 checkpoint 和 huggingface/transformers 风格的预训练权重，上述伪代码的 `pretrained_pa​​th` 可以是 checkpoint 文件路径或预训练权重路径。请注意，它不支持从网络加载预训练权重。您应该先下载预训练的权重，然后使用本地路径。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 lazy_init.py  -->

From de0d7df33f9f9349d03150ecedad74610a1e36f6 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 8 Jun 2023 00:01:29 +0800
Subject: [PATCH 27/52] [nfc] fix typo colossalai/zero (#3923)

---
 colossalai/initialize.py                                 | 2 +-
 colossalai/zero/gemini/memory_tracer/utils.py            | 2 +-
 colossalai/zero/legacy/init_ctx/init_context.py          | 2 +-
 colossalai/zero/legacy/sharded_model/sharded_model_v2.py | 6 +++---
 colossalai/zero/low_level/_utils.py                      | 2 +-
 colossalai/zero/low_level/low_level_optim.py             | 8 ++++----
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 5d3f3e5530cb..dc0df0517508 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -238,7 +238,7 @@ def initialize(model: nn.Module,
     loaded into gpc.config.
 
     Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
+        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
         optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
             Your optimizer instance.
         criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
diff --git a/colossalai/zero/gemini/memory_tracer/utils.py b/colossalai/zero/gemini/memory_tracer/utils.py
index 6962c058110e..65f6ba775139 100644
--- a/colossalai/zero/gemini/memory_tracer/utils.py
+++ b/colossalai/zero/gemini/memory_tracer/utils.py
@@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
     """Trace the optimizer memory usage
 
     Args:
-        optim (ShardedOptimV2): an instance of ShardedOptimver
+        optim (ShardedOptimV2): an instance of ShardedOptimizer
 
     Returns:
         Tuple[int, int]: cuda/cpu memory usage in Byte
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py
index a3fa46b38b5a..84e2d2f4f8e1 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
     """A context to initialize model.
 
     1. Convert the model to fp16.
-    2. The paramaters of the module are adapted to type ShardedParameter.
+    2. The parameters of the module are adapted to type ShardedParameter.
     3. Shard the param and grad according to flags.
 
     Args:
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
index be3842beb208..e7064277fb3c 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
             If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
             Note that 'auto' policy can only work well when no other processes use CUDA during your training.
             Defaults to 'cuda'.
-        gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
+        gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0.
         reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
             Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
             In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
@@ -205,7 +205,7 @@ def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> N
             exit(0)
         """
         if self._use_memory_tracer:
-            self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
+            self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0])
             if gpc.get_global_rank() == 0:
                 with open(filename, 'w+') as f:
                     f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -385,7 +385,7 @@ def _save_grad(self, param: Parameter, grad: torch.Tensor):
             # make parameters point to gradient
 
             assert param.colo_attr.saved_grad.is_null(
-            ), 'Gradien accumulation is not supported when reuse_fp16_shard=True'
+            ), 'Gradient accumulation is not supported when reuse_fp16_shard=True'
 
             param.colo_attr.grad_payload_reset(grad.data)
             # release the memory of param
diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py
index afc98e7a7f54..218f7603bc54 100644
--- a/colossalai/zero/low_level/_utils.py
+++ b/colossalai/zero/low_level/_utils.py
@@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list):
     share the same memory space. This function will update the tensor list so that
     they point to the same value.
 
-    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit
+    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor list
     :param tensor_list: A list of tensors corresponding to the flattened tensor
     :type flat_tensor: torch.Tensor
     :type tensor_list: List[torch.Tensor]
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index d4d03e5b5fcd..ee03c0f0ae15 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -207,8 +207,8 @@ def __init__(
             for param in self._working_param_groups[group_id]:
                 self._param_store.set_param_reduction_state(param, False)
 
-        # intialize communication stream for
-        # communication-compuation overlapping
+        # initialize communication stream for
+        # communication-computation overlapping
         if self._overlap_communication:
             self._comm_stream = torch.cuda.Stream()
 
@@ -269,7 +269,7 @@ def _partition_param_list(self, param_list):
         params_per_rank = [[] for _ in range(self._world_size)]
         numel_per_rank = [0 for _ in range(self._world_size)]
 
-        # partititon the parameters in a greedy fashion
+        # partition the parameters in a greedy fashion
         sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
         for param in sorted_params:
             # allocate this parameter to the rank with
@@ -297,7 +297,7 @@ def _attach_reduction_hook(self):
             param_group = self._working_param_groups[group_id]
             for param in param_group:
                 if param.requires_grad:
-                    # determines the reduction destionation rank
+                    # determines the reduction destination rank
                     # this is only valid for stage 2
                     # dst_rank = None means using all-reduce
                     # else using reduce

From 9166988d9b23548b50b154d79e8f194f61f9f6aa Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 8 Jun 2023 09:29:32 +0800
Subject: [PATCH 28/52] [devops] update torch version in compability test
 (#3919)

---
 .compatibility | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.compatibility b/.compatibility
index c8ac4083d2a2..32da32be5521 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,3 +1,3 @@
 1.12.0-11.3.0
-1.11.0-11.3.0
-1.10.1-11.3.0
+1.13.0-11.6.0
+2.0.0-11.7.0

From eb39154d4082601bf8c39b64317fecd28a526205 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 8 Jun 2023 10:18:17 +0800
Subject: [PATCH 29/52] [dtensor] updated api and doc (#3845)

---
 colossalai/device/README.md                   |  73 +++
 colossalai/device/device_mesh.py              | 444 ++++++++++++------
 colossalai/lazy/lazy_init.py                  |  16 +-
 colossalai/tensor/comm_spec.py                |  89 ++--
 colossalai/tensor/d_tensor/RAEDME.md          | 103 ++++
 colossalai/tensor/d_tensor/__init__.py        |   4 +
 colossalai/tensor/d_tensor/comm_spec.py       |  88 ++--
 colossalai/tensor/d_tensor/d_tensor.py        | 114 ++++-
 colossalai/tensor/d_tensor/layout.py          |  30 +-
 .../tensor/d_tensor/layout_converter.py       |  86 ++--
 tests/test_device/test_device_mesh.py         |  13 +-
 tests/test_device/test_init_logical_pg.py     |  16 +-
 tests/test_lazy/lazy_init_utils.py            |  10 +-
 tests/test_lazy/test_distribute.py            |  28 +-
 .../test_dtensor/test_comm_spec.py            |  33 +-
 .../test_tensor/test_dtensor/test_dtensor.py  |  17 +-
 .../test_dtensor/test_layout_converter.py     |  41 +-
 tests/test_tensor/test_shape_consistency.py   |   7 +-
 tests/test_tensor/test_sharded_linear.py      |   2 +-
 tests/test_tensor/test_sharding_spec.py       |   2 +-
 20 files changed, 793 insertions(+), 423 deletions(-)
 create mode 100644 colossalai/device/README.md
 create mode 100644 colossalai/tensor/d_tensor/RAEDME.md

diff --git a/colossalai/device/README.md b/colossalai/device/README.md
new file mode 100644
index 000000000000..8f835735bef4
--- /dev/null
+++ b/colossalai/device/README.md
@@ -0,0 +1,73 @@
+# 🗄 Device
+
+## 📚 Table of Contents
+
+- [🗄 Device](#-device)
+  - [📚 Table of Contents](#-table-of-contents)
+  - [🔗 Introduction](#-introduction)
+  - [📝 Design](#-design)
+  - [🔨 Usage](#-usage)
+
+## 🔗 Introduction
+
+This module contains the implementation of the abstraction of the device topology. It is used to represent the device topology and manage the distributed information related to the network.
+
+## 📝 Design
+
+
+This module is inspired by the DeviceMesh in the [Alpa project](https://github.com/alpa-projects/alpa) and the device array can be represented as a 1D or 2D mesh. We will be extending the device mesh to support 3D mesh in the future.
+
+
+## 🔨 Usage
+
+- Create a device mesh
+
+```python
+# this is the list of global ranks involved in the device mesh
+# assume we have 4 GPUs and the global ranks for these GPUs are 0, 1, 2, 3
+physical_mesh_id = torch.arange(4)
+mesh_shape = [2, 2]
+device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+```
+
+- View the mesh
+
+
+```python
+# view the mesh shape
+# expect output
+# [2, 2]
+print(device_mesh.shape)
+
+
+# view the logical mesh with global ranks
+# expect output
+# [
+#   [0, 1],
+#   [2, 3]
+# ]
+print(device_mesh.logical_mesh_id)
+
+# view the number of devices in the mesh
+# expect output
+# 4
+print(device_mesh.num_devices)
+
+```
+
+- Initialize the process group
+
+```python
+# intialize process group
+device_mesh.init_logical_process_group()
+
+
+# get the process group for a rank with respect to an axis
+# this is the process group involving global ranks 0 and 2
+print(device_mesh.get_process_group(axis=0, global_rank=0))
+
+# get the ranks in the process with respect to an axis
+# expect output
+# [0, 2]
+print(device_mesh.get_ranks_in_process_group(axis=0, global_rank=0))
+```
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 2a5f747fbc23..0490a440153e 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -3,11 +3,19 @@
    with some changes. """
 
 import operator
+from dataclasses import dataclass
 from functools import reduce
-from typing import List, Tuple
+from typing import Dict, List, Union
 
 import torch
 import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+@dataclass
+class ProcessGroupContainer:
+    process_group: ProcessGroup
+    ranks: List[int]
 
 
 # modified from alpa LogicalDeviceMesh(https://github.com/alpa-projects/alpa/blob/main/alpa/shard_parallel/auto_sharding.py)
@@ -27,9 +35,11 @@ class DeviceMesh:
             during initializing the DeviceMesh instance if the init_process_group set to True.
             Otherwise, users need to call create_process_groups_for_logical_mesh manually to init logical process group.
             (default: False)
-        need_flatten(bool, optional): initialize flatten_device_mesh during initializing the DeviceMesh instance if the need_flatten set to True.
+        device (str): the device for the process groups used by the DeviceMesh instance. (default: 'cuda')
     """
 
+    _DIST_BACKEND = {"cuda": "nccl", "cpu": "gloo"}
+
     def __init__(self,
                  physical_mesh_id: torch.Tensor,
                  mesh_shape: torch.Size = None,
@@ -37,48 +47,140 @@ def __init__(self,
                  mesh_alpha: List[float] = None,
                  mesh_beta: List[float] = None,
                  init_process_group: bool = False,
-                 need_flatten: bool = True):
-        self.physical_mesh_id = physical_mesh_id
+                 device: str = 'cuda'):
+        # ============================
+        # Physical & Logical Mesh IDs
+        # ============================
+        self._physical_mesh_id = physical_mesh_id
+        assert physical_mesh_id.dim() == 1, "physical_mesh_id should be a 1D tensor."
+
+        # logical mesh ids can be obtained via two ways
+        # 1. provide physical mesh id and provide mesh shape
+        # 2. directly supply the logical mesh id
+        assert mesh_shape is None or logical_mesh_id is None, \
+            "Only one of mesh_shape and logical_mesh_id can be specified." \
+            "Logical mesh IDs are obtained from either mesh_shape + phyiscal_mesh_id or directly from the user-supplied logical_mesh_id"
+
         if logical_mesh_id is None:
             self.mesh_shape = mesh_shape
-            self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+            self._logical_mesh_id = self._physical_mesh_id.reshape(self.mesh_shape)
         else:
             self._logical_mesh_id = logical_mesh_id
             self.mesh_shape = self._logical_mesh_id.shape
 
-        # map global rank into logical rank
-        self.convert_map = {}
-        self._global_rank_to_logical_rank_map(self._logical_mesh_id, [])
+        # ensure two things:
+        # 1. logical and physical mesh IDs should contain the same elements
+        # 2. there is no duplicate IDs in each mesh, e.g. [2, 2] is not allowed
+        assert torch.equal(torch.unique(self._physical_mesh_id), torch.unique(self.logical_mesh_id)), \
+            "physical and logical mesh IDs should contain the same elements, please check if you have consistent physical_mesh_id and logical_mesh_id."
+        assert torch.unique(self._physical_mesh_id).numel() == self._physical_mesh_id.numel(), \
+            "Found duplicate IDs in the phyiscal_mesh_id and this is not allowed, please check your physical_mesh_id again."
+        assert torch.unique(self.logical_mesh_id).numel() == self.logical_mesh_id.numel(), \
+            "Found duplicate IDs in the logical_mesh_id and this is not allowed, please check your logical_mesh_id again."
+
+        # ===============================================
         # coefficient for alpha-beta communication model
+        # alpha is latency and beta is bandwidth
+        # ===============================================
+        # if the values are not provided, we assume they are 1 for simplicity
         if mesh_alpha is None:
             mesh_alpha = [1] * len(self.mesh_shape)
         if mesh_beta is None:
             mesh_beta = [1] * len(self.mesh_shape)
+
         self.mesh_alpha = tuple(mesh_alpha)
         self.mesh_beta = tuple(mesh_beta)
-        self.init_process_group = init_process_group
-        self.need_flatten = need_flatten
-        if self.init_process_group:
-            self.process_groups_dict = self.create_process_groups_for_logical_mesh()
-        if self.need_flatten and self._logical_mesh_id.dim() > 1:
-            self.flatten_device_mesh = self.flatten()
-            # Create a new member `flatten_device_meshes` to distinguish from original flatten methods (Because I'm not sure if there are functions that rely on the self.flatten())
-            # self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
-            #                                                self.mesh_beta)
+
+        # ensure the alpha and beta have the same shape
+        assert len(self.mesh_alpha) == len(self.mesh_beta), \
+            "mesh_alpha and mesh_beta should have the same length, please check your mesh_alpha and mesh_beta again."
+
+        # =========================
+        # Device for Process Group
+        # =========================
+        self._device = device
+        self._dist_backend = self._DIST_BACKEND[device]
+
+        # =========================
+        # Process Group Management
+        # =========================
+        # the _global_to_local_rank_mapping is structured as follows
+        # {
+        #    <global-rank>: [ <local-rank-on-axis-0>, <local-rank-on-axis-1>, <local-rank-on-axis-2>, ...]
+        # }
+        self._global_to_local_rank_mapping = dict()
+        self._init_global_to_logical_rank_mapping(mapping=self._global_to_local_rank_mapping,
+                                                  tensor=self.logical_mesh_id)
+
+        # create process group
+        self._process_group_dict = {}
+        self._ranks_in_the_process_group = {}
+        self._global_rank_of_current_process = None
+        self._is_initialized = False
+
+        # initialize process group if specified
+        self._init_ranks_in_the_same_group()
+        self._init_process_group = init_process_group
+        if init_process_group:
+            self.init_logical_process_group()
 
     @property
-    def shape(self):
+    def shape(self) -> torch.Size:
+        """
+        Return the shape of the logical mesh.
+        """
         return self.mesh_shape
 
     @property
-    def num_devices(self):
-        return reduce(operator.mul, self.physical_mesh_id.shape, 1)
+    def num_devices(self) -> int:
+        """
+        Return the number of devices contained in the device mesh.
+        """
+        return reduce(operator.mul, self._physical_mesh_id.shape, 1)
 
     @property
-    def logical_mesh_id(self):
+    def logical_mesh_id(self) -> torch.Tensor:
+        """
+        Return the logical mesh id.
+        """
         return self._logical_mesh_id
 
-    def __deepcopy__(self, memo):
+    def get_process_group(self, axis: int, global_rank: int = None) -> ProcessGroup:
+        """
+        Return the process group on the specified axis.
+
+        Args:
+            axis (int): the axis of the process group.
+            global_rank (int, optional): the global rank of the process group. If not specified, the current process is used. (default: None)
+        """
+        if global_rank is None:
+            global_rank = self._global_rank_of_current_process
+        return self._process_group_dict[global_rank][axis]
+
+    def get_process_group_for_all_axes(self, global_rank: int = None) -> Dict[int, ProcessGroup]:
+        """
+        Return the process groups for all axes.
+
+        Args:
+            global_rank (int, optional): the global rank of the process
+        """
+        if global_rank is None:
+            global_rank = self._global_rank_of_current_process
+        return self._process_group_dict[global_rank]
+
+    def get_ranks_in_process_group(self, axis: int, global_rank: int = None) -> List[int]:
+        """
+        Return the ranks in the process group on the specified axis.
+
+        Args:
+            axis (int): the axis of the process group.
+            global_rank (int, optional): the global rank of the process
+        """
+        if global_rank is None:
+            global_rank = self._global_rank_of_current_process
+        return self._ranks_in_the_process_group[global_rank][axis]
+
+    def __deepcopy__(self, memo) -> "DeviceMesh":
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
@@ -86,111 +188,206 @@ def __deepcopy__(self, memo):
             if k != 'process_groups_dict':
                 setattr(result, k, __import__("copy").deepcopy(v, memo))
             else:
+                # process group cannot be copied
+                # thus, we share them directly
                 setattr(result, k, v)
-
         return result
 
-    def flatten(self):
+    def _init_global_to_logical_rank_mapping(self,
+                                             mapping: Dict,
+                                             tensor: torch.Tensor,
+                                             index_list: List[int] = []) -> Dict[int, List[int]]:
         """
-        Flatten the logical mesh into an effective 1d logical mesh,
-        """
-        flatten_mesh_shape_size = len(self.mesh_shape)
-        flatten_mesh_shape = [self.num_devices]
-        return DeviceMesh(self.physical_mesh_id,
-                          tuple(flatten_mesh_shape),
-                          mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
-                          mesh_beta=[max(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
-                          init_process_group=self.init_process_group,
-                          need_flatten=False)
+        Build a global rank to local rank mapping for each process group in different axis in the logical device mesh.
 
-    def _global_rank_to_logical_rank_map(self, tensor, index_list):
-        '''
-        This method is a helper function to build convert_map recursively.
-        '''
+        Args:
+            mapping (Dict): a dictionary that maps the global rank to the local rank in the logical device mesh.
+            tensor (torch.Tensor): the tensor that contains the logical mesh ids.
+            index_list (List[int])
+
+        Returns:
+            mapping (Dict): a dictionary that maps the global rank to the local rank in the logical device mesh.
+                The value is a list of integers and each integer represents the local rank in the indexed axis.
+        """
         for index, inner_tensor in enumerate(tensor):
+            # index means the local rank in the current axis
+            # inner_tensor refers to the processes with the same local rank
+
             if inner_tensor.numel() == 1:
-                self.convert_map[int(inner_tensor)] = index_list + [index]
+                # if the inner_tensor only has one element, it means that
+                # it already reaches the last axis
+                # we append its local_rank in the last axis to the index_list
+                # and assign to the mapping
+                # the value of the mapping is the the local rank at the indexed axis of the device mesh
+                mapping[int(inner_tensor)] = index_list + [index]
             else:
-                self._global_rank_to_logical_rank_map(inner_tensor, index_list + [index])
+                # we recursively go into the function until we reach the last axis
+                # meanwhile, we should add the local rank in the current axis in the index_list
+                self._init_global_to_logical_rank_mapping(mapping, inner_tensor, index_list + [index])
 
-    def create_process_groups_for_logical_mesh(self):
+    def init_logical_process_group(self):
         '''
         This method is used to initialize the logical process groups which will be used in communications
         among logical device mesh.
         Note: if init_process_group set to False, you have to call this method manually. Otherwise,
         the communication related function, such as ShapeConsistencyManager.apply will raise errors.
         '''
-        process_groups_dict = {}
-        check_duplicate_list = []
-        global_rank_flatten_list = self.physical_mesh_id.view(-1).tolist()
+        # sanity check
+        assert dist.is_initialized, "The torch.distributed should be initialized before calling init_logical_process_group"
+        assert not self._is_initialized, "The logical process group has been initialized, do not call init_logical_process_group twice"
+
+        # update the global rank of the current process
+        self._global_rank_of_current_process = dist.get_rank()
+        duplicate_check_list = []
+
+        # flatten the global ranks to 1D list
+        global_rank_flatten_list = self._physical_mesh_id.view(-1).tolist()
+
         for global_rank in global_rank_flatten_list:
-            process_groups = self.global_rank_to_process_groups_with_global_rank(global_rank)
-            for axis, process_group in process_groups.items():
-                if axis not in process_groups_dict:
-                    process_groups_dict[axis] = []
-                if process_group not in check_duplicate_list:
-                    check_duplicate_list.append(process_group)
-                    process_group_handler = dist.new_group(process_group)
-                    process_groups_dict[axis].append((process_group, process_group_handler))
+            # find the other ranks which are in the same process group as global_rank
+            ranks_in_same_group_by_axis = self._collate_global_ranks_in_same_process_group(global_rank)
 
-        return process_groups_dict
+            for axis, ranks_in_same_group in ranks_in_same_group_by_axis.items():
+                # skip duplicated process group creation
+                if ranks_in_same_group in duplicate_check_list:
+                    continue
 
-    def global_rank_to_logical_rank(self, rank):
-        return self.convert_map[rank]
+                # create the process group
+                pg_handler = dist.new_group(ranks=ranks_in_same_group, backend=self._dist_backend)
 
-    def global_rank_to_process_groups_with_logical_rank(self, rank):
-        '''
-        Give a global rank and return all logical process groups of this rank.
-        for example:
-            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
-            mesh_shape = (4, 4)
-            # [[0, 1, 2, 3],
-            #  [4, 5, 6, 7],
-            #  [8, 9, 10,11],
-            #  [12,13,14,15]]
-            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-            print(device_mesh.global_rank_to_process_groups_with_logical_rank(0))
-        output:
-            # key is axis name
-            # value is a list of logical ranks in same axis with rank 0
-            {0: [[0, 0], [1, 0], [2, 0], [3, 0]], 1: [[0, 0], [0, 1], [0, 2], [0, 3]]}
-        '''
-        process_groups = {}
-        for d in range(self.logical_mesh_id.dim()):
-            for replacer in range(self.logical_mesh_id.shape[d]):
-                if d not in process_groups:
-                    process_groups[d] = []
-                process_group_member = self.convert_map[rank].copy()
-                process_group_member[d] = replacer
-                process_groups[d].append(process_group_member)
-        return process_groups
-
-    def global_rank_to_process_groups_with_global_rank(self, rank):
+                # keep this process group in the process_groups_dict
+                for rank in ranks_in_same_group:
+                    if rank not in self._process_group_dict:
+                        self._process_group_dict[rank] = dict()
+                    self._process_group_dict[rank][axis] = pg_handler
+
+        # update the init flag
+        # we only allow init for once
+        self._is_initialized = True
+
+    def _init_ranks_in_the_same_group(self):
+        """
+        This method is used to initialize the ranks_in_the_same_group dictionary.
+        """
+        # flatten the global ranks to 1D list
+        global_rank_flatten_list = self._physical_mesh_id.view(-1).tolist()
+
+        for global_rank in global_rank_flatten_list:
+            # find the other ranks which are in the same process group as global_rank
+            ranks_in_same_group_by_axis = self._collate_global_ranks_in_same_process_group(global_rank)
+
+            for axis, ranks_in_same_group in ranks_in_same_group_by_axis.items():
+                # create dict for each rank
+                if global_rank not in self._process_group_dict:
+                    self._ranks_in_the_process_group[global_rank] = dict()
+
+                # keep this process group in the process_groups_dict
+                self._ranks_in_the_process_group[global_rank][axis] = ranks_in_same_group
+
+    def global_rank_to_local_rank(self, rank: int, axis: int = None) -> Union[List[int], int]:
+        """
+        Return the local rank of the given global rank in the logical device mesh.
+
+        Args:
+            rank (int): the global rank in the logical device mesh.
+            axis (int): the axis of the logical device mesh.
+        """
+        local_ranks = self._global_to_local_rank_mapping[rank]
+        if axis:
+            return local_ranks[axis]
+        else:
+            return local_ranks
+
+    def _collate_global_ranks_in_same_process_group(self, global_rank):
         '''
-        Give a global rank and return all process groups of this rank.
-        for example:
-            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
-            mesh_shape = (4, 4)
-            # [[0, 1, 2, 3],
-            #  [4, 5, 6, 7],
-            #  [8, 9, 10,11],
-            #  [12,13,14,15]]
-            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-            print(device_mesh.global_rank_to_process_groups_with_global_rank(0))
-        output:
-            # key is axis name
-            # value is a list of global ranks in same axis with rank 0
-            {0: [0, 4, 8, 12], 1: [0, 1, 2, 3]}
+        Give a global rank and return all global ranks involved in its associated process group in each axis.
+
+        Example:
+
+        ```python
+        sphysical_mesh_id = torch.arange(0, 16)
+        mesh_shape = (4, 4)
+
+        # logical mesh will look like
+        # [[0, 1, 2, 3],
+        #  [4, 5, 6, 7],
+        #  [8, 9, 10,11],
+        #  [12,13,14,15]]
+
+        device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+        print(device_mesh.collate_global_ranks_in_same_process_group(0))
+
+        # key is axis name
+        # value is a list of global ranks in same axis with rank 0
+        # output will look like
+        # {
+            0: [0, 4, 8, 12],
+            1: [0, 1, 2, 3]
+        #  }
         '''
-        logical_process_groups = self.global_rank_to_process_groups_with_logical_rank(rank)
-        process_groups = {}
-        for dim, logical_ranks in logical_process_groups.items():
-            process_groups[dim] = []
-            for logical_rank in logical_ranks:
-                for g_rank, l_rank in self.convert_map.items():
-                    if l_rank == logical_rank:
-                        process_groups[dim].append(g_rank)
-        return process_groups
+        # We have init the global rank to local rank by calling _init_global_to_logical_rank_mapping
+        # for self._global_to_local_rank_mapping
+        # the key is the global rank
+        # the value is the list of local ranks corresponding to the global rank with respect of different axes
+        # we can see the list of local ranks as the process coordinates for simplicity
+        # the key and value are all unique, therefore,
+        # we can also to use the coordinates to find the global rank
+
+        # =========================================================================
+        # Step 1
+        # find all the process_coordinates for processes in the same process group
+        # as the given global rank
+        # =========================================================================
+
+        # each
+        processes_in_the_same_process_group = {}
+
+        for dim in range(self.logical_mesh_id.dim()):
+            # iterate over the dimension size so that we can include all processes
+            # in the same process group in the given axis
+            # the _local_rank refers to the local rank of the current process
+            for _local_rank in range(self.logical_mesh_id.shape[dim]):
+
+                # if this dimension is not initailized yet,
+                # initialize it with an empty array
+                if dim not in processes_in_the_same_process_group:
+                    processes_in_the_same_process_group[dim] = []
+
+                # get the local rank corresponding to the global rank
+                process_coordinates = self._global_to_local_rank_mapping[global_rank].copy()
+
+                # replace the local rank in the given dimension with the
+                # lcoal rank of the current process iterated
+                process_coordinates[dim] = _local_rank
+                processes_in_the_same_process_group[dim].append(process_coordinates)
+
+        # =================================================================
+        # Step 2
+        # Use local rank combination to find its corresponding global rank
+        # =================================================================
+        # the key of the dict is the axis
+        # the value is the list of global ranks which are in the same process group as the given global rank
+        global_pg_ranks = {}
+        for dim, coordinates_of_all_processes in processes_in_the_same_process_group.items():
+            global_pg_ranks[dim] = []
+            for process_coordinates in coordinates_of_all_processes:
+                # find the global rank by local rank combination
+                for _global_rank, _process_coordinates in self._global_to_local_rank_mapping.items():
+                    if process_coordinates == _process_coordinates:
+                        global_pg_ranks[dim].append(_global_rank)
+        return global_pg_ranks
+
+    def flatten(self):
+        """
+        Flatten the logical mesh into an effective 1d logical mesh,
+        """
+        flatten_mesh_shape_size = len(self.mesh_shape)
+        flatten_mesh_shape = [self.num_devices]
+        return DeviceMesh(self._physical_mesh_id,
+                          tuple(flatten_mesh_shape),
+                          mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
+                          mesh_beta=[max(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
+                          init_process_group=self._init_process_group)
 
     def all_gather_cost(self, num_bytes, mesh_dim):
         num_devices = self.logical_mesh_id.shape[mesh_dim]
@@ -212,38 +409,3 @@ def all_to_all_cost(self, num_bytes, mesh_dim):
         penalty_factor = num_devices / 2.0
         return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] *
                 (num_devices - 1) / num_devices / num_devices * num_bytes * penalty_factor + 0.001)
-
-
-class FlattenDeviceMesh(DeviceMesh):
-
-    def __init__(self, physical_mesh_id, mesh_shape, mesh_alpha=None, mesh_beta=None):
-        super().__init__(physical_mesh_id,
-                         mesh_shape,
-                         mesh_alpha,
-                         mesh_beta,
-                         init_process_group=False,
-                         need_flatten=False)
-        # Different from flatten(), mesh_shape leaves unchanged, mesh_alpha and mesh_beta are scalars
-        self.mesh_alpha = max(self.mesh_alpha)
-        self.mesh_beta = min(self.mesh_beta)
-        # Different from original process_groups_dict, rank_list is not stored
-        self.process_number_dict = self.create_process_numbers_for_logical_mesh()
-
-    def create_process_numbers_for_logical_mesh(self):
-        '''
-        Build 1d DeviceMesh in column-major(0) and row-major(1)
-        for example:
-            mesh_shape = (2,4)
-            # [[0, 1, 2, 3],
-            #  [4, 5, 6, 7]]
-            # return {0: [0, 4, 1, 5, 2, 6, 3, 7], 1: [0, 1, 2, 3, 4, 5, 6, 7]}
-        '''
-        num_devices = reduce(operator.mul, self.mesh_shape, 1)
-        process_numbers_dict = {}
-        process_numbers_dict[0] = torch.arange(num_devices).reshape(self.mesh_shape).transpose(1, 0).flatten().tolist()
-        process_numbers_dict[1] = torch.arange(num_devices).reshape(self.mesh_shape).flatten().tolist()
-        return process_numbers_dict
-
-    def mix_gather_cost(self, num_bytes):
-        num_devices = reduce(operator.mul, self.mesh_shape, 1)
-        return (self.mesh_alpha + self.mesh_beta * (num_devices - 1) / num_devices * num_bytes + 0.1)
diff --git a/colossalai/lazy/lazy_init.py b/colossalai/lazy/lazy_init.py
index 76f550dc4392..ca8914362cd6 100644
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -1,5 +1,5 @@
 from types import MethodType
-from typing import Callable, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -8,8 +8,9 @@
 from torch.utils._pytree import tree_map
 
 from colossalai._analyzer._subclasses import MetaTensor
+from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.d_tensor.d_tensor import DTensor
-from colossalai.tensor.d_tensor.layout import Layout
+from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
 
 # reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
 _NORMAL_FACTORY = [
@@ -172,7 +173,7 @@ def materialize(self) -> torch.Tensor:
         self.clean()
         return _convert_cls(self, target)
 
-    def distribute(self, layout: Layout) -> torch.Tensor:
+    def distribute(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> torch.Tensor:
         """Distribute the ``LazyTensor`` to ``torch.Tensor`` by modifying __class__ (inplace), according to the layout.
 
         Args:
@@ -183,7 +184,7 @@ def distribute(self, layout: Layout) -> torch.Tensor:
         """
         target = self._materialize_data()
         self.clean()
-        local_tensor = DTensor(target, layout).local_tensor
+        local_tensor = DTensor(target, device_mesh, sharding_spec).local_tensor
         return _convert_cls(self, local_tensor)
 
     def clean(self) -> None:
@@ -536,7 +537,10 @@ def apply_fn(name: str, p: LazyTensor):
         return _apply_to_lazy_module(module, apply_fn, verbose)
 
     @staticmethod
-    def distribute(module: nn.Module, layout_dict: dict, verbose: bool = False) -> nn.Module:
+    def distribute(module: nn.Module,
+                   device_mesh: DeviceMesh,
+                   sharding_spec_dict: Dict[str, ShardingSpec],
+                   verbose: bool = False) -> nn.Module:
         """Distribute all ``nn.Parameter`` from ``LazyTensor``. This function will modify the module in-place.
 
         Args:
@@ -546,7 +550,7 @@ def distribute(module: nn.Module, layout_dict: dict, verbose: bool = False) -> n
         """
 
         def apply_fn(name: str, p: LazyTensor):
-            p.distribute(layout_dict[name])
+            p.distribute(device_mesh, sharding_spec_dict[name])
 
         return _apply_to_lazy_module(module, apply_fn, verbose)
 
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index af38d2a502c2..dd873c852936 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -16,69 +16,66 @@ def _all_gather(tensor, comm_spec):
     '''
     Implement all gather operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            tensor_list = [
-                torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device)
-                for _ in range(comm_spec.device_mesh.mesh_shape[comm_spec.logical_process_axis])
-            ]
-            # without this contiguous operation, the all gather may get some unexpected results.
-            tensor = tensor.contiguous()
-            dist.all_gather(tensor_list, tensor, group=process_group)
-            output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
-            return output
+    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
+    process_group = process_groups[comm_spec.logical_process_axis]
+
+    tensor_list = [
+        torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device)
+        for _ in range(comm_spec.device_mesh.mesh_shape[comm_spec.logical_process_axis])
+    ]
+    # without this contiguous operation, the all gather may get some unexpected results.
+    tensor = tensor.contiguous()
+    dist.all_gather(tensor_list, tensor, group=process_group)
+    output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
+    return output
 
 
 def _split(tensor, comm_spec):
     '''
     Implement shard operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, _ in process_groups_list:
-        if dist.get_rank() in rank_list:
-            dim = comm_spec.shard_dim
-            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
-            start = length * rank_list.index(dist.get_rank())
-            output = torch.narrow(tensor, dim, start, length).contiguous()
-            return output
+    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
+    process_group = process_groups[comm_spec.logical_process_axis]
+
+    dim = comm_spec.shard_dim
+    length = tensor.shape[comm_spec.shard_dim] // dist.get_world_size(process_group)
+    start = length * dist.get_rank(process_group)
+    output = torch.narrow(tensor, dim, start, length).contiguous()
+    return output
 
 
 def _all_to_all(tensor, comm_spec):
     '''
     Implement all to all operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            new_shape = list(tensor.shape)
-            new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // len(rank_list)
-            new_shape = torch.Size(new_shape)
-            output_tensor_list = [
-                torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
-            ]
-            dim = comm_spec.shard_dim
-            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
-            input_tensor_list = [
-                torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(len(rank_list))
-            ]
-            group = process_group
-            dist.all_to_all(output_tensor_list, input_tensor_list, group)
-            output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
-            return output
+    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
+    process_group = process_groups[comm_spec.logical_process_axis]
+    world_size = dist.get_world_size(process_group)
+
+    new_shape = list(tensor.shape)
+    new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // world_size
+    new_shape = torch.Size(new_shape)
+    output_tensor_list = [torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
+    dim = comm_spec.shard_dim
+    length = tensor.shape[comm_spec.shard_dim] // world_size
+    input_tensor_list = [torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(world_size)]
+    group = process_group
+    dist.all_to_all(output_tensor_list, input_tensor_list, group)
+    output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
+    return output
 
 
 def _all_reduce(tensor, comm_spec, async_op=False):
     '''
     Implement all reduce operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            if not tensor.is_contiguous():
-                tensor = tensor.contiguous()
-            dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
-            return tensor
+    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
+    process_group = process_groups[comm_spec.logical_process_axis]
+
+    if not tensor.is_contiguous():
+        tensor = tensor.contiguous()
+    dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
+    return tensor
 
 
 def _mix_gather(tensor, comm_spec):
@@ -414,7 +411,7 @@ def __init__(self,
         self.forward_only = forward_only
         if isinstance(self.logical_process_axis, list):
             if not mix_gather:
-                self.device_mesh = self.sharding_spec.device_mesh.flatten_device_mesh
+                self.device_mesh = self.sharding_spec.device_mesh.flatten()
                 self.logical_process_axis = 0
             else:
                 self.device_meshes = self.sharding_spec.device_mesh.flatten_device_meshes
diff --git a/colossalai/tensor/d_tensor/RAEDME.md b/colossalai/tensor/d_tensor/RAEDME.md
new file mode 100644
index 000000000000..95d866388364
--- /dev/null
+++ b/colossalai/tensor/d_tensor/RAEDME.md
@@ -0,0 +1,103 @@
+# 🔢 Distributed Tensor
+
+## 📚 Table of Contents
+
+- [🔢 Distributed Tensor](#-distributed-tensor)
+  - [📚 Table of Contents](#-table-of-contents)
+  - [🔗 Introduction](#-introduction)
+  - [📝 Design](#-design)
+  - [🔨 Usage](#-usage)
+  - [🎈 Progress Log](#-progress-log)
+
+## 🔗 Introduction
+
+Distributed tensor is a type of tensor that is distributed across multiple devices. It is a wrapper of PyTorch tensor, and it is used to support distributed training.
+It can represent the device topology and tensor placement over the devices in the topology. It also provides a set of APIs to manipulate the distributed tensor.
+
+## 📝 Design
+
+Our implementation is inspired by the work [Alpa](https://arxiv.org/abs/2201.12023), which unifies data parallelism and tensor parallelism as intra-op parallelism. It uses notations `S` to represent the sharded dimension and `R` to represent the replicated dimension. For example, given a 2D matrix, `[S, R]` represents the tensor is sharded over the first dimension.
+
+Each sharded dimension will have a subscript to represent its placement over the devices. Assuming we have 4 GPUs and the GPUs are arranged in a 2 x 2 manner. Let's say we have a 2D matrix like below:
+
+
+```text
+    [1,  2,  3,  4 ]
+A = [4,  5,  6,  7 ]
+    [8,  9,  10, 11]
+    [12, 13, 14, 15]
+```
+
+`[S0, R]` would mean that the first dimension is sharded over the rows in the device topology.
+
+```text
+| --------------------—————————————————————-|
+|                     |                     |
+|  [1,  2,  3,  4 ]   |  [1,  2,  3,  4 ]   |
+|  [4,  5,  6,  7 ]   |  [4,  5,  6,  7 ]   |
+|                     |                     |
+| --------------------——————————————————-----
+|                     |                     |
+|  [8,  9,  10, 11]   |  [8,  9,  10, 11]   |
+|  [12, 13, 14, 15]   |  [12, 13, 14, 15]   |
+|                     |                     |
+| --------------------——————————————————-----
+```
+
+`[S01, R]` would mean that the first dimension is sharded over both the row and column in the device topology.
+
+```text
+| --------------------—————————————————————-|
+|                     |                     |
+|  [1,  2,  3,  4 ]   |  [4,  5,  6,  7 ]   |
+|                     |                     |
+| --------------------——————————————————-----
+|                     |                     |
+|  [8,  9,  10, 11]   |  [12, 13, 14, 15]   |
+|                     |                     |
+| --------------------——————————————————-----
+```
+
+## 🔨 Usage
+
+A sample API usage is given below.
+
+```python
+import torch
+
+import colossalai
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.d_tensor import DTensor, ShardingSpec
+
+colossalai.launch_from_torch(config={})
+
+# define your device mesh
+# assume you have 4 GPUs
+physical_mesh_id = torch.arange(0, 4).reshape(1, 4)
+mesh_shape = (2, 2)
+device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+
+# define a tensor
+a = torch.rand(16, 32).cuda()
+
+# create sharding spec for the tensor
+# assume the sharding spec is [S0, R]
+dim_partition_dict = {0: [0]}
+sharding_spec = ShardingSpec(a.dim(), dim_partition_dict)
+
+# create a distributed tensor
+d_tensor = DTensor(a, device_mesh, sharding_spec)
+print(d_tensor)
+
+global_tensor = d_tensor.to_global()
+print(global_tensor)
+```
+
+
+## 🎈 Progress Log
+
+- [x] Support layout conversion
+- [x] Support sharding on 2D device mesh
+- [ ] Support sharding on 3D device mesh
+- [ ] Support sharding 4D device mesh
+- [ ] Support sharding info saving and offline tensor merge (we can save tensor as dtensor and gather the tensors back to the global tensor based on the sharding info in a single process in CPU, useful for distributed training checkpoint load and save.)
diff --git a/colossalai/tensor/d_tensor/__init__.py b/colossalai/tensor/d_tensor/__init__.py
index e69de29bb2d1..af77f4f0edfc 100644
--- a/colossalai/tensor/d_tensor/__init__.py
+++ b/colossalai/tensor/d_tensor/__init__.py
@@ -0,0 +1,4 @@
+from .d_tensor import DTensor
+from .sharding_spec import ShardingSpec
+
+__all__ = ['DTensor', 'ShardingSpec']
diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
index 159125fa16db..79b2e3ef936a 100644
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -24,12 +24,12 @@ class CommSpec:
     '''
     Communication spec is used to record the communication action. It converts the communication spec
     to real action which will be used in runtime. It contains comm_pattern to determine the
-    communication method, process_groups_dict to determine the process groups, gather_dim and shard_dim
+    communication method, process_group_dict to determine the process groups, gather_dim and shard_dim
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
-        process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
+        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        process_group_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
         logical_process_axis(Union(int, List[int]), Optional): The mesh_dim to implement the communication action.
@@ -37,7 +37,7 @@ class CommSpec:
 
     def __init__(self,
                  comm_pattern: CollectiveCommPattern,
-                 process_groups_dict: Dict,
+                 process_group_dict: Dict,
                  gather_dim: int = None,
                  shard_dim: int = None,
                  logical_process_axis: int = None):
@@ -45,7 +45,7 @@ def __init__(self,
         self.gather_dim = gather_dim
         self.shard_dim = shard_dim
         self.logical_process_axis = logical_process_axis
-        self.process_groups_dict = process_groups_dict
+        self.process_group_dict = process_group_dict
 
     def __repr__(self):
         res_list = ["CommSpec:("]
@@ -92,68 +92,56 @@ def _all_gather(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement all gather operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            tensor_list = [
-                torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
-            ]
-            # without this contiguous operation, the all gather may get some unexpected results.
-            tensor = tensor.contiguous()
-            dist.all_gather(tensor_list, tensor, group=process_group)
-            output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
-            return output
+    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
+    world_size = dist.get_world_size(process_group)
+    tensor_list = [torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
+    # without this contiguous operation, the all gather may get some unexpected results.
+    tensor = tensor.contiguous()
+    dist.all_gather(tensor_list, tensor, group=process_group)
+    output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
+    return output
 
 
 def _split(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement shard operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, _ in process_groups_list:
-        if dist.get_rank() in rank_list:
-            dim = comm_spec.shard_dim
-            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
-            start = length * rank_list.index(dist.get_rank())
-            output = torch.narrow(tensor, dim, start, length).contiguous()
-            return output
+    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
+    dim = comm_spec.shard_dim
+    length = tensor.shape[comm_spec.shard_dim] // dist.get_world_size(process_group)
+    start = length * dist.get_rank(process_group)
+    output = torch.narrow(tensor, dim, start, length).contiguous()
+    return output
 
 
 def _all_to_all(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement all to all operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            new_shape = list(tensor.shape)
-            new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // len(rank_list)
-            new_shape = torch.Size(new_shape)
-            output_tensor_list = [
-                torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
-            ]
-            dim = comm_spec.shard_dim
-            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
-            input_tensor_list = [
-                torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(len(rank_list))
-            ]
-            group = process_group
-            dist.all_to_all(output_tensor_list, input_tensor_list, group)
-            output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
-            return output
+    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
+    world_size = dist.get_world_size(process_group)
+    new_shape = list(tensor.shape)
+    new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // world_size
+    new_shape = torch.Size(new_shape)
+    output_tensor_list = [torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
+    dim = comm_spec.shard_dim
+    length = tensor.shape[comm_spec.shard_dim] // world_size
+    input_tensor_list = [torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(world_size)]
+    group = process_group
+    dist.all_to_all(output_tensor_list, input_tensor_list, group)
+    output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
+    return output
 
 
 def _all_reduce(tensor: torch.Tensor, comm_spec: CommSpec, async_op: bool = False):
     '''
     Implement all reduce operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
-    for rank_list, process_group in process_groups_list:
-        if dist.get_rank() in rank_list:
-            if not tensor.is_contiguous():
-                tensor = tensor.contiguous()
-            dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
-            return tensor
+    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
+    if not tensor.is_contiguous():
+        tensor = tensor.contiguous()
+    dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
+    return tensor
 
 
 class _ReduceGrad(torch.autograd.Function):
@@ -269,7 +257,7 @@ def symbolic(graph, input_):
     def forward(ctx, input_, comm_spec):
         output = _all_to_all(input_, comm_spec)
         comm_spec_for_backward = CommSpec(comm_pattern=comm_spec.comm_pattern,
-                                          process_groups_dict=comm_spec.process_groups_dict,
+                                          process_group_dict=comm_spec.process_group_dict,
                                           gather_dim=comm_spec.shard_dim,
                                           shard_dim=comm_spec.gather_dim,
                                           logical_process_axis=comm_spec.logical_process_axis)
diff --git a/colossalai/tensor/d_tensor/d_tensor.py b/colossalai/tensor/d_tensor/d_tensor.py
index c1fe9d50a048..6bda0f4e579c 100644
--- a/colossalai/tensor/d_tensor/d_tensor.py
+++ b/colossalai/tensor/d_tensor/d_tensor.py
@@ -3,55 +3,119 @@
 import torch
 from torch.utils._pytree import tree_map
 
+from colossalai.device.device_mesh import DeviceMesh
+
 from .layout import Layout
 from .layout_converter import LayoutConverter, to_global
 from .sharding_spec import ShardingSpec
 
+__all__ = ['DTensor', 'distribute_tensor', 'distribute_module', 'construct_default_sharding_spec']
+
 layout_converter = LayoutConverter()
 
 
 class DTensor(torch.Tensor):
+    """
+    DTensor stands for distributed tensor. It is a subclass of `torch.Tensor` and contains meta information
+    about the tensor distribution. The meta information includes the device mesh, the sharding specification,
+    and the entire shape of the tensor.
+
+    During runtime, we will not directly use the DTensor objects for computation. Instead, we will only use the
+    `DTensor.local_tensor` for computation. The `DTensor.local_tensor` is the local tensor in the current rank.
+    In this way, all tensors involved in computation will only be native PyTorch tensors.
+
+    Example:
+        ```python
+        from colossalai.device import DeviceMesh
+
+        # define your device mesh
+        # assume you have 4 GPUs
+        physical_mesh_id = torch.arange(0, 4).reshape(1, 4)
+        mesh_shape = (2, 2)
+        device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+
+        # define a tensor
+        x = torch.rand(16, 32)
+
+        # create sharding spec for the tensor
+        # assume the sharding spec is [S, R]
+        dim_partition_dict = {
+            0: 1
+        }
+        sharding_spec = ShardingSpec(a.dim(), dim_partition_dict)
+
+        # create a distributed tensor
+        d_tensor = DTensor(x, device_mesh, sharding_spec)
+        ```
 
-    def __init__(self, local_tensor: torch.Tensor, dist_layout: Layout):
-        self.local_tensor = local_tensor
-        self.data_type = local_tensor.dtype
-        self.entire_shape = local_tensor.shape
+    Args:
+        tensor (`torch.Tensor`): the unsharded tensor.
+        device_mesh (`DeviceMesh`): the device mesh for abstraction of the compute devices.
+        sharding_spec (`ShardingSpec`): the sharding specification which describes how the tensor will be sharded.
+    """
+
+    def __init__(self, tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec):
+        # ensure this tensor is not a DTensor
+        assert not isinstance(tensor, DTensor), 'The input tensor should not be a DTensor.'
+
+        # store meta info
+        self.local_tensor = tensor
+        self.data_type = tensor.dtype
+        self.global_shape = tensor.shape
+
+        # create distributed layout
+        dist_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=self.global_shape)
         self.dist_layout = dist_layout
+
+        # shard the tensor
         self._apply_layout()
 
     @staticmethod
-    def __new__(cls, local_tensor, layout):
-        return torch.Tensor._make_subclass(cls, local_tensor, local_tensor.requires_grad)
+    def __new__(cls, tensor, *args, **kwargs):
+        return torch.Tensor._make_subclass(cls, tensor, tensor.requires_grad)
 
     def __repr__(self):
-        return f"DTensor({self.to_global()}, {self.dist_layout})"
+        return f"DTensor(\n{self.to_global()}\n{self.dist_layout}"
 
     def __str__(self):
         return self.__repr__()
 
-    def layout_convert(self, target_layout):
+    def layout_convert(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> None:
         '''
         Convert the layout of the tensor from source_spec to target_spec.
+        This will update the `local_tensor` and `dist_layout` in place.
+
+        Args:
+            target_layout (Layout): the target layout specification.
         '''
-        self.local_tensor = layout_converter.apply(self.local_tensor, self.dist_layout, target_layout)
+        target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=self.global_shape)
+        self.local_tensor = layout_converter.apply(tensor=self.local_tensor,
+                                                   source_layout=self.dist_layout,
+                                                   target_layout=target_layout)
         self.dist_layout = target_layout
 
     def _apply_layout(self):
         '''
         Apply the layout to the local tensor during initializing process.
         '''
+        # layout converter requires a source and target laytout
+        # we construct the source layer for an unsharded tensor
+        # and use self.dist_layer as the targer layout for the sharded tensor
         source_spec = construct_default_sharding_spec(self.local_tensor)
         source_layout = Layout(device_mesh=self.dist_layout.device_mesh,
-                               device_type=self.dist_layout.device_type,
                                sharding_spec=source_spec,
-                               entire_shape=self.entire_shape)
-        self.local_tensor = layout_converter.apply(self.local_tensor, source_layout, self.dist_layout)
+                               global_shape=self.global_shape)
+        self.local_tensor = layout_converter.apply(tensor=self.local_tensor,
+                                                   source_layout=source_layout,
+                                                   target_layout=self.dist_layout)
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
+        # convert all DTensors to native pytorch tensors
+        # so that operations will be conducted on native tensors
         def filter_arg(arg):
             if isinstance(arg, DTensor):
                 return arg.local_tensor
@@ -60,9 +124,9 @@ def filter_arg(arg):
 
         args = tree_map(filter_arg, args)
         kwargs = tree_map(filter_arg, kwargs)
-        # if we want to convert the result into DTensor, we need to infer the layout of result from the layout of input tensors
-        # and op type.
 
+        # NOTE: if we want to convert the result into DTensor, we need to infer the layout of result from the layout of input tensors
+        # and op type.
         return func(*args, **kwargs)
 
     @property
@@ -85,7 +149,6 @@ def to(self, *args, **kwargs):
         '''
         self.local_tensor = self.local_tensor.to(*args, **kwargs)
         self.data_type = self.local_tensor.dtype
-        self.dist_layout.device_type = self.local_tensor.device
         # TODO: update the device mesh process groups or we should just cache
         # both the cpu process groups and the cuda process groups?
         return self
@@ -98,7 +161,7 @@ def to_local(self):
 
     def to_global(self):
         '''
-        Recover the global tensor from the distributed tensor.
+        Recover the global tensor from the distributed tensor by returning a new `torch.Tensor` object.
 
         Note: This function will all_gather the local tensor to the global tensor and it
         will not change the layout of the DTensor. This function is mainly used for debugging or
@@ -107,24 +170,29 @@ def to_global(self):
         return to_global(self.local_tensor, self.dist_layout)
 
 
-def distribute_tensor(local_tensor: torch.Tensor, dist_layout: Layout) -> DTensor:
+def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> DTensor:
     '''
     Distribute the local tensor to the distributed tensor according to the dist_layout specified.
 
     Args:
-        local_tensor: tensor to be distributed.
-        dist_layout: the layout specification of the distributed tensor.
+        tensor (`torch.Tensor`): tensor to be distributed.
+        device_mesh (`DeviceMesh`): the device mesh for abstraction of the compute devices.
+        sharding_spec (`ShardingSpec`): the sharding specification which describes how the tensor will be sharded.
 
     Returns:
         A 'DTensor' object.
     '''
-    return DTensor(local_tensor, dist_layout)
+    return DTensor(tensor, device_mesh, sharding_spec)
 
 
 def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable] = None) -> torch.nn.Module:
     '''
     This function converts all the parameters in the module to DTensor(DParam).
 
+    Args:
+        module (`torch.nn.Module`): the module to be distributed.
+        partition_fn (callable): the partition function which will be used to partition the parameters.
+
     Note: This function is subject to future change as the DParam has not been implemented yet.
     '''
     for name, param in module.named_parameters():
@@ -138,5 +206,11 @@ def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable]
 def construct_default_sharding_spec(tensor: torch.Tensor,) -> ShardingSpec:
     '''
     Construct the default sharding specification for the tensor.
+
+    Args:
+        tensor (`torch.Tensor`): the tensor to be sharded.
+
+    Returns:
+        A `ShardingSpec` object without any sharding specified.
     '''
     return ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={})
diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py
index ee7ef74a99ae..2946611b4b79 100644
--- a/colossalai/tensor/d_tensor/layout.py
+++ b/colossalai/tensor/d_tensor/layout.py
@@ -11,28 +11,32 @@
 
 
 class Layout:
-    """Layout of a tensor.
+    """
+    Layout of a tensor refers to the tensor placement on the device mesh and how the tensor is sharded over the devices.
 
-    Attributes:
-        device_mesh: the device mesh to store the tensor distributed.
-        device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
-        sharding_spec: the sharding specification to describe how the tensor is sharded.
-        entire_shape: the entire shape of the global tensor.
+    Args:
+        device_mesh (`DeviceMesh`): the device mesh to store the tensor distributed.
+        sharding_spec (`ShardingSpec`): the sharding specification to describe how the tensor is sharded.
+        global_shape (`torch.Size`): the entire shape of the global tensor.
     """
 
-    def __init__(self, device_mesh: DeviceMesh, device_type: torch.device, sharding_spec: ShardingSpec,
-                 entire_shape: torch.Size):
+    def __init__(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec, global_shape: torch.Size):
         self.device_mesh = device_mesh
-        self.device_type = device_type
         self.sharding_spec = sharding_spec
-        self.entire_shape = entire_shape
+        self.global_shape = global_shape
         self._sanity_check()
 
     def __hash__(self) -> int:
         return hash(f'{self.sharding_spec}')
 
-    def get_sharded_shape_per_device(self):
-        sharded_shape = list(self.entire_shape)
+    def get_sharded_shape_per_device(self) -> torch.Size:
+        """
+        Compute the shape of the sharded tensor on each device.
+
+        Returns:
+            `torch.Size`: the shape of the sharded tensor on each device.
+        """
+        sharded_shape = list(self.global_shape)
         for dim, shard_list in self.sharding_spec.dim_partition_dict.items():
             mesh_list = [self.device_mesh.mesh_shape[mesh_dim] for mesh_dim in shard_list]
             shard_partitions = reduce(operator.mul, mesh_list, 1)
@@ -56,7 +60,7 @@ def _sanity_check(self):
 
         # make sure that the sharding for a dimension is divisible by the number of devices
         for dim, shard_list in sharding_spec.dim_partition_dict.items():
-            tensor_dim_size = self.entire_shape[dim]
+            tensor_dim_size = self.global_shape[dim]
             num_devices = 1
 
             for element in shard_list:
diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
index cf02aac309f4..6eff92ea6b13 100644
--- a/colossalai/tensor/d_tensor/layout_converter.py
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -3,10 +3,8 @@
 from dataclasses import dataclass
 from typing import Dict, List, Tuple
 
-import numpy as np
 import torch
 
-from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, TrainCycleItem
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.tensor.d_tensor.comm_spec import *
 from colossalai.tensor.d_tensor.layout import Layout
@@ -28,13 +26,21 @@ class LayoutConverterOptions:
     pass
 
 
-def to_global(distributed_tensor: torch.Tensor, layout: Layout) -> torch.Tensor:
+def to_global(distributed_tensor: "DTensor", layout: Layout) -> torch.Tensor:
+    """
+    Convert a distributed tensor to the global tensor with the given layout.
+    This function returns a native `torch.Tensor` object.
+
+
+    Args:
+        distributed_tensor (`DTensor`): the distributed tensor to be converted.
+        layout (`Layout`): the target layout specification.
+    """
     layout_converter = LayoutConverter()
     global_sharding_spec = ShardingSpec(distributed_tensor.dim(), {})
     global_layout = Layout(device_mesh=layout.device_mesh,
-                           device_type=layout.device_type,
                            sharding_spec=global_sharding_spec,
-                           entire_shape=layout.entire_shape)
+                           global_shape=layout.global_shape)
     with torch.no_grad():
         global_tensor = layout_converter.apply(distributed_tensor, layout, global_layout)
     return global_tensor
@@ -49,6 +55,9 @@ def set_layout_converting_options(options: LayoutConverterOptions):
 
 
 class LayoutConverter(metaclass=SingletonMeta):
+    """
+    LayoutConverter is a singleton class which converts the layout of a distributed tensor.
+    """
 
     def __init__(self):
         self._options = None
@@ -91,15 +100,14 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            entire_shape = (4, 4, 4)
+            global_shape = (4, 4, 4)
             dim_partition_dict = {0: [0], 1: [1]}
 
             # [S0,S1,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
-                            device_type=torch.device('cuda'),
                             sharding_spec=sharding_spec,
-                            entire_shape=entire_shape)
+                            global_shape=global_shape)
 
             rst_dict = layout_converter.all_gather_transform_layouts(layout)
             for layout, comm_spec in rst_dict.items():
@@ -112,7 +120,12 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.GATHER_FWD_SPLIT_BWD
         source_spec = source_layout.sharding_spec
-        process_groups_dict = source_layout.device_mesh.process_groups_dict
+
+        # the key of the dict is the axis
+        # the value is the process group
+        current_rank = source_layout.device_mesh._global_rank_of_current_process
+        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
+
         for target_pair in source_spec.dim_partition_dict.items():
             shard_list = all_gather_simulator(target_pair)
             index = target_pair[0]
@@ -130,7 +143,7 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
             logical_process_axis = target_pair[1][-1]
             comm_spec = CommSpec(
                 comm_pattern,
-                process_groups_dict=process_groups_dict,
+                process_group_dict=process_group_dict,
                 gather_dim=gather_dim,
             # shard_dim will be used during backward
                 shard_dim=gather_dim,
@@ -141,8 +154,7 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
                 new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
                 new_layout = Layout(device_mesh=source_layout.device_mesh,
                                     sharding_spec=new_sharding_spec,
-                                    device_type=source_layout.device_type,
-                                    entire_shape=source_layout.entire_shape)
+                                    global_shape=source_layout.global_shape)
 
                 valid_spec_dict[new_layout] = comm_spec
             except LayoutException:
@@ -167,15 +179,14 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            entire_shape = (4, 4, 4)
+            global_shape = (4, 4, 4)
             dim_partition_dict = {0: [0], 1: [1]}
 
             # [S0,S1,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
-                                    device_type=torch.device('cuda'),
                                     sharding_spec=sharding_spec,
-                                    entire_shape=entire_shape)
+                                    global_shape=global_shape)
             rst_dict = layout_converter.all_to_all_transform_layout(layout)
 
             for layout, comm_spec in rst_dict.items():
@@ -188,7 +199,12 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
         '''
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD
-        process_groups_dict = source_layout.device_mesh.process_groups_dict
+
+        # the key of the dict is the axis
+        # the value is the process group
+        current_rank = source_layout.device_mesh._global_rank_of_current_process
+        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
+
         source_spec = source_layout.sharding_spec
         tensor_dims = source_spec.dims
         for f_index in range(tensor_dims - 1):
@@ -229,7 +245,7 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
                     shard_dim = f_index
                     logical_process_axis = b_target_pair[1][-1]
                 comm_spec = CommSpec(comm_pattern,
-                                     process_groups_dict,
+                                     process_group_dict=process_group_dict,
                                      gather_dim=gather_dim,
                                      shard_dim=shard_dim,
                                      logical_process_axis=logical_process_axis)
@@ -252,8 +268,7 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
                     new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
                     new_layout = Layout(device_mesh=source_layout.device_mesh,
                                         sharding_spec=new_sharding_spec,
-                                        device_type=source_layout.device_type,
-                                        entire_shape=source_layout.entire_shape)
+                                        global_shape=source_layout.global_shape)
                     valid_spec_dict[new_layout] = comm_spec
                 except LayoutException:
                     pass
@@ -278,16 +293,15 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            entire_shape = (4, 4, 4)
+            global_shape = (4, 4, 4)
 
             dim_partition_dict = {0: [0]}
 
             # [S0,R,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
-                          device_type=torch.device('cuda'),
                           sharding_spec=sharding_spec,
-                          entire_shape=entire_shape)
+                          global_shape=global_shape)
             rst_dict = layout_converter.shard_transform_layout(layout)
 
             for layout, comm_spec in rst_dict.items():
@@ -301,7 +315,11 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.SPLIT_FWD_GATHER_BWD
         source_spec = source_layout.sharding_spec
-        process_groups_dict = source_layout.device_mesh.process_groups_dict
+
+        # the key of the dict is the axis
+        # the value is the process group
+        current_rank = source_layout.device_mesh._global_rank_of_current_process
+        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
 
         # legal sharding dims means the mesh_id is still available to use.
         legal_sharding_dims = [i for i in range(len(source_layout.device_mesh.mesh_shape))]
@@ -329,7 +347,7 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
                 shard_dim = index
                 logical_process_axis = shard_list[-1]
                 comm_spec = CommSpec(comm_pattern,
-                                     process_groups_dict,
+                                     process_group_dict=process_group_dict,
                                      gather_dim=shard_dim,
                                      shard_dim=shard_dim,
                                      logical_process_axis=logical_process_axis)
@@ -340,8 +358,7 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
                                                      dim_partition_dict=new_dim_partition_dict)
                     new_layout = Layout(device_mesh=source_layout.device_mesh,
                                         sharding_spec=new_sharding_spec,
-                                        device_type=source_layout.device_type,
-                                        entire_shape=source_layout.entire_shape)
+                                        global_shape=source_layout.global_shape)
                     valid_spec_dict[new_layout] = comm_spec
                 except LayoutException:
                     pass
@@ -399,7 +416,7 @@ def layout_converting(self, source_layout: Layout,
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            entire_shape = (4, 4, 4)
+            global_shape = (4, 4, 4)
 
             dim_partition_source = {1: [0, 1]}
             dim_partition_target = {0: [0, 1]}
@@ -407,16 +424,14 @@ def layout_converting(self, source_layout: Layout,
             # [R,S01,R]
             sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
             source_layout = Layout(device_mesh=device_mesh,
-                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_source,
-                                entire_shape=entire_shape)
+                                global_shape=global_shape)
 
             # [S01,R,R]
             sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
             target_layout = Layout(device_mesh=device_mesh,
-                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_target,
-                                entire_shape=entire_shape)
+                                global_shape=global_shape)
 
             transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
             transform_path_str = '->'.join([str(layout.sharding_spec.sharding_sequence) for layout in transform_path])
@@ -505,21 +520,19 @@ def apply(self, tensor: torch.Tensor, source_layout: Layout, target_layout: Layo
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            entire_shape = (4, 4, 4)
+            global_shape = (4, 4, 4)
 
             # [S0,R,R]
             sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
             source_layout = Layout(device_mesh=device_mesh,
-                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_source,
-                                entire_shape=entire_shape)
+                                global_shape=global_shape)
 
             # [R,S0,R]
             sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
             target_layout = Layout(device_mesh=device_mesh,
-                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_target,
-                                entire_shape=entire_shape)
+                                global_shape=global_shape)
 
             if rank in (0, 1):
                 sharded_tensor_0 = torch.zeros(2, 1)
@@ -554,3 +567,4 @@ def apply(self, tensor: torch.Tensor, source_layout: Layout, target_layout: Layo
         for comm_spec in comm_action_sequence:
             tensor = comm_spec.covert_spec_to_action(tensor)
         return tensor
+        return tensor
diff --git a/tests/test_device/test_device_mesh.py b/tests/test_device/test_device_mesh.py
index 3be057b3a98b..19d41d23353f 100644
--- a/tests/test_device/test_device_mesh.py
+++ b/tests/test_device/test_device_mesh.py
@@ -1,20 +1,19 @@
-from colossalai.device.device_mesh import DeviceMesh
 import torch
 
+from colossalai.device.device_mesh import DeviceMesh
+
 
 def test_device_mesh():
-    physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+    physical_mesh_id = torch.arange(0, 16)
     mesh_shape = (4, 4)
     # [[0, 1, 2, 3],
     #  [4, 5, 6, 7],
     #  [8, 9, 10,11],
     #  [12,13,14,15]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    assert device_mesh.convert_map[5] == [1, 1]
-    assert device_mesh.convert_map[11] == [2, 3]
-    assert device_mesh.global_rank_to_process_groups_with_logical_rank(0)[0] == [[0, 0], [1, 0], [2, 0], [3, 0]]
-    assert device_mesh.global_rank_to_process_groups_with_logical_rank(2)[1] == [[0, 0], [0, 1], [0, 2], [0, 3]]
-    assert device_mesh.global_rank_to_process_groups_with_global_rank(2)[1] == [0, 1, 2, 3]
+    assert device_mesh.global_rank_to_local_rank(5) == [1, 1]
+    assert device_mesh.global_rank_to_local_rank(11) == [2, 3]
+    assert device_mesh.get_ranks_in_process_group(axis=1, global_rank=2) == [0, 1, 2, 3]
 
 
 if __name__ == '__main__':
diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py
index 2b7060c4846a..7c6339eff67e 100644
--- a/tests/test_device/test_init_logical_pg.py
+++ b/tests/test_device/test_init_logical_pg.py
@@ -20,16 +20,12 @@ def check_layer(rank, world_size, port):
     # [[0, 1,
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    logical_pg_dict = {0: [[0, 2], [1, 3]], 1: [[0, 1], [2, 3]]}
-    logical_process_groups = device_mesh.process_groups_dict
-
-    for mesh_dim, pgs in logical_pg_dict.items():
-        for index, pg in enumerate(pgs):
-            if rank in pg:
-                tensor = torch.ones(4).cuda()
-                group = logical_process_groups[mesh_dim][index][1]
-                dist.all_reduce(tensor, op=ReduceOp.SUM, group=group)
-                assert tensor.equal(tensor_to_check)
+
+    for axis in range(len(mesh_shape)):
+        tensor = torch.ones(4).cuda()
+        pg = device_mesh.get_process_group(axis=axis)
+        dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg)
+        assert tensor.equal(tensor_to_check)
 
     gpc.destroy()
 
diff --git a/tests/test_lazy/lazy_init_utils.py b/tests/test_lazy/lazy_init_utils.py
index 85bfd0e27801..2911012fafa8 100644
--- a/tests/test_lazy/lazy_init_utils.py
+++ b/tests/test_lazy/lazy_init_utils.py
@@ -6,7 +6,9 @@
 import torch
 from packaging import version
 
+from colossalai.device.device_mesh import DeviceMesh
 from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
+from colossalai.tensor.d_tensor.layout import Layout
 from colossalai.tensor.d_tensor.layout_converter import to_global
 from tests.kit.model_zoo.registry import ModelAttribute
 
@@ -81,7 +83,8 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
         print(f'{model.__class__.__name__} pass')
 
 
-def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, layout_dict: dict) -> None:
+def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, device_mesh: DeviceMesh,
+                            sharding_spec_dict: dict) -> None:
     state = model.state_dict()
     distributed_state = distributed_model.state_dict()
 
@@ -91,6 +94,7 @@ def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.
         assert n1 == n2
         t1 = t1.cuda()
         t2 = t2.cuda()
-        if n2 in layout_dict:
-            t2 = to_global(t2, layout_dict[n2])
+        if n2 in sharding_spec_dict:
+            layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_dict[n2], global_shape=t1.shape)
+            t2 = to_global(t2, layout)
         assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'
diff --git a/tests/test_lazy/test_distribute.py b/tests/test_lazy/test_distribute.py
index d515b175a9ea..efa43eab5788 100644
--- a/tests/test_lazy/test_distribute.py
+++ b/tests/test_lazy/test_distribute.py
@@ -26,23 +26,19 @@ def find_shard_dim(shape: torch.Size) -> Optional[int]:
             return dim
 
 
-def make_layout(device_mesh: DeviceMesh, original_tensor: torch.Tensor) -> Layout:
+def make_sharding_spec(original_tensor: torch.Tensor) -> Layout:
     shard_dim = find_shard_dim(original_tensor.shape)
     dim_partition_dict = {shard_dim: [0]} if shard_dim is not None else {}
     target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict=dim_partition_dict)
-    layout = Layout(device_mesh=device_mesh,
-                    device_type=torch.device('cuda'),
-                    sharding_spec=target_sharding_spec,
-                    entire_shape=original_tensor.shape)
-    return layout
+    return target_sharding_spec
 
 
 def _get_current_name(prefix: str, name: str) -> str:
     return f'{prefix}.{name}'.lstrip('.')
 
 
-def generate_layout_dict(model: nn.Module, device_mesh: DeviceMesh) -> dict:
-    layout_dict = {}
+def generate_sharding_spec_dict(model: nn.Module) -> dict:
+    sharding_spec_dict = {}
 
     @torch.no_grad()
     def generate_recursively(module: nn.Module, prefix: str = ''):
@@ -53,17 +49,17 @@ def generate_recursively(module: nn.Module, prefix: str = ''):
         # initialize tensors directly attached to the current module
         for name, param in module.named_parameters(recurse=False):
             if isinstance(param, LazyTensor):
-                layout = make_layout(device_mesh, param)
-                layout_dict[_get_current_name(prefix, name)] = layout
+                sharding_spec = make_sharding_spec(param)
+                sharding_spec_dict[_get_current_name(prefix, name)] = sharding_spec
 
         for name, buf in module.named_buffers(recurse=False):
             if isinstance(buf, LazyTensor):
-                layout = make_layout(device_mesh, buf)
-                layout_dict[_get_current_name(prefix, name)] = layout
+                sharding_spec = make_sharding_spec(buf)
+                sharding_spec_dict[_get_current_name(prefix, name)] = sharding_spec
 
     generate_recursively(model)
 
-    return layout_dict
+    return sharding_spec_dict
 
 
 @parameterize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])
@@ -85,9 +81,9 @@ def run_dist_lazy_init(subset, seed: int = 42):
         ctx = LazyInitContext()
         with ctx:
             deferred_model = model_fn()
-        layout_dict = generate_layout_dict(deferred_model, device_mesh)
-        ctx.distribute(deferred_model, layout_dict, verbose=True)
-        assert_dist_model_equal(model, deferred_model, layout_dict)
+        sharding_spec_dict = generate_sharding_spec_dict(deferred_model)
+        ctx.distribute(deferred_model, device_mesh, sharding_spec_dict, verbose=True)
+        assert_dist_model_equal(model, deferred_model, device_mesh, sharding_spec_dict)
 
 
 def run_dist(rank, world_size, port) -> None:
diff --git a/tests/test_tensor/test_dtensor/test_comm_spec.py b/tests/test_tensor/test_dtensor/test_comm_spec.py
index d1f5b9299397..0797e01e7e9d 100644
--- a/tests/test_tensor/test_dtensor/test_comm_spec.py
+++ b/tests/test_tensor/test_dtensor/test_comm_spec.py
@@ -125,23 +125,6 @@ def check_all_reduce_bwd(process_groups_dict, rank):
     assert tensor_to_comm.equal(tensor_to_check)
 
 
-def check_all_reduce_in_flatten_device_mesh(process_groups_dict, rank):
-    # tensor to comm
-    tensor_to_comm = torch.ones(2, 2).cuda() * rank
-
-    # reduce through logical process axis 0 at flatten device mesh
-    # tensor to check
-    # tensor([[6., 6.],
-    #         [6., 6.]])
-    tensor_to_check = torch.tensor([[6, 6], [6, 6]], dtype=tensor_to_comm.dtype).cuda()
-
-    # CommSpec:(comm_pattern:all_reduce, logical_process_axis:[0, 1])
-    comm_spec = CommSpec(CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD, process_groups_dict, logical_process_axis=0)
-    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
-
-    assert tensor_to_comm.equal(tensor_to_check)
-
-
 def check_comm(rank, world_size, port):
     disable_existing_loggers()
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
@@ -153,24 +136,22 @@ def check_comm(rank, world_size, port):
     # [[0, 1,
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    process_groups_dict = device_mesh.process_groups_dict
+
+    process_group_dict = device_mesh._process_group_dict[rank]
 
     # test all gather
-    check_all_gather(process_groups_dict, rank)
+    check_all_gather(process_group_dict, rank)
 
     # test shard
-    check_shard(process_groups_dict, rank)
+    check_shard(process_group_dict, rank)
 
     # test all to all
-    check_all_to_all(process_groups_dict, rank)
+    check_all_to_all(process_group_dict, rank)
 
     # test all reduce
-    check_all_reduce_fwd(process_groups_dict, rank)
-    check_all_reduce_bwd(process_groups_dict, rank)
+    check_all_reduce_fwd(process_group_dict, rank)
+    check_all_reduce_bwd(process_group_dict, rank)
 
-    flatten_process_groups_dict = device_mesh.flatten_device_mesh.process_groups_dict
-    # test all reduce in 1D flatten device mesh
-    check_all_reduce_in_flatten_device_mesh(flatten_process_groups_dict, rank)
     gpc.destroy()
 
 
diff --git a/tests/test_tensor/test_dtensor/test_dtensor.py b/tests/test_tensor/test_dtensor/test_dtensor.py
index 3ca369acbf87..50a3bfb15c38 100644
--- a/tests/test_tensor/test_dtensor/test_dtensor.py
+++ b/tests/test_tensor/test_dtensor/test_dtensor.py
@@ -31,13 +31,9 @@ def check_dtensor(rank, world_size, port):
 
     device_mesh = DeviceMesh(torch.Tensor([0, 1, 2, 3]), (2, 2), init_process_group=True)
     target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0]})
-    layout = Layout(device_mesh=device_mesh,
-                    device_type=torch.device('cuda'),
-                    sharding_spec=target_sharding_spec,
-                    entire_shape=original_tensor.shape)
-    d_tensor = DTensor(original_tensor, layout)
+    d_tensor = DTensor(original_tensor, device_mesh, target_sharding_spec)
 
-    assert d_tensor.entire_shape == original_tensor.shape
+    assert d_tensor.global_shape == original_tensor.shape
     assert d_tensor.data_type == original_tensor.dtype
 
     if rank in (0, 1):
@@ -57,12 +53,7 @@ def check_dtensor(rank, world_size, port):
         raise ValueError(f'rank {rank} is not in the device mesh')
 
     new_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0, 1]})
-    new_layout = Layout(device_mesh=device_mesh,
-                        device_type=torch.device('cuda'),
-                        sharding_spec=new_sharding_spec,
-                        entire_shape=original_tensor.shape)
-
-    d_tensor.layout_convert(new_layout)
+    d_tensor.layout_convert(device_mesh, new_sharding_spec)
 
     if rank == 0:
         assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 0, 1))
@@ -75,7 +66,7 @@ def check_dtensor(rank, world_size, port):
     else:
         raise ValueError(f'rank {rank} is not in the device mesh')
 
-    dtensor_from_local = distribute_tensor(original_tensor, new_layout)
+    dtensor_from_local = distribute_tensor(original_tensor, device_mesh, new_sharding_spec)
 
     if rank == 0:
         assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 0, 1))
diff --git a/tests/test_tensor/test_dtensor/test_layout_converter.py b/tests/test_tensor/test_dtensor/test_layout_converter.py
index 5f56decb5e5d..6608e4787273 100644
--- a/tests/test_tensor/test_dtensor/test_layout_converter.py
+++ b/tests/test_tensor/test_dtensor/test_layout_converter.py
@@ -12,9 +12,9 @@
 from colossalai.tensor.d_tensor.sharding_spec import DimSpec, ShardingSpec
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
-entire_shape = torch.Size((64, 32, 16))
+global_shape = torch.Size((64, 32, 16))
 layout_converter = LayoutConverter()
-physical_mesh_id = torch.arange(0, 4).reshape(2, 2)
+physical_mesh_id = torch.arange(0, 4)
 mesh_shape = (2, 2)
 
 
@@ -30,10 +30,7 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,S1,R
     #     device_mesh_shape: (2, 2)
     sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
-    layout = Layout(device_mesh=device_mesh,
-                    device_type=torch.device('cuda'),
-                    sharding_spec=sharding_spec,
-                    entire_shape=entire_shape)
+    layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=global_shape)
 
     rst_dict = layout_converter.all_gather_transform_layouts(layout)
 
@@ -49,10 +46,7 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,S1,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_all2all = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict_all2all)
-    layout_all2all = Layout(device_mesh=device_mesh,
-                            device_type=torch.device('cuda'),
-                            sharding_spec=sharding_spec_all2all,
-                            entire_shape=entire_shape)
+    layout_all2all = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_all2all, global_shape=global_shape)
 
     rst_dict_all2all = layout_converter.all_to_all_transform_layout(layout_all2all)
 
@@ -71,10 +65,7 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_shard = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_shard)
-    shard_layout = Layout(device_mesh=device_mesh,
-                          device_type=torch.device('cuda'),
-                          sharding_spec=sharding_spec_shard,
-                          entire_shape=entire_shape)
+    shard_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_shard, global_shape=global_shape)
 
     rst_dict_shard = layout_converter.shard_transform_layout(shard_layout)
 
@@ -100,19 +91,13 @@ def check_layout_converting(rank, world_size, port):
     #     shard_sequence: R,S01,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
-    source_layout = Layout(device_mesh=device_mesh,
-                           device_type=torch.device('cuda'),
-                           sharding_spec=sharding_spec_source,
-                           entire_shape=entire_shape)
+    source_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_source, global_shape=global_shape)
 
     # DistSpec:
     #     shard_sequence: S01,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
-    target_layout = Layout(device_mesh=device_mesh,
-                           device_type=torch.device('cuda'),
-                           sharding_spec=sharding_spec_target,
-                           entire_shape=entire_shape)
+    target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_target, global_shape=global_shape)
 
     transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
 
@@ -159,21 +144,15 @@ def check_layout_converting_apply(rank, world_size, port):
     #     shard_sequence: R,S01,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
-    source_layout = Layout(device_mesh=device_mesh,
-                           device_type=torch.device('cuda'),
-                           sharding_spec=sharding_spec_source,
-                           entire_shape=entire_shape)
+    source_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_source, global_shape=global_shape)
 
     # DistSpec:
     #     shard_sequence: S01,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
-    target_layout = Layout(device_mesh=device_mesh,
-                           device_type=torch.device('cuda'),
-                           sharding_spec=sharding_spec_target,
-                           entire_shape=entire_shape)
+    target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_target, global_shape=global_shape)
 
-    original_tensor = torch.rand(entire_shape).cuda()
+    original_tensor = torch.rand(global_shape).cuda()
 
     # tensor_to_apply: [R, S01, R]
     tensor_to_apply = original_tensor.narrow(1, rank * 8, 8)
diff --git a/tests/test_tensor/test_shape_consistency.py b/tests/test_tensor/test_shape_consistency.py
index 6fe9ee292cd0..859eef051256 100644
--- a/tests/test_tensor/test_shape_consistency.py
+++ b/tests/test_tensor/test_shape_consistency.py
@@ -1,9 +1,10 @@
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager, CollectiveCommPattern
 import torch
-from colossalai.tensor.sharding_spec import _DimSpec, ShardingSpec
+
 from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.shape_consistency import CollectiveCommPattern, ShapeConsistencyManager
+from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
 
-physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+physical_mesh_id = torch.arange(0, 16)
 mesh_shape = (4, 4)
 # [[0, 1, 2, 3],
 #  [4, 5, 6, 7],
diff --git a/tests/test_tensor/test_sharded_linear.py b/tests/test_tensor/test_sharded_linear.py
index d66d4fec14d1..9bd9805e9b8f 100644
--- a/tests/test_tensor/test_sharded_linear.py
+++ b/tests/test_tensor/test_sharded_linear.py
@@ -26,7 +26,7 @@ def run_dist(rank, world_size, port):
     # the mesh is in the following topo
     # [[0, 1],
     #  [2, 3]]
-    physical_mesh_id = torch.arange(0, 4).reshape(2, 2)
+    physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
     row_id = rank // 2
diff --git a/tests/test_tensor/test_sharding_spec.py b/tests/test_tensor/test_sharding_spec.py
index 909c84ef0f0e..5007c4141849 100644
--- a/tests/test_tensor/test_sharding_spec.py
+++ b/tests/test_tensor/test_sharding_spec.py
@@ -5,7 +5,7 @@
 
 
 def test_sharding_spec():
-    physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+    physical_mesh_id = torch.arange(0, 16)
     mesh_shape = (4, 4)
     # [[0, 1, 2, 3],
     #  [4, 5, 6, 7],

From cf4792c9757e071217f0b99f4e2bcc85f2d048b7 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 11:15:10 +0800
Subject: [PATCH 30/52] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 8ba413a149b5..8d18e1d4a45c 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -8,7 +8,7 @@ DIFFUSERS_OFFLINE=1
 
 #  "torch_ddp" "torch_ddp_fp16"
 for plugin in "low_level_zero" "gemini"; do
-  torchrun --nproc_per_node 8 --standalone train_dreambooth_colossalai.py \
+  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
   --output_dir="./weight_output" \

From e417dd004ee166e6787c1c6325bfc037d3f8b83e Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <56809903+Fridge003@users.noreply.github.com>
Date: Thu, 8 Jun 2023 11:27:05 +0800
Subject: [PATCH 31/52] [example] update opt example using booster api (#3918)

---
 examples/language/opt/README.md           |  32 ++-
 examples/language/opt/args.py             | 120 +++++++++++
 examples/language/opt/benchmark.sh        |  21 --
 examples/language/opt/data.py             |  37 ++++
 examples/language/opt/opt_benchmark.py    | 146 ++++++++++++++
 examples/language/opt/opt_train_demo.py   | 149 ++++++++++++++
 examples/language/opt/requirements.txt    |   2 +
 examples/language/opt/run_benchmark.sh    |  30 +++
 examples/language/opt/run_demo.sh         |  44 ++++
 examples/language/opt/run_gemini.sh       |  28 ---
 examples/language/opt/test_ci.sh          |  19 +-
 examples/language/opt/train_gemini_opt.py | 233 ----------------------
 12 files changed, 571 insertions(+), 290 deletions(-)
 create mode 100644 examples/language/opt/args.py
 delete mode 100644 examples/language/opt/benchmark.sh
 create mode 100644 examples/language/opt/data.py
 create mode 100755 examples/language/opt/opt_benchmark.py
 create mode 100644 examples/language/opt/opt_train_demo.py
 create mode 100644 examples/language/opt/run_benchmark.sh
 create mode 100644 examples/language/opt/run_demo.sh
 delete mode 100644 examples/language/opt/run_gemini.sh
 delete mode 100755 examples/language/opt/train_gemini_opt.py

diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md
index c2fd254571c7..37e1ff4d9008 100644
--- a/examples/language/opt/README.md
+++ b/examples/language/opt/README.md
@@ -19,15 +19,35 @@ Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/fa
 
 The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
 
-We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
-the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
 
 ## Our Modifications
-We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP.
 
-## Quick Start
-You can launch training by using the following bash script
+We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
+the tokenization). 
+
+We adapt the OPT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
+
+## Run Demo
 
+By running the following script:
 ```bash
-bash ./run_gemini.sh
+bash run_demo.sh
 ```
+You will finetune a [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on this [dataset](https://huggingface.co/datasets/hugginglearners/netflix-shows), which contains more than 8000 comments on Netflix shows.
+
+The script can be modified if you want to try another set of hyperparameters or change to another OPT model with different size.
+
+The demo code is adapted from this [blog](https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475) and  the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
+
+
+
+## Run Benchmark
+
+You can run benchmark for OPT model by running the following script:
+```bash
+bash run_benchmark.sh
+```
+The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your set of hyperparameters for testing.
+
+
+
diff --git a/examples/language/opt/args.py b/examples/language/opt/args.py
new file mode 100644
index 000000000000..16730be7ebea
--- /dev/null
+++ b/examples/language/opt/args.py
@@ -0,0 +1,120 @@
+from colossalai import get_default_parser
+
+
+def parse_demo_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="facebook/opt-350m",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="./output_model.bin",
+        help="The path of your saved model after finetuning."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--num_epoch",
+        type=int,
+        default=10,
+        help="Number of epochs."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--warmup_ratio",
+        type=float,
+        default=0.1,
+        help="Ratio of warmup steps against total training steps."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.01, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+
+def parse_benchmark_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="facebook/opt-125m",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.0, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--mem_cap", 
+        type=int, 
+        default=0, 
+        help="Limit on the usage of space for each GPU (in GB)."
+    )
+    args = parser.parse_args()
+
+    return args
\ No newline at end of file
diff --git a/examples/language/opt/benchmark.sh b/examples/language/opt/benchmark.sh
deleted file mode 100644
index 0d04b5e9b33c..000000000000
--- a/examples/language/opt/benchmark.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-export BS=16
-export MEMCAP=0
-export MODEL="6.7b"
-export GPUNUM=1
-
-for MODEL in "6.7b" "13b" "1.3b"
-do
-for GPUNUM in 8 1
-do
-for BS in 16 24 32 8
-do
-for MEMCAP in 0 40
-do
-pkill -9 torchrun
-pkill -9 python
-
-env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh
-done
-done
-done
-done
diff --git a/examples/language/opt/data.py b/examples/language/opt/data.py
new file mode 100644
index 000000000000..6cfffb5fc95b
--- /dev/null
+++ b/examples/language/opt/data.py
@@ -0,0 +1,37 @@
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+
+
+class NetflixDataset(Dataset):
+    
+    def __init__(self, tokenizer):
+
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.input_ids = []
+        self.attn_masks = []
+        self.labels = []
+        self.txt_list = netflix_descriptions = load_dataset("hugginglearners/netflix-shows", split="train")['description']
+        self.max_length = max([len(self.tokenizer.encode(description)) for description in netflix_descriptions])
+
+        for txt in self.txt_list:
+            encodings_dict = self.tokenizer('</s>' + txt + '</s>',
+                                        truncation=True,
+                                        max_length=self.max_length, 
+                                        padding="max_length")
+            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
+            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.attn_masks[idx]
+    
+
+def netflix_collator(data):
+    return {'input_ids': torch.stack([x[0] for x in data]),       
+            'attention_mask': torch.stack([x[1] for x in data]),
+            'labels': torch.stack([x[0] for x in data])}
diff --git a/examples/language/opt/opt_benchmark.py b/examples/language/opt/opt_benchmark.py
new file mode 100755
index 000000000000..da2be4055fa3
--- /dev/null
+++ b/examples/language/opt/opt_benchmark.py
@@ -0,0 +1,146 @@
+import time
+
+import torch
+import transformers
+from transformers import AutoConfig, OPTForCausalLM
+from transformers.utils.versions import require_version
+import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.tensor import ProcessGroup, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_benchmark_args
+
+require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt")
+
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print(f"Limiting GPU memory usage to {size_in_GB} GB")
+
+
+def main():
+
+    args = parse_benchmark_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Whether to set limit of memory capacity
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+    
+    # Build OPT model
+    # Initialize the model under ColoInitContext if using GeminiPlugin
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    if args.plugin == 'gemini':
+        shard_pg = ProcessGroup(tp_degree=world_size)
+        default_dist_spec = ShardSpec([-1], [world_size])
+        with ColoInitContext(device='cpu',
+                            default_dist_spec=default_dist_spec,
+                            default_pg=shard_pg):
+            model = OPTForCausalLM(config)
+    else:
+        model = OPTForCausalLM(config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=args.learning_rate)
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+    
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+
+    # Start training.
+    logger.info(f"Start testing", ranks=[0])
+    progress_bar = tqdm.tqdm(total=args.max_train_steps, desc="Training Step", disable=not coordinator.is_master())
+    
+    torch.cuda.synchronize()
+    model.train()
+    start_time = time.time()
+   
+    for _ in range(args.max_train_steps):
+
+        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
+        loss = outputs['loss']
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+        torch.cuda.synchronize()
+        progress_bar.update(1)
+       
+    # Compute Statistics   
+    end_time = time.time()
+    throughput = "{:.4f}".format((world_size * args.max_train_steps * args.batch_size) / (end_time - start_time))
+    max_mem = format_num(torch.cuda.max_memory_allocated(device=torch.cuda.current_device()), bytes=True)
+    
+    logger.info(f"Testing finished, " 
+                f"batch size per gpu: {args.batch_size}, "
+                f"plugin: {args.plugin}, "
+                f"throughput: {throughput}, "
+                f"maximum memory usage per gpu: {max_mem}.",
+                ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
new file mode 100644
index 000000000000..8a2ad5f55b10
--- /dev/null
+++ b/examples/language/opt/opt_train_demo.py
@@ -0,0 +1,149 @@
+import time
+
+import torch
+import datasets
+import transformers
+from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer
+from transformers import get_linear_schedule_with_warmup
+from transformers.utils.versions import require_version
+from tqdm import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.tensor import ProcessGroup, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_demo_args
+from data import NetflixDataset, netflix_collator
+
+require_version("datasets>=1.8.0", "To fix: pip install -r requirements.txt")
+require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt")
+
+
+def move_to_cuda(batch, device):
+    return {k: v.to(device) for k, v in batch.items()}
+
+
+def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator):
+        
+    torch.cuda.synchronize()
+    model.train()
+
+    with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar:
+        
+        for batch in pbar:
+
+            # Foward
+            optimizer.zero_grad()
+            batch = move_to_cuda(batch, torch.cuda.current_device())
+            
+            outputs = model(use_cache=False, **batch)
+            loss = outputs['loss']
+
+            # Backward
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            lr_scheduler.step()
+
+            # Print batch loss
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+
+    args = parse_demo_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Build OPT model
+    # Initialize the model under ColoInitContext if using GeminiPlugin
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    if args.plugin == 'gemini':
+        shard_pg = ProcessGroup(tp_degree=world_size)
+        default_dist_spec = ShardSpec([-1], [world_size])
+        with ColoInitContext(device='cpu',
+                            default_dist_spec=default_dist_spec,
+                            default_pg=shard_pg):
+            model = OPTForCausalLM(config)
+    else:
+        model = OPTForCausalLM(config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Prepare tokenizer and dataloader
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)   
+    dataset = NetflixDataset(tokenizer)
+    dataloader = plugin.prepare_dataloader(dataset,
+                                           batch_size=args.batch_size,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           collate_fn=netflix_collator)
+    
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size))
+
+    # Set lr scheduler
+    total_steps = len(dataloader) * args.num_epoch
+    num_warmup_steps = int(args.warmup_ratio * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=len(dataloader) * args.num_epoch
+    )
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, dataloader, lr_scheduler = booster.boost(model=model, 
+                                                                  optimizer=optimizer, 
+                                                                  dataloader=dataloader, 
+                                                                  lr_scheduler=lr_scheduler)
+
+    # Start finetuning
+    logger.info(f"Start finetuning", ranks=[0])
+    for epoch in range(args.num_epoch):
+        train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator)
+
+    # Finish training and evaluate
+    logger.info(f"Finish finetuning", ranks=[0])
+    booster.save_model(model, args.output_path)
+    logger.info(f"Saving model checkpoint to {args.output_path}", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt
index 137a69e80498..4422216e6a1c 100644
--- a/examples/language/opt/requirements.txt
+++ b/examples/language/opt/requirements.txt
@@ -1,2 +1,4 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+datasets >= 1.8.0
+transformers >= 4.20.0
\ No newline at end of file
diff --git a/examples/language/opt/run_benchmark.sh b/examples/language/opt/run_benchmark.sh
new file mode 100644
index 000000000000..76c5e8601989
--- /dev/null
+++ b/examples/language/opt/run_benchmark.sh
@@ -0,0 +1,30 @@
+set -xe
+pip install -r requirements.txt
+
+export BS=32
+export MEMCAP=0
+export GPUNUM=1
+
+# acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`
+export MODEL="125m"
+
+for BS in 8 32 128
+do
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+MODLE_PATH="facebook/opt-${MODEL}"
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_benchmark.py \
+  --model_name_or_path ${MODLE_PATH} \
+  --mem_cap ${MEMCAP} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+  
+done
+done
+done
diff --git a/examples/language/opt/run_demo.sh b/examples/language/opt/run_demo.sh
new file mode 100644
index 000000000000..0c9759c34039
--- /dev/null
+++ b/examples/language/opt/run_demo.sh
@@ -0,0 +1,44 @@
+set -xe
+pip install -r requirements.txt
+
+# model name or path
+MODEL="facebook/opt-350m"
+
+# path for saving model
+OUTPUT_PATH="./output_model.bin"
+
+# plugin(training strategy)
+# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
+PLUGIN="gemini"
+
+# number of gpus to use
+GPUNUM=4
+
+# batch size per gpu
+BS=16
+
+# learning rate
+LR="5e-5"
+
+# number of epoch
+EPOCH=10
+
+# weight decay
+WEIGHT_DECAY=0.01
+
+# ratio of warmup steps
+WARMUP_RATIO=0.1
+
+# run the script for demo
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_train_demo.py \
+  --model_name_or_path ${MODEL} \
+  --output_path ${OUTPUT_PATH} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS} \
+  --num_epoch ${EPOCH} \
+  --learning_rate ${LR} \
+  --weight_decay ${WEIGHT_DECAY} \
+  --warmup_ratio ${WARMUP_RATIO}
diff --git a/examples/language/opt/run_gemini.sh b/examples/language/opt/run_gemini.sh
deleted file mode 100644
index 73f231292a13..000000000000
--- a/examples/language/opt/run_gemini.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-set -x
-export BS=${BS:-16}
-export MEMCAP=${MEMCAP:-0}
-# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
-export MODEL=${MODEL:-"125m"}
-export GPUNUM=${GPUNUM:-1}
-export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"}
-
-# make directory for logs
-mkdir -p ./logs
-
-if [ ${USE_SHARD_INIT} = "true" ]; then
-  USE_SHARD_INIT="--shardinit"
-else
-  USE_SHARD_INIT=""
-fi
-
-export MODLE_PATH="facebook/opt-${MODEL}"
-
-# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
-torchrun \
-  --nproc_per_node ${GPUNUM} \
-  --master_port 19198 \
-  train_gemini_opt.py \
-  --mem_cap ${MEMCAP} \
-  --model_name_or_path ${MODLE_PATH} \
-  ${USE_SHARD_INIT} \
-  --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh
index 317f602cda3c..fa14f52b70d2 100644
--- a/examples/language/opt/test_ci.sh
+++ b/examples/language/opt/test_ci.sh
@@ -1,4 +1,19 @@
-for GPUNUM in 2 1
+set -xe
+pip install -r requirements.txt
+
+BS=4
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
 do
-env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
+for GPUNUM in 1 4
+do
+
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_benchmark.py \
+  --model_name_or_path "facebook/opt-125m" \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+
+done
 done
diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
deleted file mode 100755
index 3614b689de26..000000000000
--- a/examples/language/opt/train_gemini_opt.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
-on a text file or a dataset without using HuggingFace Trainer.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-
-import time
-from functools import partial
-
-import datasets
-import torch
-import torch.distributed as dist
-import transformers
-from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM
-from transformers.utils.versions import require_version
-
-import colossalai
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.tensor import ProcessGroup, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, GeminiDDP
-
-
-def get_data(batch_size, seq_len, vocab_size):
-    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-    attention_mask = torch.ones_like(input_ids)
-    return input_ids, attention_mask
-
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def get_time_stamp():
-    torch.cuda.synchronize()
-    return time.time()
-
-
-def get_tflops(model_numel, batch_size, seq_len, step_time):
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
-
-
-def parse_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per dp group) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=20,
-        help="Total number of training steps to perform.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--shardinit",
-        action="store_true",
-        help="Initialize the model with tensor parallel",
-    )
-    parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
-    parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
-    args = parser.parse_args()
-
-    return args
-
-
-def colo_memory_cap(size_in_GB):
-    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
-    cuda_capacity = colo_device_memory_capacity(get_current_device())
-    if size_in_GB * (1024**3) < cuda_capacity:
-        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
-        print("Using {} GB of GPU memory".format(size_in_GB))
-
-
-def main():
-    args = parse_args()
-    disable_existing_loggers()
-    colossalai.launch_from_torch({})
-    logger = get_dist_logger()
-    is_main_process = dist.get_rank() == 0
-
-    if is_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    if args.mem_cap > 0:
-        colo_memory_cap(args.mem_cap)
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        torch.mannul_seed(args.seed)
-        logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-    logger.info("Model config has been created", ranks=[0])
-
-    if args.init_in_cpu:
-        init_dev = torch.device('cpu')
-    else:
-        init_dev = get_current_device()
-
-    # shard init parameters
-    if args.shardinit:
-        logger.info("Sharding initialization !", ranks=[0])
-    else:
-        logger.info("Skipping sharding initialization", ranks=[0])
-
-    world_size = torch.distributed.get_world_size()
-    shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
-    default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
-
-    # build model
-    if args.model_name_or_path is None:
-        logger.info("Train a new model from scratch", ranks=[0])
-        with ColoInitContext(device=init_dev,
-                             dtype=torch.half,
-                             default_dist_spec=default_dist_spec,
-                             default_pg=shard_pg):
-            model = OPTForCausalLM(config)
-    else:
-        logger.info("Finetune a pre-trained model", ranks=[0])
-        with ColoInitContext(device=init_dev,
-                             dtype=torch.half,
-                             default_dist_spec=default_dist_spec,
-                             default_pg=shard_pg):
-            model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
-                                                   from_tf=bool(".ckpt" in args.model_name_or_path),
-                                                   config=config,
-                                                   local_files_only=False)
-
-    # enable gradient checkpointing
-    model.gradient_checkpointing_enable()
-
-    numel = sum([p.numel() for p in model.parameters()])
-    PLACEMENT_POLICY = 'cpu'
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=PLACEMENT_POLICY,
-                      pin_memory=True,
-                      strict_ddp_mode=args.shardinit)
-    optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
-
-    SEQ_LEN = 1024
-    VOCAB_SIZE = 50257
-
-    get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
-
-    model.train()
-    for step in range(args.max_train_steps):
-        st_time = time.time()
-        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
-
-        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
-        loss = outputs['loss']
-        optimizer.backward(loss)
-
-        optimizer.step()
-        optimizer.zero_grad()
-        torch.cuda.synchronize()
-        step_time = time.time() - st_time
-        step_tflops = get_tflops_func(step_time)
-
-        logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0])
-
-    logger.info("Training finished", ranks=[0])
-
-
-if __name__ == "__main__":
-    main()

From 039854b39165ab7f2a4fa7ab3d67e47daa325d1c Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:17:58 +0800
Subject: [PATCH 32/52] modify shell for check

---
 examples/images/dreambooth/test_ci.sh                     | 6 +++---
 examples/images/dreambooth/train_dreambooth_colossalai.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 8d18e1d4a45c..35c81b325ff6 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -6,8 +6,8 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-#  "torch_ddp" "torch_ddp_fp16"
-for plugin in "low_level_zero" "gemini"; do
+#  "torch_ddp" "torch_ddp_fp16" "low_level_zero"
+for plugin in "gemini"; do
   torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
@@ -20,5 +20,5 @@ for plugin in "low_level_zero" "gemini"; do
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cuda"
+  --placement="cpu" # "cuda"
 done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index eae52b5ecd7e..44bde922629f 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -487,7 +487,7 @@ def main(args):
     if args.plugin.startswith('torch_ddp'):
         plugin = TorchDDPPlugin()
     elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
     elif args.plugin == 'low_level_zero':
         plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
 

From 49567d56d161dba7889496abd4e74e19ed8d1195 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:36:05 +0800
Subject: [PATCH 33/52] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 35c81b325ff6..0e3f6efa4b7c 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -20,5 +20,5 @@ for plugin in "gemini"; do
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cpu" # "cuda"
+  --placement="auto" # "cuda"
 done

From 730a092ba2dd98464bd18789b7f78d2ec2d3a165 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:38:18 +0800
Subject: [PATCH 34/52] modify shell for check

---
 examples/images/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 3b15ad887b0a..b2a544928760 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -14,4 +14,4 @@ torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cuda" \
+  --placement="auto" \

From 407aa4846151b2cd87371b9052f5615cf02e0cee Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 8 Jun 2023 14:28:34 +0800
Subject: [PATCH 35/52] fix typo examples/community/roberta (#3925)

---
 examples/community/roberta/README.md                        | 2 +-
 examples/community/roberta/preprocessing/README.md          | 6 +++---
 examples/community/roberta/pretraining/README.md            | 2 +-
 examples/community/roberta/pretraining/arguments.py         | 6 +++---
 examples/community/roberta/pretraining/model/bert.py        | 2 +-
 examples/community/roberta/pretraining/run_pretraining.py   | 2 +-
 examples/community/roberta/pretraining/utils/exp_util.py    | 2 +-
 examples/community/roberta/pretraining/utils/global_vars.py | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/community/roberta/README.md b/examples/community/roberta/README.md
index 8aefa327a4b4..000fce63f35f 100644
--- a/examples/community/roberta/README.md
+++ b/examples/community/roberta/README.md
@@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
 
 ## 3. Finetune
 
-The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
+The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application.
 
 ## Contributors
 The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
diff --git a/examples/community/roberta/preprocessing/README.md b/examples/community/roberta/preprocessing/README.md
index 17cc2f4dc22c..2ed747541280 100644
--- a/examples/community/roberta/preprocessing/README.md
+++ b/examples/community/roberta/preprocessing/README.md
@@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s
 In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.**
 
 ```python
-python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100
+python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100
 # This step takes a short time
 ```
-* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ...
+* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ...
 * `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--shard`: Number of shard, e.g., 10, 50, or 100
 
@@ -76,7 +76,7 @@ make
 
 * `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ...
-* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
+* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
 * `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed**
 * `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document
 * `--worker`: number of process
diff --git a/examples/community/roberta/pretraining/README.md b/examples/community/roberta/pretraining/README.md
index c248fc1f5708..8abe48aa6c0e 100644
--- a/examples/community/roberta/pretraining/README.md
+++ b/examples/community/roberta/pretraining/README.md
@@ -13,7 +13,7 @@ bash run_pretrain.sh
 * `--bert_config`: config.json which represent model
 * `--mlm`: model type of backbone, bert or deberta_v2
 
-2. if resume training from earylier checkpoint, run the script below.
+2. if resume training from earlier checkpoint, run the script below.
 
 ```shell
 bash run_pretrain_resume.sh
diff --git a/examples/community/roberta/pretraining/arguments.py b/examples/community/roberta/pretraining/arguments.py
index 40210c4b1be7..e0702ceb59b0 100644
--- a/examples/community/roberta/pretraining/arguments.py
+++ b/examples/community/roberta/pretraining/arguments.py
@@ -46,7 +46,7 @@ def parse_args():
                         type=int,
                         default=1,
                         help="This param makes sure that a certain task is repeated for this time steps to \
-        optimise on the back propogation speed with APEX's DistributedDataParallel")
+        optimize on the back propagation speed with APEX's DistributedDataParallel")
     parser.add_argument("--max_predictions_per_seq",
                         "--max_pred",
                         default=80,
@@ -73,12 +73,12 @@ def parse_args():
                         help="location of saving checkpoint, which contains model and optimizer")
     parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
     parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug")
-    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin")
+    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint")
     parser.add_argument(
         '--load_optimizer_lr',
         default='',
         type=str,
-        help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step")
+        help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step")
     parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint")
     parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta")
     parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing")
diff --git a/examples/community/roberta/pretraining/model/bert.py b/examples/community/roberta/pretraining/model/bert.py
index a5da1bea6f65..abdf925d0540 100644
--- a/examples/community/roberta/pretraining/model/bert.py
+++ b/examples/community/roberta/pretraining/model/bert.py
@@ -327,7 +327,7 @@ def forward(
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
                 relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
diff --git a/examples/community/roberta/pretraining/run_pretraining.py b/examples/community/roberta/pretraining/run_pretraining.py
index 9a6ffc1c5661..a72bdf775644 100644
--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -78,7 +78,7 @@ def main():
                              default_pg=shard_pg):
             config, model, numel = get_model(args, logger)
 
-        # asign running configurations
+        # assign running configurations
         gemini_config = None
         if args.distplan.startswith("CAI_ZeRO"):
             optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
diff --git a/examples/community/roberta/pretraining/utils/exp_util.py b/examples/community/roberta/pretraining/utils/exp_util.py
index 0cdb56bad031..4a2c9d8a47ad 100644
--- a/examples/community/roberta/pretraining/utils/exp_util.py
+++ b/examples/community/roberta/pretraining/utils/exp_util.py
@@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations,
 def synchronize():
     if not torch.distributed.is_available():
         return
-    if not torch.distributed.is_intialized():
+    if not torch.distributed.is_initialized():
         return
     world_size = torch.distributed.get_world_size()
     if world_size == 1:
diff --git a/examples/community/roberta/pretraining/utils/global_vars.py b/examples/community/roberta/pretraining/utils/global_vars.py
index 7b0c5a2be73d..9eef19e71614 100644
--- a/examples/community/roberta/pretraining/utils/global_vars.py
+++ b/examples/community/roberta/pretraining/utils/global_vars.py
@@ -110,7 +110,7 @@ def write(self, names, writer, iteration, normalizer=1.0, reset=False):
         """Write timers to a tensorboard writer"""
         # currently when using add_scalars,
         # torch.utils.add_scalars makes each timer its own run, which
-        # polutes the runs list, so we just add each as a scalar
+        # pollutes the runs list, so we just add each as a scalar
         assert normalizer > 0.0
         for name in names:
             value = self.timers[name].elapsed(reset=reset) / normalizer

From 9b5e7ce21feb51977d11da4e6a0ed35f502dbfb5 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 14:56:56 +0800
Subject: [PATCH 36/52] modify shell for check

---
 examples/images/dreambooth/colossalai.sh                  | 1 +
 examples/images/dreambooth/test_ci.sh                     | 1 +
 examples/images/dreambooth/train_dreambooth_colossalai.py | 5 +++++
 3 files changed, 7 insertions(+)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index b2a544928760..db4562dbc921 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -14,4 +14,5 @@ torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
+  --test_run=True \
   --placement="auto" \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 0e3f6efa4b7c..21f45adae2a0 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -19,6 +19,7 @@ for plugin in "gemini"; do
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
+  --test_run=True \
   --num_class_images=200 \
   --placement="auto" # "cuda"
 done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 44bde922629f..888b28de8306 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -198,6 +198,7 @@ def parse_args(input_args=None):
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
     parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument("--test_run", default=False, help="Whether to use a smaller dataset for test run.")
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -267,6 +268,7 @@ def __init__(
         class_prompt=None,
         size=512,
         center_crop=False,
+        test=False,
     ):
         self.size = size
         self.center_crop = center_crop
@@ -277,6 +279,8 @@ def __init__(
             raise ValueError("Instance images root doesn't exists.")
 
         self.instance_images_path = list(Path(instance_data_root).iterdir())
+        if test:
+            self.instance_images_path = self.instance_images_path[:10]
         self.num_instance_images = len(self.instance_images_path)
         self.instance_prompt = instance_prompt
         self._length = self.num_instance_images
@@ -509,6 +513,7 @@ def main(args):
         tokenizer=tokenizer,
         size=args.resolution,
         center_crop=args.center_crop,
+        test=args.test_run
     )
 
     def collate_fn(examples):

From 6a69b44dfcf0a3963b7e8d626b0250b3551861c5 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Mon, 22 May 2023 15:02:17 +0800
Subject: [PATCH 37/52] [shardformer] init shardformer code structure (#3731)

* init shardformer code structure

* add implement of sharder (inject and replace)

* add implement of replace layer to colossal layer

* separate different layer policy, add some notion

* implement 1d and 2d slicer, can tell col or row

* fix bug when slicing and inject model

* fix some bug; add inference test example
---
 colossalai/shardformer/__init__.py            |   0
 colossalai/shardformer/model/__init__.py      |   0
 colossalai/shardformer/model/modeling_bert.py |  63 +++++
 colossalai/shardformer/policies/__init__.py   |   0
 colossalai/shardformer/policies/autopolicy.py |  41 +++
 colossalai/shardformer/policies/basepolicy.py | 182 ++++++++++++++
 colossalai/shardformer/policies/bert.py       | 168 +++++++++++++
 colossalai/shardformer/shard/__init__.py      |   0
 colossalai/shardformer/shard/shardconfig.py   |  18 ++
 colossalai/shardformer/shard/sharder.py       | 238 ++++++++++++++++++
 colossalai/shardformer/shard/shardmodel.py    |  58 +++++
 colossalai/shardformer/shard/slicer.py        | 167 ++++++++++++
 colossalai/shardformer/test/config.py         |   5 +
 colossalai/shardformer/test/test.py           |  37 +++
 colossalai/shardformer/utils/__init__.py      |   0
 colossalai/shardformer/utils/utils.py         |  56 +++++
 16 files changed, 1033 insertions(+)
 create mode 100644 colossalai/shardformer/__init__.py
 create mode 100644 colossalai/shardformer/model/__init__.py
 create mode 100644 colossalai/shardformer/model/modeling_bert.py
 create mode 100644 colossalai/shardformer/policies/__init__.py
 create mode 100644 colossalai/shardformer/policies/autopolicy.py
 create mode 100644 colossalai/shardformer/policies/basepolicy.py
 create mode 100644 colossalai/shardformer/policies/bert.py
 create mode 100644 colossalai/shardformer/shard/__init__.py
 create mode 100644 colossalai/shardformer/shard/shardconfig.py
 create mode 100644 colossalai/shardformer/shard/sharder.py
 create mode 100644 colossalai/shardformer/shard/shardmodel.py
 create mode 100644 colossalai/shardformer/shard/slicer.py
 create mode 100644 colossalai/shardformer/test/config.py
 create mode 100644 colossalai/shardformer/test/test.py
 create mode 100644 colossalai/shardformer/utils/__init__.py
 create mode 100644 colossalai/shardformer/utils/utils.py

diff --git a/colossalai/shardformer/__init__.py b/colossalai/shardformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/model/__init__.py b/colossalai/shardformer/model/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/model/modeling_bert.py b/colossalai/shardformer/model/modeling_bert.py
new file mode 100644
index 000000000000..87ed8ac308a5
--- /dev/null
+++ b/colossalai/shardformer/model/modeling_bert.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from typing import Any, Dict, List, Type
+
+
+from transformers import BertForMaskedLM
+from transformers.models.bert.modeling_bert import MaskedLMOutput
+class BertForMaskedLM_(BertForMaskedLM):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        print("[Inject OK] Injected forward method")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+
+        # if input_ids is not None:
+        #     masked_lm_loss = applyDistCrossEntropy(prediction_scores, input_ids, self.config.vocab_size) 
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file
diff --git a/colossalai/shardformer/policies/__init__.py b/colossalai/shardformer/policies/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
new file mode 100644
index 000000000000..9142e0dae22e
--- /dev/null
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -0,0 +1,41 @@
+import torch.nn as nn
+
+def build_policies():
+    """
+    Build the policies for the model
+    
+    Return:
+        The dict for the policies
+    """
+    auto_policy_dict = {}
+
+    from transformers.models.bert.modeling_bert import BertForMaskedLM
+    from .bert import BertForMaskedLMPolicy
+    auto_policy_dict[BertForMaskedLM] = BertForMaskedLMPolicy
+
+    from transformers.models.bert.modeling_bert import BertForSequenceClassification
+    from .bert import BertForSequenceClassificationPolicy
+    auto_policy_dict[BertForSequenceClassification] = BertForSequenceClassificationPolicy
+    
+    return auto_policy_dict
+
+def get_autopolicy(model:nn.Module):
+    """
+    Return the auto policy for the model
+
+    Args:
+        model: The model to be used
+
+    Return:
+        The auto policy for the model
+    """
+    auto_policy_dict = build_policies()
+    policy = auto_policy_dict.get(model.__class__, None)
+    if policy is None:   
+        raise NotImplementedError(f"Auto policy for {model.__class__.__qualname__} is not implemented\n Supported models are {[i.__qualname__ for i in auto_policy_dict.keys()]}")
+    return policy
+
+# from transformers.models.bert.modeling_bert import BertForMaskedLM, BertForPreTraining
+# model = BertForPreTraining
+# policy = get_autopolicy(model)
+# print(policy)
diff --git a/colossalai/shardformer/policies/basepolicy.py b/colossalai/shardformer/policies/basepolicy.py
new file mode 100644
index 000000000000..d444aeb53bf8
--- /dev/null
+++ b/colossalai/shardformer/policies/basepolicy.py
@@ -0,0 +1,182 @@
+# part of code modified from https://github.com/tunib-ai/parallelformers
+
+import torch
+import torch.nn as nn
+import colossalai.nn as col_nn
+from typing import Any, Dict, List, Type, Tuple, Callable
+from transformers import AutoConfig
+from dataclasses import dataclass, field
+
+@dataclass
+class Argument:
+    attr_dict : Dict[str, Any]
+    param_funcs : List[Callable]
+    binding_layers : List[nn.Module] = field(default_factory=list)
+
+@dataclass
+class Layer:
+    """
+    The layer object for the policy
+
+    Args:
+        weight: The weight name of the layer
+        bias: The bias name of the layer
+        replace_layer: The layer to replace the original layer
+        ignore: Whether to ignore this layer if it is not in the model
+    """
+    weight: str = None
+    bias: str = None
+    replace_layer: Any = None
+    ignore: bool = False
+
+
+@dataclass
+class Col_Layer(Layer):
+    """
+    Class for col shard layer in MegatronLM
+    """
+    gather_output: bool = False
+
+
+@dataclass
+class Row_Layer(Layer):
+    """
+    Class for col shard layer in MegatronLM
+    """
+    pass
+
+
+class Policy():
+    """
+    The base class for all the policies
+    For each different model, it should have a different policy class, like BertPolicy for Bert Model 
+    or OPTPolicy for OPT model. 
+    AutoPolicy:
+        shardformer already defined some policies for huggingface model, just set custom_policy = None
+        to use the auto policy. In shardformer autopolicy, we define a base policy for one type model,
+        like BertPolicy, and for each different Bert modle in huggingface like, BertForMaskedLM, 
+        BertForSequenceClassification, etc., for each different Bert model we difine different policy class
+        and overwrite the method inject_policy
+    
+    CustomPolicy:
+    """
+    @staticmethod
+    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module,Argument]:
+        """
+        Return a dict, the key is layer will be modified and the value is the Argument class with param setting and param functions
+
+        Args:
+            model_config: The config of transformer model
+            shard_setting: The config of distributed model
+        
+        Return:
+            Dict for the modify policy,
+            {
+                origin layer class1 (nn.Module): Argument(
+                    attr_dict = {
+                        argument1: value1,
+                        argument2: value2,
+                        ...
+                    },
+                    param_funcs = [
+                        staticmethod1,
+                        staticmethod2,
+                        ...
+                    ]
+                ),
+                origin layer class2 (nn.Module): Argument(
+                    attr_dict = {
+                        argument1: value1,
+                        argument2: value2,
+                        ...
+                    },
+                    param_funcs = [
+                        staticmethod1,
+                        staticmethod2,
+                        ...
+                    ]
+                ),
+                ...
+            }
+
+        """
+        raise NotImplementedError
+    
+
+    @staticmethod
+    def inject_policy() -> Tuple[nn.Module, nn.Module]:
+        """
+        Return the dict for the inject model 
+
+        Return:
+            The injected model, key is the original model and value is the new shardmodel
+        """
+        return ()
+    
+
+    @staticmethod
+    def attn_in() -> List:
+        """
+        Attention qkv layer
+
+        Returns:
+            List[Layer]: List of layer object, each layer is the new 
+        """
+        return NotImplementedError
+
+
+    @staticmethod
+    def attn_out() -> List:
+        """
+        Attention output projection layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+
+    @staticmethod
+    def mlp_in() -> List:
+        """
+        h -> 4h mlp layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+        
+
+    @staticmethod
+    def mlp_out() -> List:
+        """
+        4h -> h mlp layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+        
+    
+    @staticmethod
+    def embedding()->List:
+        """
+        Partially slice the embedding layer
+        vocab_size->vocab_size//gpu_nums
+
+        Return:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+        
+    
+    @staticmethod
+    def unembedding()->List:
+        """
+        Partially slice the embedding layer
+        vocab_size->vocab_size//gpu_nums
+
+        Return:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
new file mode 100644
index 000000000000..24b95e827347
--- /dev/null
+++ b/colossalai/shardformer/policies/bert.py
@@ -0,0 +1,168 @@
+from typing import Dict, List, Tuple, Type, Any, Callable
+import torch.nn as nn
+from .basepolicy import Policy, Layer, Argument, Col_Layer, Row_Layer
+import colossalai.nn as col_nn
+from transformers.models.bert.modeling_bert import BertLayer, BertEmbeddings, BertLMPredictionHead
+from dataclasses import dataclass
+
+
+class BertPolicy(Policy):
+    @staticmethod
+    def argument_policy(config, world_size: int) -> Dict[nn.Module,Argument]:
+        return {
+            BertLayer: Argument(
+                attr_dict = {
+                    # 1. shard hidden size
+                    "attention.self.all_head_size": config.hidden_size // world_size,
+                    "crossattention.self.all_head_size": config.hidden_size // world_size,
+                    # 2. shard number of heads
+                    "attention.self.num_attention_heads": config.num_attention_heads // world_size,
+                    "crossattention.self.num_attention_heads": config.num_attention_heads // world_size,
+       
+                },
+                param_funcs = [
+                    BertPolicy.attn_in,
+                    BertPolicy.attn_out,
+                    BertPolicy.mlp_in,
+                    BertPolicy.mlp_out
+                ]
+            ), 
+            BertEmbeddings: Argument(
+                attr_dict = {
+                    # 1. shard vocab size
+                    # "word_embeddings.num_embeddings": config.vocab_size // world_size,
+                    # 2. add the size of the sliced embedding layer excluding the last slice
+                    "word_embeddings.dim_size": (config.vocab_size+world_size-1) // world_size,
+                },
+                param_funcs = [
+                    BertPolicy.embedding,
+                ],
+                binding_layers = [
+                    BertLMPredictionHead,
+                ]
+            ),
+            BertLMPredictionHead: Argument(
+                attr_dict = {
+                    # 1. shard vocab size
+                    # "word_embeddings.num_embeddings": config.vocab_size // world_size,
+                    # 2. add the size of the sliced embedding layer excluding the last slice
+                },
+                param_funcs = [
+                    BertPolicy.unembedding,
+                ]
+            )
+        }
+
+    @staticmethod
+    def attn_in() -> List:
+        return [
+            Col_Layer(
+                weight="attention.self.query.weight",
+                bias="attention.self.query.bias",
+                replace_layer=col_nn.Linear1D_Col,
+            ),
+            Col_Layer(
+                weight="attention.self.key.weight",
+                bias="attention.self.key.bias",
+                replace_layer=col_nn.Linear1D_Col,
+            ),
+            Col_Layer(
+                weight="attention.self.value.weight",
+                bias="attention.self.value.bias",
+                replace_layer=col_nn.Linear1D_Col,
+            ),
+            Col_Layer(
+                weight="crossattention.self.query.weight",
+                bias="crossattention.self.query.bias",
+                replace_layer=col_nn.Linear1D_Col,
+                ignore=True,
+            ),
+            Col_Layer(
+                weight="crossattention.self.key.weight",
+                bias="crossattention.self.key.bias",
+                replace_layer=col_nn.Linear1D_Col,
+                ignore=True,
+            ),
+            Col_Layer(
+                weight="crossattention.self.value.weight",
+                bias="crossattention.self.value.bias",
+                replace_layer=col_nn.Linear1D_Col,
+                ignore=True,
+            ),
+
+        ]
+    
+    @staticmethod
+    def attn_out() -> List:
+        return [
+            Row_Layer(
+                weight="attention.output.dense.weight",
+                bias="attention.output.dense.bias",
+                replace_layer=col_nn.Linear1D_Row,
+            ),
+            Row_Layer(
+                weight="crossattention.output.dense.weight",
+                bias="crossattention.output.dense.bias",
+                replace_layer=col_nn.Linear1D_Row,
+                ignore=True,
+            ),
+        ]
+    
+    @staticmethod
+    def mlp_in() -> List:
+        return [
+             Col_Layer(
+                weight="intermediate.dense.weight",
+                bias="intermediate.dense.bias",
+                replace_layer=col_nn.Linear1D_Col,
+            ),
+        ]
+    
+    @staticmethod
+    def mlp_out() -> List:
+        return [
+            Row_Layer(
+                weight="output.dense.weight",
+                bias="output.dense.bias",
+                replace_layer=col_nn.Linear1D_Row,
+            ),
+        ]
+
+    @staticmethod
+    def embedding() -> List:
+        return [
+            Col_Layer(
+                weight="word_embeddings.weight",
+                replace_layer=col_nn.VocabParallelEmbedding1D,
+            )
+        ]
+    
+    @staticmethod
+    def unembedding() -> List:
+        return [
+            Col_Layer(
+                weight="decoder.weight",
+                bias="decoder.bias",
+                replace_layer=col_nn.Linear1D_Col,
+                gather_output=True,
+            )
+        ]
+
+from transformers import BertForMaskedLM
+from colossalai.shardformer.model.modeling_bert import BertForMaskedLM_
+class BertForMaskedLMPolicy(BertPolicy):
+    @staticmethod
+    def inject_policy() -> Tuple[nn.Module, nn.Module]:
+        return (BertForMaskedLM, BertForMaskedLM_)
+    
+
+    
+class BertForSequenceClassificationPolicy(BertPolicy):
+    @staticmethod
+    def inject_policy() -> Dict:
+        return {}
+
+
+# model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+# _ = BertForMaskedLMPolicy(model)
+# print(isinstance(model,list(_.inject_policy().keys())[0]))
\ No newline at end of file
diff --git a/colossalai/shardformer/shard/__init__.py b/colossalai/shardformer/shard/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/shard/shardconfig.py b/colossalai/shardformer/shard/shardconfig.py
new file mode 100644
index 000000000000..be265ff0c8c1
--- /dev/null
+++ b/colossalai/shardformer/shard/shardconfig.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class ShardConfig:
+    """
+    The config for sharding the huggingface model for test
+    """
+    rank: int
+    fp16: bool = True
+    num_gpus: int = 2
+    world_size: int = 2
+    backend="nccl"
+    verbose: str = 'simple'
+    seed: int = None
+    require_grad: bool = False
+    master_addr: str = "127.0.0.1"
+    master_port: int = 29500
\ No newline at end of file
diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
new file mode 100644
index 000000000000..ef785cfee9da
--- /dev/null
+++ b/colossalai/shardformer/shard/sharder.py
@@ -0,0 +1,238 @@
+import torch
+import torch.nn as nn
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, Callable
+from .shardconfig import ShardConfig
+from dataclasses import dataclass
+from ..policies.basepolicy import Policy, Layer
+from ..policies.autopolicy import get_autopolicy
+from .slicer import Slicer
+from ..utils.utils import hasattr_, setattr_, getattr_
+import colossalai.nn as col_nn
+from colossalai.logging import get_dist_logger
+import os
+
+
+logger = get_dist_logger()
+
+class ModelSharder(object):
+    """
+    Shard the original huggingface model according to the policy
+
+    Args:
+        policy: The policy to shard the model
+        model: The model to shard
+        dist_setting: The setting of distributed model
+    """
+    def __init__(
+            self,
+            model: nn.Module,
+            policy: Policy,
+            shard_config: ShardConfig = None, # TODO
+        ) -> None:
+        self.model = model
+        self.policy = get_autopolicy(self.model) if policy is None else policy
+        self.slicer = Slicer(shard_config)
+        self.shard_config = shard_config
+        self.model_config = self.model.config
+        self.binding_map = {}
+
+
+    def shard(self) -> None:
+        self.inject_model(self.model)
+        self.replace_layer(self.model)
+  
+        
+    def inject_model(
+            self,
+            model: nn.Module,
+        ) -> None:
+        """
+        Replace the model to policy defined model
+        Mainly modify the forward and backward to fit distributed model
+        
+        e.g.
+            BertForMaskedLM.forward -> BertForMaskedLM_.forward
+        """
+        inject_policy = self.policy.inject_policy()
+
+        org_model_cls = inject_policy[0]
+        shard_model_cls = inject_policy[1]
+
+        if model.__class__ == org_model_cls:
+            for key in shard_model_cls.__dict__.keys():
+                if hasattr(model.__class__, key):
+                    setattr(
+                        model.__class__,
+                        key,
+                        getattr(shard_model_cls,key),
+                    )
+        else:
+            raise NotImplementedError(f"{model.__class__} is not implemented so far")
+
+
+    def replace_layer(
+            self,
+            model: nn.Module,
+        ) -> None:
+        """
+        Replace the layer according to the policy, and replace the layer one by one
+
+        Args:
+            layer: The layer to shard
+        """
+        argument_policies = self.policy.argument_policy(self.model_config, self.shard_config.world_size)
+        for argument_policy in argument_policies.items():
+            origin_layer_cls = argument_policy[0]
+            attr_dict = argument_policy[1].attr_dict
+            param_funcs = argument_policy[1].param_funcs
+            binding_layers = argument_policy[1].binding_layers
+            # if binding_layer is not None:
+            #     self.binding_map[origin_layer_cls] = binding_layer
+            self.reverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs, binding_layers)
+
+
+    def reverse_replace_layer(
+            self,
+            layer: nn.Module,
+            origin_cls: nn.Module,
+            attr_dict: Dict[str, Any],
+            param_funcs: List[Callable],
+            binding_layers: List[nn.Module]
+        ) -> None:
+        """
+        Reverse the replace layer operation
+
+        Args:
+            layer: The object of layer to shard
+            origin_cls: The origin layer class
+            attr_dict: The attribute dict to modify
+            policy_cls: The policy class
+        """
+        for name, child in layer.named_children():
+            if child.__class__ == origin_cls:
+                # replac_layer = child
+                for k, v in attr_dict.items():
+                    setattr_(child, k, v, ignore=True)
+                # print(f"Sharding {name} layer", replac_layer.attention.self.__dict__)
+                # setattr_(layer, name, self.shard_one_layer(child, policy_cls))
+                self.shard_one_layer(child, param_funcs, binding_layers)
+                continue
+
+            self.reverse_replace_layer(child, origin_cls, attr_dict, param_funcs, binding_layers)
+        return layer
+
+
+    def shard_one_layer(
+            self, 
+            org_layer: nn.Module, 
+            param_funcs: List[Callable],
+            binding_layers: List[nn.Module]
+        ) -> None:
+        """
+        Shard one layer according to the policy, the layer should be the same class as the key in policy's argument_policy return dict
+
+        Args:
+            org_layer: The origin layer object to shard
+            param_funcs: The function list to get shard information in policy class
+
+        """
+        # print(org_layer)
+        for func in param_funcs:
+            policy_layers = func()
+            for policy_layer in policy_layers:
+                weight = None
+                bias = None
+                weight_attr = policy_layer.weight
+                bias_attr = policy_layer.bias
+                replace_layer_cls = policy_layer.replace_layer
+                ignore = policy_layer.ignore
+                if policy_layer.__class__.__name__ == "Col_Layer":
+                    gather_output = policy_layer.gather_output
+                    print(gather_output)
+
+                if weight_attr is not None:
+                    if hasattr_(org_layer, weight_attr):
+                        weight = getattr_(org_layer, weight_attr)
+                    elif not ignore:
+                        raise ValueError(f"Layer {org_layer.__class__.__qualname__} has no attribute {weight_attr}")
+
+                if bias_attr is not None:
+                    if hasattr_(org_layer, bias_attr):
+                        bias = getattr_(org_layer, bias_attr)
+                    elif not ignore:
+                        raise ValueError(f"Layer {org_layer.__class__.__qualname__} has no attribute {bias_attr}")
+
+                # dont have the attribute in policy, and ignore is true
+                if weight is None and bias is None and ignore:
+                    continue
+
+                # set the sliced weight and bias to the new nn_col layer
+                assert weight is not None or bias is not None
+                layer_attr = (lambda x: x[:x.rfind(".")])(weight_attr or bias_attr)
+
+                # slice weight and bias
+                weight, bias = self.slicer.slice_weight_bias(weight, bias, policy_layer.__class__)
+                print(os.environ['RANK'], policy_layer.__class__, weight.shape, bias.shape if bias is not None else None)
+                # save the binding information
+                for binding_layer in binding_layers:
+                    self.binding_map[binding_layer] = dict(weight=weight, bias=bias)
+
+                # create new object to replace the origin layer
+                if replace_layer_cls is not None:
+                    # print(f"RANK {os.environ['RANK']}: replace {getattr_(org_layer, layer_attr).__class__} to {replace_layer_cls}, shape is {weight.shape}")
+                    if isinstance(getattr_(org_layer, layer_attr), nn.Linear):
+                        if replace_layer_cls.__name__ == "Linear1D_Row":
+                            replace_layer = replace_layer_cls(weight.shape[1], weight.shape[0], bias=False if bias is None else True)
+                        elif replace_layer_cls.__name__ == "Linear1D_Col":
+                            replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1], bias=False if bias is None else True, gather_output=gather_output)
+                        setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
+                        self.set_param(replace_layer, weight, bias)
+                    elif isinstance(getattr_(org_layer, layer_attr), nn.Embedding):    
+                        replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1], getattr_(org_layer, f"{layer_attr}.padding_idx", ignore=True))
+                        setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
+                        self.set_param(replace_layer, weight, bias)
+                    else:
+                        raise NotImplementedError(f"Replacing {getattr_(org_layer, layer_attr).__class__} is not implemented so far")
+                # do not replace the layer object, just replace the weight and bias
+                else:
+                    self.set_param(org_layer, layer_attr, weight, bias)
+
+
+    def set_param(
+            self, 
+            layer: Any, 
+            layer_attr: str = "", 
+            weight: torch.Tensor = None, 
+            bias: torch.Tensor = None
+        ) -> None:
+        """
+        Reset the weight and bias of the layer object
+
+        Args:
+            layer: The layer object
+            layer_attr: The attribute name of the layer
+            weight: The weight of the layer
+            bias: The bias of the layer
+        """
+        assert weight is not None or bias is not None
+        if weight is not None:
+            setattr_(layer, "weight" if layer_attr == "" else layer_attr+".weight", nn.Parameter(weight))
+            self.set_layer_size(layer, layer_attr, weight.shape)
+        if bias is not None:
+            setattr_(layer, "bias" if layer_attr == "" else layer_attr+".bias", nn.Parameter(bias))
+
+
+    def set_layer_size(self, layer: nn.Module, layer_attr: str, size: torch.Size) -> None:
+        """
+        Set the layer attribute
+
+        Args:
+            layer: The layer object
+            layer_attr: The attribute name of the layer
+            size: Torch.size
+        """
+        # Tensor.shape[0] -> out_features, Tensor.shape[1] -> in_features
+        attrs = ["out_features", "in_features"]
+        for i, attr in enumerate(attrs):
+            if hasattr_(layer, f"{layer_attr}.{attr}"):
+                setattr_(layer, f"{layer_attr}.{attr}", size[i])    
diff --git a/colossalai/shardformer/shard/shardmodel.py b/colossalai/shardformer/shard/shardmodel.py
new file mode 100644
index 000000000000..54d7b5ba02d9
--- /dev/null
+++ b/colossalai/shardformer/shard/shardmodel.py
@@ -0,0 +1,58 @@
+import os
+import torch
+import torch.nn as nn
+import transformers
+import torch.distributed as dist
+from dataclasses import dataclass
+from contextlib import suppress
+
+from colossalai.tensor.d_tensor.layout import Layout
+from ..policies.basepolicy import Policy
+from .sharder import ModelSharder
+from .shardconfig import ShardConfig
+
+
+class ShardModel(object):
+    """
+    The class for sharding the huggingface model, self.model is the sharded model
+    Just creat a new ShardModel object to shard huggingface model
+
+    Args:
+        model: the origin huggingface model
+        dist_config: the config for distribute information
+        custom_policy: the custom policy for sharding
+    """
+    def __init__(
+            self,
+            model: nn.Module,
+            shard_config: ShardConfig = None, # TODO
+            custom_policy: Policy = None,
+        ) -> None:
+        self.model = model
+        self.shard_config = shard_config
+        self.policy = custom_policy
+        # self.layout=,  # TODO
+
+        sharder=ModelSharder(
+            model=self.model,
+            policy=self.policy,
+            shard_config=self.shard_config,
+        )
+        sharder.shard()
+
+
+    def set_environ(self) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
+        os.environ["MKL_SERVICE_FORCE_INTEL"] = "GNU"
+        os.environ["MASTER_ADDR"] = str(self.dist_config.master_addr)
+        os.environ["MASTER_PORT"] = str(self.dist_config.master_port)
+        os.environ["WORLD_SIZE"] = str(self.dist_config.num_gpus)
+        os.environ["RANK"] = str(self.dist_config.rank)
+        os.environ["LOCAL_RANK"] = str(self.dist_config.rank)
+        if not dist.is_initialized():
+            dist.init_process_group(backend=self.dist_config.backend)
+
+        torch.cuda.set_device(int(os.getenv("LOCAL_RANK", "0")))
+
+    def back_to_org() -> None:
+        pass
\ No newline at end of file
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
new file mode 100644
index 000000000000..1849cdc99c72
--- /dev/null
+++ b/colossalai/shardformer/shard/slicer.py
@@ -0,0 +1,167 @@
+import os
+from typing import Dict, Tuple
+from dataclasses import dataclass
+
+import torch
+import torch.distributed as dist
+from ..policies.basepolicy import Layer, Col_Layer, Row_Layer
+from .shardconfig import ShardConfig
+
+
+dim_mapping = {Col_Layer: 1, Row_Layer: 0}
+
+class Slicer():
+
+    def __init__(
+        self, 
+        shardconfig: ShardConfig #TODO
+    ) -> None:
+        self.shardconfig = shardconfig
+
+    
+    def slice_weight_bias(
+        self,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        policy_layer_cls: Layer,
+    ):
+        """
+        Slice the weight and bias according to policy layer cls
+        Layer -> do nothing
+        Col_Layer -> slice the weight and bias along dim 1
+        Row_Layer -> slice the weight along dim 0 and do not slice bias
+
+        Args:
+            weight: The weight of the layer
+            bias: The bias of the layer
+            policy_layer_class: The class represent how to slice the tensor
+        """
+        if policy_layer_cls == Layer:
+            return weight, bias
+        elif policy_layer_cls == Col_Layer:
+            weight = self.slice_tensor(weight, 1, False)
+            bias = self.slice_tensor(bias, 0, True)
+        elif policy_layer_cls == Row_Layer:
+            weight = self.slice_tensor(weight, 0, False)
+        else:
+            raise NotImplementedError(f"The policy layer class {policy_layer_cls} is not supported")
+        return weight, bias
+    
+
+    def slice_weight(
+        self,
+        weight: torch.Tensor,
+        policy_layer_cls: Layer,
+    ) -> torch.Tensor:
+        """
+        Slice the weight and bias according to the shardconfig
+
+        Args:
+            weight: The weight of the layer
+            bias: The bias of the layer
+            policy_layer_class: The class represent how to slice the tensor
+        """
+        if weight is not None:
+            dim = dim_mapping[policy_layer_cls]
+            weight = self.slice_tensor(weight, dim, False)
+        return weight
+
+
+    def slice_bias(
+        self,
+        bias: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Slice the bias according to the shardconfig
+        
+        Args:
+            bias: The bias of the layer
+        """
+        assert bias is not None, "The bias is None"
+        if bias is not None:
+            bias = self.slice_tensor(bias, 1, True)
+        return bias
+
+
+    def slice_tensor(
+        self,
+        tensor_in: torch.Tensor,
+        dim: int,
+        is_bias: bool,
+    ) -> torch.Tensor:
+        """
+        Slice tensor according to the config
+        """
+        if tensor_in is None:
+            return None
+        if not is_bias:
+            return self.slice_2d(tensor_in, dim)
+        else:
+            return self.slice_1d(tensor_in)
+
+
+    def slice_2d(
+        self,
+        tensor: torch.Tensor,
+        dim: int,
+    ) -> torch.Tensor:
+        """
+        Slice the 2D tensor 
+
+        Args:
+            tensor: The tensor to slice
+        """
+        assert dim in [0,1], f"Only support 2D tensor, but got {dim}D tensor"
+        if dim == 0:
+            return self.slice_row(tensor)
+        elif dim == 1:
+            return self.slice_col(tensor)
+
+
+    def slice_1d(
+        self,
+        tensor: torch.Tensor,
+        dim: int = None,
+    ) -> torch.Tensor:
+        """
+        Slice the 1D tensor 
+
+        Args:
+            tensor: The tensor to slice
+        """
+        delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
+        down_idx = self.shardconfig.rank * delta
+        up_idx = down_idx + delta
+        return tensor[down_idx:up_idx]
+
+    def slice_col(
+        self,
+        tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Slice the tensor in column
+
+        Args:
+            tensor: The tensor to slice
+        """
+        delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
+        down_idx = self.shardconfig.rank * delta
+        up_idx = down_idx + delta
+        return tensor[down_idx:up_idx,:]
+
+
+    def slice_row(
+        self,
+        tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Slice the tensor in column
+
+        Args:
+            tensor: The tensor to slice
+        """
+        delta = (tensor.shape[1] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
+        down_idx = self.shardconfig.rank * delta
+        up_idx = down_idx + delta
+        return tensor[:,down_idx:up_idx]
+    
\ No newline at end of file
diff --git a/colossalai/shardformer/test/config.py b/colossalai/shardformer/test/config.py
new file mode 100644
index 000000000000..295529429237
--- /dev/null
+++ b/colossalai/shardformer/test/config.py
@@ -0,0 +1,5 @@
+parallel = dict(
+        data=1,
+        pipeline=1,
+        tensor=dict(size=2, mode='1d')
+)
\ No newline at end of file
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
new file mode 100644
index 000000000000..c2a9053ca2f6
--- /dev/null
+++ b/colossalai/shardformer/test/test.py
@@ -0,0 +1,37 @@
+from transformers import AutoTokenizer
+from transformers import BertForMaskedLM
+import colossalai
+from colossalai.shardformer.shard.shardmodel import ShardModel
+from colossalai.utils import get_current_device, print_rank_0
+from colossalai.logging import get_dist_logger
+from colossalai.shardformer.shard.shardconfig import ShardConfig
+import inspect
+import argparse
+import torch.nn as nn
+import os
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+def get_args():
+    parser = colossalai.get_default_parser()
+    return parser.parse_args()
+
+def inference(model: nn.Module):
+    # print(model)
+    token = "Hello, my dog is cute"
+    inputs = tokenizer(token, return_tensors="pt")
+    inputs.to("cuda")
+    model.to("cuda")
+    outputs = model(**inputs)
+    print(outputs)
+
+if __name__ == "__main__":
+    args = get_args()
+    colossalai.launch_from_torch(config=args.config)
+    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    shard_config = ShardConfig(
+        rank = int(str(get_current_device()).split(':')[-1]),
+        world_size= int(os.environ['WORLD_SIZE']),
+    )
+    shardmodel = ShardModel(model, shard_config)
+    inference(shardmodel.model)
diff --git a/colossalai/shardformer/utils/__init__.py b/colossalai/shardformer/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/utils/utils.py b/colossalai/shardformer/utils/utils.py
new file mode 100644
index 000000000000..5eba87f6fe09
--- /dev/null
+++ b/colossalai/shardformer/utils/utils.py
@@ -0,0 +1,56 @@
+def hasattr_(obj, attr: str):
+    """
+    Check whether the object has the multi sublevel attr
+
+    Args:
+        obj: The object to check
+        attr: The multi level attr to check
+    """
+    attrs = attr.split('.')
+    for a in attrs:
+        try:
+            obj = getattr(obj, a)
+        except AttributeError:
+            return False
+    return True
+
+def setattr_(obj, attr: str, value, ignore: bool=False):
+    """
+    Set the object's multi sublevel attr to value, if ignore, ignore when it doesn't exist
+
+    Args:
+        obj: The object to set
+        attr: The multi level attr to set
+        value: The value to set
+        ignore: Whether to ignore when the attr doesn't exist
+    """
+
+    attrs = attr.split('.')
+    for a in attrs[:-1]:
+        try:
+            obj = getattr(obj, a)
+        except AttributeError:
+            if ignore:
+                 return
+            raise AttributeError(f"Object {obj} has no attribute {attr}")
+    setattr(obj, attrs[-1], value)
+
+def getattr_(obj, attr: str, ignore: bool=None):
+    """
+    Get the object's multi sublevel attr
+    
+    Args:
+        obj: The object to set
+        attr: The multi level attr to set
+        ignore: Whether to ignore when the attr doesn't exist
+    """
+
+    attrs = attr.split('.')
+    for a in attrs:
+        try:
+            obj = getattr(obj, a)
+        except AttributeError:
+            if ignore:
+                return None
+            raise AttributeError(f"Object {obj} has no attribute {attr}")
+    return obj
\ No newline at end of file

From 58f6432416127e9d5e4ad1e3ac6b8200dcb41c56 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Wed, 24 May 2023 10:26:46 +0800
Subject: [PATCH 38/52] [shardformer]: Feature/shardformer, add some docstring
 and readme (#3816)

* init shardformer code structure

* add implement of sharder (inject and replace)

* add implement of replace layer to colossal layer

* separate different layer policy, add some notion

* implement 1d and 2d slicer, can tell col or row

* fix bug when slicing and inject model

* fix some bug; add inference test example

* add share weight and train example

* add train

* add docstring and readme

* add docstring for other files

* pre-commit
---
 colossalai/nn/layer/parallel_1d/_operation.py |   2 +
 colossalai/nn/layer/parallel_1d/layers.py     |   9 +-
 colossalai/shardformer/README.md              | 177 +++++++++++++++++
 colossalai/shardformer/model/modeling_bert.py |  16 +-
 colossalai/shardformer/policies/autopolicy.py |  25 ++-
 colossalai/shardformer/policies/basepolicy.py | 128 +++++++-----
 colossalai/shardformer/policies/bert.py       | 125 ++++++------
 colossalai/shardformer/shard/shardconfig.py   |   4 +-
 colossalai/shardformer/shard/sharder.py       | 187 +++++++++---------
 colossalai/shardformer/shard/shardmodel.py    |  36 ++--
 colossalai/shardformer/shard/slicer.py        | 113 +++++------
 colossalai/shardformer/test/config.py         |   6 +-
 colossalai/shardformer/test/test.py           |  87 ++++++--
 colossalai/shardformer/utils/utils.py         |  36 ++--
 14 files changed, 612 insertions(+), 339 deletions(-)
 create mode 100644 colossalai/shardformer/README.md

diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/nn/layer/parallel_1d/_operation.py
index 394334558275..c5e33fd497cd 100644
--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -1,5 +1,6 @@
 import torch
 import torch.distributed as dist
+
 from colossalai.core import global_context as gpc
 
 try:
@@ -72,6 +73,7 @@ def backward(ctx, grad_output):
         total_input = input
         grad_input = grad_output.matmul(weight)
 
+        grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
         grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2])
         total_input = total_input.view(total_input.shape[0] * total_input.shape[1], total_input.shape[2])
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 406173a18c60..0ee3b4fcb502 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -469,7 +469,8 @@ def __init__(self,
         if skip_bias_add and not bias:
             raise ValueError('cannot skip bias addition if bias is None')
 
-        self.out_features_per_partition = divide(out_features, gpc.tensor_parallel_size)
+        # self.out_features_per_partition = divide(out_features*2, gpc.tensor_parallel_size)
+        self.out_features_per_partition = out_features
 
         # Parameters.
         # Initialize weight.
@@ -612,7 +613,8 @@ def __init__(self,
             raise ValueError('cannot skip bias addition if bias is None')
 
         # Divide the weight matrix along the last dimension.
-        self.input_size_per_partition = divide(in_features, gpc.tensor_parallel_size)
+        # self.input_size_per_partition = divide(in_features*2, gpc.tensor_parallel_size)
+        self.input_size_per_partition = in_features
 
         # Parameters.
         # Initialize weight.
@@ -884,7 +886,8 @@ def __init__(self,
 
         tensor_parallel_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
         tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-        self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
+        # self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
+        self.num_embeddings_per_partition = num_embeddings
         self.vocab_start_index = tensor_parallel_rank * self.num_embeddings_per_partition
         self.vocab_end_index = self.vocab_start_index + self.num_embeddings_per_partition
 
diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
new file mode 100644
index 000000000000..a47e280f2be4
--- /dev/null
+++ b/colossalai/shardformer/README.md
@@ -0,0 +1,177 @@
+## ShardFormer
+
+### Intro
+Make the model in huggingface.co can be paralleled and can be used with colossalai according to custom policy.
+
+### Quick start
+1. Usage
+- Use
+``` python
+from colossalai.shardformer.shard.shardmodel import ShardModel
+from transformers import BertForMaskedLM
+
+# create huggingface model as normal
+model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+
+# make the huggingface model paralleled to ShardModel
+# auto policy:
+shardmodel = ShardModel(model).model
+
+# custom policy:
+from xxx import <POLICYCLASS>
+shardmodel = ShardModel(model, <POLICYCLASS>).model
+
+
+# do angthing as normal
+...
+```
+- Policy
+
+If you wanna parallel the model in custom way, just overwrite the policy class for the huggingface model.
+
+You should do:
+
+1. Inherit Policy class
+2. Overwrite argument_policy method
+    - In this method you need to list which layers class you wanna modify and the attributes and parameters in those layers.
+3. Overwrite inject_policy method [Optional]
+    - If you need to modify the forward or backward progress.
+4. Overwrite or add the param recording functions
+    - These function use suffix to record the path of weight or bias for the layer.
+5. Overwrite binding
+
+More details can be found in shardformer/policies/basepolicy.py
+``` python
+from colossalai.shardformer.policies.basepolicy import Policy, Layer, Col_Layer, Row_Layer, Argument
+
+CustomPolicy(Policy):
+   @staticmethod
+    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module,Argument]:
+        """
+        Return a dict, the key is layer will be modified and the value is the Argument class with param setting and param functions
+
+        Args:
+            model_config: The config of transformer model
+            shard_setting: The config of distributed model
+
+        Return:
+            Dict for the modify policy,
+            {
+                origin layer class1 (nn.Module): Argument(
+                    attr_dict = {
+                        argument1: value1,
+                        argument2: value2,
+                        ...
+                    },
+                    param_funcs = [
+                        staticmethod1,
+                        staticmethod2,
+                        ...
+                    ]
+                ),
+                origin layer class2 (nn.Module): Argument(
+                    attr_dict = {
+                        argument1: value1,
+                        argument2: value2,
+                        ...
+                    },
+                    param_funcs = [
+                        staticmethod1,
+                        staticmethod2,
+                        ...
+                    ]
+                ),
+                ...
+            }
+
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def inject_policy() -> Tuple[nn.Module, nn.Module]:
+        """
+        Return the dict for the inject model
+
+        Return:
+            The injected model, key is the original model and value is the new shardmodel
+        """
+        return ()
+
+    @staticmethod
+    def binding_policy() -> Dict:
+        """
+        Return the dict for the binding model
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def attn_in() -> List:
+        """
+        Attention qkv layer
+
+        Returns:
+            List[Layer]: List of layer object, each layer is the new
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def attn_out() -> List:
+        """
+        Attention output projection layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def mlp_in() -> List:
+        """
+        h -> 4h mlp layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def mlp_out() -> List:
+        """
+        4h -> h mlp layer
+
+        Returns:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def embedding() -> List:
+        """
+        Partially slice the embedding layer
+        vocab_size->vocab_size//gpu_nums
+
+        Return:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+    @staticmethod
+    def unembedding() -> List:
+        """
+        Partially slice the embedding layer
+        vocab_size->vocab_size//gpu_nums
+
+        Return:
+            List[Layer]: List of layer object
+        """
+        return NotImplementedError
+
+```
+
+2. Simple example
+``` shell
+# inference
+colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode inference
+# train
+colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode train
+```
diff --git a/colossalai/shardformer/model/modeling_bert.py b/colossalai/shardformer/model/modeling_bert.py
index 87ed8ac308a5..6741ae866991 100644
--- a/colossalai/shardformer/model/modeling_bert.py
+++ b/colossalai/shardformer/model/modeling_bert.py
@@ -1,12 +1,14 @@
+from typing import Any, Dict, List, Type
+
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from typing import Any, Dict, List, Type
-
-
 from transformers import BertForMaskedLM
 from transformers.models.bert.modeling_bert import MaskedLMOutput
+
+
 class BertForMaskedLM_(BertForMaskedLM):
+
     def forward(
         self,
         input_ids=None,
@@ -23,7 +25,7 @@ def forward(
         return_dict=None,
         **kwargs,
     ):
-        print("[Inject OK] Injected forward method")
+        # print("[Inject OK] Injected forward method")
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.bert(
@@ -46,9 +48,9 @@ def forward(
         masked_lm_loss = None
 
         # if input_ids is not None:
-        #     masked_lm_loss = applyDistCrossEntropy(prediction_scores, input_ids, self.config.vocab_size) 
+        #     masked_lm_loss = applyDistCrossEntropy(prediction_scores, input_ids, self.config.vocab_size)
         if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            loss_fct = CrossEntropyLoss()    # -100 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -60,4 +62,4 @@ def forward(
             logits=prediction_scores,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )
\ No newline at end of file
+        )
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index 9142e0dae22e..e096c2b13a59 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -1,40 +1,47 @@
 import torch.nn as nn
 
+
 def build_policies():
-    """
+    r"""
     Build the policies for the model
-    
+
     Return:
         The dict for the policies
     """
     auto_policy_dict = {}
 
     from transformers.models.bert.modeling_bert import BertForMaskedLM
+
     from .bert import BertForMaskedLMPolicy
     auto_policy_dict[BertForMaskedLM] = BertForMaskedLMPolicy
 
     from transformers.models.bert.modeling_bert import BertForSequenceClassification
+
     from .bert import BertForSequenceClassificationPolicy
     auto_policy_dict[BertForSequenceClassification] = BertForSequenceClassificationPolicy
-    
+
     return auto_policy_dict
 
-def get_autopolicy(model:nn.Module):
-    """
+
+def get_autopolicy(model: nn.Module):
+    r"""
     Return the auto policy for the model
 
     Args:
-        model: The model to be used
+        model (:class:`nn.Module`): The model to get the auto policy
 
     Return:
-        The auto policy for the model
+        :class:`Policy`: The auto policy for the model
     """
     auto_policy_dict = build_policies()
     policy = auto_policy_dict.get(model.__class__, None)
-    if policy is None:   
-        raise NotImplementedError(f"Auto policy for {model.__class__.__qualname__} is not implemented\n Supported models are {[i.__qualname__ for i in auto_policy_dict.keys()]}")
+    if policy is None:
+        raise NotImplementedError(
+            f"Auto policy for {model.__class__.__qualname__} is not implemented\n Supported models are {[i.__qualname__ for i in auto_policy_dict.keys()]}"
+        )
     return policy
 
+
 # from transformers.models.bert.modeling_bert import BertForMaskedLM, BertForPreTraining
 # model = BertForPreTraining
 # policy = get_autopolicy(model)
diff --git a/colossalai/shardformer/policies/basepolicy.py b/colossalai/shardformer/policies/basepolicy.py
index d444aeb53bf8..a5cc0bc68df6 100644
--- a/colossalai/shardformer/policies/basepolicy.py
+++ b/colossalai/shardformer/policies/basepolicy.py
@@ -1,28 +1,38 @@
 # part of code modified from https://github.com/tunib-ai/parallelformers
 
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Tuple, Type
+
 import torch
 import torch.nn as nn
-import colossalai.nn as col_nn
-from typing import Any, Dict, List, Type, Tuple, Callable
 from transformers import AutoConfig
-from dataclasses import dataclass, field
+
+import colossalai.nn as col_nn
+
 
 @dataclass
 class Argument:
-    attr_dict : Dict[str, Any]
-    param_funcs : List[Callable]
-    binding_layers : List[nn.Module] = field(default_factory=list)
+    r"""
+    The argument class for the policy
+
+    Args:
+        attr_dict (Dict[str, Any]): The dict for the param setting
+        param_funcs (:class:`List[Callable]`): The list for the param functions
+    """
+    attr_dict: Dict[str, Any]
+    param_funcs: List[Callable]
+
 
 @dataclass
 class Layer:
-    """
+    r"""
     The layer object for the policy
 
     Args:
-        weight: The weight name of the layer
-        bias: The bias name of the layer
-        replace_layer: The layer to replace the original layer
-        ignore: Whether to ignore this layer if it is not in the model
+        weight (str): The weight suffix of the layer
+        bias (str): The bias suffix of the layer
+        replace_layer (:class:`colosalai.nn`): The layer to replace the original layer
+        ignore (bool): Whether to ignore this layer if it is not in the model
     """
     weight: str = None
     bias: str = None
@@ -32,45 +42,55 @@ class Layer:
 
 @dataclass
 class Col_Layer(Layer):
-    """
+    r"""
     Class for col shard layer in MegatronLM
+
+    Args:
+        gather_output (bool): Whether to gather the output of the layer
     """
     gather_output: bool = False
 
 
 @dataclass
 class Row_Layer(Layer):
-    """
+    r"""
     Class for col shard layer in MegatronLM
     """
     pass
 
 
 class Policy():
-    """
+    r"""
     The base class for all the policies
-    For each different model, it should have a different policy class, like BertPolicy for Bert Model 
-    or OPTPolicy for OPT model. 
+    For each different model, it should have a different policy class, like BertPolicy for Bert Model
+    or OPTPolicy for OPT model.
     AutoPolicy:
-        shardformer already defined some policies for huggingface model, just set custom_policy = None
+        Shardformer already defined some policies for huggingface model, just set ``custom_policy`` = None
         to use the auto policy. In shardformer autopolicy, we define a base policy for one type model,
-        like BertPolicy, and for each different Bert modle in huggingface like, BertForMaskedLM, 
+        like BertPolicy, and for each different Bert modle in huggingface like, BertForMaskedLM,
         BertForSequenceClassification, etc., for each different Bert model we difine different policy class
-        and overwrite the method inject_policy
-    
+        and overwrite the method like ``inject_policy`` to modify the forward and backward process.
+
     CustomPolicy:
+        If you want to define your own policy, you can set ``custom_policy`` = CustomPolicy, and overwrite
+        all the methods in ``Policy`` class. You can refer to any policy we defined like the ``BertPolicy``
+        class for the example.
+
     """
+
     @staticmethod
-    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module,Argument]:
-        """
-        Return a dict, the key is layer will be modified and the value is the Argument class with param setting and param functions
+    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module, Argument]:
+        r"""
+        Return the dict for the modify policy, the key is the original layer class and the value is the
+        argument for the modify layer
 
         Args:
-            model_config: The config of transformer model
-            shard_setting: The config of distributed model
-        
+            model_config (:class:`tansformer.Config`): The config of transformer model
+            shard_config (:class:`ShardConfig`): The config for sharding model
+
         Return:
             Dict for the modify policy,
+            ::
             {
                 origin layer class1 (nn.Module): Argument(
                     attr_dict = {
@@ -101,33 +121,51 @@ def argument_policy(model_config, shard_config: int) -> Dict[nn.Module,Argument]
 
         """
         raise NotImplementedError
-    
 
     @staticmethod
     def inject_policy() -> Tuple[nn.Module, nn.Module]:
-        """
-        Return the dict for the inject model 
+        r"""
+        Return the dict for the inject model
 
         Return:
             The injected model, key is the original model and value is the new shardmodel
+            ::
+            (OrignModel, CustomModel)
+            in `CustomModel`, we can overwrite the forward and backward process
         """
         return ()
-    
 
     @staticmethod
-    def attn_in() -> List:
+    def binding_policy() -> Dict:
+        r"""
+        Return the dict for the binding model
+
+        Return:
+            This method should return the binding relationship for some layers share the weight or bias,
+            the key and value is the suffix of the weight or bias of the model
+        ::
+            return {
+                "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
+            }
         """
+        return NotImplementedError
+
+    @staticmethod
+    def attn_in() -> List:
+        r"""
         Attention qkv layer
+        In this kind of method, we should return the list of ``Layer`` object, each ``Layer`` object should be
+        ``Layer`` for no slicing, ``Col_Layer`` for col slicing, ``Row_Layer`` for row slicing. And the parameters
+        in ``Layer`` object can refer to the ``Layer`` class.
 
         Returns:
-            List[Layer]: List of layer object, each layer is the new 
+            List[Layer]: List of layer object, each layer is the new
         """
         return NotImplementedError
 
-
     @staticmethod
     def attn_out() -> List:
-        """
+        r"""
         Attention output projection layer
 
         Returns:
@@ -135,46 +173,40 @@ def attn_out() -> List:
         """
         return NotImplementedError
 
-
     @staticmethod
     def mlp_in() -> List:
-        """
+        r"""
         h -> 4h mlp layer
 
         Returns:
             List[Layer]: List of layer object
         """
         return NotImplementedError
-        
 
     @staticmethod
     def mlp_out() -> List:
-        """
+        r"""
         4h -> h mlp layer
 
         Returns:
             List[Layer]: List of layer object
         """
         return NotImplementedError
-        
-    
+
     @staticmethod
-    def embedding()->List:
-        """
+    def embedding() -> List:
+        r"""
         Partially slice the embedding layer
-        vocab_size->vocab_size//gpu_nums
 
         Return:
             List[Layer]: List of layer object
         """
         return NotImplementedError
-        
-    
+
     @staticmethod
-    def unembedding()->List:
-        """
+    def unembedding() -> List:
+        r"""
         Partially slice the embedding layer
-        vocab_size->vocab_size//gpu_nums
 
         Return:
             List[Layer]: List of layer object
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
index 24b95e827347..5d91d8ddc766 100644
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -1,56 +1,57 @@
-from typing import Dict, List, Tuple, Type, Any, Callable
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Tuple, Type
+
 import torch.nn as nn
-from .basepolicy import Policy, Layer, Argument, Col_Layer, Row_Layer
+from transformers.models.bert.modeling_bert import BertEmbeddings, BertLayer, BertLMPredictionHead
+
 import colossalai.nn as col_nn
-from transformers.models.bert.modeling_bert import BertLayer, BertEmbeddings, BertLMPredictionHead
-from dataclasses import dataclass
+
+from .basepolicy import Argument, Col_Layer, Layer, Policy, Row_Layer
 
 
 class BertPolicy(Policy):
+
     @staticmethod
-    def argument_policy(config, world_size: int) -> Dict[nn.Module,Argument]:
+    def argument_policy(config, world_size: int) -> Dict[nn.Module, Argument]:
         return {
-            BertLayer: Argument(
-                attr_dict = {
-                    # 1. shard hidden size
-                    "attention.self.all_head_size": config.hidden_size // world_size,
-                    "crossattention.self.all_head_size": config.hidden_size // world_size,
-                    # 2. shard number of heads
-                    "attention.self.num_attention_heads": config.num_attention_heads // world_size,
-                    "crossattention.self.num_attention_heads": config.num_attention_heads // world_size,
-       
-                },
-                param_funcs = [
-                    BertPolicy.attn_in,
-                    BertPolicy.attn_out,
-                    BertPolicy.mlp_in,
-                    BertPolicy.mlp_out
-                ]
-            ), 
-            BertEmbeddings: Argument(
-                attr_dict = {
-                    # 1. shard vocab size
-                    # "word_embeddings.num_embeddings": config.vocab_size // world_size,
-                    # 2. add the size of the sliced embedding layer excluding the last slice
-                    "word_embeddings.dim_size": (config.vocab_size+world_size-1) // world_size,
-                },
-                param_funcs = [
-                    BertPolicy.embedding,
-                ],
-                binding_layers = [
-                    BertLMPredictionHead,
-                ]
-            ),
-            BertLMPredictionHead: Argument(
-                attr_dict = {
-                    # 1. shard vocab size
-                    # "word_embeddings.num_embeddings": config.vocab_size // world_size,
-                    # 2. add the size of the sliced embedding layer excluding the last slice
-                },
-                param_funcs = [
-                    BertPolicy.unembedding,
-                ]
-            )
+            BertLayer:
+                Argument(
+                    attr_dict={
+        # 1. shard hidden size
+                        "attention.self.all_head_size": config.hidden_size // world_size,
+                        "crossattention.self.all_head_size": config.hidden_size // world_size,
+        # 2. shard number of heads
+                        "attention.self.num_attention_heads": config.num_attention_heads // world_size,
+                        "crossattention.self.num_attention_heads": config.num_attention_heads // world_size,
+                    },
+                    param_funcs=[BertPolicy.attn_in, BertPolicy.attn_out, BertPolicy.mlp_in, BertPolicy.mlp_out]),
+            BertEmbeddings:
+                Argument(
+                    attr_dict={
+        # 1. shard vocab size
+        # "word_embeddings.num_embeddings": config.vocab_size // world_size,
+        # 2. add the size of the sliced embedding layer excluding the last slice
+                        "word_embeddings.dim_size": (config.vocab_size + world_size - 1) // world_size,
+                    },
+                    param_funcs=[
+                        BertPolicy.embedding,
+                    ]),
+            BertLMPredictionHead:
+                Argument(
+                    attr_dict={
+        # 1. shard vocab size
+        # "word_embeddings.num_embeddings": config.vocab_size // world_size,
+        # 2. add the size of the sliced embedding layer excluding the last slice
+                    },
+                    param_funcs=[
+                        BertPolicy.unembedding,
+                    ])
+        }
+
+    @staticmethod
+    def binding_policy() -> Dict:
+        return {
+            "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
         }
 
     @staticmethod
@@ -89,9 +90,8 @@ def attn_in() -> List:
                 replace_layer=col_nn.Linear1D_Col,
                 ignore=True,
             ),
-
         ]
-    
+
     @staticmethod
     def attn_out() -> List:
         return [
@@ -107,17 +107,17 @@ def attn_out() -> List:
                 ignore=True,
             ),
         ]
-    
+
     @staticmethod
     def mlp_in() -> List:
         return [
-             Col_Layer(
+            Col_Layer(
                 weight="intermediate.dense.weight",
                 bias="intermediate.dense.bias",
                 replace_layer=col_nn.Linear1D_Col,
             ),
         ]
-    
+
     @staticmethod
     def mlp_out() -> List:
         return [
@@ -130,13 +130,11 @@ def mlp_out() -> List:
 
     @staticmethod
     def embedding() -> List:
-        return [
-            Col_Layer(
-                weight="word_embeddings.weight",
-                replace_layer=col_nn.VocabParallelEmbedding1D,
-            )
-        ]
-    
+        return [Col_Layer(
+            weight="word_embeddings.weight",
+            replace_layer=col_nn.VocabParallelEmbedding1D,
+        )]
+
     @staticmethod
     def unembedding() -> List:
         return [
@@ -148,16 +146,21 @@ def unembedding() -> List:
             )
         ]
 
+
 from transformers import BertForMaskedLM
+
 from colossalai.shardformer.model.modeling_bert import BertForMaskedLM_
+
+
 class BertForMaskedLMPolicy(BertPolicy):
+
     @staticmethod
     def inject_policy() -> Tuple[nn.Module, nn.Module]:
         return (BertForMaskedLM, BertForMaskedLM_)
-    
 
-    
+
 class BertForSequenceClassificationPolicy(BertPolicy):
+
     @staticmethod
     def inject_policy() -> Dict:
         return {}
@@ -165,4 +168,4 @@ def inject_policy() -> Dict:
 
 # model = BertForMaskedLM.from_pretrained("bert-base-uncased")
 # _ = BertForMaskedLMPolicy(model)
-# print(isinstance(model,list(_.inject_policy().keys())[0]))
\ No newline at end of file
+# print(isinstance(model,list(_.inject_policy().keys())[0]))
diff --git a/colossalai/shardformer/shard/shardconfig.py b/colossalai/shardformer/shard/shardconfig.py
index be265ff0c8c1..c6a2513a6eff 100644
--- a/colossalai/shardformer/shard/shardconfig.py
+++ b/colossalai/shardformer/shard/shardconfig.py
@@ -10,9 +10,9 @@ class ShardConfig:
     fp16: bool = True
     num_gpus: int = 2
     world_size: int = 2
-    backend="nccl"
+    backend = "nccl"
     verbose: str = 'simple'
     seed: int = None
     require_grad: bool = False
     master_addr: str = "127.0.0.1"
-    master_port: int = 29500
\ No newline at end of file
+    master_port: int = 29500
diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
index ef785cfee9da..2f6bb4265a11 100644
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -1,56 +1,59 @@
+import os
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
+
 import torch
 import torch.nn as nn
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, Callable
-from .shardconfig import ShardConfig
-from dataclasses import dataclass
-from ..policies.basepolicy import Policy, Layer
-from ..policies.autopolicy import get_autopolicy
-from .slicer import Slicer
-from ..utils.utils import hasattr_, setattr_, getattr_
+
 import colossalai.nn as col_nn
 from colossalai.logging import get_dist_logger
-import os
 
+from ..policies.autopolicy import get_autopolicy
+from ..policies.basepolicy import Layer, Policy
+from ..utils.utils import getattr_, hasattr_, setattr_
+from .shardconfig import ShardConfig
+from .slicer import Slicer
 
 logger = get_dist_logger()
 
+
 class ModelSharder(object):
-    """
+    r"""
     Shard the original huggingface model according to the policy
 
     Args:
-        policy: The policy to shard the model
-        model: The model to shard
-        dist_setting: The setting of distributed model
+        policy (:class:`Policy`): The policy to shard the model
+        model (:class:`torch.Module`): The model to shard
+        shard_config: The setting of distributed model
     """
+
     def __init__(
             self,
             model: nn.Module,
             policy: Policy,
-            shard_config: ShardConfig = None, # TODO
-        ) -> None:
+            shard_config: ShardConfig = None,    # TODO
+    ) -> None:
         self.model = model
         self.policy = get_autopolicy(self.model) if policy is None else policy
         self.slicer = Slicer(shard_config)
         self.shard_config = shard_config
         self.model_config = self.model.config
-        self.binding_map = {}
-
 
     def shard(self) -> None:
         self.inject_model(self.model)
         self.replace_layer(self.model)
-  
-        
+        self.bind_layer(self.model)
+
     def inject_model(
-            self,
-            model: nn.Module,
-        ) -> None:
-        """
+        self,
+        model: nn.Module,
+    ) -> None:
+        r"""
         Replace the model to policy defined model
         Mainly modify the forward and backward to fit distributed model
-        
+
         e.g.
+        ::
             BertForMaskedLM.forward -> BertForMaskedLM_.forward
         """
         inject_policy = self.policy.inject_policy()
@@ -64,49 +67,43 @@ def inject_model(
                     setattr(
                         model.__class__,
                         key,
-                        getattr(shard_model_cls,key),
+                        getattr(shard_model_cls, key),
                     )
         else:
             raise NotImplementedError(f"{model.__class__} is not implemented so far")
 
-
     def replace_layer(
-            self,
-            model: nn.Module,
-        ) -> None:
-        """
+        self,
+        model: nn.Module,
+    ) -> None:
+        r"""
         Replace the layer according to the policy, and replace the layer one by one
 
         Args:
-            layer: The layer to shard
+            model (:class:`torch.nn.Module`): The layer to shard
         """
         argument_policies = self.policy.argument_policy(self.model_config, self.shard_config.world_size)
         for argument_policy in argument_policies.items():
             origin_layer_cls = argument_policy[0]
             attr_dict = argument_policy[1].attr_dict
             param_funcs = argument_policy[1].param_funcs
-            binding_layers = argument_policy[1].binding_layers
-            # if binding_layer is not None:
-            #     self.binding_map[origin_layer_cls] = binding_layer
-            self.reverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs, binding_layers)
-
+            self.reverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs)
 
     def reverse_replace_layer(
-            self,
-            layer: nn.Module,
-            origin_cls: nn.Module,
-            attr_dict: Dict[str, Any],
-            param_funcs: List[Callable],
-            binding_layers: List[nn.Module]
-        ) -> None:
-        """
+        self,
+        layer: nn.Module,
+        origin_cls: nn.Module,
+        attr_dict: Dict[str, Any],
+        param_funcs: List[Callable],
+    ) -> None:
+        r"""
         Reverse the replace layer operation
 
         Args:
-            layer: The object of layer to shard
-            origin_cls: The origin layer class
-            attr_dict: The attribute dict to modify
-            policy_cls: The policy class
+            layer (:class:`torch.nn.Module`): The object of layer to shard
+            origin_cls (:class:`transformers.model`): The origin layer class
+            attr_dict (Dict): The attribute dict to modify
+            policy_cls (:class:`Policy`): The policy class
         """
         for name, child in layer.named_children():
             if child.__class__ == origin_cls:
@@ -115,25 +112,23 @@ def reverse_replace_layer(
                     setattr_(child, k, v, ignore=True)
                 # print(f"Sharding {name} layer", replac_layer.attention.self.__dict__)
                 # setattr_(layer, name, self.shard_one_layer(child, policy_cls))
-                self.shard_one_layer(child, param_funcs, binding_layers)
+                self.shard_one_layer(child, param_funcs)
                 continue
 
-            self.reverse_replace_layer(child, origin_cls, attr_dict, param_funcs, binding_layers)
+            self.reverse_replace_layer(child, origin_cls, attr_dict, param_funcs)
         return layer
 
-
     def shard_one_layer(
-            self, 
-            org_layer: nn.Module, 
-            param_funcs: List[Callable],
-            binding_layers: List[nn.Module]
-        ) -> None:
-        """
+        self,
+        org_layer: nn.Module,
+        param_funcs: List[Callable],
+    ) -> None:
+        r"""
         Shard one layer according to the policy, the layer should be the same class as the key in policy's argument_policy return dict
 
         Args:
-            org_layer: The origin layer object to shard
-            param_funcs: The function list to get shard information in policy class
+            org_layer (:class:`torch.nn.Module`): The origin layer object to shard
+            param_funcs (:class:`List[typing.Callable]`): The function list to get shard information in policy class
 
         """
         # print(org_layer)
@@ -148,7 +143,7 @@ def shard_one_layer(
                 ignore = policy_layer.ignore
                 if policy_layer.__class__.__name__ == "Col_Layer":
                     gather_output = policy_layer.gather_output
-                    print(gather_output)
+                    # print(gather_output)
 
                 if weight_attr is not None:
                     if hasattr_(org_layer, weight_attr):
@@ -172,67 +167,81 @@ def shard_one_layer(
 
                 # slice weight and bias
                 weight, bias = self.slicer.slice_weight_bias(weight, bias, policy_layer.__class__)
-                print(os.environ['RANK'], policy_layer.__class__, weight.shape, bias.shape if bias is not None else None)
-                # save the binding information
-                for binding_layer in binding_layers:
-                    self.binding_map[binding_layer] = dict(weight=weight, bias=bias)
+                # print(os.environ['RANK'], policy_layer.__class__, weight.shape, bias.shape if bias is not None else None)
 
                 # create new object to replace the origin layer
                 if replace_layer_cls is not None:
                     # print(f"RANK {os.environ['RANK']}: replace {getattr_(org_layer, layer_attr).__class__} to {replace_layer_cls}, shape is {weight.shape}")
                     if isinstance(getattr_(org_layer, layer_attr), nn.Linear):
                         if replace_layer_cls.__name__ == "Linear1D_Row":
-                            replace_layer = replace_layer_cls(weight.shape[1], weight.shape[0], bias=False if bias is None else True)
+                            replace_layer = replace_layer_cls(weight.shape[1],
+                                                              weight.shape[0],
+                                                              bias=False if bias is None else True)
                         elif replace_layer_cls.__name__ == "Linear1D_Col":
-                            replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1], bias=False if bias is None else True, gather_output=gather_output)
+                            replace_layer = replace_layer_cls(weight.shape[0],
+                                                              weight.shape[1],
+                                                              bias=False if bias is None else True,
+                                                              gather_output=gather_output)
                         setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
                         self.set_param(replace_layer, weight, bias)
-                    elif isinstance(getattr_(org_layer, layer_attr), nn.Embedding):    
-                        replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1], getattr_(org_layer, f"{layer_attr}.padding_idx", ignore=True))
+                    elif isinstance(getattr_(org_layer, layer_attr), nn.Embedding):
+                        replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1],
+                                                          getattr_(org_layer, f"{layer_attr}.padding_idx", ignore=True))
                         setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
                         self.set_param(replace_layer, weight, bias)
                     else:
-                        raise NotImplementedError(f"Replacing {getattr_(org_layer, layer_attr).__class__} is not implemented so far")
+                        raise NotImplementedError(
+                            f"Replacing {getattr_(org_layer, layer_attr).__class__} is not implemented so far")
                 # do not replace the layer object, just replace the weight and bias
                 else:
                     self.set_param(org_layer, layer_attr, weight, bias)
 
-
-    def set_param(
-            self, 
-            layer: Any, 
-            layer_attr: str = "", 
-            weight: torch.Tensor = None, 
-            bias: torch.Tensor = None
-        ) -> None:
-        """
+    def set_param(self,
+                  layer: Any,
+                  weight: torch.Tensor = None,
+                  bias: torch.Tensor = None,
+                  layer_attr: str = "") -> None:
+        r"""
         Reset the weight and bias of the layer object
 
         Args:
-            layer: The layer object
-            layer_attr: The attribute name of the layer
-            weight: The weight of the layer
-            bias: The bias of the layer
+            layer (:class:`torch.nn.Module`): The layer object
+            layer_attr (str): The attribute name of the layer
+            weight (:class:`torch.Tensor`): The weight of the layer
+            bias (:class:`torch.Tensor`): The bias of the layer
         """
         assert weight is not None or bias is not None
         if weight is not None:
-            setattr_(layer, "weight" if layer_attr == "" else layer_attr+".weight", nn.Parameter(weight))
+            setattr_(layer, "weight" if layer_attr == "" else layer_attr + ".weight", nn.Parameter(weight.contiguous()))
             self.set_layer_size(layer, layer_attr, weight.shape)
         if bias is not None:
-            setattr_(layer, "bias" if layer_attr == "" else layer_attr+".bias", nn.Parameter(bias))
-
+            setattr_(layer, "bias" if layer_attr == "" else layer_attr + ".bias", nn.Parameter(bias.contiguous()))
 
     def set_layer_size(self, layer: nn.Module, layer_attr: str, size: torch.Size) -> None:
-        """
+        r"""
         Set the layer attribute
 
         Args:
-            layer: The layer object
-            layer_attr: The attribute name of the layer
-            size: Torch.size
+            layer (:class:`torch.nn.Module`): The layer object
+            layer_attr (str): The attribute name of the layer
+            size (:class:`torch.Size`): The size of the tensor
         """
         # Tensor.shape[0] -> out_features, Tensor.shape[1] -> in_features
         attrs = ["out_features", "in_features"]
         for i, attr in enumerate(attrs):
             if hasattr_(layer, f"{layer_attr}.{attr}"):
-                setattr_(layer, f"{layer_attr}.{attr}", size[i])    
+                setattr_(layer, f"{layer_attr}.{attr}", size[i])
+
+    def bind_layer(self, model: nn.Module) -> None:
+        r"""
+        Bind the layer according to the binding policy
+
+        Args:
+            model (:class:`torch.nn.Module`): The shard model
+        """
+        binding_map = self.policy.binding_policy()
+        for k, v in binding_map.items():
+            param = getattr_(model, k)
+            param = nn.Parameter(param)
+            setattr_(model, k, param)
+            setattr_(model, v, param)
diff --git a/colossalai/shardformer/shard/shardmodel.py b/colossalai/shardformer/shard/shardmodel.py
index 54d7b5ba02d9..7e7d1576afd6 100644
--- a/colossalai/shardformer/shard/shardmodel.py
+++ b/colossalai/shardformer/shard/shardmodel.py
@@ -1,46 +1,48 @@
 import os
+from contextlib import suppress
+from dataclasses import dataclass
+
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import transformers
-import torch.distributed as dist
-from dataclasses import dataclass
-from contextlib import suppress
 
 from colossalai.tensor.d_tensor.layout import Layout
+
 from ..policies.basepolicy import Policy
-from .sharder import ModelSharder
 from .shardconfig import ShardConfig
+from .sharder import ModelSharder
 
 
 class ShardModel(object):
-    """
-    The class for sharding the huggingface model, self.model is the sharded model
+    r"""
+    The class for sharding the huggingface model, ''self.model'' is the sharded model
     Just creat a new ShardModel object to shard huggingface model
 
     Args:
-        model: the origin huggingface model
-        dist_config: the config for distribute information
-        custom_policy: the custom policy for sharding
+        model (:class:`torch.nn.Model`): the origin huggingface model
+        dist_config (:class:`ShardConfig`): the config for distribute information
+        custom_policy (:class:`Policy`): the custom policy for sharding
     """
+
     def __init__(
-            self,
-            model: nn.Module,
-            shard_config: ShardConfig = None, # TODO
-            custom_policy: Policy = None,
-        ) -> None:
+        self,
+        model: nn.Module,
+        shard_config: ShardConfig = None,    # TODO
+        custom_policy: Policy = None,
+    ) -> None:
         self.model = model
         self.shard_config = shard_config
         self.policy = custom_policy
         # self.layout=,  # TODO
 
-        sharder=ModelSharder(
+        sharder = ModelSharder(
             model=self.model,
             policy=self.policy,
             shard_config=self.shard_config,
         )
         sharder.shard()
 
-
     def set_environ(self) -> None:
         os.environ["TOKENIZERS_PARALLELISM"] = "true"
         os.environ["MKL_SERVICE_FORCE_INTEL"] = "GNU"
@@ -55,4 +57,4 @@ def set_environ(self) -> None:
         torch.cuda.set_device(int(os.getenv("LOCAL_RANK", "0")))
 
     def back_to_org() -> None:
-        pass
\ No newline at end of file
+        pass
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
index 1849cdc99c72..096f5db95f49 100644
--- a/colossalai/shardformer/shard/slicer.py
+++ b/colossalai/shardformer/shard/slicer.py
@@ -1,40 +1,40 @@
 import os
-from typing import Dict, Tuple
 from dataclasses import dataclass
+from typing import Dict, Tuple
 
 import torch
 import torch.distributed as dist
-from ..policies.basepolicy import Layer, Col_Layer, Row_Layer
-from .shardconfig import ShardConfig
 
+from ..policies.basepolicy import Col_Layer, Layer, Row_Layer
+from .shardconfig import ShardConfig
 
 dim_mapping = {Col_Layer: 1, Row_Layer: 0}
 
+
 class Slicer():
 
     def __init__(
-        self, 
-        shardconfig: ShardConfig #TODO
+            self,
+            shardconfig: ShardConfig    #TODO
     ) -> None:
         self.shardconfig = shardconfig
 
-    
     def slice_weight_bias(
         self,
         weight: torch.Tensor,
         bias: torch.Tensor,
         policy_layer_cls: Layer,
     ):
-        """
+        r"""
         Slice the weight and bias according to policy layer cls
-        Layer -> do nothing
-        Col_Layer -> slice the weight and bias along dim 1
-        Row_Layer -> slice the weight along dim 0 and do not slice bias
+        ``Layer`` -> do nothing
+        ``Col_Layer`` -> slice the weight and bias along dim 1
+        ``Row_Layer`` -> slice the weight along dim 0 and do not slice bias
 
         Args:
-            weight: The weight of the layer
-            bias: The bias of the layer
-            policy_layer_class: The class represent how to slice the tensor
+            weight (:class:`torch.nn.Module`): The weight of the layer
+            bias: (:class:`torch.nn.Module`): The bias of the layer
+            policy_layer_class (:class:`Policy`): The class represent how to slice the tensor
         """
         if policy_layer_cls == Layer:
             return weight, bias
@@ -46,42 +46,6 @@ def slice_weight_bias(
         else:
             raise NotImplementedError(f"The policy layer class {policy_layer_cls} is not supported")
         return weight, bias
-    
-
-    def slice_weight(
-        self,
-        weight: torch.Tensor,
-        policy_layer_cls: Layer,
-    ) -> torch.Tensor:
-        """
-        Slice the weight and bias according to the shardconfig
-
-        Args:
-            weight: The weight of the layer
-            bias: The bias of the layer
-            policy_layer_class: The class represent how to slice the tensor
-        """
-        if weight is not None:
-            dim = dim_mapping[policy_layer_cls]
-            weight = self.slice_tensor(weight, dim, False)
-        return weight
-
-
-    def slice_bias(
-        self,
-        bias: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Slice the bias according to the shardconfig
-        
-        Args:
-            bias: The bias of the layer
-        """
-        assert bias is not None, "The bias is None"
-        if bias is not None:
-            bias = self.slice_tensor(bias, 1, True)
-        return bias
-
 
     def slice_tensor(
         self,
@@ -89,8 +53,13 @@ def slice_tensor(
         dim: int,
         is_bias: bool,
     ) -> torch.Tensor:
-        """
+        r"""
         Slice tensor according to the config
+
+        Args:
+            tensor_in (:class:`torch.Tensor`): The tensor to slice
+            dim (int): The dimension to slice
+            is_bias (bool): Whether the tensor is bias
         """
         if tensor_in is None:
             return None
@@ -99,69 +68,75 @@ def slice_tensor(
         else:
             return self.slice_1d(tensor_in)
 
-
     def slice_2d(
         self,
         tensor: torch.Tensor,
         dim: int,
     ) -> torch.Tensor:
-        """
-        Slice the 2D tensor 
+        r"""
+        Slice the 2D tensor
 
         Args:
-            tensor: The tensor to slice
+            tensor (:class:`torch.Tensor`): The tensor to slice
+            dim (int): The dimension to slice
         """
-        assert dim in [0,1], f"Only support 2D tensor, but got {dim}D tensor"
+        assert dim in [0, 1], f"Only support 2D tensor, but got {dim}D tensor"
         if dim == 0:
             return self.slice_row(tensor)
         elif dim == 1:
             return self.slice_col(tensor)
 
-
     def slice_1d(
         self,
         tensor: torch.Tensor,
-        dim: int = None,
     ) -> torch.Tensor:
-        """
-        Slice the 1D tensor 
+        r"""
+        Slice the 1D tensor
 
         Args:
-            tensor: The tensor to slice
+            tensor (:class:`torch.Tensor`): The tensor to slice
+
+        Returns:
+            :class:`torch.Tensor`: The sliced tensor
         """
         delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
         down_idx = self.shardconfig.rank * delta
         up_idx = down_idx + delta
-        return tensor[down_idx:up_idx]
+        return tensor[down_idx:up_idx].contiguous()
 
     def slice_col(
         self,
         tensor: torch.Tensor,
     ) -> torch.Tensor:
-        """
+        r"""
         Slice the tensor in column
 
         Args:
-            tensor: The tensor to slice
+            tensor (:class:`torch.Tensor`): The tensor to slice
+
+        Returns:
+            :class:`torch.Tensor`: The sliced tensor
+
         """
         delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
         down_idx = self.shardconfig.rank * delta
         up_idx = down_idx + delta
-        return tensor[down_idx:up_idx,:]
-
+        return tensor[down_idx:up_idx, :].contiguous()
 
     def slice_row(
         self,
         tensor: torch.Tensor,
     ) -> torch.Tensor:
-        """
+        r"""
         Slice the tensor in column
 
         Args:
-            tensor: The tensor to slice
+            tensor (:class:`torch.Tensor`): The tensor to slice
+
+        Returns:
+            :class:`torch.Tensor`: The sliced tensor
         """
         delta = (tensor.shape[1] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
         down_idx = self.shardconfig.rank * delta
         up_idx = down_idx + delta
-        return tensor[:,down_idx:up_idx]
-    
\ No newline at end of file
+        return tensor[:, down_idx:up_idx].contiguous()
diff --git a/colossalai/shardformer/test/config.py b/colossalai/shardformer/test/config.py
index 295529429237..2b80d8b3ca12 100644
--- a/colossalai/shardformer/test/config.py
+++ b/colossalai/shardformer/test/config.py
@@ -1,5 +1 @@
-parallel = dict(
-        data=1,
-        pipeline=1,
-        tensor=dict(size=2, mode='1d')
-)
\ No newline at end of file
+parallel = dict(data=1, pipeline=1, tensor=dict(size=2, mode='1d'))
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
index c2a9053ca2f6..0cdc6ef38fd2 100644
--- a/colossalai/shardformer/test/test.py
+++ b/colossalai/shardformer/test/test.py
@@ -1,23 +1,51 @@
-from transformers import AutoTokenizer
-from transformers import BertForMaskedLM
+import argparse
+import inspect
+import os
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+
 import colossalai
-from colossalai.shardformer.shard.shardmodel import ShardModel
-from colossalai.utils import get_current_device, print_rank_0
 from colossalai.logging import get_dist_logger
 from colossalai.shardformer.shard.shardconfig import ShardConfig
-import inspect
-import argparse
-import torch.nn as nn
-import os
+from colossalai.shardformer.shard.shardmodel import ShardModel
+from colossalai.utils import get_current_device, print_rank_0
 
+os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
+
 def get_args():
     parser = colossalai.get_default_parser()
+    parser.add_argument("--mode", type=str, default='inference')
     return parser.parse_args()
 
+
+def load_data():
+    datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
+    # datasets=load_dataset("yelp_review_full")
+    tokenized_datasets = datasets.map(
+        lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length"), batched=True)
+    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    # tokenized_datasets=tokenized_datasets.rename_column("label","labels")
+    tokenized_datasets.set_format("torch")
+
+    train_dataset = tokenized_datasets["train"].select(range(500))
+    test_dataset = tokenized_datasets["test"].select(range(100))
+
+    datacollector = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15, return_tensors="pt")
+    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=datacollector)
+    eval_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=datacollector)
+    return train_dataloader, eval_dataloader
+
+
 def inference(model: nn.Module):
-    # print(model)
+    print(model)
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     token = "Hello, my dog is cute"
     inputs = tokenizer(token, return_tensors="pt")
     inputs.to("cuda")
@@ -25,13 +53,48 @@ def inference(model: nn.Module):
     outputs = model(**inputs)
     print(outputs)
 
+
+def train(model: nn.Module, num_epoch: int = 2):
+    train_dataloader, eval_dataloader = load_data()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+    progress_bar = tqdm(range((num_epoch) * len(train_dataloader)))
+    criterion = nn.CrossEntropyLoss()
+    model.to("cuda")
+    model.train()
+    for epoch in range(num_epoch):
+        progress_bar.set_description(f"Rank {get_current_device()} epoch {epoch}")
+
+        for batch in train_dataloader:
+            optimizer.zero_grad()
+            batch = {k: v.to('cuda') for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            progress_bar.update(1)
+        train_loss = loss
+
+        loss = 0.0
+        for batch in eval_dataloader:
+            batch = {k: v.to('cuda') for k, v in batch.items()}
+            outputs = model(**batch)
+            # loss = outputs.loss
+            loss += outputs.loss.item()
+            # loss = criterion(outputs.logits, batch["input_ids"])
+        test_loss = loss / len(eval_dataloader)
+        print_rank_0(f"Train Loss: {train_loss:.4f} Test Loss:{test_loss:.4f}")
+
+
 if __name__ == "__main__":
     args = get_args()
     colossalai.launch_from_torch(config=args.config)
     model = BertForMaskedLM.from_pretrained("bert-base-uncased")
     shard_config = ShardConfig(
-        rank = int(str(get_current_device()).split(':')[-1]),
-        world_size= int(os.environ['WORLD_SIZE']),
+        rank=int(str(get_current_device()).split(':')[-1]),
+        world_size=int(os.environ['WORLD_SIZE']),
     )
     shardmodel = ShardModel(model, shard_config)
-    inference(shardmodel.model)
+    if args.mode == "train":
+        train(shardmodel.model)
+    elif args.mode == "inference":
+        inference(shardmodel.model)
diff --git a/colossalai/shardformer/utils/utils.py b/colossalai/shardformer/utils/utils.py
index 5eba87f6fe09..eb84edd88404 100644
--- a/colossalai/shardformer/utils/utils.py
+++ b/colossalai/shardformer/utils/utils.py
@@ -1,10 +1,10 @@
 def hasattr_(obj, attr: str):
-    """
+    r"""
     Check whether the object has the multi sublevel attr
 
     Args:
-        obj: The object to check
-        attr: The multi level attr to check
+        obj (object): The object to check
+        attr (str): The multi level attr to check
     """
     attrs = attr.split('.')
     for a in attrs:
@@ -14,15 +14,16 @@ def hasattr_(obj, attr: str):
             return False
     return True
 
-def setattr_(obj, attr: str, value, ignore: bool=False):
-    """
+
+def setattr_(obj, attr: str, value, ignore: bool = False):
+    r"""
     Set the object's multi sublevel attr to value, if ignore, ignore when it doesn't exist
 
     Args:
-        obj: The object to set
-        attr: The multi level attr to set
-        value: The value to set
-        ignore: Whether to ignore when the attr doesn't exist
+        obj (object): The object to set
+        attr (str): The multi level attr to set
+        value (Any): The value to set
+        ignore (bool): Whether to ignore when the attr doesn't exist
     """
 
     attrs = attr.split('.')
@@ -31,18 +32,19 @@ def setattr_(obj, attr: str, value, ignore: bool=False):
             obj = getattr(obj, a)
         except AttributeError:
             if ignore:
-                 return
+                return
             raise AttributeError(f"Object {obj} has no attribute {attr}")
     setattr(obj, attrs[-1], value)
 
-def getattr_(obj, attr: str, ignore: bool=None):
-    """
+
+def getattr_(obj, attr: str, ignore: bool = None):
+    r"""
     Get the object's multi sublevel attr
-    
+
     Args:
-        obj: The object to set
-        attr: The multi level attr to set
-        ignore: Whether to ignore when the attr doesn't exist
+        obj (object): The object to set
+        attr (str): The multi level attr to set
+        ignore (bool): Whether to ignore when the attr doesn't exist
     """
 
     attrs = attr.split('.')
@@ -53,4 +55,4 @@ def getattr_(obj, attr: str, ignore: bool=None):
             if ignore:
                 return None
             raise AttributeError(f"Object {obj} has no attribute {attr}")
-    return obj
\ No newline at end of file
+    return obj

From bc19024bf9db549ed5c22e890715267f8d877eaa Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 May 2023 11:51:48 +0800
Subject: [PATCH 39/52] [shardformer] updated readme (#3827)

---
 colossalai/shardformer/README.md | 53 ++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index a47e280f2be4..f76cbac8d7b8 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -1,11 +1,22 @@
-## ShardFormer
+# ⚡️ ShardFormer
 
-### Intro
-Make the model in huggingface.co can be paralleled and can be used with colossalai according to custom policy.
+## 📚 Table of Contents
+
+- [⚡️ ShardFormer](#️-shardformer)
+  - [📚 Table of Contents](#-table-of-contents)
+  - [🔗 Introduction](#-introduction)
+  - [🔨 Usage](#-usage)
+  - [🔮 Simple example](#-simple-example)
+  - [💡 Policy](#-policy)
+
+## 🔗 Introduction
+
+**Shardformer** is a module that automatically parallelizes the mainstream models in libraries such as HuggingFace and TIMM. This module aims to make parallelization hassle-free for users who are not from the system background.
+
+## 🔨 Usage
+
+The sample API usage is given below:
 
-### Quick start
-1. Usage
-- Use
 ``` python
 from colossalai.shardformer.shard.shardmodel import ShardModel
 from transformers import BertForMaskedLM
@@ -21,23 +32,33 @@ shardmodel = ShardModel(model).model
 from xxx import <POLICYCLASS>
 shardmodel = ShardModel(model, <POLICYCLASS>).model
 
-
 # do angthing as normal
 ...
 ```
-- Policy
 
-If you wanna parallel the model in custom way, just overwrite the policy class for the huggingface model.
+## 🔮 Simple example
+
+``` shell
+# inference
+colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode inference
+# train
+colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode train
+```
+
+
+## 💡 Policy
+
+If you wanna parallel the model in a custom way, just overwrite the policy class for the Hugging Face model.
 
 You should do:
 
 1. Inherit Policy class
 2. Overwrite argument_policy method
-    - In this method you need to list which layers class you wanna modify and the attributes and parameters in those layers.
-3. Overwrite inject_policy method [Optional]
+    - In this method, you need to list which layers class you wanna modify and the attributes and parameters in those layers.
+3. Overwrite inject_policy method (Optional)
     - If you need to modify the forward or backward progress.
 4. Overwrite or add the param recording functions
-    - These function use suffix to record the path of weight or bias for the layer.
+    - These functions use a suffix to record the path of weight or bias for the layer.
 5. Overwrite binding
 
 More details can be found in shardformer/policies/basepolicy.py
@@ -167,11 +188,3 @@ CustomPolicy(Policy):
         return NotImplementedError
 
 ```
-
-2. Simple example
-``` shell
-# inference
-colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode inference
-# train
-colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode train
-```

From 537a52b7a279f8e40e06d3c820c803290d796c19 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 May 2023 16:01:26 +0800
Subject: [PATCH 40/52] [shardformer] refactored the user api (#3828)

* [shardformer] refactored the user api

* polish code
---
 colossalai/shardformer/README.md              |  6 +-
 colossalai/shardformer/shard/__init__.py      |  5 ++
 .../shard/{shardconfig.py => shard_config.py} |  2 +
 colossalai/shardformer/shard/sharder.py       | 27 ++++++---
 colossalai/shardformer/shard/shardmodel.py    | 60 -------------------
 colossalai/shardformer/shard/slicer.py        |  7 +--
 colossalai/shardformer/test/test.py           | 15 ++---
 7 files changed, 35 insertions(+), 87 deletions(-)
 rename colossalai/shardformer/shard/{shardconfig.py => shard_config.py} (93%)
 delete mode 100644 colossalai/shardformer/shard/shardmodel.py

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index f76cbac8d7b8..10fd1809b287 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -18,7 +18,7 @@
 The sample API usage is given below:
 
 ``` python
-from colossalai.shardformer.shard.shardmodel import ShardModel
+from colossalai.shardformer import shard_model
 from transformers import BertForMaskedLM
 
 # create huggingface model as normal
@@ -26,11 +26,11 @@ model = BertForMaskedLM.from_pretrained("bert-base-uncased")
 
 # make the huggingface model paralleled to ShardModel
 # auto policy:
-shardmodel = ShardModel(model).model
+sharded_model = shard_model(model)
 
 # custom policy:
 from xxx import <POLICYCLASS>
-shardmodel = ShardModel(model, <POLICYCLASS>).model
+sharded_model = shard_model(model, <POLICYCLASS>)
 
 # do angthing as normal
 ...
diff --git a/colossalai/shardformer/shard/__init__.py b/colossalai/shardformer/shard/__init__.py
index e69de29bb2d1..d5f70163ad57 100644
--- a/colossalai/shardformer/shard/__init__.py
+++ b/colossalai/shardformer/shard/__init__.py
@@ -0,0 +1,5 @@
+from .shard_config import ShardConfig
+from .sharder import ModelSharder, shard_model
+from .slicer import Slicer
+
+__all__ = ['ShardConfig', 'ModelSharder', 'shard_model', 'Slicer']
diff --git a/colossalai/shardformer/shard/shardconfig.py b/colossalai/shardformer/shard/shard_config.py
similarity index 93%
rename from colossalai/shardformer/shard/shardconfig.py
rename to colossalai/shardformer/shard/shard_config.py
index c6a2513a6eff..4cf9162b9548 100644
--- a/colossalai/shardformer/shard/shardconfig.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass
 
+__all__ = ['ShardConfig']
+
 
 @dataclass
 class ShardConfig:
diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
index 2f6bb4265a11..2218661889f8 100644
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -1,20 +1,15 @@
-import os
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List
 
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
-from colossalai.logging import get_dist_logger
-
 from ..policies.autopolicy import get_autopolicy
-from ..policies.basepolicy import Layer, Policy
+from ..policies.basepolicy import Policy
 from ..utils.utils import getattr_, hasattr_, setattr_
-from .shardconfig import ShardConfig
+from .shard_config import ShardConfig
 from .slicer import Slicer
 
-logger = get_dist_logger()
+__all__ = ['ModelSharder', 'shard_model']
 
 
 class ModelSharder(object):
@@ -245,3 +240,17 @@ def bind_layer(self, model: nn.Module) -> None:
             param = nn.Parameter(param)
             setattr_(model, k, param)
             setattr_(model, v, param)
+
+
+def shard_model(model: nn.Module, shard_config: ShardConfig = None, policy: Policy = None):
+    r"""
+    The function is used to shard the PyTorch model.
+
+    Args:
+        model (`torch.nn.Model`): the origin huggingface model
+        shard_config (`ShardConfig`): the config for distribute information
+        policy (`Policy`): the custom policy for sharding
+    """
+    sharder = ModelSharder(model=model, shard_config=shard_config, policy=policy)
+    sharder.shard()
+    return model
diff --git a/colossalai/shardformer/shard/shardmodel.py b/colossalai/shardformer/shard/shardmodel.py
deleted file mode 100644
index 7e7d1576afd6..000000000000
--- a/colossalai/shardformer/shard/shardmodel.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-from contextlib import suppress
-from dataclasses import dataclass
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import transformers
-
-from colossalai.tensor.d_tensor.layout import Layout
-
-from ..policies.basepolicy import Policy
-from .shardconfig import ShardConfig
-from .sharder import ModelSharder
-
-
-class ShardModel(object):
-    r"""
-    The class for sharding the huggingface model, ''self.model'' is the sharded model
-    Just creat a new ShardModel object to shard huggingface model
-
-    Args:
-        model (:class:`torch.nn.Model`): the origin huggingface model
-        dist_config (:class:`ShardConfig`): the config for distribute information
-        custom_policy (:class:`Policy`): the custom policy for sharding
-    """
-
-    def __init__(
-        self,
-        model: nn.Module,
-        shard_config: ShardConfig = None,    # TODO
-        custom_policy: Policy = None,
-    ) -> None:
-        self.model = model
-        self.shard_config = shard_config
-        self.policy = custom_policy
-        # self.layout=,  # TODO
-
-        sharder = ModelSharder(
-            model=self.model,
-            policy=self.policy,
-            shard_config=self.shard_config,
-        )
-        sharder.shard()
-
-    def set_environ(self) -> None:
-        os.environ["TOKENIZERS_PARALLELISM"] = "true"
-        os.environ["MKL_SERVICE_FORCE_INTEL"] = "GNU"
-        os.environ["MASTER_ADDR"] = str(self.dist_config.master_addr)
-        os.environ["MASTER_PORT"] = str(self.dist_config.master_port)
-        os.environ["WORLD_SIZE"] = str(self.dist_config.num_gpus)
-        os.environ["RANK"] = str(self.dist_config.rank)
-        os.environ["LOCAL_RANK"] = str(self.dist_config.rank)
-        if not dist.is_initialized():
-            dist.init_process_group(backend=self.dist_config.backend)
-
-        torch.cuda.set_device(int(os.getenv("LOCAL_RANK", "0")))
-
-    def back_to_org() -> None:
-        pass
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
index 096f5db95f49..957ce1f85814 100644
--- a/colossalai/shardformer/shard/slicer.py
+++ b/colossalai/shardformer/shard/slicer.py
@@ -1,12 +1,7 @@
-import os
-from dataclasses import dataclass
-from typing import Dict, Tuple
-
 import torch
-import torch.distributed as dist
 
 from ..policies.basepolicy import Col_Layer, Layer, Row_Layer
-from .shardconfig import ShardConfig
+from .shard_config import ShardConfig
 
 dim_mapping = {Col_Layer: 1, Row_Layer: 0}
 
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
index 0cdc6ef38fd2..202208123ced 100644
--- a/colossalai/shardformer/test/test.py
+++ b/colossalai/shardformer/test/test.py
@@ -1,5 +1,3 @@
-import argparse
-import inspect
 import os
 
 import torch
@@ -7,12 +5,10 @@
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
 
 import colossalai
-from colossalai.logging import get_dist_logger
-from colossalai.shardformer.shard.shardconfig import ShardConfig
-from colossalai.shardformer.shard.shardmodel import ShardModel
+from colossalai.shardformer.shard import ShardConfig, shard_model
 from colossalai.utils import get_current_device, print_rank_0
 
 os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
@@ -93,8 +89,9 @@ def train(model: nn.Module, num_epoch: int = 2):
         rank=int(str(get_current_device()).split(':')[-1]),
         world_size=int(os.environ['WORLD_SIZE']),
     )
-    shardmodel = ShardModel(model, shard_config)
+    sharded_model = shard_model(model, shard_config)
+
     if args.mode == "train":
-        train(shardmodel.model)
+        train(sharded_model)
     elif args.mode == "inference":
-        inference(shardmodel.model)
+        inference(sharded_model)

From 997544c1f90b9a1549e91a6d97ee3902c2ac0ed4 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Wed, 24 May 2023 18:02:54 +0800
Subject: [PATCH 41/52] [shardformer] update readme with modules implement doc
 (#3834)

* update readme with modules content

* remove img
---
 colossalai/shardformer/README.md | 69 ++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 10fd1809b287..55b6aa75ef84 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -8,6 +8,8 @@
   - [🔨 Usage](#-usage)
   - [🔮 Simple example](#-simple-example)
   - [💡 Policy](#-policy)
+  - [😊 Module](#-module)
+
 
 ## 🔗 Introduction
 
@@ -188,3 +190,70 @@ CustomPolicy(Policy):
         return NotImplementedError
 
 ```
+
+
+## 😊 Module
+
+  1. Flowchart
+
+  <p align="center">
+      <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/shardformer/shardformer_flowchart.png" width="600" />
+  </p>
+
+  2. Important Modules
+
+  - CLASS `shard_model`:
+
+    This is the user api to use shardformer, just create a model from transformers and define a custom policy or use shardformer autopolicy to make a shard model.
+
+  - CLASS `Layer`:
+
+    Parameters:
+    - weight (str): The weight suffix of the layer
+    - bias (str): The bias suffix of the layer
+    - replace_layer (:class:`colosalai.nn`): The layer to replace the original layer
+    - ignore (bool): Whether to ignore this layer if it is not in the model
+
+    This class is used to specify the replacement policy for a particular layer. If `replace_layer` is None, only parameter partitioning will be performed without replacing the layer class.
+
+    CLASS `Col_Layer(Layer)`:
+      - gather_output (bool): Whether to gather the output of the layer
+
+      This class inherited from `Layer`, representing the layer will be sliced along column.
+
+    CLASS `Row_Layer(Layer)`:
+
+      This class inherited from `Layer`, representing the layer will be sliced along row.
+
+  - CLASS `Policy`:
+
+    In Shardformer, this class holds significant importance as it defines the model partitioning methods, required parameter modifications, and model injection techniques all within a single Policy class.
+    - `Policy.attn_in()/attn_out()/mlp_in()/mlp_out()/embedding()/unembedding()`......
+
+      These functions define the partitioning methods of the parameters at different locations in the model. Each function returns a list of objects of Layer class that specify the replacement approach for these parameters. Shardformer also supports user-defined functions for modifying their models, in addition to the listed functions.
+    - `Policy.argument_policy()`
+
+      In this function, the user should use multiple dict to define which class of layers will require replacement. This includes the attributes and parameters that need to be modified or replaced. Attributes are stored in the form of a "suffix-string: value" dict, while parameters are stored via multiple static methods that return the replacement approach.
+    - `Policy.inject_policy()`
+
+      This function will return the injected model to replace the original model. The new model should be a nn.Module class which includes modified forward or backward functions or anything else.
+    - `Policy.binding_policy()`
+
+      This function will return the weight sharing information in the model in some dict. The key and value are both the suffixes of the shared parameters.
+
+  - CLASS `ModelSharder(model, policy)`:
+
+    This class helps shard the model, the parameter is the created transformers model and the custom policy. If custom policy is None, shardformer will automatically get already defined policy for the model.
+    - `ModelShard.inject_model()`
+
+      This function is used to inject the model to modify the forward and backward progress.
+    - `ModelShard.replace_layer()`
+
+      This function is used to replace the original layers with colossalai layer to make them paralleled and can do distributed communication.
+    - `ModelShard.bind_layer()`
+
+      This function is used to help different layers share weight or bias.
+
+  - CLASS `Slicer`:
+
+    This class is used to slice tensor according to policy.

From 21a3915c9874c722eacf769728aa727a9b5d0b82 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Thu, 1 Jun 2023 16:21:02 +0800
Subject: [PATCH 42/52] [shardformer] add Dropout layer support different
 dropout pattern (#3856)

* add dropout layer, add dropout test

* modify seed manager as context manager

* add a copy of col_nn.layer

* add dist_crossentropy loss; separate module test

* polish the code

* fix dist crossentropy loss
---
 colossalai/nn/layer/parallel_1d/_operation.py |    1 -
 colossalai/nn/layer/parallel_1d/layers.py     |    9 +-
 colossalai/shardformer/README.md              |   19 +
 colossalai/shardformer/layer/__init__.py      |    0
 colossalai/shardformer/layer/_operation.py    |   97 ++
 .../shardformer/layer/dist_crossentropy.py    |  105 ++
 colossalai/shardformer/layer/dropout.py       |   58 +
 colossalai/shardformer/layer/layers.py        | 1043 +++++++++++++++++
 colossalai/shardformer/model/modeling_bert.py |   10 +-
 colossalai/shardformer/policies/basepolicy.py |    2 -
 colossalai/shardformer/policies/bert.py       |    4 +-
 colossalai/shardformer/shard/slicer.py        |   15 +-
 colossalai/shardformer/test/module_test.py    |   50 +
 colossalai/shardformer/test/test.py           |   41 +-
 14 files changed, 1413 insertions(+), 41 deletions(-)
 create mode 100644 colossalai/shardformer/layer/__init__.py
 create mode 100644 colossalai/shardformer/layer/_operation.py
 create mode 100644 colossalai/shardformer/layer/dist_crossentropy.py
 create mode 100644 colossalai/shardformer/layer/dropout.py
 create mode 100644 colossalai/shardformer/layer/layers.py
 create mode 100644 colossalai/shardformer/test/module_test.py

diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/nn/layer/parallel_1d/_operation.py
index c5e33fd497cd..300baf9c12ba 100644
--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -73,7 +73,6 @@ def backward(ctx, grad_output):
         total_input = input
         grad_input = grad_output.matmul(weight)
 
-        grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
         grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2])
         total_input = total_input.view(total_input.shape[0] * total_input.shape[1], total_input.shape[2])
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 0ee3b4fcb502..406173a18c60 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -469,8 +469,7 @@ def __init__(self,
         if skip_bias_add and not bias:
             raise ValueError('cannot skip bias addition if bias is None')
 
-        # self.out_features_per_partition = divide(out_features*2, gpc.tensor_parallel_size)
-        self.out_features_per_partition = out_features
+        self.out_features_per_partition = divide(out_features, gpc.tensor_parallel_size)
 
         # Parameters.
         # Initialize weight.
@@ -613,8 +612,7 @@ def __init__(self,
             raise ValueError('cannot skip bias addition if bias is None')
 
         # Divide the weight matrix along the last dimension.
-        # self.input_size_per_partition = divide(in_features*2, gpc.tensor_parallel_size)
-        self.input_size_per_partition = in_features
+        self.input_size_per_partition = divide(in_features, gpc.tensor_parallel_size)
 
         # Parameters.
         # Initialize weight.
@@ -886,8 +884,7 @@ def __init__(self,
 
         tensor_parallel_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
         tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-        # self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
-        self.num_embeddings_per_partition = num_embeddings
+        self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
         self.vocab_start_index = tensor_parallel_rank * self.num_embeddings_per_partition
         self.vocab_end_index = self.vocab_start_index + self.num_embeddings_per_partition
 
diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 55b6aa75ef84..3394e9457da3 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -257,3 +257,22 @@ CustomPolicy(Policy):
   - CLASS `Slicer`:
 
     This class is used to slice tensor according to policy.
+
+
+  3. DistCrossEntropy Loss
+  - Overview
+
+    In order to reduce the communication size, caculate the crossentropy before all gather, refer to [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), reduce the communication size from [batch_size * seq_length * vocab_size] to [batch_size * seq_length]. The origin loss function is:
+    $$ loss = -\log(\frac{\exp(x[class])}{\sum_i\exp(x[i])})$$
+
+    alse can be represented as:
+
+    $$ loss = \log(\sum_i\exp(x[i])) - x[class]$$
+
+  - Step
+
+    - First get the maximum logits across all the devices, make all the logist minus the maximun value to scale the value less than zero to avoid the value of exp being too large
+
+    - Get a mask to mask the logits not in the local device
+
+    - Caculate the loss according to the second formula
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
new file mode 100644
index 000000000000..e817ea3ebbee
--- /dev/null
+++ b/colossalai/shardformer/layer/_operation.py
@@ -0,0 +1,97 @@
+import torch
+import torch.distributed as dist
+
+from colossalai.core import global_context as gpc
+
+try:
+    import fused_mix_prec_layer_norm_cuda
+except:
+    fused_mix_prec_layer_norm_cuda = None
+
+
+class FusedLayerNormAffineFunction1D(torch.autograd.Function):
+    r"""Layernorm
+
+    Args:
+        input: input matrix.
+        weight: weight matrix.
+        bias: bias matrix.
+        normalized_shape: input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability
+  """
+
+    @staticmethod
+    def forward(ctx, input, weight, bias, normalized_shape, eps):
+        ctx.normalized_shape = normalized_shape
+        ctx.eps = eps
+        input_ = input.contiguous()
+        weight_ = weight.contiguous()
+        bias_ = bias.contiguous()
+        output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(input_, ctx.normalized_shape, weight_,
+                                                                             bias_, ctx.eps)
+        ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_, weight_, bias_, mean, invvar = ctx.saved_tensors
+        grad_input = grad_weight = grad_bias = None
+        grad_input, grad_weight, grad_bias \
+          = fused_mix_prec_layer_norm_cuda.backward_affine(
+            grad_output.contiguous(), mean, invvar,
+            input_, ctx.normalized_shape,
+            weight_, bias_, ctx.eps)
+
+        return grad_input, grad_weight, grad_bias, None, None
+
+
+class LinearWithAsyncCommunication(torch.autograd.Function):
+    """
+    Linear layer execution with asynchronous communication in backprop.
+    """
+
+    @staticmethod
+    def forward(ctx, input_, weight, bias, parallel_mode, async_grad_allreduce):
+        ctx.save_for_backward(input_, weight)
+        ctx.use_bias = bias is not None
+        ctx.parallel_mode = parallel_mode
+        ctx.async_grad_allreduce = async_grad_allreduce
+
+        output = torch.matmul(input_, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        use_bias = ctx.use_bias
+
+        total_input = input
+        grad_input = grad_output.matmul(weight)
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2])
+        total_input = total_input.view(total_input.shape[0] * total_input.shape[1], total_input.shape[2])
+
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = dist.all_reduce(grad_input, group=gpc.get_group(ctx.parallel_mode), async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # all-reduce scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+
+        grad_weight = grad_output.t().matmul(total_input)
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+        if ctx.async_grad_allreduce:
+            handle.wait()
+
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def linear_with_async_comm(input_, weight, bias, parallel_mode, async_grad_allreduce):
+    return LinearWithAsyncCommunication.apply(input_, weight, bias, parallel_mode, async_grad_allreduce)
diff --git a/colossalai/shardformer/layer/dist_crossentropy.py b/colossalai/shardformer/layer/dist_crossentropy.py
new file mode 100644
index 000000000000..1869594670ce
--- /dev/null
+++ b/colossalai/shardformer/layer/dist_crossentropy.py
@@ -0,0 +1,105 @@
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+
+
+class DistCrossEntropy(Function):
+    r"""
+    Overwrite the forward and backward function to calculate the cross entropy loss before gather
+
+    Args:
+        Function (:class:`torch.autograd.Function`): default
+    """
+
+    @staticmethod
+    def forward(ctx, vocab_logits: torch.Tensor, target: torch.Tensor):
+        r"""
+        Calculate the cross entropy loss before gather, the origin loss function is as follows:
+        loss = -log(exp(x[class])/sum(exp(x[i]))
+        and can be rewrite as:
+        loss = log(sum(exp(x[i])) - x[class]
+
+        To avoid the `nan` of log(sim(exp(x[i]))), we minus the max of x[i]
+
+        Args:
+            vocab_logits (:class:`torch.Tensor`): The logits of the vocabulary, shape is
+              [batch_size, seq_len, vocab_size]
+            labels (:class:`torch.Tensor`): The labels of the vocabulary, shape is
+              [batch_size, seq_len]
+
+        Returns:
+            :class:`torch.Tensor`: The cross entropy loss
+        """
+        # get the max
+        logits_max = torch.max(vocab_logits, dim=-1)[0]
+        dist.all_reduce(logits_max, op=dist.ReduceOp.MAX)
+
+        # minus the max to avoid the result of sum of exp is too large and the log is nan
+        vocab_logits = vocab_logits - logits_max.unsqueeze(dim=-1)
+
+        # mask the target in the local device
+        partition_vocab_size = vocab_logits.size()[-1]
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        global_vocab_size = partition_vocab_size * world_size
+
+        # [down, up) => false, other device and -100 => true
+        delta = (global_vocab_size + world_size - 1) // world_size
+        down_shreshold = rank * delta
+        up_shreshold = down_shreshold + delta
+        mask = (target < down_shreshold) | (target >= up_shreshold)
+        masked_target = target.clone() - down_shreshold
+        masked_target[mask] = 0
+
+        # reshape the logist and target
+        # reshape the vocab_logits to [bath_size * seq_len, vocab_size]
+        # reshape the labels to [bath_size * seq_len]
+        logits_2d = vocab_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+
+        # extract the x[class] and set the x[other device] to zero
+        pred_logits_1d = logits_2d[torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device),
+                                   masked_target_1d]
+        pred_logits_1d = pred_logits_1d.clone().contiguous()
+        pred_logits = pred_logits_1d.view_as(target)
+        pred_logits[mask] = 0.0
+
+        # allreduce the get all x(i,y)
+        dist.all_reduce(pred_logits, op=dist.ReduceOp.SUM)
+        exp_logits = vocab_logits
+        torch.exp(vocab_logits, out=exp_logits)
+        sum_exp_logits = torch.sum(exp_logits, dim=-1)
+        dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM)
+
+        # calculate the loss
+        # loss = log(sum(exp(x[i]))) - x[class]
+        loss = torch.log(sum_exp_logits) - pred_logits
+        loss = torch.sum(loss).div_(loss.numel())
+
+        # caculate the softmax
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # retrieve the saved tensors
+        exp_logits, mask, masked_target_1d = ctx.saved_tensors
+
+        # use exp logits as the input grad
+        grad_logits = exp_logits
+        partion_vocab_size = grad_logits.shape[-1]
+        grad_logits_2d = grad_logits.view(-1, partion_vocab_size)
+
+        update = 1.0 - mask.view(-1).float()
+        grad_logits_2d[torch.arange(0, grad_logits_2d.shape[0]), masked_target_1d] -= update
+
+        grad_logits.mul_(grad_output.unsqueeze(dim=-1))
+        return grad_logits, None, None
+
+
+def applyDistCrossEntropy(vocab_logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    return DistCrossEntropy.apply(vocab_logits, labels)
diff --git a/colossalai/shardformer/layer/dropout.py b/colossalai/shardformer/layer/dropout.py
new file mode 100644
index 000000000000..acc114029ac1
--- /dev/null
+++ b/colossalai/shardformer/layer/dropout.py
@@ -0,0 +1,58 @@
+import os
+import time
+from contextlib import contextmanager
+
+import torch
+import torch.nn as nn
+
+
+class SeedManager:
+    """
+    This class is a random state manager to change random state for different random seed.
+
+    """
+
+    def __init__(self):
+        original_state = torch.cuda.get_rng_state()
+        seed = int(f"{int(time.time())}{os.environ['RANK']}")
+        torch.cuda.manual_seed(int(seed))
+        self.dropout_state = torch.cuda.get_rng_state()
+        torch.cuda.set_rng_state(original_state)
+
+    def set_mode(self, rng_state):
+        torch.cuda.set_rng_state(rng_state)
+
+    def get_current_mode(self):
+        current_state = torch.cuda.get_rng_state()
+        return current_state
+
+    @contextmanager
+    def dropout_mode(self):
+        """
+        This is a context manager to change the dropout state and recover the original state.
+
+        Usage:
+        ::
+            >>> with _seed_manager.dropout_mode():
+            >>>     input = super().forward(input)
+        """
+        try:
+            current_mode = self.get_current_mode()
+            yield self.set_mode(self.dropout_state)
+        finally:
+            self.dropout_state = self.get_current_mode()
+            self.set_mode(current_mode)
+
+
+_seed_manager = SeedManager()
+
+
+class Dropout1D(nn.Dropout):
+
+    def __init__(self, p=0.5, inplace=False):
+        super().__init__(p, inplace)
+
+    def forward(self, input):
+        with _seed_manager.dropout_mode():
+            input = super().forward(input)
+        return input
diff --git a/colossalai/shardformer/layer/layers.py b/colossalai/shardformer/layer/layers.py
new file mode 100644
index 000000000000..f5123885bbe4
--- /dev/null
+++ b/colossalai/shardformer/layer/layers.py
@@ -0,0 +1,1043 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+from collections import OrderedDict
+from typing import Callable, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.parameter import Parameter
+
+from colossalai.communication import broadcast
+from colossalai.context import ParallelMode, seed
+from colossalai.core import global_context as gpc
+from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.kernel import LayerNorm
+from colossalai.nn import init as init
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.colossalai_layer._utils import ColossalaiModule
+from colossalai.nn.layer.parallel_1d._utils import (
+    gather_forward_split_backward,
+    get_parallel_input,
+    reduce_grad,
+    reduce_input,
+    set_parallel_input,
+    split_forward_gather_backward,
+)
+from colossalai.nn.layer.utils import divide, set_tensor_parallel_attribute_by_partition
+from colossalai.nn.layer.vanilla import VanillaLayerNorm, VanillaPatchEmbedding
+from colossalai.registry import LAYERS
+from colossalai.utils.checkpointing import (
+    broadcast_state_dict,
+    gather_tensor_parallel_state_dict,
+    partition_tensor_parallel_state_dict,
+)
+from colossalai.utils.cuda import get_current_device
+
+from ._operation import linear_with_async_comm
+
+Fast_LN = None
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNorm
+    Fast_LN = FastLayerNorm
+except ImportError:
+    pass
+
+
+# @LAYERS.register_module
+class Linear1D(ColossalaiModule):
+    r"""Linear layer for 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 gather_output: bool = False,
+                 skip_bias_add: bool = False,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        parallel_input = get_parallel_input()
+        if not parallel_input and not gather_output:
+            layer = Linear1D_Col(in_features,
+                                 out_features,
+                                 bias=bias,
+                                 dtype=dtype,
+                                 skip_bias_add=skip_bias_add,
+                                 weight_initializer=weight_initializer,
+                                 bias_initializer=bias_initializer)
+        else:
+            layer = Linear1D_Row(in_features,
+                                 out_features,
+                                 bias=bias,
+                                 dtype=dtype,
+                                 parallel_input=parallel_input,
+                                 skip_bias_add=skip_bias_add,
+                                 weight_initializer=weight_initializer,
+                                 bias_initializer=bias_initializer)
+        super().__init__(layer)
+
+
+# @LAYERS.register_module
+class LayerNorm1D(ColossalaiModule):
+    r"""
+    Layer Normalization for colossalai
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+    """
+
+    _fast_ln_supported_sizes = [
+        1024, 1536, 2048, 2304, 3072, 3840, 4096, 5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
+        24576, 25600, 30720, 32768, 40960, 49152, 65536
+    ]
+
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None):
+        if Fast_LN is not None and normalized_shape in self._fast_ln_supported_sizes:
+            norm = Fast_LN(normalized_shape, eps=eps).to(dtype)
+        else:
+            norm = None
+            try:
+                from apex.normalization import FusedLayerNorm
+                norm = FusedLayerNorm(normalized_shape, eps=eps).to(dtype)
+            except ImportError:
+                norm = LayerNorm(normalized_shape, eps=eps).to(dtype)
+        super().__init__(norm)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            weight = state_dict.pop(weight_key, None)
+            if weight is not None:
+                local_state[weight_key] = weight
+            # bias
+            bias = state_dict.pop(bias_key, None)
+            if bias is not None:
+                local_state[bias_key] = bias
+
+        local_state = broadcast_state_dict(local_state, ParallelMode.PARALLEL_1D)
+        super()._load_from_state_dict(local_state, prefix, *args)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            super()._save_to_state_dict(destination, prefix, keep_vars)
+
+
+# @LAYERS.register_module
+class Classifier1D(ParallelLayer):
+    r"""RowLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        self.parallel_input = get_parallel_input()
+
+        # Divide the weight matrix along the last dimension.
+        self.input_size_per_partition = divide(in_features, gpc.tensor_parallel_size)
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(torch.empty(self.num_classes, self.input_size_per_partition, **factory_kwargs))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.empty(self.num_classes, **factory_kwargs))
+        else:
+            self.bias = None
+        with seed(ParallelMode.TENSOR):
+            self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+        env.vocab_parallel = False
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.num_classes
+        if self.has_weight:
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+            broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], ParallelMode.PARALLEL_1D)
+
+    def _set_tensor_parallel_attributes(self):
+        if self.has_weight:
+            num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+            set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            if self.has_weight:
+                weight = state_dict.pop(weight_key, None)
+                if weight is not None:
+                    local_state[weight_key] = weight
+            # bias
+            if self.bias is not None:
+                bias = state_dict.pop(bias_key, None)
+                if bias is not None:
+                    local_state[bias_key] = bias
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={
+                                                               weight_key: -1,
+                                                               bias_key: 0
+                                                           },
+                                                           partition_states={
+                                                               weight_key: True,
+                                                               bias_key: False
+                                                           })
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        local_state = OrderedDict()
+        if self.has_weight:
+            local_state[weight_key] = self.weight
+        if self.bias is not None:
+            local_state[bias_key] = self.bias
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={
+                                                            weight_key: -1,
+                                                            bias_key: 0
+                                                        },
+                                                        partition_states={
+                                                            weight_key: True,
+                                                            bias_key: False
+                                                        },
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        # Set up backprop all-reduce.
+        if self.parallel_input:
+            assert input_.shape[-1] == self.weight.shape[-1], \
+                'Invalid shapes in Classifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1])
+            input_ = input_
+        else:
+            assert divide(input_.shape[-1], gpc.tensor_parallel_size) == self.weight.shape[-1], \
+                'Invalid shapes in Classifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1] * gpc.tensor_parallel_size)
+            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)
+
+        output_parallel = F.linear(input_, self.weight)
+        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+# @LAYERS.register_module
+class VocabParallelClassifier1D(ParallelLayer):
+    r"""ColLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 gather_output: bool = False,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        self.gather_output = gather_output
+        self.parallel_input = get_parallel_input()
+
+        # Divide the weight matrix along the last dimension.
+        self.num_classes_per_partition = divide(num_classes, gpc.tensor_parallel_size)
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(torch.empty(self.num_classes_per_partition, self.in_features, **factory_kwargs))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.empty(self.num_classes_per_partition, **factory_kwargs))
+        else:
+            self.bias = None
+        with seed(ParallelMode.TENSOR):
+            self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+        env.vocab_parallel = True
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.num_classes
+        if self.has_weight:
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+
+    def _set_tensor_parallel_attributes(self):
+        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+        if self.has_weight:
+            set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+        if self.bias is not None:
+            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            if self.has_weight:
+                weight = state_dict.pop(weight_key, None)
+                if weight is not None:
+                    local_state[weight_key] = weight
+            # bias
+            if self.bias is not None:
+                bias = state_dict.pop(bias_key, None)
+                if bias is not None:
+                    local_state[bias_key] = bias
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={
+                                                               weight_key: 0,
+                                                               bias_key: 0
+                                                           },
+                                                           partition_states={
+                                                               weight_key: True,
+                                                               bias_key: True
+                                                           })
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        local_state = OrderedDict()
+        if self.has_weight:
+            local_state[weight_key] = self.weight
+        if self.bias is not None:
+            local_state[bias_key] = self.bias
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={
+                                                            weight_key: 0,
+                                                            bias_key: 0
+                                                        },
+                                                        partition_states={
+                                                            weight_key: True,
+                                                            bias_key: True
+                                                        },
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        assert input_.shape[-1] == self.weight.shape[-1], \
+            'Invalid shapes in VocabParallelClassifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1])
+        # Set up backprop all-reduce.
+        input_parallel = reduce_grad(input_, ParallelMode.PARALLEL_1D)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, self.bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        else:
+            output = output_parallel
+        return output
+
+
+# @LAYERS.register_module
+class Linear1D_Col(ParallelLayer):
+    r"""Linear layer with column parallelism.
+
+    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
+    its second dimension as :math:`A = [A_1, ..., A_p]`.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
+                    to all GPUs, otherwise, every GPU will have its output
+                    which is :math:`Y_i = XA_i`, defaults to False
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 gather_output: bool = False,
+                 skip_bias_add: bool = False,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+
+        if skip_bias_add and not bias:
+            raise ValueError('cannot skip bias addition if bias is None')
+
+        # self.out_features_per_partition = divide(out_features*2, gpc.tensor_parallel_size)
+        self.out_features_per_partition = out_features
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.out_features_per_partition, self.in_features, **factory_kwargs))
+
+        if bias:
+            self.bias = Parameter(torch.empty(self.out_features_per_partition, **factory_kwargs))
+        else:
+            self.bias = None
+        with seed(ParallelMode.TENSOR):
+            self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+        is_parallel_output = not self.gather_output
+        set_parallel_input(is_parallel_output)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.out_features
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+
+    def _set_tensor_parallel_attributes(self):
+        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+        if self.bias is not None:
+            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            weight = state_dict.pop(weight_key, None)
+            if weight is not None:
+                local_state[weight_key] = weight
+            # bias
+            if self.bias is not None:
+                bias = state_dict.pop(bias_key, None)
+                if bias is not None:
+                    local_state[bias_key] = bias
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={
+                                                               weight_key: 0,
+                                                               bias_key: 0
+                                                           },
+                                                           partition_states={
+                                                               weight_key: True,
+                                                               bias_key: True
+                                                           })
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        local_state = OrderedDict({weight_key: self.weight})
+        if self.bias is not None:
+            local_state[bias_key] = self.bias
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={
+                                                            weight_key: 0,
+                                                            bias_key: 0
+                                                        },
+                                                        partition_states={
+                                                            weight_key: True,
+                                                            bias_key: True
+                                                        },
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
+        assert input_.shape[-1] == self.weight.shape[-1], \
+            'Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1])
+        # Set up backprop all-reduce.
+        # input_parallel = reduce_grad(input_, ParallelMode.PARALLEL_1D)
+        input_parallel = input_
+        # Matrix multiply.
+        bias = self.bias if not self.skip_bias_add else None
+        # output_parallel = F.linear(input_parallel, self.weight, bias)
+        output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, ParallelMode.PARALLEL_1D, True)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        else:
+            output = output_parallel
+
+        if self.skip_bias_add:
+            return output, self.bias
+        else:
+            return output
+
+
+# @LAYERS.register_module
+class Linear1D_Row(ParallelLayer):
+    r""" Linear layer with row parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 parallel_input: bool = True,
+                 skip_bias_add: bool = False,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 stream_chunk_num: int = 1):
+        super().__init__()
+
+        self.stream_chunk_num = stream_chunk_num
+
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.parallel_input = parallel_input
+        self.skip_bias_add = skip_bias_add
+
+        if skip_bias_add and not bias:
+            raise ValueError('cannot skip bias addition if bias is None')
+
+        # Divide the weight matrix along the last dimension.
+        # self.input_size_per_partition = divide(in_features*2, gpc.tensor_parallel_size)
+        self.input_size_per_partition = in_features
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.out_features, self.input_size_per_partition, **factory_kwargs))
+
+        if self.stream_chunk_num > 1:
+            # TODO() work for inference only
+            self.chunk_weight()
+        if bias:
+            self.bias = Parameter(torch.empty(self.out_features, **factory_kwargs))
+        else:
+            self.bias = None
+        with seed(ParallelMode.TENSOR):
+            self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+
+    def chunk_weight(self):
+        self.weight_list = torch.chunk(self.weight, self.stream_chunk_num, dim=0)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.out_features
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+            broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], ParallelMode.PARALLEL_1D)
+
+    def _set_tensor_parallel_attributes(self):
+        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            weight = state_dict.pop(weight_key, None)
+            if weight is not None:
+                local_state[weight_key] = weight
+            # bias
+            if self.bias is not None:
+                bias = state_dict.pop(bias_key, None)
+                if bias is not None:
+                    local_state[bias_key] = bias
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={
+                                                               weight_key: -1,
+                                                               bias_key: 0
+                                                           },
+                                                           partition_states={
+                                                               weight_key: True,
+                                                               bias_key: False
+                                                           })
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        bias_key = prefix + 'bias'
+        local_state = OrderedDict({weight_key: self.weight})
+        if self.bias is not None:
+            local_state[bias_key] = self.bias
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={
+                                                            weight_key: -1,
+                                                            bias_key: 0
+                                                        },
+                                                        partition_states={
+                                                            weight_key: True,
+                                                            bias_key: False
+                                                        },
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        # Set up backprop all-reduce.
+        if self.parallel_input:
+            assert input_.shape[-1] == self.weight.shape[-1], \
+                'Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1])
+            input_ = input_
+        else:
+            assert divide(input_.shape[-1], gpc.tensor_parallel_size) == self.weight.shape[-1], \
+                'Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1] * gpc.tensor_parallel_size)
+            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)
+
+        if self.stream_chunk_num > 1:
+            if self.training:
+                raise RuntimeError("use stream_chunk_num=1 in Linear1D_Row for training!")
+            with torch.no_grad():
+                output_parallel_list = [None for i in range(self.stream_chunk_num)]
+                handle_list = []
+                for i in range(self.stream_chunk_num):
+                    output_parallel_list[i] = F.linear(input_, self.weight_list[i])
+                    handle = torch.distributed.all_reduce(output_parallel_list[i],
+                                                          group=gpc.get_group(ParallelMode.PARALLEL_1D),
+                                                          async_op=True)
+                    handle_list.append(handle)
+                    # output_parallel_list[i] = reduce_input(output_parallel_list[i], ParallelMode.PARALLEL_1D)
+                for handle in handle_list:
+                    handle.wait()
+                output = torch.cat(output_parallel_list, dim=-1)
+        else:
+            output_parallel = F.linear(input_, self.weight)
+            # output_parallel = linear_with_async_comm(input_, self.weight, None, ParallelMode.PARALLEL_1D, False)
+            output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        if not self.skip_bias_add:
+            if self.bias is not None:
+                output = output + self.bias
+            return output
+        else:
+            return output, self.bias
+
+
+# @LAYERS.register_module
+class Embedding1D(ParallelLayer):
+    r"""Embedding for 1D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: torch.dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        self.weight = Parameter(
+            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+
+    def _set_tensor_parallel_attributes(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, gpc.tensor_parallel_size)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            weight = state_dict.pop(weight_key, None)
+            if weight is not None:
+                local_state[weight_key] = weight
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={weight_key: -1},
+                                                           partition_states={weight_key: True})
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        local_state = OrderedDict({weight_key: self.weight})
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={weight_key: -1},
+                                                        partition_states={weight_key: True},
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tensor:
+
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+
+        return output
+
+
+# @LAYERS.register_module
+class VocabParallelEmbedding1D(ParallelLayer):
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: torch.dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        tensor_parallel_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
+        tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+        # self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
+        self.num_embeddings_per_partition = num_embeddings
+        self.vocab_start_index = tensor_parallel_rank * self.num_embeddings_per_partition
+        self.vocab_end_index = self.vocab_start_index + self.num_embeddings_per_partition
+
+        self.weight = Parameter(
+            torch.empty((self.num_embeddings_per_partition, self.embed_dim), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+        env.vocab_parallel = True
+
+    def _set_tensor_parallel_attributes(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, gpc.tensor_parallel_size)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None and \
+                self.padding_idx >= self.vocab_start_index and self.padding_idx < self.vocab_end_index:
+            with torch.no_grad():
+                self.weight[self.padding_idx - self.vocab_start_index].fill_(0)
+
+    def _load_from_global_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        weight_key = prefix + 'weight'
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            # weight
+            weight = state_dict.pop(weight_key, None)
+            if weight is not None:
+                local_state[weight_key] = weight
+
+        local_state = partition_tensor_parallel_state_dict(local_state,
+                                                           ParallelMode.PARALLEL_1D,
+                                                           dims={weight_key: 0},
+                                                           partition_states={weight_key: True})
+        super()._load_from_global_state_dict(local_state, prefix, *args)
+
+    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
+        weight_key = prefix + 'weight'
+        local_state = OrderedDict({weight_key: self.weight})
+        local_state = gather_tensor_parallel_state_dict(local_state,
+                                                        ParallelMode.PARALLEL_1D,
+                                                        dims={weight_key: 0},
+                                                        partition_states={weight_key: True},
+                                                        keep_vars=keep_vars)
+        destination.update(local_state)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+
+        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, *self.embed_args,
+                                      **self.embed_kwargs)
+
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.
+        # Reduce across all the model parallel GPUs.
+        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        return output
+
+
+# @LAYERS.register_module
+class Dropout1D(ParallelLayer):
+    """Dropout layer of 1D parallelism.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
+    """
+
+    def __init__(self, p: float = 0.5, inplace: bool = False):
+        super().__init__()
+        self.parallel_input = get_parallel_input()
+        self.p = p
+        self.inplace = inplace
+
+    def forward(self, input_: Tensor) -> Tensor:
+        if self.parallel_input:
+            with seed(ParallelMode.TENSOR):
+                output = F.dropout(input_, self.p, self.training, self.inplace)
+        else:
+            output = F.dropout(input_, self.p, self.training, self.inplace)
+        return output
+
+
+# @LAYERS.register_module
+class PatchEmbedding1D(ColossalaiModule):
+    """
+    2D Image to Patch Embedding
+
+    :param img_size: image size
+    :type img_size: int
+    :param patch_size: patch size
+    :type patch_size: int
+    :param in_chans: number of channels of input image
+    :type in_chans: int
+    :param embed_size: size of embedding
+    :type embed_size: int
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param flatten: whether to flatten output tensor, defaults to True
+    :type flatten: bool, optional
+    :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    :param position_embed_initializer: The initializer of position embedding, defaults to zero
+    :type position_embed_initializer: typing.Callable, optional
+    """
+
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: torch.dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()):
+        embed = VanillaPatchEmbedding(img_size,
+                                      patch_size,
+                                      in_chans,
+                                      embed_size,
+                                      dtype=dtype,
+                                      flatten=flatten,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer,
+                                      position_embed_initializer=position_embed_initializer)
+        super().__init__(embed)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args):
+        local_state = OrderedDict()
+        param_keys = [prefix + 'weight', prefix + 'bias', prefix + 'cls_token', prefix + 'pos_embed']
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            for key in param_keys:
+                param = state_dict.pop(key, None)
+                if param is not None:
+                    local_state[key] = param
+
+        local_state = broadcast_state_dict(local_state, ParallelMode.PARALLEL_1D)
+        super()._load_from_state_dict(local_state, prefix, *args)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            super()._save_to_state_dict(destination, prefix, keep_vars)
diff --git a/colossalai/shardformer/model/modeling_bert.py b/colossalai/shardformer/model/modeling_bert.py
index 6741ae866991..bd07ab80c00d 100644
--- a/colossalai/shardformer/model/modeling_bert.py
+++ b/colossalai/shardformer/model/modeling_bert.py
@@ -6,6 +6,8 @@
 from transformers import BertForMaskedLM
 from transformers.models.bert.modeling_bert import MaskedLMOutput
 
+from ..layer.dist_crossentropy import applyDistCrossEntropy
+
 
 class BertForMaskedLM_(BertForMaskedLM):
 
@@ -47,11 +49,11 @@ def forward(
 
         masked_lm_loss = None
 
-        # if input_ids is not None:
-        #     masked_lm_loss = applyDistCrossEntropy(prediction_scores, input_ids, self.config.vocab_size)
         if labels is not None:
-            loss_fct = CrossEntropyLoss()    # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            masked_lm_loss = applyDistCrossEntropy(prediction_scores, labels)
+        # if labels is not None:
+        #     loss_fct = CrossEntropyLoss()    # -100 index = padding token
+        #     masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
             output = (prediction_scores,) + outputs[2:]
diff --git a/colossalai/shardformer/policies/basepolicy.py b/colossalai/shardformer/policies/basepolicy.py
index a5cc0bc68df6..2eb7eb29e1a4 100644
--- a/colossalai/shardformer/policies/basepolicy.py
+++ b/colossalai/shardformer/policies/basepolicy.py
@@ -7,8 +7,6 @@
 import torch.nn as nn
 from transformers import AutoConfig
 
-import colossalai.nn as col_nn
-
 
 @dataclass
 class Argument:
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
index 5d91d8ddc766..ab77b29f71f4 100644
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 from transformers.models.bert.modeling_bert import BertEmbeddings, BertLayer, BertLMPredictionHead
 
-import colossalai.nn as col_nn
+import colossalai.shardformer.layer.layers as col_nn
 
 from .basepolicy import Argument, Col_Layer, Layer, Policy, Row_Layer
 
@@ -142,7 +142,7 @@ def unembedding() -> List:
                 weight="decoder.weight",
                 bias="decoder.bias",
                 replace_layer=col_nn.Linear1D_Col,
-                gather_output=True,
+        # gather_output=True,
             )
         ]
 
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
index 957ce1f85814..26053b9f7408 100644
--- a/colossalai/shardformer/shard/slicer.py
+++ b/colossalai/shardformer/shard/slicer.py
@@ -94,10 +94,7 @@ def slice_1d(
         Returns:
             :class:`torch.Tensor`: The sliced tensor
         """
-        delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
-        down_idx = self.shardconfig.rank * delta
-        up_idx = down_idx + delta
-        return tensor[down_idx:up_idx].contiguous()
+        return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
 
     def slice_col(
         self,
@@ -113,10 +110,7 @@ def slice_col(
             :class:`torch.Tensor`: The sliced tensor
 
         """
-        delta = (tensor.shape[0] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
-        down_idx = self.shardconfig.rank * delta
-        up_idx = down_idx + delta
-        return tensor[down_idx:up_idx, :].contiguous()
+        return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
 
     def slice_row(
         self,
@@ -131,7 +125,4 @@ def slice_row(
         Returns:
             :class:`torch.Tensor`: The sliced tensor
         """
-        delta = (tensor.shape[1] + self.shardconfig.world_size - 1) // self.shardconfig.world_size
-        down_idx = self.shardconfig.rank * delta
-        up_idx = down_idx + delta
-        return tensor[:, down_idx:up_idx].contiguous()
+        return tensor.chunk(self.shardconfig.world_size, dim=1)[self.shardconfig.rank].contiguous()
diff --git a/colossalai/shardformer/test/module_test.py b/colossalai/shardformer/test/module_test.py
new file mode 100644
index 000000000000..83dc7ec6cf4a
--- /dev/null
+++ b/colossalai/shardformer/test/module_test.py
@@ -0,0 +1,50 @@
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import colossalai
+from colossalai.shardformer.layer.dist_crossentropy import applyDistCrossEntropy
+from colossalai.shardformer.layer.dropout import Dropout1D
+
+
+def get_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument("--module", type=str, default='distloss')
+    return parser.parse_args()
+
+
+def test_dist_crossentropy():
+    pred = torch.randn(2, 4, 8, requires_grad=True)
+    labels = torch.randint(8, (1, 4)).repeat(2, 1)
+
+    pred_ = pred.view(-1, 8)
+    labels_ = labels.view(-1)
+    loss = F.cross_entropy(pred_, labels_)
+    loss.backward()
+    print(f"normal loss:{loss}")
+
+    pred = pred.chunk(int(os.environ['WORLD_SIZE']), -1)[int(os.environ['RANK'])]
+    loss = applyDistCrossEntropy(pred.to('cuda'), labels.to('cuda'))
+    loss.backward()
+    print(f"dist loss:{loss}")
+
+
+def test_dropout():
+    input = torch.randn(5, 4).to("cuda")
+    m = Dropout1D(p=0.2).to("cuda")
+    for i in range(2):
+        print(f"Output: {m(input)}")
+        print(torch.randn(1))
+
+
+if __name__ == '__main__':
+    args = get_args()
+    colossalai.launch_from_torch(config={})
+    if args.module == 'distloss':
+        test_dist_crossentropy()
+    elif args.module == 'dropout':
+        test_dropout()
+    else:
+        print("not implemented yet")
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
index 202208123ced..b896fd4a4020 100644
--- a/colossalai/shardformer/test/test.py
+++ b/colossalai/shardformer/test/test.py
@@ -1,11 +1,12 @@
 import os
+import random
 
 import torch
 import torch.nn as nn
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
+from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, get_scheduler
 
 import colossalai
 from colossalai.shardformer.shard import ShardConfig, shard_model
@@ -18,6 +19,7 @@
 def get_args():
     parser = colossalai.get_default_parser()
     parser.add_argument("--mode", type=str, default='inference')
+    parser.add_argument("--save_model", action='store_true')
     return parser.parse_args()
 
 
@@ -30,36 +32,40 @@ def load_data():
     # tokenized_datasets=tokenized_datasets.rename_column("label","labels")
     tokenized_datasets.set_format("torch")
 
-    train_dataset = tokenized_datasets["train"].select(range(500))
-    test_dataset = tokenized_datasets["test"].select(range(100))
+    train_dataset = tokenized_datasets["train"]
+    test_dataset = tokenized_datasets["test"]
 
     datacollector = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15, return_tensors="pt")
-    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=datacollector)
-    eval_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=datacollector)
+    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=datacollector)
+    eval_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, collate_fn=datacollector)
     return train_dataloader, eval_dataloader
 
 
-def inference(model: nn.Module):
-    print(model)
+def inference(model: nn.Module, args):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     token = "Hello, my dog is cute"
     inputs = tokenizer(token, return_tensors="pt")
     inputs.to("cuda")
+    model.eval()
     model.to("cuda")
     outputs = model(**inputs)
     print(outputs)
 
 
-def train(model: nn.Module, num_epoch: int = 2):
+def train(model: nn.Module, args, num_epoch: int = 3):
     train_dataloader, eval_dataloader = load_data()
     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
-    progress_bar = tqdm(range((num_epoch) * len(train_dataloader)))
-    criterion = nn.CrossEntropyLoss()
+    num_training = num_epoch * len(train_dataloader)
+    progress_bar = tqdm(range(num_training))
+    lr_scheduler = get_scheduler(name="linear",
+                                 optimizer=optimizer,
+                                 num_warmup_steps=0,
+                                 num_training_steps=num_training)
+    best_test_loss = float("inf")
     model.to("cuda")
     model.train()
     for epoch in range(num_epoch):
         progress_bar.set_description(f"Rank {get_current_device()} epoch {epoch}")
-
         for batch in train_dataloader:
             optimizer.zero_grad()
             batch = {k: v.to('cuda') for k, v in batch.items()}
@@ -67,6 +73,7 @@ def train(model: nn.Module, num_epoch: int = 2):
             loss = outputs.loss
             loss.backward()
             optimizer.step()
+            lr_scheduler.step()
             progress_bar.update(1)
         train_loss = loss
 
@@ -75,16 +82,20 @@ def train(model: nn.Module, num_epoch: int = 2):
             batch = {k: v.to('cuda') for k, v in batch.items()}
             outputs = model(**batch)
             # loss = outputs.loss
+            assert not torch.isnan(outputs.loss), f"{batch}"
             loss += outputs.loss.item()
             # loss = criterion(outputs.logits, batch["input_ids"])
         test_loss = loss / len(eval_dataloader)
         print_rank_0(f"Train Loss: {train_loss:.4f} Test Loss:{test_loss:.4f}")
+        if args.save_model and test_loss < best_test_loss:
+            best_test_loss = test_loss
+            torch.save(model.state_dict(), "./checkpoints/best_model.pth")
 
 
 if __name__ == "__main__":
     args = get_args()
-    colossalai.launch_from_torch(config=args.config)
     model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    colossalai.launch_from_torch(config=args.config)
     shard_config = ShardConfig(
         rank=int(str(get_current_device()).split(':')[-1]),
         world_size=int(os.environ['WORLD_SIZE']),
@@ -92,6 +103,8 @@ def train(model: nn.Module, num_epoch: int = 2):
     sharded_model = shard_model(model, shard_config)
 
     if args.mode == "train":
-        train(sharded_model)
+        train(sharded_model, args)
     elif args.mode == "inference":
-        inference(sharded_model)
+        inference(sharded_model, args)
+    else:
+        raise NotImplementedError

From 6370a935f6d9180a2f4af054708dfe5193619d76 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Tue, 6 Jun 2023 15:31:52 +0800
Subject: [PATCH 43/52] update README (#3909)

---
 colossalai/shardformer/README.md | 46 ++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 3394e9457da3..93a4f1e578e4 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -55,30 +55,37 @@ If you wanna parallel the model in a custom way, just overwrite the policy class
 You should do:
 
 1. Inherit Policy class
-2. Overwrite argument_policy method
-    - In this method, you need to list which layers class you wanna modify and the attributes and parameters in those layers.
-3. Overwrite inject_policy method (Optional)
-    - If you need to modify the forward or backward progress.
-4. Overwrite or add the param recording functions
+2. Overwrite `argument_policy` method
+    - In this method, you need to list which layers class you wanna modify and the attributes and parameters in those layers. Shardformer will replace all the layer belonging to the class you specified.
+    - `attr_dict` is dict contains all the attributes need to be modified in this layer.
+    - `param_funcs` is a list contains some functions which will return the path of the weight and bias from the layer.
+3. Overwrite `inject_policy` method (Optional)
+    - Shardformer will inject the model according to this method. If you need to modify the forward or backward progress (like distributed corssentropy loss in Bert) you need to overwrite this method.
+4. Overwrite or add the param functions
     - These functions use a suffix to record the path of weight or bias for the layer.
-5. Overwrite binding
+    - The return is a list contains some `Col_Layer` or `Row_Layer` objects, which means slice along col and row respectively.
+5. Overwrite `binding_policy` (Optional)
+    - Overwrite to specify Shardformer will bind some weight between layers, like embedding and unembedding layers.
+    - This function will return a dict, the key and value are the suffix of weight need to be binded.
 
 More details can be found in shardformer/policies/basepolicy.py
 ``` python
 from colossalai.shardformer.policies.basepolicy import Policy, Layer, Col_Layer, Row_Layer, Argument
 
 CustomPolicy(Policy):
-   @staticmethod
-    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module,Argument]:
-        """
-        Return a dict, the key is layer will be modified and the value is the Argument class with param setting and param functions
+@staticmethod
+    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module, Argument]:
+        r"""
+        Return the dict for the modify policy, the key is the original layer class and the value is the
+        argument for the modify layer
 
         Args:
-            model_config: The config of transformer model
-            shard_setting: The config of distributed model
+            model_config (:class:`tansformer.Config`): The config of transformer model
+            shard_config (:class:`ShardConfig`): The config for sharding model
 
         Return:
             Dict for the modify policy,
+            ::
             {
                 origin layer class1 (nn.Module): Argument(
                     attr_dict = {
@@ -112,18 +119,29 @@ CustomPolicy(Policy):
 
     @staticmethod
     def inject_policy() -> Tuple[nn.Module, nn.Module]:
-        """
+        r"""
         Return the dict for the inject model
 
         Return:
             The injected model, key is the original model and value is the new shardmodel
+            ::
+            (OrignModel, CustomModel)
+            in `CustomModel`, we can overwrite the forward and backward process
         """
         return ()
 
     @staticmethod
     def binding_policy() -> Dict:
-        """
+        r"""
         Return the dict for the binding model
+
+        Return:
+            This method should return the binding relationship for some layers share the weight or bias,
+            the key and value is the suffix of the weight or bias of the model
+        ::
+            return {
+                "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
+            }
         """
         return NotImplementedError
 

From ef1537759c1bf3fcddf389e6b857eac9b22bb444 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Wed, 7 Jun 2023 16:09:40 +0800
Subject: [PATCH 44/52] [shardformer] add gpt2 policy and modify shard and
 slicer to support (#3883)

* add gpt2 policy and modify shard and slicer to support

* remove unused code

* polish code
---
 colossalai/shardformer/policies/autopolicy.py |  14 ++-
 colossalai/shardformer/policies/basepolicy.py |  17 ++-
 colossalai/shardformer/policies/bert.py       |   1 -
 colossalai/shardformer/policies/gpt2.py       | 118 ++++++++++++++++++
 colossalai/shardformer/shard/sharder.py       |  46 ++++---
 colossalai/shardformer/shard/slicer.py        |  53 ++++++--
 colossalai/shardformer/test/test.py           |  28 +++--
 7 files changed, 233 insertions(+), 44 deletions(-)
 create mode 100644 colossalai/shardformer/policies/gpt2.py

diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index e096c2b13a59..54cc63ba124f 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -10,16 +10,26 @@ def build_policies():
     """
     auto_policy_dict = {}
 
-    from transformers.models.bert.modeling_bert import BertForMaskedLM
+    from transformers import BertForMaskedLM
 
     from .bert import BertForMaskedLMPolicy
     auto_policy_dict[BertForMaskedLM] = BertForMaskedLMPolicy
 
-    from transformers.models.bert.modeling_bert import BertForSequenceClassification
+    from transformers import BertForSequenceClassification
 
     from .bert import BertForSequenceClassificationPolicy
     auto_policy_dict[BertForSequenceClassification] = BertForSequenceClassificationPolicy
 
+    from transformers import GPT2Model
+
+    from .gpt2 import GPT2Policy
+    auto_policy_dict[GPT2Model] = GPT2Policy
+
+    from transformers import GPT2LMHeadModel
+
+    from .gpt2 import GPT2LMHeadModelPolicy
+    auto_policy_dict[GPT2LMHeadModel] = GPT2LMHeadModelPolicy
+
     return auto_policy_dict
 
 
diff --git a/colossalai/shardformer/policies/basepolicy.py b/colossalai/shardformer/policies/basepolicy.py
index 2eb7eb29e1a4..644d115a270e 100644
--- a/colossalai/shardformer/policies/basepolicy.py
+++ b/colossalai/shardformer/policies/basepolicy.py
@@ -1,11 +1,9 @@
 # part of code modified from https://github.com/tunib-ai/parallelformers
 
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Tuple, Type
 
-import torch
 import torch.nn as nn
-from transformers import AutoConfig
 
 
 @dataclass
@@ -31,11 +29,18 @@ class Layer:
         bias (str): The bias suffix of the layer
         replace_layer (:class:`colosalai.nn`): The layer to replace the original layer
         ignore (bool): Whether to ignore this layer if it is not in the model
+        reversed (bool): Whether the weight in layer is reversed, commonly the weight in `torch.nn.Linear` is [out, in],
+                        but in GPT2 `Conv1D` layer is [in, out] which is reversed.
+        n_cast (int): The number of weight will cast to, like q, k, v in attention layer, n_cast should be 3. commonly in TP, we just chunk the weight with the number of devices,
+                        but in multi-head attention, we need to chunk the weight with the number of devices * n_head, and
+                        each device should have a part of Q, K and V weight.
     """
     weight: str = None
     bias: str = None
     replace_layer: Any = None
     ignore: bool = False
+    reversed: bool = False
+    n_cast: int = None
 
 
 @dataclass
@@ -131,7 +136,7 @@ def inject_policy() -> Tuple[nn.Module, nn.Module]:
             (OrignModel, CustomModel)
             in `CustomModel`, we can overwrite the forward and backward process
         """
-        return ()
+        return None
 
     @staticmethod
     def binding_policy() -> Dict:
@@ -146,7 +151,7 @@ def binding_policy() -> Dict:
                 "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
             }
         """
-        return NotImplementedError
+        return None
 
     @staticmethod
     def attn_in() -> List:
@@ -209,4 +214,4 @@ def unembedding() -> List:
         Return:
             List[Layer]: List of layer object
         """
-        return NotImplementedError
+        return None
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
index ab77b29f71f4..89b32f065c27 100644
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -1,4 +1,3 @@
-from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Tuple, Type
 
 import torch.nn as nn
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
new file mode 100644
index 000000000000..44dc9c72f986
--- /dev/null
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -0,0 +1,118 @@
+from typing import Any, Callable, Dict, List, Tuple, Type
+
+import torch.nn as nn
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model
+
+import colossalai.shardformer.layer.layers as col_nn
+
+from .basepolicy import Argument, Col_Layer, Layer, Policy, Row_Layer
+
+
+class GPT2Policy(Policy):
+
+    @staticmethod
+    def argument_policy(config, world_size):
+        return {
+            GPT2Model:
+                Argument(attr_dict={}, param_funcs=[
+                    GPT2Policy.embedding,
+                ]),
+            GPT2Block:
+                Argument(
+                    attr_dict={
+        # 1. reduce hidden size
+                        "attn.embed_dim": config.hidden_size // world_size,
+                        "attn.split_size": config.hidden_size // world_size,
+                        "crossattention.embed_dim": config.hidden_size // world_size,
+                        "crossattention.split_size": config.hidden_size // world_size,
+        # 2. reduce number of heads
+                        "attn.num_heads": config.num_attention_heads // world_size,
+                        "crossattention.num_heads": config.num_attention_heads // world_size,
+                    },
+                    param_funcs=[
+                        GPT2Policy.attn_in,
+                        GPT2Policy.attn_out,
+                        GPT2Policy.mlp_in,
+                        GPT2Policy.mlp_out,
+                    ]),
+        }
+
+    @staticmethod
+    def attn_in() -> List:
+        return [
+            Col_Layer(weight="attn.c_attn.weight",
+                      bias="attn.c_attn.bias",
+                      n_cast=3,
+                      reversed=True,
+                      replace_layer=col_nn.Linear1D_Col),
+            Col_Layer(weight="crossattention.c_attn.weight",
+                      bias="crossattention.c_attn.bias",
+                      n_cast=2,
+                      reversed=True,
+                      ignore=True,
+                      replace_layer=col_nn.Linear1D_Col),
+            Col_Layer(weight="crossattention.q_attn.weight",
+                      bias="crossattention.q_attn.bias",
+                      reversed=True,
+                      ignore=True,
+                      replace_layer=col_nn.Linear1D_Col)
+        ]
+
+    @staticmethod
+    def attn_out() -> List:
+        return [
+            Row_Layer(weight="attn.c_proj.weight",
+                      bias="attn.c_proj.bias",
+                      reversed=True,
+                      replace_layer=col_nn.Linear1D_Row),
+            Row_Layer(weight="crossattention.c_proj.weight",
+                      bias="crossattention.c_proj.bias",
+                      reversed=True,
+                      ignore=True,
+                      replace_layer=col_nn.Linear1D_Row)
+        ]
+
+    @staticmethod
+    def mlp_in() -> List:
+        return [
+            Col_Layer(weight="mlp.c_fc.weight", bias="mlp.c_fc.bias", reversed=True, replace_layer=col_nn.Linear1D_Col),
+        ]
+
+    @staticmethod
+    def mlp_out() -> List:
+        return [
+            Row_Layer(weight="mlp.c_proj.weight",
+                      bias="mlp.c_proj.bias",
+                      reversed=True,
+                      replace_layer=col_nn.Linear1D_Row)
+        ]
+
+    @staticmethod
+    def embedding() -> List:
+        return [Col_Layer(weight="wte.weight", replace_layer=col_nn.VocabParallelEmbedding1D)]
+
+
+from transformers import GPT2LMHeadModel
+
+
+class GPT2LMHeadModelPolicy(GPT2Policy):
+
+    @staticmethod
+    def argument_policy(config, world_size):
+        base_argument = GPT2Policy.argument_policy(config, world_size)
+        argument = {
+            GPT2LMHeadModel: Argument(attr_dict={}, param_funcs=[
+                GPT2LMHeadModelPolicy.unembedding,
+            ]),
+        }
+        argument.update(base_argument)
+        return argument
+
+    @staticmethod
+    def unembedding() -> List:
+        return [
+            Col_Layer(weight="lm_head.weight",
+                      bias="lm_head.bias",
+                      replace_layer=col_nn.Linear1D_Col,
+                      gather_output=True)
+        ]
diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
index 2218661889f8..1ada75e06b67 100644
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+from transformers.pytorch_utils import Conv1D
 
 from ..policies.autopolicy import get_autopolicy
 from ..policies.basepolicy import Policy
@@ -35,10 +36,22 @@ def __init__(
         self.model_config = self.model.config
 
     def shard(self) -> None:
+        self.reshape_embedding()
         self.inject_model(self.model)
         self.replace_layer(self.model)
         self.bind_layer(self.model)
 
+    def reshape_embedding(self,) -> None:
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        vocab_size = self.model_config.vocab_size
+        world_size = self.shard_config.world_size
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+            self.model_config = self.model.config
+
     def inject_model(
         self,
         model: nn.Module,
@@ -53,6 +66,8 @@ def inject_model(
         """
         inject_policy = self.policy.inject_policy()
 
+        if inject_policy is None:
+            return
         org_model_cls = inject_policy[0]
         shard_model_cls = inject_policy[1]
 
@@ -82,9 +97,9 @@ def replace_layer(
             origin_layer_cls = argument_policy[0]
             attr_dict = argument_policy[1].attr_dict
             param_funcs = argument_policy[1].param_funcs
-            self.reverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs)
+            self.traverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs)
 
-    def reverse_replace_layer(
+    def traverse_replace_layer(
         self,
         layer: nn.Module,
         origin_cls: nn.Module,
@@ -100,17 +115,12 @@ def reverse_replace_layer(
             attr_dict (Dict): The attribute dict to modify
             policy_cls (:class:`Policy`): The policy class
         """
+        if layer.__class__ == origin_cls:
+            for k, v in attr_dict.items():
+                setattr_(layer, k, v, ignore=True)
+            self.shard_one_layer(layer, param_funcs)
         for name, child in layer.named_children():
-            if child.__class__ == origin_cls:
-                # replac_layer = child
-                for k, v in attr_dict.items():
-                    setattr_(child, k, v, ignore=True)
-                # print(f"Sharding {name} layer", replac_layer.attention.self.__dict__)
-                # setattr_(layer, name, self.shard_one_layer(child, policy_cls))
-                self.shard_one_layer(child, param_funcs)
-                continue
-
-            self.reverse_replace_layer(child, origin_cls, attr_dict, param_funcs)
+            self.traverse_replace_layer(child, origin_cls, attr_dict, param_funcs)
         return layer
 
     def shard_one_layer(
@@ -126,7 +136,6 @@ def shard_one_layer(
             param_funcs (:class:`List[typing.Callable]`): The function list to get shard information in policy class
 
         """
-        # print(org_layer)
         for func in param_funcs:
             policy_layers = func()
             for policy_layer in policy_layers:
@@ -136,9 +145,10 @@ def shard_one_layer(
                 bias_attr = policy_layer.bias
                 replace_layer_cls = policy_layer.replace_layer
                 ignore = policy_layer.ignore
+                n_cast = policy_layer.n_cast
+                reversed = policy_layer.reversed
                 if policy_layer.__class__.__name__ == "Col_Layer":
                     gather_output = policy_layer.gather_output
-                    # print(gather_output)
 
                 if weight_attr is not None:
                     if hasattr_(org_layer, weight_attr):
@@ -161,13 +171,11 @@ def shard_one_layer(
                 layer_attr = (lambda x: x[:x.rfind(".")])(weight_attr or bias_attr)
 
                 # slice weight and bias
-                weight, bias = self.slicer.slice_weight_bias(weight, bias, policy_layer.__class__)
-                # print(os.environ['RANK'], policy_layer.__class__, weight.shape, bias.shape if bias is not None else None)
+                weight, bias = self.slicer.slice_weight_bias(weight, bias, policy_layer.__class__, n_cast, reversed)
 
                 # create new object to replace the origin layer
                 if replace_layer_cls is not None:
-                    # print(f"RANK {os.environ['RANK']}: replace {getattr_(org_layer, layer_attr).__class__} to {replace_layer_cls}, shape is {weight.shape}")
-                    if isinstance(getattr_(org_layer, layer_attr), nn.Linear):
+                    if isinstance(getattr_(org_layer, layer_attr), (nn.Linear, Conv1D)):
                         if replace_layer_cls.__name__ == "Linear1D_Row":
                             replace_layer = replace_layer_cls(weight.shape[1],
                                                               weight.shape[0],
@@ -235,6 +243,8 @@ def bind_layer(self, model: nn.Module) -> None:
             model (:class:`torch.nn.Module`): The shard model
         """
         binding_map = self.policy.binding_policy()
+        if binding_map is None:
+            return
         for k, v in binding_map.items():
             param = getattr_(model, k)
             param = nn.Parameter(param)
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
index 26053b9f7408..6d35bd193fed 100644
--- a/colossalai/shardformer/shard/slicer.py
+++ b/colossalai/shardformer/shard/slicer.py
@@ -19,6 +19,8 @@ def slice_weight_bias(
         weight: torch.Tensor,
         bias: torch.Tensor,
         policy_layer_cls: Layer,
+        n_cast: int = None,
+        reversed: bool = False,
     ):
         r"""
         Slice the weight and bias according to policy layer cls
@@ -33,13 +35,18 @@ def slice_weight_bias(
         """
         if policy_layer_cls == Layer:
             return weight, bias
-        elif policy_layer_cls == Col_Layer:
-            weight = self.slice_tensor(weight, 1, False)
+
+        dim = dim_mapping[policy_layer_cls] if not reversed else (1 - dim_mapping[policy_layer_cls])
+        # print(weight.shape, dim)
+        if policy_layer_cls == Col_Layer:
+            weight = self.slice_tensor(weight, dim, False, n_cast)
             bias = self.slice_tensor(bias, 0, True)
         elif policy_layer_cls == Row_Layer:
-            weight = self.slice_tensor(weight, 0, False)
+            weight = self.slice_tensor(weight, dim, False, n_cast)
         else:
             raise NotImplementedError(f"The policy layer class {policy_layer_cls} is not supported")
+        if reversed:
+            weight = weight.transpose(0, 1).contiguous()
         return weight, bias
 
     def slice_tensor(
@@ -47,6 +54,7 @@ def slice_tensor(
         tensor_in: torch.Tensor,
         dim: int,
         is_bias: bool,
+        n_cast: int = None,
     ) -> torch.Tensor:
         r"""
         Slice tensor according to the config
@@ -59,14 +67,15 @@ def slice_tensor(
         if tensor_in is None:
             return None
         if not is_bias:
-            return self.slice_2d(tensor_in, dim)
+            return self.slice_2d(tensor_in, dim, n_cast)
         else:
-            return self.slice_1d(tensor_in)
+            return self.slice_1d(tensor_in, n_cast)
 
     def slice_2d(
         self,
         tensor: torch.Tensor,
         dim: int,
+        n_cast: int = None,
     ) -> torch.Tensor:
         r"""
         Slice the 2D tensor
@@ -77,13 +86,14 @@ def slice_2d(
         """
         assert dim in [0, 1], f"Only support 2D tensor, but got {dim}D tensor"
         if dim == 0:
-            return self.slice_row(tensor)
+            return self.slice_row(tensor, n_cast)
         elif dim == 1:
-            return self.slice_col(tensor)
+            return self.slice_col(tensor, n_cast)
 
     def slice_1d(
         self,
         tensor: torch.Tensor,
+        n_cast: int = None,
     ) -> torch.Tensor:
         r"""
         Slice the 1D tensor
@@ -94,11 +104,19 @@ def slice_1d(
         Returns:
             :class:`torch.Tensor`: The sliced tensor
         """
-        return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
+        if n_cast is None:
+            return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
+        else:
+            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=0)
+            chunk_list = [
+                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
+            ]
+            return torch.cat(chunk_list, dim=0).contiguous()
 
     def slice_col(
         self,
         tensor: torch.Tensor,
+        n_cast: int = None,
     ) -> torch.Tensor:
         r"""
         Slice the tensor in column
@@ -110,11 +128,19 @@ def slice_col(
             :class:`torch.Tensor`: The sliced tensor
 
         """
-        return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
+        if n_cast is None:
+            return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
+        else:
+            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=0)
+            chunk_list = [
+                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
+            ]
+            return torch.cat(chunk_list, dim=0).contiguous()
 
     def slice_row(
         self,
         tensor: torch.Tensor,
+        n_cast: int = None,
     ) -> torch.Tensor:
         r"""
         Slice the tensor in column
@@ -125,4 +151,11 @@ def slice_row(
         Returns:
             :class:`torch.Tensor`: The sliced tensor
         """
-        return tensor.chunk(self.shardconfig.world_size, dim=1)[self.shardconfig.rank].contiguous()
+        if n_cast is None:
+            return tensor.chunk(self.shardconfig.world_size, dim=1)[self.shardconfig.rank].contiguous()
+        else:
+            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=1)
+            chunk_list = [
+                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
+            ]
+            return torch.cat(chunk_list, dim=1).contiguous()
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
index b896fd4a4020..e2d5a94c782a 100644
--- a/colossalai/shardformer/test/test.py
+++ b/colossalai/shardformer/test/test.py
@@ -6,24 +6,28 @@
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, get_scheduler
+from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, GPT2LMHeadModel, get_scheduler
 
 import colossalai
 from colossalai.shardformer.shard import ShardConfig, shard_model
 from colossalai.utils import get_current_device, print_rank_0
 
 os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
 
 def get_args():
     parser = colossalai.get_default_parser()
     parser.add_argument("--mode", type=str, default='inference')
     parser.add_argument("--save_model", action='store_true')
+    parser.add_argument("--model", type=str, default='bert-base-uncased')
     return parser.parse_args()
 
 
-def load_data():
+def load_data(args):
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        # tokenizer.pad_token_id = 0
     datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
     # datasets=load_dataset("yelp_review_full")
     tokenized_datasets = datasets.map(
@@ -42,18 +46,23 @@ def load_data():
 
 
 def inference(model: nn.Module, args):
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    print(model)
+    # print(model.wte.weight.shape)
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        tokenizer.pad_token_id = 0
     token = "Hello, my dog is cute"
     inputs = tokenizer(token, return_tensors="pt")
     inputs.to("cuda")
     model.eval()
     model.to("cuda")
     outputs = model(**inputs)
-    print(outputs)
+    print(outputs[0])
 
 
 def train(model: nn.Module, args, num_epoch: int = 3):
-    train_dataloader, eval_dataloader = load_data()
+    train_dataloader, eval_dataloader = load_data(args)
     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
     num_training = num_epoch * len(train_dataloader)
     progress_bar = tqdm(range(num_training))
@@ -94,8 +103,13 @@ def train(model: nn.Module, args, num_epoch: int = 3):
 
 if __name__ == "__main__":
     args = get_args()
-    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
     colossalai.launch_from_torch(config=args.config)
+    if args.model == 'bert-base-uncased':
+        model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    elif args.model == 'gpt2':
+        model = GPT2LMHeadModel.from_pretrained("gpt2")
+    else:
+        raise AttributeError("model not supported")
     shard_config = ShardConfig(
         rank=int(str(get_current_device()).split(':')[-1]),
         world_size=int(os.environ['WORLD_SIZE']),

From 33eef714db460d3db42698a2d969cb6a669dc583 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 8 Jun 2023 16:09:32 +0800
Subject: [PATCH 45/52] fix typo examples and docs (#3932)

---
 .../parallelize_your_training_like_Megatron.md            | 6 +++---
 .../parallelize_your_training_like_Megatron.md            | 6 +++---
 examples/images/dreambooth/README.md                      | 2 +-
 examples/language/bert/README.md                          | 2 +-
 examples/language/gpt/gemini/train_gpt_demo.py            | 8 ++++----
 examples/language/gpt/titans/model/embed.py               | 2 +-
 examples/language/opt/opt_train_demo.py                   | 2 +-
 examples/language/palm/train.py                           | 6 +++---
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 22d52fb3cd1a..978ac32fc78e 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -141,16 +141,16 @@ for mn, module in model.named_modules():
 
         if 'mlp.c_fc' in mn:
             if 'weight' in pn or 'bias' in pn:
-                split_param_col_tp1d(param, pg)  # colmn slice
+                split_param_col_tp1d(param, pg)  # column slice
                 # keep the shape of the output from c_fc
                 param.compute_spec.set_output_replicate(False)
         elif 'mlp.c_proj' in mn:
             if 'weight' in pn:
                 split_param_row_tp1d(param, pg)  # row slice
         elif 'wte' in mn or 'wpe' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
         elif 'c_attn' in mn or 'c_proj' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
 ```
 
 The modified model is illustrated below.
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index c4131e593437..b4e0d18a2647 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -126,16 +126,16 @@ for mn, module in model.named_modules():
 
         if 'mlp.c_fc' in mn:
             if 'weight' in pn or 'bias' in pn:
-                split_param_col_tp1d(param, pg)  # colmn slice
+                split_param_col_tp1d(param, pg)  # column slice
                 # keep the shape of the output from c_fc
                 param.compute_spec.set_output_replicate(False)
         elif 'mlp.c_proj' in mn:
             if 'weight' in pn:
                 split_param_row_tp1d(param, pg)  # row slice
         elif 'wte' in mn or 'wpe' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
         elif 'c_attn' in mn or 'c_proj' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
 ```
 
 修改后的模型如下图所示。
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 7c117d841e24..5b350bc95b8e 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -37,7 +37,7 @@ The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Mode
 
 ## Training
 
-We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparation. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
+We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparison. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
diff --git a/examples/language/bert/README.md b/examples/language/bert/README.md
index c845a5c50387..81c3f03fffca 100644
--- a/examples/language/bert/README.md
+++ b/examples/language/bert/README.md
@@ -1,6 +1,6 @@
 ## Overview
 
-This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
+This directory includes two parts: Using the Booster API finetune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
 
 ## Finetune
 ```
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 92751c7e2f47..4b78624f0110 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -162,7 +162,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
             # shard it w.r.t tp pattern
             if 'mlp.c_fc' in mn:
                 if 'weight' in pn or 'bias' in pn:
-                    split_param_col_tp1d(param, pg)    # colmn slice
+                    split_param_col_tp1d(param, pg)    # column slice
                     # keep the shape of the output from c_fc
                     param.compute_spec.set_output_replicate(False)
                 else:
@@ -173,9 +173,9 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 else:
                     param.set_dist_spec(ReplicaSpec())
             elif 'wte' in mn or 'wpe' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'c_attn' in mn or 'c_proj' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             else:
                 param.set_dist_spec(ReplicaSpec())
             param.visited = True
@@ -237,7 +237,7 @@ def main():
         if args.tp_degree > 1:
             tensor_parallelize(model, tp_pg)
 
-        # asign running configurations
+        # assign running configurations
         if args.distplan == "CAI_ZeRO1":
             zero_stage = 1
         elif args.distplan == "CAI_ZeRO2":
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index 6369b9f8c5a1..d825ae92a285 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -305,7 +305,7 @@ def forward(ctx, vocab_parallel_logits, target):
     @staticmethod
     def backward(ctx, grad_output):
 
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
index 8a2ad5f55b10..bb2eb52ce560 100644
--- a/examples/language/opt/opt_train_demo.py
+++ b/examples/language/opt/opt_train_demo.py
@@ -38,7 +38,7 @@ def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coor
         
         for batch in pbar:
 
-            # Foward
+            # Forward
             optimizer.zero_grad()
             batch = move_to_cuda(batch, torch.cuda.current_device())
             
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 62062e8bd272..a0600db1bc5b 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -140,15 +140,15 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 continue
             param.set_dist_spec(ReplicaSpec())
             if 'net.0' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'to_q' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'to_kv' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             elif 'to_out' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             elif '1.1' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif '1.2' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             else:

From 21c4c0b1a01c13a0bc9b515e061a4fe19de2a341 Mon Sep 17 00:00:00 2001
From: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Date: Thu, 8 Jun 2023 17:38:47 +0800
Subject: [PATCH 46/52] support UniEval and add CHRF metric (#3924)

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
---
 applications/Chat/evaluate/README.md          | 105 ++++--
 .../Chat/evaluate/config/config_cn.json       |  12 +-
 .../Chat/evaluate/config/config_en.json       |  73 +++-
 applications/Chat/evaluate/eval.py            |   2 +-
 applications/Chat/evaluate/evaluator.py       | 142 ++++++--
 applications/Chat/evaluate/gpt_evaluate.py    |   2 +-
 applications/Chat/evaluate/metrics.py         |  22 ++
 .../Chat/evaluate/unieval/__init__.py         |  12 +
 .../Chat/evaluate/unieval/evaluator.py        | 330 ++++++++++++++++++
 applications/Chat/evaluate/unieval/scorer.py  | 101 ++++++
 applications/Chat/evaluate/unieval/utils.py   | 248 +++++++++++++
 applications/Chat/evaluate/utils.py           |   2 +-
 12 files changed, 984 insertions(+), 67 deletions(-)
 create mode 100644 applications/Chat/evaluate/unieval/__init__.py
 create mode 100644 applications/Chat/evaluate/unieval/evaluator.py
 create mode 100644 applications/Chat/evaluate/unieval/scorer.py
 create mode 100644 applications/Chat/evaluate/unieval/utils.py

diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index e3510e3522f6..077193b63ce0 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -12,12 +12,13 @@ pip install -r requirements.txt
 
 ## Evaluation Pipeline
 
-The whole evaluation pipeline consists of two methods:
+The whole evaluation pipeline consists of three methods:
 
 1. `GPT Evaluation`: evaluates model predictions using GPT models.
    * Compare the performance of two different models (battle).
    * Rate the model according to pre-defined metrics using prompting design.
 2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
+3. `UniEval`: evaluates model predictions using UniEval models(English only).
 
 ### Evaluation Category
 
@@ -75,7 +76,9 @@ GPT evaluation uses GPT models to evaluate the prediction of different models an
 
 GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
 
-> **NOTE:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+> **NOTE 1:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+
+> **NOTE 2:** To add customized metrics, you can refer to [FAQ](#faq).
 
 #### Automatic Evaluation
 
@@ -85,7 +88,7 @@ There are two ways to obtain reference answers:
 * For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
 * For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
 
-There are 5 types of automatic evaluation metrics listed in the table below:
+There are 6 types of automatic evaluation metrics listed in the table below:
 
 |     Automatic Evaluation Metric     | Description                                                  |
 | :---------------------------------: | :----------------------------------------------------------- |
@@ -94,6 +97,25 @@ There are 5 types of automatic evaluation metrics listed in the table below:
 |              Distinct               | Measure the diversity of generation text by counting the unique n-grams. |
 |              BERTScore              | Measure the semantic similarity between tokens of predictions and references with BERT. |
 | Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
+|                CHRF                 | Measure the similarity of character n-grams between prediction and reference. |
+
+#### UniEval Evaluation
+
+UniEval converts all evaluation tasks of different dimensions(metrics) into Boolean QA problems and utilize the model to answer with “Yes” or “No”. Compared with similarity-based metrics such as ROUGE and BLEU, UniEval can achieve a more comprehensive evaluation. In addition, UniEval also demonstrates its ability to transfer to unseen dimensions and tasks.
+
+In our evaluation pipeline, two pre-trained UniEval evaluators are used. One is [unieval-sum](https://huggingface.co/MingZhong/unieval-sum) and the other is [unieval-dialog](https://huggingface.co/MingZhong/unieval-dialog). The two models can be used for the 3 tasks, `summarization`, `dialogue` and `data2text`. Each task has different evaluation dimensions.
+
+| UniEval Model  | Task               | Dimension(Metric) |
+| :------------: | :----------------- | :--- |
+| unieval-sum    | summarization | coherence: whether the summary is coherent<br/>consistency: whether the claim is consistent with the given document<br/>fluency: whether the paragraph is fluent<br/>relevance: whether the summary is relevant to the reference |
+| unieval-sum | data2text | naturalness: whether the utterance is fluent<br/>informativeness: whether the utterance is informative according to the reference |
+| unieval-dialog | dialogue | naturalness: whether the response is natural in the dialogue<br/>coherence: whether the response is coherent in the dialogue history<br/>understandability: whether the response is understandable in the dialogue |
+
+> **NOTE 1:**  Task "data2text" uses the same model as task "summarization".
+
+> **NOTE 2:**  In UniEval paper, the `unieval-sum` model demonstrates the best transfer ability and so you can evaluate your customized metric with this model. Details of adding customized metrics can be found in [FAQ](#faq).
+
+> **NOTE 3:**  We consider not including all metrics provided in UniEval in our pipeline because the data structure and content of the instructions we want to evaluate are not suitable for direct use of some UniEval metrics.
 
 ## Evaluation Process
 
@@ -215,19 +237,26 @@ The following is an example of a Chinese GPT evaluation prompt. In an evaluation
 
 #### Configuration
 
-The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics and automatic metrics in key `GPT` and `Metrics`. You can find an example Chinese config file in `config`.
+The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics, automatic metrics and UniEval metrics in key `GPT`, `Metrics` and `UniEval`(English only). You can find an example English config file in `config`.
 
 ```json
 {
-    "language": "cn",
+    "language": "en",
+    "path_for_UniEval": {
+        "summarization": "path to unieval-sum model",
+        "dialogue": "path to unieval-dialog model",
+        "data2text": "path to unieval-sum model"
+    },
     "category": {
         "brainstorming": {
             "GPT": ["relevance", "creativity", "practicality", "correctness"],
-            "Metrics": ["Distinct"]
+            "Metrics": ["Distinct"],
+            "UniEval": ["summarization-fluency", "data2text-naturalness", "data2text-informativeness"]
         },
         "chat": {
             "GPT": [ "relevance", "naturalness", "engagingness", "reasonableness"],
-            "Metrics": ["Distinct"]
+            "Metrics": ["Distinct"],
+            "UniEval": ["dialogue-naturalness", "dialogue-coherence", "dialogue-understandability"]
         }
     }
 }
@@ -235,27 +264,33 @@ The following is an example of a Chinese config file. The configuration file can
 
 `"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
 
+`"path_for_UniEval"`: path to the UniEval model.
+
 `"category"`: the category/categories needed to evaluate the model capability.
 
 `"GPT"`: the metrics you want to use for GPT evaluation.
 
 `"Metrics"`: the metrics you want to use for automatic metrics evaluation.
 
+`"UniEval"`: the metrics you want to use for UniEval metrics evaluation. The metric has to be in the `"{task}-{metric}"` format because different tasks have same metrics such as naturalness and coherence.
+
+You can remove the key such as `"Metrics"` to skip evaluating answers using its corresponding evaluation metrics.
+
 You can create your config file based on available settings listed in following table.
 
-|    "category"    |          "GPT"          |  "Metrics"  |
-| :--------------: | :---------------------: | :---------: |
-| "brainstorming"  | "language organization" |   "BLEU"    |
-|      "chat"      |       "relevance"       |   "ROUGE"   |
-| "classification" |      "creativity"       | "Distinct"  |
-|   "closed_qa"    |     "practicality"      | "BERTScore" |
-|   "extraction"   |      "correctness"      | "Precision" |
-|   "generation"   |      "naturalness"      |  "Recall"   |
-|    "open_qa"     |     "engagingness"      | "F1 score"  |
-|   "rewriting"    |    "reasonableness"     |             |
-|    "roleplay"    |       "diversity"       |             |
-| "summarization"  |       "fidelity"        |             |
-|                  |      "conciseness"      |             |
+|    "category"    |          "GPT"          |  "Metrics"  |          "UniEval"           |
+| :--------------: | :---------------------: | :---------: | :--------------------------: |
+| "brainstorming"  | "language organization" |   "BLEU"    |    "dialogue-naturalness"    |
+|      "chat"      |       "relevance"       |   "ROUGE"   |     "dialogue-coherence"     |
+| "classification" |      "creativity"       | "Distinct"  | "dialogue-understandability" |
+|   "closed_qa"    |     "practicality"      | "BERTScore" |   "data2text-naturalness"    |
+|   "extraction"   |      "correctness"      | "Precision" | "data2text-informativeness"  |
+|   "generation"   |      "naturalness"      |  "Recall"   |  "summarization-coherence"   |
+|    "open_qa"     |     "engagingness"      | "F1 score"  | "summarization-consistency"  |
+|   "rewriting"    |    "reasonableness"     |   "CHRF"    |   "summarization-fluency"    |
+|    "roleplay"    |       "diversity"       |             |  "summarization-relevance"   |
+| "summarization"  |       "fidelity"        |             |                              |
+|                  |      "conciseness"      |             |                              |
 
 > **NOTE:**  For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
 
@@ -290,23 +325,36 @@ For example, if you want to add a new metric `persuasiveness` into category `bra
     "id": 1,
     "category": "brainstorming",
     "metrics": {
-      "persuasiveness": "说服力(1-5)：XXX"
+      "persuasiveness": "persuasiveness(1-5)：a short description for persuasiveness"
     },
     "CoT": {
-      "persuasiveness": "XXX\n\n说服力："
+      "persuasiveness": "CoT for persuasiveness\n\npersuasiveness："
     },
-    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+    "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
   }
 }
 ```
 
 </details>
 
+<details><summary><b>How can I add a new UniEval evaluation metric?</b></summary>
+
+For example, if you want to add a new metric `persuasiveness` into task `data2text`, you should add a Boolean QA question about the metric in function `add_question` in `unieval/utils.py`. Please do note that how effectively the model would evaluate this metric is unknown and you may need some experiments to test whether the model is capable of evaluating this metric.
+
+```python
+if task == 'data2text':
+	if dimension == 'persuasiveness':
+		cur_input = 'question: Is this a persuasive utterence </s> utterance: ' + output[i]
+```
+
+</details>
+
 ## To Do
 
 - [x] Add evaluation for English capability
-- [ ] Support UniEval
+- [x] Support UniEval
 - [x] Support GPT-4 evaluation
+- [ ] Support GPT evaluation with reference in the prompt
 
 ## Citations
 
@@ -327,4 +375,13 @@ For example, if you want to add a new metric `persuasiveness` into category `bra
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+
+@misc{zhong2022unified,
+      title={Towards a Unified Multi-Dimensional Evaluator for Text Generation},
+      author={Ming Zhong and Yang Liu and Da Yin and Yuning Mao and Yizhu Jiao and Pengfei Liu and Chenguang Zhu and Heng Ji and Jiawei Han},
+      year={2022},
+      eprint={2210.07197},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 ```
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
index a8c7ea8a3135..cf647f79bbf8 100644
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -34,7 +34,8 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
       ]
     },
     "closed_qa": {
@@ -46,7 +47,8 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
       ]
     },
     "extraction": {
@@ -58,7 +60,8 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
       ]
     },
     "generation": {
@@ -116,7 +119,8 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
       ]
     }
   }
diff --git a/applications/Chat/evaluate/config/config_en.json b/applications/Chat/evaluate/config/config_en.json
index 5b6272b97084..014c61d93a54 100644
--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
@@ -1,5 +1,10 @@
 {
   "language": "en",
+  "path_for_UniEval": {
+    "summarization": "path to unieval-sum",
+    "dialogue": "path to unieval-dialog",
+    "data2text": "path to unieval-sum"
+  },
   "category": {
     "brainstorming": {
       "GPT": [
@@ -11,6 +16,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "chat": {
@@ -23,6 +33,14 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "dialogue-naturalness",
+        "dialogue-coherence",
+        "dialogue-understandability",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "classification": {
@@ -34,7 +52,13 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "closed_qa": {
@@ -46,7 +70,13 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "extraction": {
@@ -58,7 +88,13 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "generation": {
@@ -71,6 +107,11 @@
         "BLEU",
         "ROUGE",
         "BERTScore"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "open_qa": {
@@ -81,6 +122,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "rewriting": {
@@ -93,6 +139,11 @@
         "BLEU",
         "ROUGE",
         "BERTScore"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "roleplay": {
@@ -104,6 +155,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "summarization": {
@@ -116,7 +172,16 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-coherence",
+        "summarization-consistency",
+        "summarization-fluency",
+        "summarization-relevance",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     }
   }
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index 8388d95f748a..180ef438cc43 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -40,7 +40,7 @@ def main(args):
 
         # initialize evaluator
         evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
-                              config["language"])
+                              config["language"], config.get("path_for_UniEval", None))
         if len(args.model_name_list) == 2:
             answers1 = jload(args.answer_file_list[0])
             answers2 = jload(args.answer_file_list[1])
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index 0bf55ca80d7c..6bb8cdb29431 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -4,6 +4,7 @@
 import gpt_evaluate
 import metrics
 import pandas as pd
+import unieval
 from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
 
 
@@ -15,13 +16,15 @@ class Evaluator(object):
     """
 
     def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
-                 gpt_model: str, language: str) -> None:
+                 gpt_model: str, language: str, path_for_UniEval: Dict[str, str]) -> None:
         self.params = params
         self.battle_prompt = battle_prompt
         self.gpt_evaluation_prompt = gpt_evaluation_prompt
         self.gpt_model = gpt_model
         self.language = language
+        self.path_for_UniEval = path_for_UniEval
         self.automatic_metric_stats = dict()
+        self.unieval_metric_stats = dict()
         self.gpt_evaluation_results = dict()
         self.battle_results = []
 
@@ -47,16 +50,18 @@ def switch(metric, language):
                 return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
             elif metric == "ROUGE":
                 return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Distinct"):
+            elif metric == "Distinct":
                 return metrics.distinct_score(preds=predicts_list, language=language)
-            elif (metric == "BERTScore"):
+            elif metric == "BERTScore":
                 return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Precision"):
+            elif metric == "Precision":
                 return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Recall"):
+            elif metric == "Recall":
                 return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "F1 score"):
+            elif metric == "F1 score":
                 return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
+            elif metric == "CHRF":
+                return metrics.chrf_score(preds=predicts_list, targets=targets_list, language=language)
             else:
                 raise ValueError(f"Unexpected metric")
 
@@ -69,6 +74,9 @@ def switch(metric, language):
                 print(f"Category {category} specified in your config doesn't have corresponding answers!")
                 continue
 
+            if self.params[category].get("Metrics", None) is None:
+                continue
+
             category_metrics = self.params[category]["Metrics"]
             self.automatic_metric_stats[category] = {}
 
@@ -80,12 +88,68 @@ def switch(metric, language):
             for metric in category_metrics:
                 self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
 
+        # UniEval evaluation
+        # self.unieval_metric_stats's key is "task" instead of "category".
+        # Iterating "task" first will avoid repeated loading models because one task corresponds to one UniEval model.
+        # If key is "category", different models will be loaded for multiple times across categories because the user may require different task(models) to evaluate one category.
+        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            if self.params[category].get("UniEval", None) is None:
+                continue
+
+            if self.params[category]["UniEval"] and self.language == "cn":
+                raise Exception(
+                    "UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file.")
+
+            category_metrics = self.params[category]["UniEval"]
+
+            for task, metric in [tuple(category_metric.split("-")) for category_metric in category_metrics]:
+                if self.unieval_metric_stats.get(task, None) is None:
+                    self.unieval_metric_stats[task] = {category: {metric: 0}}
+                elif self.unieval_metric_stats[task].get(category, None) is None:
+                    self.unieval_metric_stats[task][category] = {metric: 0}
+                else:
+                    self.unieval_metric_stats[task][category][metric] = 0
+
+        for task in self.unieval_metric_stats:
+            if self.path_for_UniEval is None:
+                raise Exception(f"Please specify the path for UniEval model in the config file!")
+
+            if self.path_for_UniEval.get(task, None) is None:
+                raise Exception(f"Please specify the model path for task {task} in the config file!")
+
+            print(f"Load UniEval model for task {task}.")
+
+            uni_evaluator = unieval.get_evaluator(task, model_name_or_path=self.path_for_UniEval[task])
+            for category in self.unieval_metric_stats[task]:
+                targets_list = [
+                    target["target"] if target["target"] else target["output"]
+                    for target in targets_per_category[category]
+                ]
+                predicts_list = [answer["output"] for answer in answers_per_category[category]]
+                sources_list = [answer["instruction"] + answer["input"] for answer in answers_per_category[category]]
+
+                data = unieval.convert_data_to_unieval_format(predicts_list, sources_list, targets_list)
+                scores = uni_evaluator.evaluate(data,
+                                                category,
+                                                dims=list(self.unieval_metric_stats[task][category].keys()),
+                                                overall=False)
+                avg_scores = unieval.calculate_average_score(scores)
+
+                self.unieval_metric_stats[task][category].update(avg_scores)
+
         # gpt evaluation
         for category in self.params:
             if len(answers_per_category[category]) == 0:
                 print(f"Category {category} specified in your config doesn't have corresponding answers!")
                 continue
 
+            if self.params[category].get("GPT", None) is None:
+                continue
+
             category_metrics = self.params[category]["GPT"]
 
             prompt = self.gpt_evaluation_prompt.get(category, None)
@@ -106,29 +170,43 @@ def save(self, path: str, model_name_list: List[str]) -> None:
             save_path = os.path.join(path, "gpt_evaluate", "battle_results")
             gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
         else:
-            # Save evaluation results for automatic metrics
-            automatic_base_save_path = os.path.join(path, "automatic_results")
-            automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
-
-            save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
-
-            # Save charts and csv.
-            automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
-            analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
-
-            # Save evaluation results for GPT evaluation metrics.
-            gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
-            gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
-
-            all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
-                                                                       gpt_evaluation_results_save_path)
-
-            # Start to calculate scores and save statistics.
-            gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                        gpt_evaluation_statistics_save_path)
-
-            # Save charts and csv.
-            gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
-                                                           gpt_evaluation_analyses_save_path)
+            if self.automatic_metric_stats:
+                # Save evaluation results for automatic metrics
+                automatic_base_save_path = os.path.join(path, "automatic_results")
+                automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
+
+                save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
+
+                # Save charts and csv.
+                automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
+                analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
+
+            if self.unieval_metric_stats:
+                # Save evaluation results for UniEval metrics
+                unieval_base_save_path = os.path.join(path, "unieval_results")
+                unieval_results_save_path = os.path.join(unieval_base_save_path, "evaluation_results")
+
+                unieval.save_unieval_results(model_name_list[0], self.unieval_metric_stats, unieval_results_save_path)
+
+                # Save charts and csv.
+                unieval_analyses_save_path = os.path.join(unieval_base_save_path, "evaluation_analyses")
+                unieval.analyze_unieval_results(unieval_results_save_path, unieval_analyses_save_path)
+
+            if self.gpt_evaluation_results:
+                # Save evaluation results for GPT evaluation metrics.
+                gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+                gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+
+                all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0],
+                                                                           self.gpt_evaluation_results,
+                                                                           gpt_evaluation_results_save_path)
+
+                # Start to calculate scores and save statistics.
+                gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
+                gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                            gpt_evaluation_statistics_save_path)
+
+                # Save charts and csv.
+                gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
+                gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
+                                                               gpt_evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index b433500dfa04..6702526ac5e6 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -599,7 +599,7 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
 
     for category in tqdm.tqdm(
             frame_per_category.keys(),
-            desc=f"category: ",
+            desc=f"GPT evaluation: ",
             total=len(frame_per_category.keys()),
     ):
         data = pd.DataFrame(frame_per_category[category])
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
index 031f6fa83926..e220226ec041 100644
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -4,6 +4,7 @@
 import jieba
 from bert_score import score
 from nltk.translate.bleu_score import sentence_bleu
+from nltk.translate.chrf_score import sentence_chrf
 from rouge_chinese import Rouge as Rouge_cn
 from rouge_score import rouge_scorer as Rouge_en
 from sklearn.metrics import f1_score, precision_score, recall_score
@@ -40,6 +41,27 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,
     return bleu_scores
 
 
+def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
+    """Calculate CHRF Score Metric in sentence level.
+    """
+    chrf_score = {"chrf": 0}
+    cumulative_chrf = []
+
+    for pred, target in zip(preds, targets):
+        if language == "cn":
+            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = ' '.join(jieba.cut(preprocessing_text(target))).split()
+        elif language == "en":
+            pred_list = preprocessing_text(pred).split()
+            target_list = preprocessing_text(target).split()
+
+        cumulative_chrf.append(sentence_chrf(target_list, pred_list))
+
+    chrf_score["chrf"] = statistics.mean(cumulative_chrf)
+
+    return chrf_score
+
+
 def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
     """Calculate Chinese ROUGE Score Metric
 
diff --git a/applications/Chat/evaluate/unieval/__init__.py b/applications/Chat/evaluate/unieval/__init__.py
new file mode 100644
index 000000000000..dad8d6ad09fa
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/__init__.py
@@ -0,0 +1,12 @@
+from .evaluator import get_evaluator
+from .utils import (
+    analyze_unieval_results,
+    calculate_average_score,
+    convert_data_to_unieval_format,
+    save_unieval_results,
+)
+
+__all__ = [
+    'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
+    'analyze_unieval_results'
+]
diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py
new file mode 100644
index 000000000000..385425e4a576
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/evaluator.py
@@ -0,0 +1,330 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+from nltk import sent_tokenize
+
+from .scorer import UniEvaluator
+from .utils import add_question
+
+
+class SumEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for text summarization """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'summarization'
+        self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
+                  four dimensions: coherence, consistency, fluency, relevance.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            # Calculate average sentence-level scores for 'consistency' and 'fluency'
+            if dim == 'consistency' or dim == 'fluency':
+                src_list, output_list = [], []
+                n_sents = []    # the number of sentences in each generated summary
+                for i in range(n_data):
+                    source = data[i]['source']
+                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    n_sents.append(len(system_outputs))
+                    for j in range(len(system_outputs)):
+                        src_list.append(source)
+                        output_list.append(system_outputs[j])
+                input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
+                sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+                # Get average score for each sample
+                start_idx = 0
+                score = []
+                for cur_n_sent in n_sents:
+                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+                    start_idx += cur_n_sent
+
+            # Calculate summary-level score for 'coherence' and 'relevance'
+            elif dim == 'coherence' or dim == 'relevance':
+                src_list, output_list, ref_list = [], [], []
+                for i in range(n_data):
+                    src_list.append(data[i]['source'])
+                    output_list.append(data[i]['system_output'])
+                    if dim == 'relevance':
+                        ref_list.append(data[i]['reference'])
+                input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
+                score = self.scorer.score(input_list, self.task, category, dim)
+
+            # Please customize other dimensions here for summarization
+            else:
+                raise NotImplementedError('The input format for this dimension is still undefined. \
+                                           Please customize it first.')
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class DialogEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for dialogues """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'dialogue'
+        self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
+                  five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            # Calculate summation score for 'engagingness'
+            if dim == 'engagingness':
+                src_list, output_list, context_list = [], [], []
+                n_sents = []    # the number of sentences in each generated response
+                for i in range(n_data):
+                    source = data[i]['source']
+                    context = data[i]['context']
+                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    n_sents.append(len(system_outputs))
+                    for j in range(len(system_outputs)):
+                        src_list.append(source)
+                        context_list.append(context)
+                        output_list.append(system_outputs[j])
+                input_list = add_question(dimension=dim,
+                                          output=output_list,
+                                          src=src_list,
+                                          context=context_list,
+                                          task=self.task)
+                sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+                # Get the summation score for each sample
+                start_idx = 0
+                score = []
+                for cur_n_sent in n_sents:
+                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
+                    start_idx += cur_n_sent
+
+            # Calculate turn-level score for other dimensions
+            elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
+                src_list, output_list, context_list = [], [], []
+                for i in range(n_data):
+                    src_list.append(data[i]['source'])
+                    output_list.append(data[i]['system_output'])
+                    context_list.append(data[i]['context'])
+                input_list = add_question(dimension=dim,
+                                          output=output_list,
+                                          src=src_list,
+                                          context=context_list,
+                                          task=self.task)
+                score = self.scorer.score(input_list, self.task, category, dim)
+
+            # Please customize other dimensions here for summarization
+            else:
+                raise NotImplementedError('The input format for this dimension is still undefined. \
+                                           Please customize it first.')
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class D2tEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for data-to-text """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'data2text'
+        self.dimensions = ['naturalness', 'informativeness']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
+                  two dimensions: naturalness and informativeness.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            output_list, ref_list = [], []
+            for i in range(n_data):
+                output_list.append(data[i]['system_output'])
+                ref_list.append(data[i]['reference'])
+
+            input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
+            score = self.scorer.score(input_list, self.task, category, dim)
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class FactEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for factual consistency detection """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'fact'
+        self.dim = 'consistency'
+
+    def evaluate(self, data, category):
+        """
+            Get the factual consistency score (only 1 dimension for this task)
+
+            category: The category to be evaluated.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        # Calculate average sentence-level scores for facutal consistency
+        src_list, output_list = [], []
+        n_sents = []    # the number of sentences in the claim
+        for i in range(n_data):
+            source = data[i]['source']
+            system_outputs = sent_tokenize(data[i]['system_output'])
+            n_sents.append(len(system_outputs))
+            for j in range(len(system_outputs)):
+                src_list.append(source)
+                output_list.append(system_outputs[j])
+        input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
+        sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+        # Get average score for each sample
+        start_idx = 0
+        score = []
+        for cur_n_sent in n_sents:
+            score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+            start_idx += cur_n_sent
+
+        for i in range(n_data):
+            eval_scores[i][self.dim] = score[i]
+
+        return eval_scores
+
+
+def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
+    assert task in ['summarization', 'dialogue', 'data2text', 'fact']
+    if task == 'summarization':
+        return SumEvaluator(model_name_or_path=model_name_or_path,
+                            max_length=max_length,
+                            device=device,
+                            cache_dir=cache_dir)
+    elif task == 'dialogue':
+        return DialogEvaluator(model_name_or_path=model_name_or_path,
+                               max_length=max_length,
+                               device=device,
+                               cache_dir=cache_dir)
+    elif task == 'data2text':
+        return D2tEvaluator(model_name_or_path=model_name_or_path,
+                            max_length=max_length,
+                            device=device,
+                            cache_dir=cache_dir)
+    elif task == 'fact':
+        return FactEvaluator(model_name_or_path=model_name_or_path,
+                             max_length=max_length,
+                             device=device,
+                             cache_dir=cache_dir)
+    else:
+        raise NotImplementedError('Other tasks are not implemented, \
+                                   please customize specific tasks here.')
diff --git a/applications/Chat/evaluate/unieval/scorer.py b/applications/Chat/evaluate/unieval/scorer.py
new file mode 100644
index 000000000000..2c70bb9f6ded
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/scorer.py
@@ -0,0 +1,101 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+class UniEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up model """
+        self.device = device
+        self.max_length = max_length
+
+        self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
+
+        self.model.eval()
+        self.model.to(device)
+
+        self.softmax = nn.Softmax(dim=1)
+
+        self.pos_id = self.tokenizer("Yes")["input_ids"][0]
+        self.neg_id = self.tokenizer("No")["input_ids"][0]
+
+    def score(self, inputs, task, category, dim, batch_size=8):
+        """
+            Get scores for the given samples.
+            final_score = postive_score / (postive_score + negative_score)
+        """
+
+        # The implementation of "forward" in T5 still requires decoder_input_ids.
+        # Therefore, we construct a random one-word target sequence.
+        # The content of the target has no effect on the final scores.
+        tgts = ["No" for _ in range(len(inputs))]
+
+        pos_score_list, neg_score_list = [], []
+        for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
+            src_list = inputs[i:i + batch_size]
+            tgt_list = tgts[i:i + batch_size]
+            try:
+                with torch.no_grad():
+                    encoded_src = self.tokenizer(src_list,
+                                                 max_length=self.max_length,
+                                                 truncation=True,
+                                                 padding=True,
+                                                 return_tensors='pt')
+                    encoded_tgt = self.tokenizer(tgt_list,
+                                                 max_length=self.max_length,
+                                                 truncation=True,
+                                                 padding=True,
+                                                 return_tensors='pt')
+
+                    src_tokens = encoded_src['input_ids'].to(self.device)
+                    src_mask = encoded_src['attention_mask'].to(self.device)
+
+                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
+
+                    output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
+                    logits = output.logits.view(-1, self.model.config.vocab_size)
+
+                    pos_score = self.softmax(logits)[:, self.pos_id]    # Yes
+                    neg_score = self.softmax(logits)[:, self.neg_id]    # No
+
+                    cur_pos_score = [x.item() for x in pos_score]
+                    cur_neg_score = [x.item() for x in neg_score]
+                    pos_score_list += cur_pos_score
+                    neg_score_list += cur_neg_score
+
+            except RuntimeError:
+                print(f'source: {src_list}')
+                print(f'target: {tgt_list}')
+                exit(0)
+
+        score_list = []
+        for i in range(len(pos_score_list)):
+            score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
+
+        return score_list
diff --git a/applications/Chat/evaluate/unieval/utils.py b/applications/Chat/evaluate/unieval/utils.py
new file mode 100644
index 000000000000..a77505faa0d2
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/utils.py
@@ -0,0 +1,248 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+from typing import Dict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import tqdm
+
+
+def add_question(dimension, output, src=None, ref=None, context=None, task=None):
+    """
+        Add questions to generate input in Bool-QA format for UniEval.
+
+        dimension: specific dimension to be evaluated
+        src: source input for different NLG tasks. For example, source document for summarization
+             and dialogue history for dialogue response generation.
+        output: output text generated by the models
+        ref: human-annotataed groundtruth
+        context: the context needed to evaluate several specific dimension. For example,
+                 additional factual information when evaluating engagingness and groundedness in dialogues.
+    """
+
+    input_with_question = []
+    for i in range(len(output)):
+        # For summarization
+        if task == 'summarization':
+            if dimension == 'fluency':
+                cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
+            elif dimension == 'coherence':
+                cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            elif dimension == 'consistency':
+                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            elif dimension == 'relevance':
+                cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[
+                    i] + ' </s> reference: ' + ref[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For dialogues
+        elif task == 'dialogue':
+            if dimension == 'naturalness':
+                cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
+            elif dimension == 'coherence':
+                cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
+                            + output[i] + ' </s> dialogue history: ' + src[i]
+            elif dimension == 'engagingness':
+                cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
+                            + output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
+            elif dimension == 'groundedness':
+                cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
+                            + output[i] + ' </s> fact: ' + context[i]
+            elif dimension == 'understandability':
+                cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For data-to-text
+        elif task == 'data2text':
+            if dimension == 'naturalness':
+                cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
+            elif dimension == 'informativeness':
+                cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
+                            + output[i] + ' </s> reference: ' + ref[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For factual consistency detection
+        elif task == 'fact':
+            if dimension == 'consistency':
+                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            else:
+                raise NotImplementedError('No other dimensions for the factual consistency detection task.')
+        # For new customized tasks
+        else:
+            raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
+        input_with_question.append(cur_input)
+    return input_with_question
+
+
+def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
+    """
+        Convert the data into the unieval's format.
+
+        output_list: a list of model output
+
+        src_list: source input for different NLG tasks. For example, source document for summarization
+                  and dialogue history for dialogue response generation
+        ref_list: human-annotated groundtruth
+    """
+    json_data = []
+    for i in range(len(output_list)):
+        cur = {}
+        cur['system_output'] = output_list[i]
+        if src_list is not None:
+            cur['source'] = src_list[i]
+        if ref_list is not None:
+            cur['reference'] = ref_list[i]
+        cur['context'] = ""
+        json_data.append(cur)
+    return json_data
+
+
+def calculate_average_score(scores):
+    """
+        Calculate average scores for different metrics
+
+        scores: a list of scores for different metrics for each answer
+
+    """
+    metrics = {metric: 0 for metric in scores[0]}
+
+    for score in scores:
+        for metric in score:
+            metrics[metric] += score[metric]
+
+    for metric in metrics:
+        metrics[metric] /= len(scores)
+
+    return metrics
+
+
+def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
+    """
+    Save UniEval evaluation results of different categories for one model.
+
+    """
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    unieval_metric_stats_per_category = {}
+    for task, category_stat in unieval_metric_stats.items():
+        for category, metric_stat in category_stat.items():
+            if unieval_metric_stats_per_category.get(category, None) is None:
+                unieval_metric_stats_per_category[category] = {}
+            for metric, score in metric_stat.items():
+                unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
+
+    automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
+    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
+
+
+def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
+    """
+    Read a csv file and return a dictionary which stores scores per metric.
+
+    """
+
+    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
+
+    results_dict = {metric: {} for metric in list(results.index)}
+    for i, metric in enumerate(results_dict.keys()):
+        for j, category in enumerate(list(results.columns)):
+            if pd.isnull(results.iloc[i][j]):
+                continue
+            results_dict[metric][category] = results.iloc[i][j]
+
+    return results_dict
+
+
+def analyze_unieval_results(results_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all csv files in the given folder.
+
+    """
+
+    if not os.path.exists(results_path):
+        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
+
+    all_statistics = {}
+
+    for file_name in os.listdir(results_path):
+        if file_name.endswith("_results.csv"):
+            model_name = file_name.split("_results.csv")[0]
+            all_statistics[model_name] = read_unieval_results(results_path, file_name)
+
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
+
+    frame_all = {"model": [], "category": [], "metric": [], "score": []}
+    frame_per_metric = {}
+    for model_name, model_statistics in all_statistics.items():
+        for metric, metric_statistics in model_statistics.items():
+            if frame_per_metric.get(metric) is None:
+                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
+
+            for category, category_score in metric_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["score"].append(category_score)
+
+                frame_per_metric[metric]["model"].append(model_name)
+                frame_per_metric[metric]["category"].append(category)
+                frame_per_metric[metric]["score"].append(category_score)
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
+
+    for metric in tqdm.tqdm(
+            frame_per_metric.keys(),
+            desc=f"UniEval metrics: ",
+            total=len(frame_per_metric.keys()),
+    ):
+        data = pd.DataFrame(frame_per_metric[metric])
+
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+
+        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
+        fig.set_title(
+            f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
+        )
+        plt.xlabel("Evaluation Category")
+        plt.ylabel("Score")
+
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
+
+        plt.close()
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index 1f4069386fcd..fefe25f5e764 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -199,7 +199,7 @@ def analyze_automatic_results(results_path: str, save_path: str) -> None:
 
     for metric in tqdm.tqdm(
             frame_per_metric.keys(),
-            desc=f"metric: ",
+            desc=f"automatic metrics: ",
             total=len(frame_per_metric.keys()),
     ):
         data = pd.DataFrame(frame_per_metric[metric])

From ddcf58cacf9581d9c59a18f8276d52a061818fab Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 9 Jun 2023 09:41:27 +0800
Subject: [PATCH 47/52] Revert "[sync] sync feature/shardformer with develop"

---
 colossalai/device/README.md                   |   73 --
 colossalai/device/device_mesh.py              |  444 +++----
 colossalai/lazy/lazy_init.py                  |   16 +-
 colossalai/nn/layer/parallel_1d/_operation.py |    1 -
 colossalai/shardformer/README.md              |  296 -----
 colossalai/shardformer/__init__.py            |    0
 colossalai/shardformer/layer/__init__.py      |    0
 colossalai/shardformer/layer/_operation.py    |   97 --
 .../shardformer/layer/dist_crossentropy.py    |  105 --
 colossalai/shardformer/layer/dropout.py       |   58 -
 colossalai/shardformer/layer/layers.py        | 1043 -----------------
 colossalai/shardformer/model/__init__.py      |    0
 colossalai/shardformer/model/modeling_bert.py |   67 --
 colossalai/shardformer/policies/__init__.py   |    0
 colossalai/shardformer/policies/autopolicy.py |   58 -
 colossalai/shardformer/policies/basepolicy.py |  217 ----
 colossalai/shardformer/policies/bert.py       |  170 ---
 colossalai/shardformer/policies/gpt2.py       |  118 --
 colossalai/shardformer/shard/__init__.py      |    5 -
 colossalai/shardformer/shard/shard_config.py  |   20 -
 colossalai/shardformer/shard/sharder.py       |  266 -----
 colossalai/shardformer/shard/slicer.py        |  161 ---
 colossalai/shardformer/test/config.py         |    1 -
 colossalai/shardformer/test/module_test.py    |   50 -
 colossalai/shardformer/test/test.py           |  124 --
 colossalai/shardformer/utils/__init__.py      |    0
 colossalai/shardformer/utils/utils.py         |   58 -
 colossalai/tensor/comm_spec.py                |   89 +-
 colossalai/tensor/d_tensor/RAEDME.md          |  103 --
 colossalai/tensor/d_tensor/__init__.py        |    4 -
 colossalai/tensor/d_tensor/comm_spec.py       |   88 +-
 colossalai/tensor/d_tensor/d_tensor.py        |  114 +-
 colossalai/tensor/d_tensor/layout.py          |   30 +-
 .../tensor/d_tensor/layout_converter.py       |   86 +-
 colossalai/tensor/d_tensor/sharding_spec.py   |   31 +-
 docs/sidebars.json                            |    1 -
 docs/source/en/features/lazy_init.md          |   71 --
 docs/source/zh-Hans/features/lazy_init.md     |   71 --
 tests/test_device/test_device_mesh.py         |   13 +-
 tests/test_device/test_init_logical_pg.py     |   16 +-
 tests/test_lazy/lazy_init_utils.py            |   10 +-
 tests/test_lazy/test_distribute.py            |   28 +-
 .../test_dtensor/test_comm_spec.py            |   33 +-
 .../test_tensor/test_dtensor/test_dtensor.py  |   17 +-
 .../test_dtensor/test_layout_converter.py     |   41 +-
 tests/test_tensor/test_shape_consistency.py   |    7 +-
 tests/test_tensor/test_sharded_linear.py      |    2 +-
 tests/test_tensor/test_sharding_spec.py       |    2 +-
 48 files changed, 437 insertions(+), 3868 deletions(-)
 delete mode 100644 colossalai/device/README.md
 delete mode 100644 colossalai/shardformer/README.md
 delete mode 100644 colossalai/shardformer/__init__.py
 delete mode 100644 colossalai/shardformer/layer/__init__.py
 delete mode 100644 colossalai/shardformer/layer/_operation.py
 delete mode 100644 colossalai/shardformer/layer/dist_crossentropy.py
 delete mode 100644 colossalai/shardformer/layer/dropout.py
 delete mode 100644 colossalai/shardformer/layer/layers.py
 delete mode 100644 colossalai/shardformer/model/__init__.py
 delete mode 100644 colossalai/shardformer/model/modeling_bert.py
 delete mode 100644 colossalai/shardformer/policies/__init__.py
 delete mode 100644 colossalai/shardformer/policies/autopolicy.py
 delete mode 100644 colossalai/shardformer/policies/basepolicy.py
 delete mode 100644 colossalai/shardformer/policies/bert.py
 delete mode 100644 colossalai/shardformer/policies/gpt2.py
 delete mode 100644 colossalai/shardformer/shard/__init__.py
 delete mode 100644 colossalai/shardformer/shard/shard_config.py
 delete mode 100644 colossalai/shardformer/shard/sharder.py
 delete mode 100644 colossalai/shardformer/shard/slicer.py
 delete mode 100644 colossalai/shardformer/test/config.py
 delete mode 100644 colossalai/shardformer/test/module_test.py
 delete mode 100644 colossalai/shardformer/test/test.py
 delete mode 100644 colossalai/shardformer/utils/__init__.py
 delete mode 100644 colossalai/shardformer/utils/utils.py
 delete mode 100644 colossalai/tensor/d_tensor/RAEDME.md
 delete mode 100644 docs/source/en/features/lazy_init.md
 delete mode 100644 docs/source/zh-Hans/features/lazy_init.md

diff --git a/colossalai/device/README.md b/colossalai/device/README.md
deleted file mode 100644
index 8f835735bef4..000000000000
--- a/colossalai/device/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 🗄 Device
-
-## 📚 Table of Contents
-
-- [🗄 Device](#-device)
-  - [📚 Table of Contents](#-table-of-contents)
-  - [🔗 Introduction](#-introduction)
-  - [📝 Design](#-design)
-  - [🔨 Usage](#-usage)
-
-## 🔗 Introduction
-
-This module contains the implementation of the abstraction of the device topology. It is used to represent the device topology and manage the distributed information related to the network.
-
-## 📝 Design
-
-
-This module is inspired by the DeviceMesh in the [Alpa project](https://github.com/alpa-projects/alpa) and the device array can be represented as a 1D or 2D mesh. We will be extending the device mesh to support 3D mesh in the future.
-
-
-## 🔨 Usage
-
-- Create a device mesh
-
-```python
-# this is the list of global ranks involved in the device mesh
-# assume we have 4 GPUs and the global ranks for these GPUs are 0, 1, 2, 3
-physical_mesh_id = torch.arange(4)
-mesh_shape = [2, 2]
-device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-```
-
-- View the mesh
-
-
-```python
-# view the mesh shape
-# expect output
-# [2, 2]
-print(device_mesh.shape)
-
-
-# view the logical mesh with global ranks
-# expect output
-# [
-#   [0, 1],
-#   [2, 3]
-# ]
-print(device_mesh.logical_mesh_id)
-
-# view the number of devices in the mesh
-# expect output
-# 4
-print(device_mesh.num_devices)
-
-```
-
-- Initialize the process group
-
-```python
-# intialize process group
-device_mesh.init_logical_process_group()
-
-
-# get the process group for a rank with respect to an axis
-# this is the process group involving global ranks 0 and 2
-print(device_mesh.get_process_group(axis=0, global_rank=0))
-
-# get the ranks in the process with respect to an axis
-# expect output
-# [0, 2]
-print(device_mesh.get_ranks_in_process_group(axis=0, global_rank=0))
-```
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 0490a440153e..2a5f747fbc23 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -3,19 +3,11 @@
    with some changes. """
 
 import operator
-from dataclasses import dataclass
 from functools import reduce
-from typing import Dict, List, Union
+from typing import List, Tuple
 
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-
-
-@dataclass
-class ProcessGroupContainer:
-    process_group: ProcessGroup
-    ranks: List[int]
 
 
 # modified from alpa LogicalDeviceMesh(https://github.com/alpa-projects/alpa/blob/main/alpa/shard_parallel/auto_sharding.py)
@@ -35,11 +27,9 @@ class DeviceMesh:
             during initializing the DeviceMesh instance if the init_process_group set to True.
             Otherwise, users need to call create_process_groups_for_logical_mesh manually to init logical process group.
             (default: False)
-        device (str): the device for the process groups used by the DeviceMesh instance. (default: 'cuda')
+        need_flatten(bool, optional): initialize flatten_device_mesh during initializing the DeviceMesh instance if the need_flatten set to True.
     """
 
-    _DIST_BACKEND = {"cuda": "nccl", "cpu": "gloo"}
-
     def __init__(self,
                  physical_mesh_id: torch.Tensor,
                  mesh_shape: torch.Size = None,
@@ -47,140 +37,48 @@ def __init__(self,
                  mesh_alpha: List[float] = None,
                  mesh_beta: List[float] = None,
                  init_process_group: bool = False,
-                 device: str = 'cuda'):
-        # ============================
-        # Physical & Logical Mesh IDs
-        # ============================
-        self._physical_mesh_id = physical_mesh_id
-        assert physical_mesh_id.dim() == 1, "physical_mesh_id should be a 1D tensor."
-
-        # logical mesh ids can be obtained via two ways
-        # 1. provide physical mesh id and provide mesh shape
-        # 2. directly supply the logical mesh id
-        assert mesh_shape is None or logical_mesh_id is None, \
-            "Only one of mesh_shape and logical_mesh_id can be specified." \
-            "Logical mesh IDs are obtained from either mesh_shape + phyiscal_mesh_id or directly from the user-supplied logical_mesh_id"
-
+                 need_flatten: bool = True):
+        self.physical_mesh_id = physical_mesh_id
         if logical_mesh_id is None:
             self.mesh_shape = mesh_shape
-            self._logical_mesh_id = self._physical_mesh_id.reshape(self.mesh_shape)
+            self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
         else:
             self._logical_mesh_id = logical_mesh_id
             self.mesh_shape = self._logical_mesh_id.shape
 
-        # ensure two things:
-        # 1. logical and physical mesh IDs should contain the same elements
-        # 2. there is no duplicate IDs in each mesh, e.g. [2, 2] is not allowed
-        assert torch.equal(torch.unique(self._physical_mesh_id), torch.unique(self.logical_mesh_id)), \
-            "physical and logical mesh IDs should contain the same elements, please check if you have consistent physical_mesh_id and logical_mesh_id."
-        assert torch.unique(self._physical_mesh_id).numel() == self._physical_mesh_id.numel(), \
-            "Found duplicate IDs in the phyiscal_mesh_id and this is not allowed, please check your physical_mesh_id again."
-        assert torch.unique(self.logical_mesh_id).numel() == self.logical_mesh_id.numel(), \
-            "Found duplicate IDs in the logical_mesh_id and this is not allowed, please check your logical_mesh_id again."
-
-        # ===============================================
+        # map global rank into logical rank
+        self.convert_map = {}
+        self._global_rank_to_logical_rank_map(self._logical_mesh_id, [])
         # coefficient for alpha-beta communication model
-        # alpha is latency and beta is bandwidth
-        # ===============================================
-        # if the values are not provided, we assume they are 1 for simplicity
         if mesh_alpha is None:
             mesh_alpha = [1] * len(self.mesh_shape)
         if mesh_beta is None:
             mesh_beta = [1] * len(self.mesh_shape)
-
         self.mesh_alpha = tuple(mesh_alpha)
         self.mesh_beta = tuple(mesh_beta)
-
-        # ensure the alpha and beta have the same shape
-        assert len(self.mesh_alpha) == len(self.mesh_beta), \
-            "mesh_alpha and mesh_beta should have the same length, please check your mesh_alpha and mesh_beta again."
-
-        # =========================
-        # Device for Process Group
-        # =========================
-        self._device = device
-        self._dist_backend = self._DIST_BACKEND[device]
-
-        # =========================
-        # Process Group Management
-        # =========================
-        # the _global_to_local_rank_mapping is structured as follows
-        # {
-        #    <global-rank>: [ <local-rank-on-axis-0>, <local-rank-on-axis-1>, <local-rank-on-axis-2>, ...]
-        # }
-        self._global_to_local_rank_mapping = dict()
-        self._init_global_to_logical_rank_mapping(mapping=self._global_to_local_rank_mapping,
-                                                  tensor=self.logical_mesh_id)
-
-        # create process group
-        self._process_group_dict = {}
-        self._ranks_in_the_process_group = {}
-        self._global_rank_of_current_process = None
-        self._is_initialized = False
-
-        # initialize process group if specified
-        self._init_ranks_in_the_same_group()
-        self._init_process_group = init_process_group
-        if init_process_group:
-            self.init_logical_process_group()
+        self.init_process_group = init_process_group
+        self.need_flatten = need_flatten
+        if self.init_process_group:
+            self.process_groups_dict = self.create_process_groups_for_logical_mesh()
+        if self.need_flatten and self._logical_mesh_id.dim() > 1:
+            self.flatten_device_mesh = self.flatten()
+            # Create a new member `flatten_device_meshes` to distinguish from original flatten methods (Because I'm not sure if there are functions that rely on the self.flatten())
+            # self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
+            #                                                self.mesh_beta)
 
     @property
-    def shape(self) -> torch.Size:
-        """
-        Return the shape of the logical mesh.
-        """
+    def shape(self):
         return self.mesh_shape
 
     @property
-    def num_devices(self) -> int:
-        """
-        Return the number of devices contained in the device mesh.
-        """
-        return reduce(operator.mul, self._physical_mesh_id.shape, 1)
+    def num_devices(self):
+        return reduce(operator.mul, self.physical_mesh_id.shape, 1)
 
     @property
-    def logical_mesh_id(self) -> torch.Tensor:
-        """
-        Return the logical mesh id.
-        """
+    def logical_mesh_id(self):
         return self._logical_mesh_id
 
-    def get_process_group(self, axis: int, global_rank: int = None) -> ProcessGroup:
-        """
-        Return the process group on the specified axis.
-
-        Args:
-            axis (int): the axis of the process group.
-            global_rank (int, optional): the global rank of the process group. If not specified, the current process is used. (default: None)
-        """
-        if global_rank is None:
-            global_rank = self._global_rank_of_current_process
-        return self._process_group_dict[global_rank][axis]
-
-    def get_process_group_for_all_axes(self, global_rank: int = None) -> Dict[int, ProcessGroup]:
-        """
-        Return the process groups for all axes.
-
-        Args:
-            global_rank (int, optional): the global rank of the process
-        """
-        if global_rank is None:
-            global_rank = self._global_rank_of_current_process
-        return self._process_group_dict[global_rank]
-
-    def get_ranks_in_process_group(self, axis: int, global_rank: int = None) -> List[int]:
-        """
-        Return the ranks in the process group on the specified axis.
-
-        Args:
-            axis (int): the axis of the process group.
-            global_rank (int, optional): the global rank of the process
-        """
-        if global_rank is None:
-            global_rank = self._global_rank_of_current_process
-        return self._ranks_in_the_process_group[global_rank][axis]
-
-    def __deepcopy__(self, memo) -> "DeviceMesh":
+    def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
@@ -188,206 +86,111 @@ def __deepcopy__(self, memo) -> "DeviceMesh":
             if k != 'process_groups_dict':
                 setattr(result, k, __import__("copy").deepcopy(v, memo))
             else:
-                # process group cannot be copied
-                # thus, we share them directly
                 setattr(result, k, v)
+
         return result
 
-    def _init_global_to_logical_rank_mapping(self,
-                                             mapping: Dict,
-                                             tensor: torch.Tensor,
-                                             index_list: List[int] = []) -> Dict[int, List[int]]:
+    def flatten(self):
         """
-        Build a global rank to local rank mapping for each process group in different axis in the logical device mesh.
-
-        Args:
-            mapping (Dict): a dictionary that maps the global rank to the local rank in the logical device mesh.
-            tensor (torch.Tensor): the tensor that contains the logical mesh ids.
-            index_list (List[int])
-
-        Returns:
-            mapping (Dict): a dictionary that maps the global rank to the local rank in the logical device mesh.
-                The value is a list of integers and each integer represents the local rank in the indexed axis.
+        Flatten the logical mesh into an effective 1d logical mesh,
         """
-        for index, inner_tensor in enumerate(tensor):
-            # index means the local rank in the current axis
-            # inner_tensor refers to the processes with the same local rank
+        flatten_mesh_shape_size = len(self.mesh_shape)
+        flatten_mesh_shape = [self.num_devices]
+        return DeviceMesh(self.physical_mesh_id,
+                          tuple(flatten_mesh_shape),
+                          mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
+                          mesh_beta=[max(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
+                          init_process_group=self.init_process_group,
+                          need_flatten=False)
 
+    def _global_rank_to_logical_rank_map(self, tensor, index_list):
+        '''
+        This method is a helper function to build convert_map recursively.
+        '''
+        for index, inner_tensor in enumerate(tensor):
             if inner_tensor.numel() == 1:
-                # if the inner_tensor only has one element, it means that
-                # it already reaches the last axis
-                # we append its local_rank in the last axis to the index_list
-                # and assign to the mapping
-                # the value of the mapping is the the local rank at the indexed axis of the device mesh
-                mapping[int(inner_tensor)] = index_list + [index]
+                self.convert_map[int(inner_tensor)] = index_list + [index]
             else:
-                # we recursively go into the function until we reach the last axis
-                # meanwhile, we should add the local rank in the current axis in the index_list
-                self._init_global_to_logical_rank_mapping(mapping, inner_tensor, index_list + [index])
+                self._global_rank_to_logical_rank_map(inner_tensor, index_list + [index])
 
-    def init_logical_process_group(self):
+    def create_process_groups_for_logical_mesh(self):
         '''
         This method is used to initialize the logical process groups which will be used in communications
         among logical device mesh.
         Note: if init_process_group set to False, you have to call this method manually. Otherwise,
         the communication related function, such as ShapeConsistencyManager.apply will raise errors.
         '''
-        # sanity check
-        assert dist.is_initialized, "The torch.distributed should be initialized before calling init_logical_process_group"
-        assert not self._is_initialized, "The logical process group has been initialized, do not call init_logical_process_group twice"
-
-        # update the global rank of the current process
-        self._global_rank_of_current_process = dist.get_rank()
-        duplicate_check_list = []
-
-        # flatten the global ranks to 1D list
-        global_rank_flatten_list = self._physical_mesh_id.view(-1).tolist()
-
-        for global_rank in global_rank_flatten_list:
-            # find the other ranks which are in the same process group as global_rank
-            ranks_in_same_group_by_axis = self._collate_global_ranks_in_same_process_group(global_rank)
-
-            for axis, ranks_in_same_group in ranks_in_same_group_by_axis.items():
-                # skip duplicated process group creation
-                if ranks_in_same_group in duplicate_check_list:
-                    continue
-
-                # create the process group
-                pg_handler = dist.new_group(ranks=ranks_in_same_group, backend=self._dist_backend)
-
-                # keep this process group in the process_groups_dict
-                for rank in ranks_in_same_group:
-                    if rank not in self._process_group_dict:
-                        self._process_group_dict[rank] = dict()
-                    self._process_group_dict[rank][axis] = pg_handler
-
-        # update the init flag
-        # we only allow init for once
-        self._is_initialized = True
-
-    def _init_ranks_in_the_same_group(self):
-        """
-        This method is used to initialize the ranks_in_the_same_group dictionary.
-        """
-        # flatten the global ranks to 1D list
-        global_rank_flatten_list = self._physical_mesh_id.view(-1).tolist()
-
+        process_groups_dict = {}
+        check_duplicate_list = []
+        global_rank_flatten_list = self.physical_mesh_id.view(-1).tolist()
         for global_rank in global_rank_flatten_list:
-            # find the other ranks which are in the same process group as global_rank
-            ranks_in_same_group_by_axis = self._collate_global_ranks_in_same_process_group(global_rank)
-
-            for axis, ranks_in_same_group in ranks_in_same_group_by_axis.items():
-                # create dict for each rank
-                if global_rank not in self._process_group_dict:
-                    self._ranks_in_the_process_group[global_rank] = dict()
+            process_groups = self.global_rank_to_process_groups_with_global_rank(global_rank)
+            for axis, process_group in process_groups.items():
+                if axis not in process_groups_dict:
+                    process_groups_dict[axis] = []
+                if process_group not in check_duplicate_list:
+                    check_duplicate_list.append(process_group)
+                    process_group_handler = dist.new_group(process_group)
+                    process_groups_dict[axis].append((process_group, process_group_handler))
 
-                # keep this process group in the process_groups_dict
-                self._ranks_in_the_process_group[global_rank][axis] = ranks_in_same_group
+        return process_groups_dict
 
-    def global_rank_to_local_rank(self, rank: int, axis: int = None) -> Union[List[int], int]:
-        """
-        Return the local rank of the given global rank in the logical device mesh.
+    def global_rank_to_logical_rank(self, rank):
+        return self.convert_map[rank]
 
-        Args:
-            rank (int): the global rank in the logical device mesh.
-            axis (int): the axis of the logical device mesh.
-        """
-        local_ranks = self._global_to_local_rank_mapping[rank]
-        if axis:
-            return local_ranks[axis]
-        else:
-            return local_ranks
-
-    def _collate_global_ranks_in_same_process_group(self, global_rank):
+    def global_rank_to_process_groups_with_logical_rank(self, rank):
         '''
-        Give a global rank and return all global ranks involved in its associated process group in each axis.
-
-        Example:
-
-        ```python
-        sphysical_mesh_id = torch.arange(0, 16)
-        mesh_shape = (4, 4)
-
-        # logical mesh will look like
-        # [[0, 1, 2, 3],
-        #  [4, 5, 6, 7],
-        #  [8, 9, 10,11],
-        #  [12,13,14,15]]
-
-        device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-        print(device_mesh.collate_global_ranks_in_same_process_group(0))
-
-        # key is axis name
-        # value is a list of global ranks in same axis with rank 0
-        # output will look like
-        # {
-            0: [0, 4, 8, 12],
-            1: [0, 1, 2, 3]
-        #  }
+        Give a global rank and return all logical process groups of this rank.
+        for example:
+            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+            mesh_shape = (4, 4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7],
+            #  [8, 9, 10,11],
+            #  [12,13,14,15]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+            print(device_mesh.global_rank_to_process_groups_with_logical_rank(0))
+        output:
+            # key is axis name
+            # value is a list of logical ranks in same axis with rank 0
+            {0: [[0, 0], [1, 0], [2, 0], [3, 0]], 1: [[0, 0], [0, 1], [0, 2], [0, 3]]}
         '''
-        # We have init the global rank to local rank by calling _init_global_to_logical_rank_mapping
-        # for self._global_to_local_rank_mapping
-        # the key is the global rank
-        # the value is the list of local ranks corresponding to the global rank with respect of different axes
-        # we can see the list of local ranks as the process coordinates for simplicity
-        # the key and value are all unique, therefore,
-        # we can also to use the coordinates to find the global rank
-
-        # =========================================================================
-        # Step 1
-        # find all the process_coordinates for processes in the same process group
-        # as the given global rank
-        # =========================================================================
-
-        # each
-        processes_in_the_same_process_group = {}
-
-        for dim in range(self.logical_mesh_id.dim()):
-            # iterate over the dimension size so that we can include all processes
-            # in the same process group in the given axis
-            # the _local_rank refers to the local rank of the current process
-            for _local_rank in range(self.logical_mesh_id.shape[dim]):
-
-                # if this dimension is not initailized yet,
-                # initialize it with an empty array
-                if dim not in processes_in_the_same_process_group:
-                    processes_in_the_same_process_group[dim] = []
-
-                # get the local rank corresponding to the global rank
-                process_coordinates = self._global_to_local_rank_mapping[global_rank].copy()
-
-                # replace the local rank in the given dimension with the
-                # lcoal rank of the current process iterated
-                process_coordinates[dim] = _local_rank
-                processes_in_the_same_process_group[dim].append(process_coordinates)
-
-        # =================================================================
-        # Step 2
-        # Use local rank combination to find its corresponding global rank
-        # =================================================================
-        # the key of the dict is the axis
-        # the value is the list of global ranks which are in the same process group as the given global rank
-        global_pg_ranks = {}
-        for dim, coordinates_of_all_processes in processes_in_the_same_process_group.items():
-            global_pg_ranks[dim] = []
-            for process_coordinates in coordinates_of_all_processes:
-                # find the global rank by local rank combination
-                for _global_rank, _process_coordinates in self._global_to_local_rank_mapping.items():
-                    if process_coordinates == _process_coordinates:
-                        global_pg_ranks[dim].append(_global_rank)
-        return global_pg_ranks
-
-    def flatten(self):
-        """
-        Flatten the logical mesh into an effective 1d logical mesh,
-        """
-        flatten_mesh_shape_size = len(self.mesh_shape)
-        flatten_mesh_shape = [self.num_devices]
-        return DeviceMesh(self._physical_mesh_id,
-                          tuple(flatten_mesh_shape),
-                          mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
-                          mesh_beta=[max(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
-                          init_process_group=self._init_process_group)
+        process_groups = {}
+        for d in range(self.logical_mesh_id.dim()):
+            for replacer in range(self.logical_mesh_id.shape[d]):
+                if d not in process_groups:
+                    process_groups[d] = []
+                process_group_member = self.convert_map[rank].copy()
+                process_group_member[d] = replacer
+                process_groups[d].append(process_group_member)
+        return process_groups
+
+    def global_rank_to_process_groups_with_global_rank(self, rank):
+        '''
+        Give a global rank and return all process groups of this rank.
+        for example:
+            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+            mesh_shape = (4, 4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7],
+            #  [8, 9, 10,11],
+            #  [12,13,14,15]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+            print(device_mesh.global_rank_to_process_groups_with_global_rank(0))
+        output:
+            # key is axis name
+            # value is a list of global ranks in same axis with rank 0
+            {0: [0, 4, 8, 12], 1: [0, 1, 2, 3]}
+        '''
+        logical_process_groups = self.global_rank_to_process_groups_with_logical_rank(rank)
+        process_groups = {}
+        for dim, logical_ranks in logical_process_groups.items():
+            process_groups[dim] = []
+            for logical_rank in logical_ranks:
+                for g_rank, l_rank in self.convert_map.items():
+                    if l_rank == logical_rank:
+                        process_groups[dim].append(g_rank)
+        return process_groups
 
     def all_gather_cost(self, num_bytes, mesh_dim):
         num_devices = self.logical_mesh_id.shape[mesh_dim]
@@ -409,3 +212,38 @@ def all_to_all_cost(self, num_bytes, mesh_dim):
         penalty_factor = num_devices / 2.0
         return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] *
                 (num_devices - 1) / num_devices / num_devices * num_bytes * penalty_factor + 0.001)
+
+
+class FlattenDeviceMesh(DeviceMesh):
+
+    def __init__(self, physical_mesh_id, mesh_shape, mesh_alpha=None, mesh_beta=None):
+        super().__init__(physical_mesh_id,
+                         mesh_shape,
+                         mesh_alpha,
+                         mesh_beta,
+                         init_process_group=False,
+                         need_flatten=False)
+        # Different from flatten(), mesh_shape leaves unchanged, mesh_alpha and mesh_beta are scalars
+        self.mesh_alpha = max(self.mesh_alpha)
+        self.mesh_beta = min(self.mesh_beta)
+        # Different from original process_groups_dict, rank_list is not stored
+        self.process_number_dict = self.create_process_numbers_for_logical_mesh()
+
+    def create_process_numbers_for_logical_mesh(self):
+        '''
+        Build 1d DeviceMesh in column-major(0) and row-major(1)
+        for example:
+            mesh_shape = (2,4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7]]
+            # return {0: [0, 4, 1, 5, 2, 6, 3, 7], 1: [0, 1, 2, 3, 4, 5, 6, 7]}
+        '''
+        num_devices = reduce(operator.mul, self.mesh_shape, 1)
+        process_numbers_dict = {}
+        process_numbers_dict[0] = torch.arange(num_devices).reshape(self.mesh_shape).transpose(1, 0).flatten().tolist()
+        process_numbers_dict[1] = torch.arange(num_devices).reshape(self.mesh_shape).flatten().tolist()
+        return process_numbers_dict
+
+    def mix_gather_cost(self, num_bytes):
+        num_devices = reduce(operator.mul, self.mesh_shape, 1)
+        return (self.mesh_alpha + self.mesh_beta * (num_devices - 1) / num_devices * num_bytes + 0.1)
diff --git a/colossalai/lazy/lazy_init.py b/colossalai/lazy/lazy_init.py
index ca8914362cd6..76f550dc4392 100644
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -1,5 +1,5 @@
 from types import MethodType
-from typing import Callable, Dict, Optional, Union
+from typing import Callable, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -8,9 +8,8 @@
 from torch.utils._pytree import tree_map
 
 from colossalai._analyzer._subclasses import MetaTensor
-from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.d_tensor.d_tensor import DTensor
-from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
+from colossalai.tensor.d_tensor.layout import Layout
 
 # reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
 _NORMAL_FACTORY = [
@@ -173,7 +172,7 @@ def materialize(self) -> torch.Tensor:
         self.clean()
         return _convert_cls(self, target)
 
-    def distribute(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> torch.Tensor:
+    def distribute(self, layout: Layout) -> torch.Tensor:
         """Distribute the ``LazyTensor`` to ``torch.Tensor`` by modifying __class__ (inplace), according to the layout.
 
         Args:
@@ -184,7 +183,7 @@ def distribute(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> to
         """
         target = self._materialize_data()
         self.clean()
-        local_tensor = DTensor(target, device_mesh, sharding_spec).local_tensor
+        local_tensor = DTensor(target, layout).local_tensor
         return _convert_cls(self, local_tensor)
 
     def clean(self) -> None:
@@ -537,10 +536,7 @@ def apply_fn(name: str, p: LazyTensor):
         return _apply_to_lazy_module(module, apply_fn, verbose)
 
     @staticmethod
-    def distribute(module: nn.Module,
-                   device_mesh: DeviceMesh,
-                   sharding_spec_dict: Dict[str, ShardingSpec],
-                   verbose: bool = False) -> nn.Module:
+    def distribute(module: nn.Module, layout_dict: dict, verbose: bool = False) -> nn.Module:
         """Distribute all ``nn.Parameter`` from ``LazyTensor``. This function will modify the module in-place.
 
         Args:
@@ -550,7 +546,7 @@ def distribute(module: nn.Module,
         """
 
         def apply_fn(name: str, p: LazyTensor):
-            p.distribute(device_mesh, sharding_spec_dict[name])
+            p.distribute(layout_dict[name])
 
         return _apply_to_lazy_module(module, apply_fn, verbose)
 
diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/nn/layer/parallel_1d/_operation.py
index 300baf9c12ba..394334558275 100644
--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -1,6 +1,5 @@
 import torch
 import torch.distributed as dist
-
 from colossalai.core import global_context as gpc
 
 try:
diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
deleted file mode 100644
index 93a4f1e578e4..000000000000
--- a/colossalai/shardformer/README.md
+++ /dev/null
@@ -1,296 +0,0 @@
-# ⚡️ ShardFormer
-
-## 📚 Table of Contents
-
-- [⚡️ ShardFormer](#️-shardformer)
-  - [📚 Table of Contents](#-table-of-contents)
-  - [🔗 Introduction](#-introduction)
-  - [🔨 Usage](#-usage)
-  - [🔮 Simple example](#-simple-example)
-  - [💡 Policy](#-policy)
-  - [😊 Module](#-module)
-
-
-## 🔗 Introduction
-
-**Shardformer** is a module that automatically parallelizes the mainstream models in libraries such as HuggingFace and TIMM. This module aims to make parallelization hassle-free for users who are not from the system background.
-
-## 🔨 Usage
-
-The sample API usage is given below:
-
-``` python
-from colossalai.shardformer import shard_model
-from transformers import BertForMaskedLM
-
-# create huggingface model as normal
-model = BertForMaskedLM.from_pretrained("bert-base-uncased")
-
-# make the huggingface model paralleled to ShardModel
-# auto policy:
-sharded_model = shard_model(model)
-
-# custom policy:
-from xxx import <POLICYCLASS>
-sharded_model = shard_model(model, <POLICYCLASS>)
-
-# do angthing as normal
-...
-```
-
-## 🔮 Simple example
-
-``` shell
-# inference
-colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode inference
-# train
-colossalai run --nproc_per_node 2 --master_port 29500 test.py --config config.py --mode train
-```
-
-
-## 💡 Policy
-
-If you wanna parallel the model in a custom way, just overwrite the policy class for the Hugging Face model.
-
-You should do:
-
-1. Inherit Policy class
-2. Overwrite `argument_policy` method
-    - In this method, you need to list which layers class you wanna modify and the attributes and parameters in those layers. Shardformer will replace all the layer belonging to the class you specified.
-    - `attr_dict` is dict contains all the attributes need to be modified in this layer.
-    - `param_funcs` is a list contains some functions which will return the path of the weight and bias from the layer.
-3. Overwrite `inject_policy` method (Optional)
-    - Shardformer will inject the model according to this method. If you need to modify the forward or backward progress (like distributed corssentropy loss in Bert) you need to overwrite this method.
-4. Overwrite or add the param functions
-    - These functions use a suffix to record the path of weight or bias for the layer.
-    - The return is a list contains some `Col_Layer` or `Row_Layer` objects, which means slice along col and row respectively.
-5. Overwrite `binding_policy` (Optional)
-    - Overwrite to specify Shardformer will bind some weight between layers, like embedding and unembedding layers.
-    - This function will return a dict, the key and value are the suffix of weight need to be binded.
-
-More details can be found in shardformer/policies/basepolicy.py
-``` python
-from colossalai.shardformer.policies.basepolicy import Policy, Layer, Col_Layer, Row_Layer, Argument
-
-CustomPolicy(Policy):
-@staticmethod
-    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module, Argument]:
-        r"""
-        Return the dict for the modify policy, the key is the original layer class and the value is the
-        argument for the modify layer
-
-        Args:
-            model_config (:class:`tansformer.Config`): The config of transformer model
-            shard_config (:class:`ShardConfig`): The config for sharding model
-
-        Return:
-            Dict for the modify policy,
-            ::
-            {
-                origin layer class1 (nn.Module): Argument(
-                    attr_dict = {
-                        argument1: value1,
-                        argument2: value2,
-                        ...
-                    },
-                    param_funcs = [
-                        staticmethod1,
-                        staticmethod2,
-                        ...
-                    ]
-                ),
-                origin layer class2 (nn.Module): Argument(
-                    attr_dict = {
-                        argument1: value1,
-                        argument2: value2,
-                        ...
-                    },
-                    param_funcs = [
-                        staticmethod1,
-                        staticmethod2,
-                        ...
-                    ]
-                ),
-                ...
-            }
-
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def inject_policy() -> Tuple[nn.Module, nn.Module]:
-        r"""
-        Return the dict for the inject model
-
-        Return:
-            The injected model, key is the original model and value is the new shardmodel
-            ::
-            (OrignModel, CustomModel)
-            in `CustomModel`, we can overwrite the forward and backward process
-        """
-        return ()
-
-    @staticmethod
-    def binding_policy() -> Dict:
-        r"""
-        Return the dict for the binding model
-
-        Return:
-            This method should return the binding relationship for some layers share the weight or bias,
-            the key and value is the suffix of the weight or bias of the model
-        ::
-            return {
-                "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
-            }
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def attn_in() -> List:
-        """
-        Attention qkv layer
-
-        Returns:
-            List[Layer]: List of layer object, each layer is the new
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def attn_out() -> List:
-        """
-        Attention output projection layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def mlp_in() -> List:
-        """
-        h -> 4h mlp layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def mlp_out() -> List:
-        """
-        4h -> h mlp layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def embedding() -> List:
-        """
-        Partially slice the embedding layer
-        vocab_size->vocab_size//gpu_nums
-
-        Return:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def unembedding() -> List:
-        """
-        Partially slice the embedding layer
-        vocab_size->vocab_size//gpu_nums
-
-        Return:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-```
-
-
-## 😊 Module
-
-  1. Flowchart
-
-  <p align="center">
-      <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/shardformer/shardformer_flowchart.png" width="600" />
-  </p>
-
-  2. Important Modules
-
-  - CLASS `shard_model`:
-
-    This is the user api to use shardformer, just create a model from transformers and define a custom policy or use shardformer autopolicy to make a shard model.
-
-  - CLASS `Layer`:
-
-    Parameters:
-    - weight (str): The weight suffix of the layer
-    - bias (str): The bias suffix of the layer
-    - replace_layer (:class:`colosalai.nn`): The layer to replace the original layer
-    - ignore (bool): Whether to ignore this layer if it is not in the model
-
-    This class is used to specify the replacement policy for a particular layer. If `replace_layer` is None, only parameter partitioning will be performed without replacing the layer class.
-
-    CLASS `Col_Layer(Layer)`:
-      - gather_output (bool): Whether to gather the output of the layer
-
-      This class inherited from `Layer`, representing the layer will be sliced along column.
-
-    CLASS `Row_Layer(Layer)`:
-
-      This class inherited from `Layer`, representing the layer will be sliced along row.
-
-  - CLASS `Policy`:
-
-    In Shardformer, this class holds significant importance as it defines the model partitioning methods, required parameter modifications, and model injection techniques all within a single Policy class.
-    - `Policy.attn_in()/attn_out()/mlp_in()/mlp_out()/embedding()/unembedding()`......
-
-      These functions define the partitioning methods of the parameters at different locations in the model. Each function returns a list of objects of Layer class that specify the replacement approach for these parameters. Shardformer also supports user-defined functions for modifying their models, in addition to the listed functions.
-    - `Policy.argument_policy()`
-
-      In this function, the user should use multiple dict to define which class of layers will require replacement. This includes the attributes and parameters that need to be modified or replaced. Attributes are stored in the form of a "suffix-string: value" dict, while parameters are stored via multiple static methods that return the replacement approach.
-    - `Policy.inject_policy()`
-
-      This function will return the injected model to replace the original model. The new model should be a nn.Module class which includes modified forward or backward functions or anything else.
-    - `Policy.binding_policy()`
-
-      This function will return the weight sharing information in the model in some dict. The key and value are both the suffixes of the shared parameters.
-
-  - CLASS `ModelSharder(model, policy)`:
-
-    This class helps shard the model, the parameter is the created transformers model and the custom policy. If custom policy is None, shardformer will automatically get already defined policy for the model.
-    - `ModelShard.inject_model()`
-
-      This function is used to inject the model to modify the forward and backward progress.
-    - `ModelShard.replace_layer()`
-
-      This function is used to replace the original layers with colossalai layer to make them paralleled and can do distributed communication.
-    - `ModelShard.bind_layer()`
-
-      This function is used to help different layers share weight or bias.
-
-  - CLASS `Slicer`:
-
-    This class is used to slice tensor according to policy.
-
-
-  3. DistCrossEntropy Loss
-  - Overview
-
-    In order to reduce the communication size, caculate the crossentropy before all gather, refer to [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), reduce the communication size from [batch_size * seq_length * vocab_size] to [batch_size * seq_length]. The origin loss function is:
-    $$ loss = -\log(\frac{\exp(x[class])}{\sum_i\exp(x[i])})$$
-
-    alse can be represented as:
-
-    $$ loss = \log(\sum_i\exp(x[i])) - x[class]$$
-
-  - Step
-
-    - First get the maximum logits across all the devices, make all the logist minus the maximun value to scale the value less than zero to avoid the value of exp being too large
-
-    - Get a mask to mask the logits not in the local device
-
-    - Caculate the loss according to the second formula
diff --git a/colossalai/shardformer/__init__.py b/colossalai/shardformer/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
deleted file mode 100644
index e817ea3ebbee..000000000000
--- a/colossalai/shardformer/layer/_operation.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import torch
-import torch.distributed as dist
-
-from colossalai.core import global_context as gpc
-
-try:
-    import fused_mix_prec_layer_norm_cuda
-except:
-    fused_mix_prec_layer_norm_cuda = None
-
-
-class FusedLayerNormAffineFunction1D(torch.autograd.Function):
-    r"""Layernorm
-
-    Args:
-        input: input matrix.
-        weight: weight matrix.
-        bias: bias matrix.
-        normalized_shape: input shape from an expected input of size.
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-            If a single integer is used, it is treated as a singleton list, and this module will
-            normalize over the last dimension which is expected to be of that specific size.
-        eps: a value added to the denominator for numerical stability
-  """
-
-    @staticmethod
-    def forward(ctx, input, weight, bias, normalized_shape, eps):
-        ctx.normalized_shape = normalized_shape
-        ctx.eps = eps
-        input_ = input.contiguous()
-        weight_ = weight.contiguous()
-        bias_ = bias.contiguous()
-        output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(input_, ctx.normalized_shape, weight_,
-                                                                             bias_, ctx.eps)
-        ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input_, weight_, bias_, mean, invvar = ctx.saved_tensors
-        grad_input = grad_weight = grad_bias = None
-        grad_input, grad_weight, grad_bias \
-          = fused_mix_prec_layer_norm_cuda.backward_affine(
-            grad_output.contiguous(), mean, invvar,
-            input_, ctx.normalized_shape,
-            weight_, bias_, ctx.eps)
-
-        return grad_input, grad_weight, grad_bias, None, None
-
-
-class LinearWithAsyncCommunication(torch.autograd.Function):
-    """
-    Linear layer execution with asynchronous communication in backprop.
-    """
-
-    @staticmethod
-    def forward(ctx, input_, weight, bias, parallel_mode, async_grad_allreduce):
-        ctx.save_for_backward(input_, weight)
-        ctx.use_bias = bias is not None
-        ctx.parallel_mode = parallel_mode
-        ctx.async_grad_allreduce = async_grad_allreduce
-
-        output = torch.matmul(input_, weight.t())
-        if bias is not None:
-            output = output + bias
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, weight = ctx.saved_tensors
-        use_bias = ctx.use_bias
-
-        total_input = input
-        grad_input = grad_output.matmul(weight)
-        grad_output = grad_output.contiguous()
-        # Convert the tensor shapes to 2D for execution compatibility
-        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2])
-        total_input = total_input.view(total_input.shape[0] * total_input.shape[1], total_input.shape[2])
-
-        if ctx.async_grad_allreduce:
-            # Asynchronous all-reduce
-            handle = dist.all_reduce(grad_input, group=gpc.get_group(ctx.parallel_mode), async_op=True)
-            # Delay the start of weight gradient computation shortly (3us) to have
-            # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
-
-        grad_weight = grad_output.t().matmul(total_input)
-        grad_bias = grad_output.sum(dim=0) if use_bias else None
-
-        if ctx.async_grad_allreduce:
-            handle.wait()
-
-        return grad_input, grad_weight, grad_bias, None, None, None
-
-
-def linear_with_async_comm(input_, weight, bias, parallel_mode, async_grad_allreduce):
-    return LinearWithAsyncCommunication.apply(input_, weight, bias, parallel_mode, async_grad_allreduce)
diff --git a/colossalai/shardformer/layer/dist_crossentropy.py b/colossalai/shardformer/layer/dist_crossentropy.py
deleted file mode 100644
index 1869594670ce..000000000000
--- a/colossalai/shardformer/layer/dist_crossentropy.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function
-
-
-class DistCrossEntropy(Function):
-    r"""
-    Overwrite the forward and backward function to calculate the cross entropy loss before gather
-
-    Args:
-        Function (:class:`torch.autograd.Function`): default
-    """
-
-    @staticmethod
-    def forward(ctx, vocab_logits: torch.Tensor, target: torch.Tensor):
-        r"""
-        Calculate the cross entropy loss before gather, the origin loss function is as follows:
-        loss = -log(exp(x[class])/sum(exp(x[i]))
-        and can be rewrite as:
-        loss = log(sum(exp(x[i])) - x[class]
-
-        To avoid the `nan` of log(sim(exp(x[i]))), we minus the max of x[i]
-
-        Args:
-            vocab_logits (:class:`torch.Tensor`): The logits of the vocabulary, shape is
-              [batch_size, seq_len, vocab_size]
-            labels (:class:`torch.Tensor`): The labels of the vocabulary, shape is
-              [batch_size, seq_len]
-
-        Returns:
-            :class:`torch.Tensor`: The cross entropy loss
-        """
-        # get the max
-        logits_max = torch.max(vocab_logits, dim=-1)[0]
-        dist.all_reduce(logits_max, op=dist.ReduceOp.MAX)
-
-        # minus the max to avoid the result of sum of exp is too large and the log is nan
-        vocab_logits = vocab_logits - logits_max.unsqueeze(dim=-1)
-
-        # mask the target in the local device
-        partition_vocab_size = vocab_logits.size()[-1]
-        rank = dist.get_rank()
-        world_size = dist.get_world_size()
-        global_vocab_size = partition_vocab_size * world_size
-
-        # [down, up) => false, other device and -100 => true
-        delta = (global_vocab_size + world_size - 1) // world_size
-        down_shreshold = rank * delta
-        up_shreshold = down_shreshold + delta
-        mask = (target < down_shreshold) | (target >= up_shreshold)
-        masked_target = target.clone() - down_shreshold
-        masked_target[mask] = 0
-
-        # reshape the logist and target
-        # reshape the vocab_logits to [bath_size * seq_len, vocab_size]
-        # reshape the labels to [bath_size * seq_len]
-        logits_2d = vocab_logits.view(-1, partition_vocab_size)
-        masked_target_1d = masked_target.view(-1)
-
-        # extract the x[class] and set the x[other device] to zero
-        pred_logits_1d = logits_2d[torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device),
-                                   masked_target_1d]
-        pred_logits_1d = pred_logits_1d.clone().contiguous()
-        pred_logits = pred_logits_1d.view_as(target)
-        pred_logits[mask] = 0.0
-
-        # allreduce the get all x(i,y)
-        dist.all_reduce(pred_logits, op=dist.ReduceOp.SUM)
-        exp_logits = vocab_logits
-        torch.exp(vocab_logits, out=exp_logits)
-        sum_exp_logits = torch.sum(exp_logits, dim=-1)
-        dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM)
-
-        # calculate the loss
-        # loss = log(sum(exp(x[i]))) - x[class]
-        loss = torch.log(sum_exp_logits) - pred_logits
-        loss = torch.sum(loss).div_(loss.numel())
-
-        # caculate the softmax
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, mask, masked_target_1d)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        # retrieve the saved tensors
-        exp_logits, mask, masked_target_1d = ctx.saved_tensors
-
-        # use exp logits as the input grad
-        grad_logits = exp_logits
-        partion_vocab_size = grad_logits.shape[-1]
-        grad_logits_2d = grad_logits.view(-1, partion_vocab_size)
-
-        update = 1.0 - mask.view(-1).float()
-        grad_logits_2d[torch.arange(0, grad_logits_2d.shape[0]), masked_target_1d] -= update
-
-        grad_logits.mul_(grad_output.unsqueeze(dim=-1))
-        return grad_logits, None, None
-
-
-def applyDistCrossEntropy(vocab_logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-    return DistCrossEntropy.apply(vocab_logits, labels)
diff --git a/colossalai/shardformer/layer/dropout.py b/colossalai/shardformer/layer/dropout.py
deleted file mode 100644
index acc114029ac1..000000000000
--- a/colossalai/shardformer/layer/dropout.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-import time
-from contextlib import contextmanager
-
-import torch
-import torch.nn as nn
-
-
-class SeedManager:
-    """
-    This class is a random state manager to change random state for different random seed.
-
-    """
-
-    def __init__(self):
-        original_state = torch.cuda.get_rng_state()
-        seed = int(f"{int(time.time())}{os.environ['RANK']}")
-        torch.cuda.manual_seed(int(seed))
-        self.dropout_state = torch.cuda.get_rng_state()
-        torch.cuda.set_rng_state(original_state)
-
-    def set_mode(self, rng_state):
-        torch.cuda.set_rng_state(rng_state)
-
-    def get_current_mode(self):
-        current_state = torch.cuda.get_rng_state()
-        return current_state
-
-    @contextmanager
-    def dropout_mode(self):
-        """
-        This is a context manager to change the dropout state and recover the original state.
-
-        Usage:
-        ::
-            >>> with _seed_manager.dropout_mode():
-            >>>     input = super().forward(input)
-        """
-        try:
-            current_mode = self.get_current_mode()
-            yield self.set_mode(self.dropout_state)
-        finally:
-            self.dropout_state = self.get_current_mode()
-            self.set_mode(current_mode)
-
-
-_seed_manager = SeedManager()
-
-
-class Dropout1D(nn.Dropout):
-
-    def __init__(self, p=0.5, inplace=False):
-        super().__init__(p, inplace)
-
-    def forward(self, input):
-        with _seed_manager.dropout_mode():
-            input = super().forward(input)
-        return input
diff --git a/colossalai/shardformer/layer/layers.py b/colossalai/shardformer/layer/layers.py
deleted file mode 100644
index f5123885bbe4..000000000000
--- a/colossalai/shardformer/layer/layers.py
+++ /dev/null
@@ -1,1043 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import math
-from collections import OrderedDict
-from typing import Callable, Tuple
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from torch.nn.parameter import Parameter
-
-from colossalai.communication import broadcast
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.kernel import LayerNorm
-from colossalai.nn import init as init
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.colossalai_layer._utils import ColossalaiModule
-from colossalai.nn.layer.parallel_1d._utils import (
-    gather_forward_split_backward,
-    get_parallel_input,
-    reduce_grad,
-    reduce_input,
-    set_parallel_input,
-    split_forward_gather_backward,
-)
-from colossalai.nn.layer.utils import divide, set_tensor_parallel_attribute_by_partition
-from colossalai.nn.layer.vanilla import VanillaLayerNorm, VanillaPatchEmbedding
-from colossalai.registry import LAYERS
-from colossalai.utils.checkpointing import (
-    broadcast_state_dict,
-    gather_tensor_parallel_state_dict,
-    partition_tensor_parallel_state_dict,
-)
-from colossalai.utils.cuda import get_current_device
-
-from ._operation import linear_with_async_comm
-
-Fast_LN = None
-try:
-    from apex.contrib.layer_norm.layer_norm import FastLayerNorm
-    Fast_LN = FastLayerNorm
-except ImportError:
-    pass
-
-
-# @LAYERS.register_module
-class Linear1D(ColossalaiModule):
-    r"""Linear layer for 1D parallelism.
-
-    Args:
-        in_features (int): size of each input sample.
-        out_features (int): size of each output sample.
-        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
-        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to False
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 dtype: torch.dtype = None,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
-        parallel_input = get_parallel_input()
-        if not parallel_input and not gather_output:
-            layer = Linear1D_Col(in_features,
-                                 out_features,
-                                 bias=bias,
-                                 dtype=dtype,
-                                 skip_bias_add=skip_bias_add,
-                                 weight_initializer=weight_initializer,
-                                 bias_initializer=bias_initializer)
-        else:
-            layer = Linear1D_Row(in_features,
-                                 out_features,
-                                 bias=bias,
-                                 dtype=dtype,
-                                 parallel_input=parallel_input,
-                                 skip_bias_add=skip_bias_add,
-                                 weight_initializer=weight_initializer,
-                                 bias_initializer=bias_initializer)
-        super().__init__(layer)
-
-
-# @LAYERS.register_module
-class LayerNorm1D(ColossalaiModule):
-    r"""
-    Layer Normalization for colossalai
-
-    Args:
-        normalized_shape (int): input shape from an expected input of size.
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            \times \ldots \times \text{normalized_shape}[-1]]`
-            If a single integer is used, it is treated as a singleton list, and this module will
-            normalize over the last dimension which is expected to be of that specific size.
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    """
-
-    _fast_ln_supported_sizes = [
-        1024, 1536, 2048, 2304, 3072, 3840, 4096, 5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
-        24576, 25600, 30720, 32768, 40960, 49152, 65536
-    ]
-
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None):
-        if Fast_LN is not None and normalized_shape in self._fast_ln_supported_sizes:
-            norm = Fast_LN(normalized_shape, eps=eps).to(dtype)
-        else:
-            norm = None
-            try:
-                from apex.normalization import FusedLayerNorm
-                norm = FusedLayerNorm(normalized_shape, eps=eps).to(dtype)
-            except ImportError:
-                norm = LayerNorm(normalized_shape, eps=eps).to(dtype)
-        super().__init__(norm)
-
-    def _load_from_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            weight = state_dict.pop(weight_key, None)
-            if weight is not None:
-                local_state[weight_key] = weight
-            # bias
-            bias = state_dict.pop(bias_key, None)
-            if bias is not None:
-                local_state[bias_key] = bias
-
-        local_state = broadcast_state_dict(local_state, ParallelMode.PARALLEL_1D)
-        super()._load_from_state_dict(local_state, prefix, *args)
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            super()._save_to_state_dict(destination, prefix, keep_vars)
-
-
-# @LAYERS.register_module
-class Classifier1D(ParallelLayer):
-    r"""RowLinear with given weight. Classifier of 1D parallelism.
-
-    Args:
-        in_features (int): size of each input sample.
-        num_classes (int): number of classes.
-        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 num_classes: int,
-                 weight: Parameter = None,
-                 bias: bool = True,
-                 dtype: torch.dtype = None,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
-        super().__init__()
-        self.in_features = in_features
-        self.num_classes = num_classes
-        self.parallel_input = get_parallel_input()
-
-        # Divide the weight matrix along the last dimension.
-        self.input_size_per_partition = divide(in_features, gpc.tensor_parallel_size)
-
-        # Parameters.
-        # Initialize weight.
-        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        if weight is not None:
-            self.weight = weight
-            self.has_weight = False
-        else:
-            self.weight = Parameter(torch.empty(self.num_classes, self.input_size_per_partition, **factory_kwargs))
-            self.has_weight = True
-        if bias:
-            self.bias = Parameter(torch.empty(self.num_classes, **factory_kwargs))
-        else:
-            self.bias = None
-        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(weight_initializer, bias_initializer)
-        self._set_tensor_parallel_attributes()
-        set_parallel_input(False)
-        env.vocab_parallel = False
-
-    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
-        fan_in, fan_out = self.in_features, self.num_classes
-        if self.has_weight:
-            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-        if self.bias is not None:
-            bias_initializer(self.bias, fan_in=fan_in)
-            broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], ParallelMode.PARALLEL_1D)
-
-    def _set_tensor_parallel_attributes(self):
-        if self.has_weight:
-            num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-            set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            if self.has_weight:
-                weight = state_dict.pop(weight_key, None)
-                if weight is not None:
-                    local_state[weight_key] = weight
-            # bias
-            if self.bias is not None:
-                bias = state_dict.pop(bias_key, None)
-                if bias is not None:
-                    local_state[bias_key] = bias
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={
-                                                               weight_key: -1,
-                                                               bias_key: 0
-                                                           },
-                                                           partition_states={
-                                                               weight_key: True,
-                                                               bias_key: False
-                                                           })
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        local_state = OrderedDict()
-        if self.has_weight:
-            local_state[weight_key] = self.weight
-        if self.bias is not None:
-            local_state[bias_key] = self.bias
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={
-                                                            weight_key: -1,
-                                                            bias_key: 0
-                                                        },
-                                                        partition_states={
-                                                            weight_key: True,
-                                                            bias_key: False
-                                                        },
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tensor:
-        # Set up backprop all-reduce.
-        if self.parallel_input:
-            assert input_.shape[-1] == self.weight.shape[-1], \
-                'Invalid shapes in Classifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1])
-            input_ = input_
-        else:
-            assert divide(input_.shape[-1], gpc.tensor_parallel_size) == self.weight.shape[-1], \
-                'Invalid shapes in Classifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1] * gpc.tensor_parallel_size)
-            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)
-
-        output_parallel = F.linear(input_, self.weight)
-        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
-        if self.bias is not None:
-            output = output + self.bias
-        return output
-
-
-# @LAYERS.register_module
-class VocabParallelClassifier1D(ParallelLayer):
-    r"""ColLinear with given weight. Classifier of 1D parallelism.
-
-    Args:
-        in_features (int): size of each input sample.
-        num_classes (int): number of classes.
-        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 num_classes: int,
-                 weight: Parameter = None,
-                 bias: bool = True,
-                 dtype: torch.dtype = None,
-                 gather_output: bool = False,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
-        super().__init__()
-        self.in_features = in_features
-        self.num_classes = num_classes
-        self.gather_output = gather_output
-        self.parallel_input = get_parallel_input()
-
-        # Divide the weight matrix along the last dimension.
-        self.num_classes_per_partition = divide(num_classes, gpc.tensor_parallel_size)
-
-        # Parameters.
-        # Initialize weight.
-        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        if weight is not None:
-            self.weight = weight
-            self.has_weight = False
-        else:
-            self.weight = Parameter(torch.empty(self.num_classes_per_partition, self.in_features, **factory_kwargs))
-            self.has_weight = True
-        if bias:
-            self.bias = Parameter(torch.empty(self.num_classes_per_partition, **factory_kwargs))
-        else:
-            self.bias = None
-        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(weight_initializer, bias_initializer)
-        self._set_tensor_parallel_attributes()
-        set_parallel_input(False)
-        env.vocab_parallel = True
-
-    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
-        fan_in, fan_out = self.in_features, self.num_classes
-        if self.has_weight:
-            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-        if self.bias is not None:
-            bias_initializer(self.bias, fan_in=fan_in)
-
-    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        if self.has_weight:
-            set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
-        if self.bias is not None:
-            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            if self.has_weight:
-                weight = state_dict.pop(weight_key, None)
-                if weight is not None:
-                    local_state[weight_key] = weight
-            # bias
-            if self.bias is not None:
-                bias = state_dict.pop(bias_key, None)
-                if bias is not None:
-                    local_state[bias_key] = bias
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={
-                                                               weight_key: 0,
-                                                               bias_key: 0
-                                                           },
-                                                           partition_states={
-                                                               weight_key: True,
-                                                               bias_key: True
-                                                           })
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        local_state = OrderedDict()
-        if self.has_weight:
-            local_state[weight_key] = self.weight
-        if self.bias is not None:
-            local_state[bias_key] = self.bias
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={
-                                                            weight_key: 0,
-                                                            bias_key: 0
-                                                        },
-                                                        partition_states={
-                                                            weight_key: True,
-                                                            bias_key: True
-                                                        },
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tensor:
-        assert input_.shape[-1] == self.weight.shape[-1], \
-            'Invalid shapes in VocabParallelClassifier1D forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1])
-        # Set up backprop all-reduce.
-        input_parallel = reduce_grad(input_, ParallelMode.PARALLEL_1D)
-        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight, self.bias)
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
-        else:
-            output = output_parallel
-        return output
-
-
-# @LAYERS.register_module
-class Linear1D_Col(ParallelLayer):
-    r"""Linear layer with column parallelism.
-
-    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
-    its second dimension as :math:`A = [A_1, ..., A_p]`.
-
-    Args:
-        in_features (int): size of each input sample.
-        out_features (int): size of each output sample.
-        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        gather_output (bool, optional): If true, call all-gather on output and make Y available
-                    to all GPUs, otherwise, every GPU will have its output
-                    which is :math:`Y_i = XA_i`, defaults to False
-        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to False
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 dtype: torch.dtype = None,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
-        super().__init__()
-
-        # Keep input parameters
-        self.in_features = in_features
-        self.out_features = out_features
-        self.gather_output = gather_output
-        self.skip_bias_add = skip_bias_add
-
-        if skip_bias_add and not bias:
-            raise ValueError('cannot skip bias addition if bias is None')
-
-        # self.out_features_per_partition = divide(out_features*2, gpc.tensor_parallel_size)
-        self.out_features_per_partition = out_features
-
-        # Parameters.
-        # Initialize weight.
-        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        self.weight = Parameter(torch.empty(self.out_features_per_partition, self.in_features, **factory_kwargs))
-
-        if bias:
-            self.bias = Parameter(torch.empty(self.out_features_per_partition, **factory_kwargs))
-        else:
-            self.bias = None
-        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(weight_initializer, bias_initializer)
-        self._set_tensor_parallel_attributes()
-        is_parallel_output = not self.gather_output
-        set_parallel_input(is_parallel_output)
-
-    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
-        fan_in, fan_out = self.in_features, self.out_features
-        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-        if self.bias is not None:
-            bias_initializer(self.bias, fan_in=fan_in)
-
-    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
-        if self.bias is not None:
-            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            weight = state_dict.pop(weight_key, None)
-            if weight is not None:
-                local_state[weight_key] = weight
-            # bias
-            if self.bias is not None:
-                bias = state_dict.pop(bias_key, None)
-                if bias is not None:
-                    local_state[bias_key] = bias
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={
-                                                               weight_key: 0,
-                                                               bias_key: 0
-                                                           },
-                                                           partition_states={
-                                                               weight_key: True,
-                                                               bias_key: True
-                                                           })
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        local_state = OrderedDict({weight_key: self.weight})
-        if self.bias is not None:
-            local_state[bias_key] = self.bias
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={
-                                                            weight_key: 0,
-                                                            bias_key: 0
-                                                        },
-                                                        partition_states={
-                                                            weight_key: True,
-                                                            bias_key: True
-                                                        },
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
-        assert input_.shape[-1] == self.weight.shape[-1], \
-            'Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1])
-        # Set up backprop all-reduce.
-        # input_parallel = reduce_grad(input_, ParallelMode.PARALLEL_1D)
-        input_parallel = input_
-        # Matrix multiply.
-        bias = self.bias if not self.skip_bias_add else None
-        # output_parallel = F.linear(input_parallel, self.weight, bias)
-        output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, ParallelMode.PARALLEL_1D, True)
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
-        else:
-            output = output_parallel
-
-        if self.skip_bias_add:
-            return output, self.bias
-        else:
-            return output
-
-
-# @LAYERS.register_module
-class Linear1D_Row(ParallelLayer):
-    r""" Linear layer with row parallelism
-
-    Args:
-        in_features (int): size of each input sample.
-        out_features (int): size of each output sample.
-        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
-        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to False
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 dtype: torch.dtype = None,
-                 parallel_input: bool = True,
-                 skip_bias_add: bool = False,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-                 stream_chunk_num: int = 1):
-        super().__init__()
-
-        self.stream_chunk_num = stream_chunk_num
-
-        # Keep input parameters
-        self.in_features = in_features
-        self.out_features = out_features
-        self.parallel_input = parallel_input
-        self.skip_bias_add = skip_bias_add
-
-        if skip_bias_add and not bias:
-            raise ValueError('cannot skip bias addition if bias is None')
-
-        # Divide the weight matrix along the last dimension.
-        # self.input_size_per_partition = divide(in_features*2, gpc.tensor_parallel_size)
-        self.input_size_per_partition = in_features
-
-        # Parameters.
-        # Initialize weight.
-        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        self.weight = Parameter(torch.empty(self.out_features, self.input_size_per_partition, **factory_kwargs))
-
-        if self.stream_chunk_num > 1:
-            # TODO() work for inference only
-            self.chunk_weight()
-        if bias:
-            self.bias = Parameter(torch.empty(self.out_features, **factory_kwargs))
-        else:
-            self.bias = None
-        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(weight_initializer, bias_initializer)
-        self._set_tensor_parallel_attributes()
-        set_parallel_input(False)
-
-    def chunk_weight(self):
-        self.weight_list = torch.chunk(self.weight, self.stream_chunk_num, dim=0)
-
-    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
-        fan_in, fan_out = self.in_features, self.out_features
-        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-        if self.bias is not None:
-            bias_initializer(self.bias, fan_in=fan_in)
-            broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], ParallelMode.PARALLEL_1D)
-
-    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            weight = state_dict.pop(weight_key, None)
-            if weight is not None:
-                local_state[weight_key] = weight
-            # bias
-            if self.bias is not None:
-                bias = state_dict.pop(bias_key, None)
-                if bias is not None:
-                    local_state[bias_key] = bias
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={
-                                                               weight_key: -1,
-                                                               bias_key: 0
-                                                           },
-                                                           partition_states={
-                                                               weight_key: True,
-                                                               bias_key: False
-                                                           })
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        bias_key = prefix + 'bias'
-        local_state = OrderedDict({weight_key: self.weight})
-        if self.bias is not None:
-            local_state[bias_key] = self.bias
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={
-                                                            weight_key: -1,
-                                                            bias_key: 0
-                                                        },
-                                                        partition_states={
-                                                            weight_key: True,
-                                                            bias_key: False
-                                                        },
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tensor:
-        # Set up backprop all-reduce.
-        if self.parallel_input:
-            assert input_.shape[-1] == self.weight.shape[-1], \
-                'Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1])
-            input_ = input_
-        else:
-            assert divide(input_.shape[-1], gpc.tensor_parallel_size) == self.weight.shape[-1], \
-                'Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.'.format(
-                input_.shape, self.weight.shape, self.weight.shape[-1] * gpc.tensor_parallel_size)
-            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)
-
-        if self.stream_chunk_num > 1:
-            if self.training:
-                raise RuntimeError("use stream_chunk_num=1 in Linear1D_Row for training!")
-            with torch.no_grad():
-                output_parallel_list = [None for i in range(self.stream_chunk_num)]
-                handle_list = []
-                for i in range(self.stream_chunk_num):
-                    output_parallel_list[i] = F.linear(input_, self.weight_list[i])
-                    handle = torch.distributed.all_reduce(output_parallel_list[i],
-                                                          group=gpc.get_group(ParallelMode.PARALLEL_1D),
-                                                          async_op=True)
-                    handle_list.append(handle)
-                    # output_parallel_list[i] = reduce_input(output_parallel_list[i], ParallelMode.PARALLEL_1D)
-                for handle in handle_list:
-                    handle.wait()
-                output = torch.cat(output_parallel_list, dim=-1)
-        else:
-            output_parallel = F.linear(input_, self.weight)
-            # output_parallel = linear_with_async_comm(input_, self.weight, None, ParallelMode.PARALLEL_1D, False)
-            output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
-        if not self.skip_bias_add:
-            if self.bias is not None:
-                output = output + self.bias
-            return output
-        else:
-            return output, self.bias
-
-
-# @LAYERS.register_module
-class Embedding1D(ParallelLayer):
-    r"""Embedding for 1D parallelism.
-
-    Args:
-        num_embeddings (int): number of embeddings.
-        embedding_dim (int): dimension of embedding.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            therefore, the embedding vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”, defaults to None.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            he initializer of weight, defaults to normal initializer.
-
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    ::
-
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-                    of frequency of the words in the mini-batch. Default False.
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: torch.dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 *args,
-                 **kwargs):
-        super().__init__()
-
-        self.num_embeddings = num_embeddings
-        self.embed_dim = embedding_dim
-        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
-
-        self.padding_idx = padding_idx
-        self.embed_args = args
-        self.embed_kwargs = kwargs
-
-        self.weight = Parameter(
-            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
-
-        self.reset_parameters(weight_initializer)
-        self._set_tensor_parallel_attributes()
-        set_parallel_input(False)
-
-    def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_partition(self.weight, gpc.tensor_parallel_size)
-
-    def reset_parameters(self, weight_initializer) -> None:
-        with seed(ParallelMode.TENSOR):
-            fan_in, fan_out = self.num_embeddings, self.embed_dim
-            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-            self._fill_padding_idx_with_zero()
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            weight = state_dict.pop(weight_key, None)
-            if weight is not None:
-                local_state[weight_key] = weight
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={weight_key: -1},
-                                                           partition_states={weight_key: True})
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        local_state = OrderedDict({weight_key: self.weight})
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={weight_key: -1},
-                                                        partition_states={weight_key: True},
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tensor:
-
-        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
-
-        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
-
-        return output
-
-
-# @LAYERS.register_module
-class VocabParallelEmbedding1D(ParallelLayer):
-    r"""Embedding parallelized in the vocabulary dimension.
-
-    Args:
-        num_embeddings (int): number of embeddings.
-        embedding_dim (int): dimension of embedding.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            therefore, the embedding vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”, defaults to None.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            he initializer of weight, defaults to normal initializer.
-
-    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    ::
-
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-                    of frequency of the words in the mini-batch. Default False.
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-
-    More details about initializer please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: torch.dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 *args,
-                 **kwargs):
-        super().__init__()
-        self.num_embeddings = num_embeddings
-        self.embed_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.embed_args = args
-        self.embed_kwargs = kwargs
-
-        tensor_parallel_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-        tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-        # self.num_embeddings_per_partition = divide(num_embeddings, tensor_parallel_size)
-        self.num_embeddings_per_partition = num_embeddings
-        self.vocab_start_index = tensor_parallel_rank * self.num_embeddings_per_partition
-        self.vocab_end_index = self.vocab_start_index + self.num_embeddings_per_partition
-
-        self.weight = Parameter(
-            torch.empty((self.num_embeddings_per_partition, self.embed_dim), device=get_current_device(), dtype=dtype))
-
-        self.reset_parameters(weight_initializer)
-        self._set_tensor_parallel_attributes()
-        set_parallel_input(False)
-        env.vocab_parallel = True
-
-    def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_partition(self.weight, gpc.tensor_parallel_size)
-
-    def reset_parameters(self, weight_initializer) -> None:
-        with seed(ParallelMode.TENSOR):
-            fan_in, fan_out = self.num_embeddings, self.embed_dim
-            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
-            self._fill_padding_idx_with_zero()
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None and \
-                self.padding_idx >= self.vocab_start_index and self.padding_idx < self.vocab_end_index:
-            with torch.no_grad():
-                self.weight[self.padding_idx - self.vocab_start_index].fill_(0)
-
-    def _load_from_global_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        weight_key = prefix + 'weight'
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            # weight
-            weight = state_dict.pop(weight_key, None)
-            if weight is not None:
-                local_state[weight_key] = weight
-
-        local_state = partition_tensor_parallel_state_dict(local_state,
-                                                           ParallelMode.PARALLEL_1D,
-                                                           dims={weight_key: 0},
-                                                           partition_states={weight_key: True})
-        super()._load_from_global_state_dict(local_state, prefix, *args)
-
-    def _save_to_global_state_dict(self, destination, prefix, keep_vars):
-        weight_key = prefix + 'weight'
-        local_state = OrderedDict({weight_key: self.weight})
-        local_state = gather_tensor_parallel_state_dict(local_state,
-                                                        ParallelMode.PARALLEL_1D,
-                                                        dims={weight_key: 0},
-                                                        partition_states={weight_key: True},
-                                                        keep_vars=keep_vars)
-        destination.update(local_state)
-
-    def forward(self, input_: Tensor) -> Tensor:
-        # Build the mask.
-        input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
-        # Mask the input.
-        masked_input = input_.clone() - self.vocab_start_index
-        masked_input[input_mask] = 0
-
-        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, *self.embed_args,
-                                      **self.embed_kwargs)
-
-        # Mask the output embedding.
-        output_parallel[input_mask, :] = 0.
-        # Reduce across all the model parallel GPUs.
-        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
-        return output
-
-
-# @LAYERS.register_module
-class Dropout1D(ParallelLayer):
-    """Dropout layer of 1D parallelism.
-
-    Args:
-        p (float, optional): probability of an element to be zeroed, defaults 0.5.
-        inplace (bool, optional): whether to do dropout in-place, default to be False.
-    """
-
-    def __init__(self, p: float = 0.5, inplace: bool = False):
-        super().__init__()
-        self.parallel_input = get_parallel_input()
-        self.p = p
-        self.inplace = inplace
-
-    def forward(self, input_: Tensor) -> Tensor:
-        if self.parallel_input:
-            with seed(ParallelMode.TENSOR):
-                output = F.dropout(input_, self.p, self.training, self.inplace)
-        else:
-            output = F.dropout(input_, self.p, self.training, self.inplace)
-        return output
-
-
-# @LAYERS.register_module
-class PatchEmbedding1D(ColossalaiModule):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The initializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
-    """
-
-    def __init__(self,
-                 img_size: int,
-                 patch_size: int,
-                 in_chans: int,
-                 embed_size: int,
-                 dtype: torch.dtype = None,
-                 flatten: bool = True,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-                 position_embed_initializer: Callable = init.zeros_()):
-        embed = VanillaPatchEmbedding(img_size,
-                                      patch_size,
-                                      in_chans,
-                                      embed_size,
-                                      dtype=dtype,
-                                      flatten=flatten,
-                                      weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer,
-                                      position_embed_initializer=position_embed_initializer)
-        super().__init__(embed)
-
-    def _load_from_state_dict(self, state_dict, prefix, *args):
-        local_state = OrderedDict()
-        param_keys = [prefix + 'weight', prefix + 'bias', prefix + 'cls_token', prefix + 'pos_embed']
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            for key in param_keys:
-                param = state_dict.pop(key, None)
-                if param is not None:
-                    local_state[key] = param
-
-        local_state = broadcast_state_dict(local_state, ParallelMode.PARALLEL_1D)
-        super()._load_from_state_dict(local_state, prefix, *args)
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-            super()._save_to_state_dict(destination, prefix, keep_vars)
diff --git a/colossalai/shardformer/model/__init__.py b/colossalai/shardformer/model/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/shardformer/model/modeling_bert.py b/colossalai/shardformer/model/modeling_bert.py
deleted file mode 100644
index bd07ab80c00d..000000000000
--- a/colossalai/shardformer/model/modeling_bert.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from typing import Any, Dict, List, Type
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-from transformers import BertForMaskedLM
-from transformers.models.bert.modeling_bert import MaskedLMOutput
-
-from ..layer.dist_crossentropy import applyDistCrossEntropy
-
-
-class BertForMaskedLM_(BertForMaskedLM):
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        # print("[Inject OK] Injected forward method")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-
-        if labels is not None:
-            masked_lm_loss = applyDistCrossEntropy(prediction_scores, labels)
-        # if labels is not None:
-        #     loss_fct = CrossEntropyLoss()    # -100 index = padding token
-        #     masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/colossalai/shardformer/policies/__init__.py b/colossalai/shardformer/policies/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
deleted file mode 100644
index 54cc63ba124f..000000000000
--- a/colossalai/shardformer/policies/autopolicy.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch.nn as nn
-
-
-def build_policies():
-    r"""
-    Build the policies for the model
-
-    Return:
-        The dict for the policies
-    """
-    auto_policy_dict = {}
-
-    from transformers import BertForMaskedLM
-
-    from .bert import BertForMaskedLMPolicy
-    auto_policy_dict[BertForMaskedLM] = BertForMaskedLMPolicy
-
-    from transformers import BertForSequenceClassification
-
-    from .bert import BertForSequenceClassificationPolicy
-    auto_policy_dict[BertForSequenceClassification] = BertForSequenceClassificationPolicy
-
-    from transformers import GPT2Model
-
-    from .gpt2 import GPT2Policy
-    auto_policy_dict[GPT2Model] = GPT2Policy
-
-    from transformers import GPT2LMHeadModel
-
-    from .gpt2 import GPT2LMHeadModelPolicy
-    auto_policy_dict[GPT2LMHeadModel] = GPT2LMHeadModelPolicy
-
-    return auto_policy_dict
-
-
-def get_autopolicy(model: nn.Module):
-    r"""
-    Return the auto policy for the model
-
-    Args:
-        model (:class:`nn.Module`): The model to get the auto policy
-
-    Return:
-        :class:`Policy`: The auto policy for the model
-    """
-    auto_policy_dict = build_policies()
-    policy = auto_policy_dict.get(model.__class__, None)
-    if policy is None:
-        raise NotImplementedError(
-            f"Auto policy for {model.__class__.__qualname__} is not implemented\n Supported models are {[i.__qualname__ for i in auto_policy_dict.keys()]}"
-        )
-    return policy
-
-
-# from transformers.models.bert.modeling_bert import BertForMaskedLM, BertForPreTraining
-# model = BertForPreTraining
-# policy = get_autopolicy(model)
-# print(policy)
diff --git a/colossalai/shardformer/policies/basepolicy.py b/colossalai/shardformer/policies/basepolicy.py
deleted file mode 100644
index 644d115a270e..000000000000
--- a/colossalai/shardformer/policies/basepolicy.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# part of code modified from https://github.com/tunib-ai/parallelformers
-
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Tuple, Type
-
-import torch.nn as nn
-
-
-@dataclass
-class Argument:
-    r"""
-    The argument class for the policy
-
-    Args:
-        attr_dict (Dict[str, Any]): The dict for the param setting
-        param_funcs (:class:`List[Callable]`): The list for the param functions
-    """
-    attr_dict: Dict[str, Any]
-    param_funcs: List[Callable]
-
-
-@dataclass
-class Layer:
-    r"""
-    The layer object for the policy
-
-    Args:
-        weight (str): The weight suffix of the layer
-        bias (str): The bias suffix of the layer
-        replace_layer (:class:`colosalai.nn`): The layer to replace the original layer
-        ignore (bool): Whether to ignore this layer if it is not in the model
-        reversed (bool): Whether the weight in layer is reversed, commonly the weight in `torch.nn.Linear` is [out, in],
-                        but in GPT2 `Conv1D` layer is [in, out] which is reversed.
-        n_cast (int): The number of weight will cast to, like q, k, v in attention layer, n_cast should be 3. commonly in TP, we just chunk the weight with the number of devices,
-                        but in multi-head attention, we need to chunk the weight with the number of devices * n_head, and
-                        each device should have a part of Q, K and V weight.
-    """
-    weight: str = None
-    bias: str = None
-    replace_layer: Any = None
-    ignore: bool = False
-    reversed: bool = False
-    n_cast: int = None
-
-
-@dataclass
-class Col_Layer(Layer):
-    r"""
-    Class for col shard layer in MegatronLM
-
-    Args:
-        gather_output (bool): Whether to gather the output of the layer
-    """
-    gather_output: bool = False
-
-
-@dataclass
-class Row_Layer(Layer):
-    r"""
-    Class for col shard layer in MegatronLM
-    """
-    pass
-
-
-class Policy():
-    r"""
-    The base class for all the policies
-    For each different model, it should have a different policy class, like BertPolicy for Bert Model
-    or OPTPolicy for OPT model.
-    AutoPolicy:
-        Shardformer already defined some policies for huggingface model, just set ``custom_policy`` = None
-        to use the auto policy. In shardformer autopolicy, we define a base policy for one type model,
-        like BertPolicy, and for each different Bert modle in huggingface like, BertForMaskedLM,
-        BertForSequenceClassification, etc., for each different Bert model we difine different policy class
-        and overwrite the method like ``inject_policy`` to modify the forward and backward process.
-
-    CustomPolicy:
-        If you want to define your own policy, you can set ``custom_policy`` = CustomPolicy, and overwrite
-        all the methods in ``Policy`` class. You can refer to any policy we defined like the ``BertPolicy``
-        class for the example.
-
-    """
-
-    @staticmethod
-    def argument_policy(model_config, shard_config: int) -> Dict[nn.Module, Argument]:
-        r"""
-        Return the dict for the modify policy, the key is the original layer class and the value is the
-        argument for the modify layer
-
-        Args:
-            model_config (:class:`tansformer.Config`): The config of transformer model
-            shard_config (:class:`ShardConfig`): The config for sharding model
-
-        Return:
-            Dict for the modify policy,
-            ::
-            {
-                origin layer class1 (nn.Module): Argument(
-                    attr_dict = {
-                        argument1: value1,
-                        argument2: value2,
-                        ...
-                    },
-                    param_funcs = [
-                        staticmethod1,
-                        staticmethod2,
-                        ...
-                    ]
-                ),
-                origin layer class2 (nn.Module): Argument(
-                    attr_dict = {
-                        argument1: value1,
-                        argument2: value2,
-                        ...
-                    },
-                    param_funcs = [
-                        staticmethod1,
-                        staticmethod2,
-                        ...
-                    ]
-                ),
-                ...
-            }
-
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def inject_policy() -> Tuple[nn.Module, nn.Module]:
-        r"""
-        Return the dict for the inject model
-
-        Return:
-            The injected model, key is the original model and value is the new shardmodel
-            ::
-            (OrignModel, CustomModel)
-            in `CustomModel`, we can overwrite the forward and backward process
-        """
-        return None
-
-    @staticmethod
-    def binding_policy() -> Dict:
-        r"""
-        Return the dict for the binding model
-
-        Return:
-            This method should return the binding relationship for some layers share the weight or bias,
-            the key and value is the suffix of the weight or bias of the model
-        ::
-            return {
-                "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
-            }
-        """
-        return None
-
-    @staticmethod
-    def attn_in() -> List:
-        r"""
-        Attention qkv layer
-        In this kind of method, we should return the list of ``Layer`` object, each ``Layer`` object should be
-        ``Layer`` for no slicing, ``Col_Layer`` for col slicing, ``Row_Layer`` for row slicing. And the parameters
-        in ``Layer`` object can refer to the ``Layer`` class.
-
-        Returns:
-            List[Layer]: List of layer object, each layer is the new
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def attn_out() -> List:
-        r"""
-        Attention output projection layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def mlp_in() -> List:
-        r"""
-        h -> 4h mlp layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def mlp_out() -> List:
-        r"""
-        4h -> h mlp layer
-
-        Returns:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def embedding() -> List:
-        r"""
-        Partially slice the embedding layer
-
-        Return:
-            List[Layer]: List of layer object
-        """
-        return NotImplementedError
-
-    @staticmethod
-    def unembedding() -> List:
-        r"""
-        Partially slice the embedding layer
-
-        Return:
-            List[Layer]: List of layer object
-        """
-        return None
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
deleted file mode 100644
index 89b32f065c27..000000000000
--- a/colossalai/shardformer/policies/bert.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Any, Callable, Dict, List, Tuple, Type
-
-import torch.nn as nn
-from transformers.models.bert.modeling_bert import BertEmbeddings, BertLayer, BertLMPredictionHead
-
-import colossalai.shardformer.layer.layers as col_nn
-
-from .basepolicy import Argument, Col_Layer, Layer, Policy, Row_Layer
-
-
-class BertPolicy(Policy):
-
-    @staticmethod
-    def argument_policy(config, world_size: int) -> Dict[nn.Module, Argument]:
-        return {
-            BertLayer:
-                Argument(
-                    attr_dict={
-        # 1. shard hidden size
-                        "attention.self.all_head_size": config.hidden_size // world_size,
-                        "crossattention.self.all_head_size": config.hidden_size // world_size,
-        # 2. shard number of heads
-                        "attention.self.num_attention_heads": config.num_attention_heads // world_size,
-                        "crossattention.self.num_attention_heads": config.num_attention_heads // world_size,
-                    },
-                    param_funcs=[BertPolicy.attn_in, BertPolicy.attn_out, BertPolicy.mlp_in, BertPolicy.mlp_out]),
-            BertEmbeddings:
-                Argument(
-                    attr_dict={
-        # 1. shard vocab size
-        # "word_embeddings.num_embeddings": config.vocab_size // world_size,
-        # 2. add the size of the sliced embedding layer excluding the last slice
-                        "word_embeddings.dim_size": (config.vocab_size + world_size - 1) // world_size,
-                    },
-                    param_funcs=[
-                        BertPolicy.embedding,
-                    ]),
-            BertLMPredictionHead:
-                Argument(
-                    attr_dict={
-        # 1. shard vocab size
-        # "word_embeddings.num_embeddings": config.vocab_size // world_size,
-        # 2. add the size of the sliced embedding layer excluding the last slice
-                    },
-                    param_funcs=[
-                        BertPolicy.unembedding,
-                    ])
-        }
-
-    @staticmethod
-    def binding_policy() -> Dict:
-        return {
-            "bert.embeddings.word_embeddings.weight": "cls.predictions.decoder.weight",
-        }
-
-    @staticmethod
-    def attn_in() -> List:
-        return [
-            Col_Layer(
-                weight="attention.self.query.weight",
-                bias="attention.self.query.bias",
-                replace_layer=col_nn.Linear1D_Col,
-            ),
-            Col_Layer(
-                weight="attention.self.key.weight",
-                bias="attention.self.key.bias",
-                replace_layer=col_nn.Linear1D_Col,
-            ),
-            Col_Layer(
-                weight="attention.self.value.weight",
-                bias="attention.self.value.bias",
-                replace_layer=col_nn.Linear1D_Col,
-            ),
-            Col_Layer(
-                weight="crossattention.self.query.weight",
-                bias="crossattention.self.query.bias",
-                replace_layer=col_nn.Linear1D_Col,
-                ignore=True,
-            ),
-            Col_Layer(
-                weight="crossattention.self.key.weight",
-                bias="crossattention.self.key.bias",
-                replace_layer=col_nn.Linear1D_Col,
-                ignore=True,
-            ),
-            Col_Layer(
-                weight="crossattention.self.value.weight",
-                bias="crossattention.self.value.bias",
-                replace_layer=col_nn.Linear1D_Col,
-                ignore=True,
-            ),
-        ]
-
-    @staticmethod
-    def attn_out() -> List:
-        return [
-            Row_Layer(
-                weight="attention.output.dense.weight",
-                bias="attention.output.dense.bias",
-                replace_layer=col_nn.Linear1D_Row,
-            ),
-            Row_Layer(
-                weight="crossattention.output.dense.weight",
-                bias="crossattention.output.dense.bias",
-                replace_layer=col_nn.Linear1D_Row,
-                ignore=True,
-            ),
-        ]
-
-    @staticmethod
-    def mlp_in() -> List:
-        return [
-            Col_Layer(
-                weight="intermediate.dense.weight",
-                bias="intermediate.dense.bias",
-                replace_layer=col_nn.Linear1D_Col,
-            ),
-        ]
-
-    @staticmethod
-    def mlp_out() -> List:
-        return [
-            Row_Layer(
-                weight="output.dense.weight",
-                bias="output.dense.bias",
-                replace_layer=col_nn.Linear1D_Row,
-            ),
-        ]
-
-    @staticmethod
-    def embedding() -> List:
-        return [Col_Layer(
-            weight="word_embeddings.weight",
-            replace_layer=col_nn.VocabParallelEmbedding1D,
-        )]
-
-    @staticmethod
-    def unembedding() -> List:
-        return [
-            Col_Layer(
-                weight="decoder.weight",
-                bias="decoder.bias",
-                replace_layer=col_nn.Linear1D_Col,
-        # gather_output=True,
-            )
-        ]
-
-
-from transformers import BertForMaskedLM
-
-from colossalai.shardformer.model.modeling_bert import BertForMaskedLM_
-
-
-class BertForMaskedLMPolicy(BertPolicy):
-
-    @staticmethod
-    def inject_policy() -> Tuple[nn.Module, nn.Module]:
-        return (BertForMaskedLM, BertForMaskedLM_)
-
-
-class BertForSequenceClassificationPolicy(BertPolicy):
-
-    @staticmethod
-    def inject_policy() -> Dict:
-        return {}
-
-
-# model = BertForMaskedLM.from_pretrained("bert-base-uncased")
-# _ = BertForMaskedLMPolicy(model)
-# print(isinstance(model,list(_.inject_policy().keys())[0]))
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
deleted file mode 100644
index 44dc9c72f986..000000000000
--- a/colossalai/shardformer/policies/gpt2.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from typing import Any, Callable, Dict, List, Tuple, Type
-
-import torch.nn as nn
-from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model
-
-import colossalai.shardformer.layer.layers as col_nn
-
-from .basepolicy import Argument, Col_Layer, Layer, Policy, Row_Layer
-
-
-class GPT2Policy(Policy):
-
-    @staticmethod
-    def argument_policy(config, world_size):
-        return {
-            GPT2Model:
-                Argument(attr_dict={}, param_funcs=[
-                    GPT2Policy.embedding,
-                ]),
-            GPT2Block:
-                Argument(
-                    attr_dict={
-        # 1. reduce hidden size
-                        "attn.embed_dim": config.hidden_size // world_size,
-                        "attn.split_size": config.hidden_size // world_size,
-                        "crossattention.embed_dim": config.hidden_size // world_size,
-                        "crossattention.split_size": config.hidden_size // world_size,
-        # 2. reduce number of heads
-                        "attn.num_heads": config.num_attention_heads // world_size,
-                        "crossattention.num_heads": config.num_attention_heads // world_size,
-                    },
-                    param_funcs=[
-                        GPT2Policy.attn_in,
-                        GPT2Policy.attn_out,
-                        GPT2Policy.mlp_in,
-                        GPT2Policy.mlp_out,
-                    ]),
-        }
-
-    @staticmethod
-    def attn_in() -> List:
-        return [
-            Col_Layer(weight="attn.c_attn.weight",
-                      bias="attn.c_attn.bias",
-                      n_cast=3,
-                      reversed=True,
-                      replace_layer=col_nn.Linear1D_Col),
-            Col_Layer(weight="crossattention.c_attn.weight",
-                      bias="crossattention.c_attn.bias",
-                      n_cast=2,
-                      reversed=True,
-                      ignore=True,
-                      replace_layer=col_nn.Linear1D_Col),
-            Col_Layer(weight="crossattention.q_attn.weight",
-                      bias="crossattention.q_attn.bias",
-                      reversed=True,
-                      ignore=True,
-                      replace_layer=col_nn.Linear1D_Col)
-        ]
-
-    @staticmethod
-    def attn_out() -> List:
-        return [
-            Row_Layer(weight="attn.c_proj.weight",
-                      bias="attn.c_proj.bias",
-                      reversed=True,
-                      replace_layer=col_nn.Linear1D_Row),
-            Row_Layer(weight="crossattention.c_proj.weight",
-                      bias="crossattention.c_proj.bias",
-                      reversed=True,
-                      ignore=True,
-                      replace_layer=col_nn.Linear1D_Row)
-        ]
-
-    @staticmethod
-    def mlp_in() -> List:
-        return [
-            Col_Layer(weight="mlp.c_fc.weight", bias="mlp.c_fc.bias", reversed=True, replace_layer=col_nn.Linear1D_Col),
-        ]
-
-    @staticmethod
-    def mlp_out() -> List:
-        return [
-            Row_Layer(weight="mlp.c_proj.weight",
-                      bias="mlp.c_proj.bias",
-                      reversed=True,
-                      replace_layer=col_nn.Linear1D_Row)
-        ]
-
-    @staticmethod
-    def embedding() -> List:
-        return [Col_Layer(weight="wte.weight", replace_layer=col_nn.VocabParallelEmbedding1D)]
-
-
-from transformers import GPT2LMHeadModel
-
-
-class GPT2LMHeadModelPolicy(GPT2Policy):
-
-    @staticmethod
-    def argument_policy(config, world_size):
-        base_argument = GPT2Policy.argument_policy(config, world_size)
-        argument = {
-            GPT2LMHeadModel: Argument(attr_dict={}, param_funcs=[
-                GPT2LMHeadModelPolicy.unembedding,
-            ]),
-        }
-        argument.update(base_argument)
-        return argument
-
-    @staticmethod
-    def unembedding() -> List:
-        return [
-            Col_Layer(weight="lm_head.weight",
-                      bias="lm_head.bias",
-                      replace_layer=col_nn.Linear1D_Col,
-                      gather_output=True)
-        ]
diff --git a/colossalai/shardformer/shard/__init__.py b/colossalai/shardformer/shard/__init__.py
deleted file mode 100644
index d5f70163ad57..000000000000
--- a/colossalai/shardformer/shard/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .shard_config import ShardConfig
-from .sharder import ModelSharder, shard_model
-from .slicer import Slicer
-
-__all__ = ['ShardConfig', 'ModelSharder', 'shard_model', 'Slicer']
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
deleted file mode 100644
index 4cf9162b9548..000000000000
--- a/colossalai/shardformer/shard/shard_config.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from dataclasses import dataclass
-
-__all__ = ['ShardConfig']
-
-
-@dataclass
-class ShardConfig:
-    """
-    The config for sharding the huggingface model for test
-    """
-    rank: int
-    fp16: bool = True
-    num_gpus: int = 2
-    world_size: int = 2
-    backend = "nccl"
-    verbose: str = 'simple'
-    seed: int = None
-    require_grad: bool = False
-    master_addr: str = "127.0.0.1"
-    master_port: int = 29500
diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
deleted file mode 100644
index 1ada75e06b67..000000000000
--- a/colossalai/shardformer/shard/sharder.py
+++ /dev/null
@@ -1,266 +0,0 @@
-from typing import Any, Callable, Dict, List
-
-import torch
-import torch.nn as nn
-from transformers.pytorch_utils import Conv1D
-
-from ..policies.autopolicy import get_autopolicy
-from ..policies.basepolicy import Policy
-from ..utils.utils import getattr_, hasattr_, setattr_
-from .shard_config import ShardConfig
-from .slicer import Slicer
-
-__all__ = ['ModelSharder', 'shard_model']
-
-
-class ModelSharder(object):
-    r"""
-    Shard the original huggingface model according to the policy
-
-    Args:
-        policy (:class:`Policy`): The policy to shard the model
-        model (:class:`torch.Module`): The model to shard
-        shard_config: The setting of distributed model
-    """
-
-    def __init__(
-            self,
-            model: nn.Module,
-            policy: Policy,
-            shard_config: ShardConfig = None,    # TODO
-    ) -> None:
-        self.model = model
-        self.policy = get_autopolicy(self.model) if policy is None else policy
-        self.slicer = Slicer(shard_config)
-        self.shard_config = shard_config
-        self.model_config = self.model.config
-
-    def shard(self) -> None:
-        self.reshape_embedding()
-        self.inject_model(self.model)
-        self.replace_layer(self.model)
-        self.bind_layer(self.model)
-
-    def reshape_embedding(self,) -> None:
-        r"""
-        Reshape the Embedding layer to make the embedding dimension divisible by world_size
-        """
-        vocab_size = self.model_config.vocab_size
-        world_size = self.shard_config.world_size
-        if vocab_size % world_size != 0:
-            new_vocab_size = vocab_size + world_size - vocab_size % world_size
-            self.model.resize_token_embeddings(new_vocab_size)
-            self.model_config = self.model.config
-
-    def inject_model(
-        self,
-        model: nn.Module,
-    ) -> None:
-        r"""
-        Replace the model to policy defined model
-        Mainly modify the forward and backward to fit distributed model
-
-        e.g.
-        ::
-            BertForMaskedLM.forward -> BertForMaskedLM_.forward
-        """
-        inject_policy = self.policy.inject_policy()
-
-        if inject_policy is None:
-            return
-        org_model_cls = inject_policy[0]
-        shard_model_cls = inject_policy[1]
-
-        if model.__class__ == org_model_cls:
-            for key in shard_model_cls.__dict__.keys():
-                if hasattr(model.__class__, key):
-                    setattr(
-                        model.__class__,
-                        key,
-                        getattr(shard_model_cls, key),
-                    )
-        else:
-            raise NotImplementedError(f"{model.__class__} is not implemented so far")
-
-    def replace_layer(
-        self,
-        model: nn.Module,
-    ) -> None:
-        r"""
-        Replace the layer according to the policy, and replace the layer one by one
-
-        Args:
-            model (:class:`torch.nn.Module`): The layer to shard
-        """
-        argument_policies = self.policy.argument_policy(self.model_config, self.shard_config.world_size)
-        for argument_policy in argument_policies.items():
-            origin_layer_cls = argument_policy[0]
-            attr_dict = argument_policy[1].attr_dict
-            param_funcs = argument_policy[1].param_funcs
-            self.traverse_replace_layer(model, origin_layer_cls, attr_dict, param_funcs)
-
-    def traverse_replace_layer(
-        self,
-        layer: nn.Module,
-        origin_cls: nn.Module,
-        attr_dict: Dict[str, Any],
-        param_funcs: List[Callable],
-    ) -> None:
-        r"""
-        Reverse the replace layer operation
-
-        Args:
-            layer (:class:`torch.nn.Module`): The object of layer to shard
-            origin_cls (:class:`transformers.model`): The origin layer class
-            attr_dict (Dict): The attribute dict to modify
-            policy_cls (:class:`Policy`): The policy class
-        """
-        if layer.__class__ == origin_cls:
-            for k, v in attr_dict.items():
-                setattr_(layer, k, v, ignore=True)
-            self.shard_one_layer(layer, param_funcs)
-        for name, child in layer.named_children():
-            self.traverse_replace_layer(child, origin_cls, attr_dict, param_funcs)
-        return layer
-
-    def shard_one_layer(
-        self,
-        org_layer: nn.Module,
-        param_funcs: List[Callable],
-    ) -> None:
-        r"""
-        Shard one layer according to the policy, the layer should be the same class as the key in policy's argument_policy return dict
-
-        Args:
-            org_layer (:class:`torch.nn.Module`): The origin layer object to shard
-            param_funcs (:class:`List[typing.Callable]`): The function list to get shard information in policy class
-
-        """
-        for func in param_funcs:
-            policy_layers = func()
-            for policy_layer in policy_layers:
-                weight = None
-                bias = None
-                weight_attr = policy_layer.weight
-                bias_attr = policy_layer.bias
-                replace_layer_cls = policy_layer.replace_layer
-                ignore = policy_layer.ignore
-                n_cast = policy_layer.n_cast
-                reversed = policy_layer.reversed
-                if policy_layer.__class__.__name__ == "Col_Layer":
-                    gather_output = policy_layer.gather_output
-
-                if weight_attr is not None:
-                    if hasattr_(org_layer, weight_attr):
-                        weight = getattr_(org_layer, weight_attr)
-                    elif not ignore:
-                        raise ValueError(f"Layer {org_layer.__class__.__qualname__} has no attribute {weight_attr}")
-
-                if bias_attr is not None:
-                    if hasattr_(org_layer, bias_attr):
-                        bias = getattr_(org_layer, bias_attr)
-                    elif not ignore:
-                        raise ValueError(f"Layer {org_layer.__class__.__qualname__} has no attribute {bias_attr}")
-
-                # dont have the attribute in policy, and ignore is true
-                if weight is None and bias is None and ignore:
-                    continue
-
-                # set the sliced weight and bias to the new nn_col layer
-                assert weight is not None or bias is not None
-                layer_attr = (lambda x: x[:x.rfind(".")])(weight_attr or bias_attr)
-
-                # slice weight and bias
-                weight, bias = self.slicer.slice_weight_bias(weight, bias, policy_layer.__class__, n_cast, reversed)
-
-                # create new object to replace the origin layer
-                if replace_layer_cls is not None:
-                    if isinstance(getattr_(org_layer, layer_attr), (nn.Linear, Conv1D)):
-                        if replace_layer_cls.__name__ == "Linear1D_Row":
-                            replace_layer = replace_layer_cls(weight.shape[1],
-                                                              weight.shape[0],
-                                                              bias=False if bias is None else True)
-                        elif replace_layer_cls.__name__ == "Linear1D_Col":
-                            replace_layer = replace_layer_cls(weight.shape[0],
-                                                              weight.shape[1],
-                                                              bias=False if bias is None else True,
-                                                              gather_output=gather_output)
-                        setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
-                        self.set_param(replace_layer, weight, bias)
-                    elif isinstance(getattr_(org_layer, layer_attr), nn.Embedding):
-                        replace_layer = replace_layer_cls(weight.shape[0], weight.shape[1],
-                                                          getattr_(org_layer, f"{layer_attr}.padding_idx", ignore=True))
-                        setattr_(org_layer, layer_attr, replace_layer, ignore=ignore)
-                        self.set_param(replace_layer, weight, bias)
-                    else:
-                        raise NotImplementedError(
-                            f"Replacing {getattr_(org_layer, layer_attr).__class__} is not implemented so far")
-                # do not replace the layer object, just replace the weight and bias
-                else:
-                    self.set_param(org_layer, layer_attr, weight, bias)
-
-    def set_param(self,
-                  layer: Any,
-                  weight: torch.Tensor = None,
-                  bias: torch.Tensor = None,
-                  layer_attr: str = "") -> None:
-        r"""
-        Reset the weight and bias of the layer object
-
-        Args:
-            layer (:class:`torch.nn.Module`): The layer object
-            layer_attr (str): The attribute name of the layer
-            weight (:class:`torch.Tensor`): The weight of the layer
-            bias (:class:`torch.Tensor`): The bias of the layer
-        """
-        assert weight is not None or bias is not None
-        if weight is not None:
-            setattr_(layer, "weight" if layer_attr == "" else layer_attr + ".weight", nn.Parameter(weight.contiguous()))
-            self.set_layer_size(layer, layer_attr, weight.shape)
-        if bias is not None:
-            setattr_(layer, "bias" if layer_attr == "" else layer_attr + ".bias", nn.Parameter(bias.contiguous()))
-
-    def set_layer_size(self, layer: nn.Module, layer_attr: str, size: torch.Size) -> None:
-        r"""
-        Set the layer attribute
-
-        Args:
-            layer (:class:`torch.nn.Module`): The layer object
-            layer_attr (str): The attribute name of the layer
-            size (:class:`torch.Size`): The size of the tensor
-        """
-        # Tensor.shape[0] -> out_features, Tensor.shape[1] -> in_features
-        attrs = ["out_features", "in_features"]
-        for i, attr in enumerate(attrs):
-            if hasattr_(layer, f"{layer_attr}.{attr}"):
-                setattr_(layer, f"{layer_attr}.{attr}", size[i])
-
-    def bind_layer(self, model: nn.Module) -> None:
-        r"""
-        Bind the layer according to the binding policy
-
-        Args:
-            model (:class:`torch.nn.Module`): The shard model
-        """
-        binding_map = self.policy.binding_policy()
-        if binding_map is None:
-            return
-        for k, v in binding_map.items():
-            param = getattr_(model, k)
-            param = nn.Parameter(param)
-            setattr_(model, k, param)
-            setattr_(model, v, param)
-
-
-def shard_model(model: nn.Module, shard_config: ShardConfig = None, policy: Policy = None):
-    r"""
-    The function is used to shard the PyTorch model.
-
-    Args:
-        model (`torch.nn.Model`): the origin huggingface model
-        shard_config (`ShardConfig`): the config for distribute information
-        policy (`Policy`): the custom policy for sharding
-    """
-    sharder = ModelSharder(model=model, shard_config=shard_config, policy=policy)
-    sharder.shard()
-    return model
diff --git a/colossalai/shardformer/shard/slicer.py b/colossalai/shardformer/shard/slicer.py
deleted file mode 100644
index 6d35bd193fed..000000000000
--- a/colossalai/shardformer/shard/slicer.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import torch
-
-from ..policies.basepolicy import Col_Layer, Layer, Row_Layer
-from .shard_config import ShardConfig
-
-dim_mapping = {Col_Layer: 1, Row_Layer: 0}
-
-
-class Slicer():
-
-    def __init__(
-            self,
-            shardconfig: ShardConfig    #TODO
-    ) -> None:
-        self.shardconfig = shardconfig
-
-    def slice_weight_bias(
-        self,
-        weight: torch.Tensor,
-        bias: torch.Tensor,
-        policy_layer_cls: Layer,
-        n_cast: int = None,
-        reversed: bool = False,
-    ):
-        r"""
-        Slice the weight and bias according to policy layer cls
-        ``Layer`` -> do nothing
-        ``Col_Layer`` -> slice the weight and bias along dim 1
-        ``Row_Layer`` -> slice the weight along dim 0 and do not slice bias
-
-        Args:
-            weight (:class:`torch.nn.Module`): The weight of the layer
-            bias: (:class:`torch.nn.Module`): The bias of the layer
-            policy_layer_class (:class:`Policy`): The class represent how to slice the tensor
-        """
-        if policy_layer_cls == Layer:
-            return weight, bias
-
-        dim = dim_mapping[policy_layer_cls] if not reversed else (1 - dim_mapping[policy_layer_cls])
-        # print(weight.shape, dim)
-        if policy_layer_cls == Col_Layer:
-            weight = self.slice_tensor(weight, dim, False, n_cast)
-            bias = self.slice_tensor(bias, 0, True)
-        elif policy_layer_cls == Row_Layer:
-            weight = self.slice_tensor(weight, dim, False, n_cast)
-        else:
-            raise NotImplementedError(f"The policy layer class {policy_layer_cls} is not supported")
-        if reversed:
-            weight = weight.transpose(0, 1).contiguous()
-        return weight, bias
-
-    def slice_tensor(
-        self,
-        tensor_in: torch.Tensor,
-        dim: int,
-        is_bias: bool,
-        n_cast: int = None,
-    ) -> torch.Tensor:
-        r"""
-        Slice tensor according to the config
-
-        Args:
-            tensor_in (:class:`torch.Tensor`): The tensor to slice
-            dim (int): The dimension to slice
-            is_bias (bool): Whether the tensor is bias
-        """
-        if tensor_in is None:
-            return None
-        if not is_bias:
-            return self.slice_2d(tensor_in, dim, n_cast)
-        else:
-            return self.slice_1d(tensor_in, n_cast)
-
-    def slice_2d(
-        self,
-        tensor: torch.Tensor,
-        dim: int,
-        n_cast: int = None,
-    ) -> torch.Tensor:
-        r"""
-        Slice the 2D tensor
-
-        Args:
-            tensor (:class:`torch.Tensor`): The tensor to slice
-            dim (int): The dimension to slice
-        """
-        assert dim in [0, 1], f"Only support 2D tensor, but got {dim}D tensor"
-        if dim == 0:
-            return self.slice_row(tensor, n_cast)
-        elif dim == 1:
-            return self.slice_col(tensor, n_cast)
-
-    def slice_1d(
-        self,
-        tensor: torch.Tensor,
-        n_cast: int = None,
-    ) -> torch.Tensor:
-        r"""
-        Slice the 1D tensor
-
-        Args:
-            tensor (:class:`torch.Tensor`): The tensor to slice
-
-        Returns:
-            :class:`torch.Tensor`: The sliced tensor
-        """
-        if n_cast is None:
-            return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
-        else:
-            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=0)
-            chunk_list = [
-                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
-            ]
-            return torch.cat(chunk_list, dim=0).contiguous()
-
-    def slice_col(
-        self,
-        tensor: torch.Tensor,
-        n_cast: int = None,
-    ) -> torch.Tensor:
-        r"""
-        Slice the tensor in column
-
-        Args:
-            tensor (:class:`torch.Tensor`): The tensor to slice
-
-        Returns:
-            :class:`torch.Tensor`: The sliced tensor
-
-        """
-        if n_cast is None:
-            return tensor.chunk(self.shardconfig.world_size, dim=0)[self.shardconfig.rank].contiguous()
-        else:
-            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=0)
-            chunk_list = [
-                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
-            ]
-            return torch.cat(chunk_list, dim=0).contiguous()
-
-    def slice_row(
-        self,
-        tensor: torch.Tensor,
-        n_cast: int = None,
-    ) -> torch.Tensor:
-        r"""
-        Slice the tensor in column
-
-        Args:
-            tensor (:class:`torch.Tensor`): The tensor to slice
-
-        Returns:
-            :class:`torch.Tensor`: The sliced tensor
-        """
-        if n_cast is None:
-            return tensor.chunk(self.shardconfig.world_size, dim=1)[self.shardconfig.rank].contiguous()
-        else:
-            tensor_chunks = tensor.chunk(self.shardconfig.world_size * n_cast, dim=1)
-            chunk_list = [
-                tensor_chunks[i] for i in range(self.shardconfig.rank, len(tensor_chunks), self.shardconfig.world_size)
-            ]
-            return torch.cat(chunk_list, dim=1).contiguous()
diff --git a/colossalai/shardformer/test/config.py b/colossalai/shardformer/test/config.py
deleted file mode 100644
index 2b80d8b3ca12..000000000000
--- a/colossalai/shardformer/test/config.py
+++ /dev/null
@@ -1 +0,0 @@
-parallel = dict(data=1, pipeline=1, tensor=dict(size=2, mode='1d'))
diff --git a/colossalai/shardformer/test/module_test.py b/colossalai/shardformer/test/module_test.py
deleted file mode 100644
index 83dc7ec6cf4a..000000000000
--- a/colossalai/shardformer/test/module_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import colossalai
-from colossalai.shardformer.layer.dist_crossentropy import applyDistCrossEntropy
-from colossalai.shardformer.layer.dropout import Dropout1D
-
-
-def get_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument("--module", type=str, default='distloss')
-    return parser.parse_args()
-
-
-def test_dist_crossentropy():
-    pred = torch.randn(2, 4, 8, requires_grad=True)
-    labels = torch.randint(8, (1, 4)).repeat(2, 1)
-
-    pred_ = pred.view(-1, 8)
-    labels_ = labels.view(-1)
-    loss = F.cross_entropy(pred_, labels_)
-    loss.backward()
-    print(f"normal loss:{loss}")
-
-    pred = pred.chunk(int(os.environ['WORLD_SIZE']), -1)[int(os.environ['RANK'])]
-    loss = applyDistCrossEntropy(pred.to('cuda'), labels.to('cuda'))
-    loss.backward()
-    print(f"dist loss:{loss}")
-
-
-def test_dropout():
-    input = torch.randn(5, 4).to("cuda")
-    m = Dropout1D(p=0.2).to("cuda")
-    for i in range(2):
-        print(f"Output: {m(input)}")
-        print(torch.randn(1))
-
-
-if __name__ == '__main__':
-    args = get_args()
-    colossalai.launch_from_torch(config={})
-    if args.module == 'distloss':
-        test_dist_crossentropy()
-    elif args.module == 'dropout':
-        test_dropout()
-    else:
-        print("not implemented yet")
diff --git a/colossalai/shardformer/test/test.py b/colossalai/shardformer/test/test.py
deleted file mode 100644
index e2d5a94c782a..000000000000
--- a/colossalai/shardformer/test/test.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import os
-import random
-
-import torch
-import torch.nn as nn
-from datasets import load_dataset
-from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, GPT2LMHeadModel, get_scheduler
-
-import colossalai
-from colossalai.shardformer.shard import ShardConfig, shard_model
-from colossalai.utils import get_current_device, print_rank_0
-
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
-
-
-def get_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument("--mode", type=str, default='inference')
-    parser.add_argument("--save_model", action='store_true')
-    parser.add_argument("--model", type=str, default='bert-base-uncased')
-    return parser.parse_args()
-
-
-def load_data(args):
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    if tokenizer.pad_token is None:
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        # tokenizer.pad_token_id = 0
-    datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
-    # datasets=load_dataset("yelp_review_full")
-    tokenized_datasets = datasets.map(
-        lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length"), batched=True)
-    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    # tokenized_datasets=tokenized_datasets.rename_column("label","labels")
-    tokenized_datasets.set_format("torch")
-
-    train_dataset = tokenized_datasets["train"]
-    test_dataset = tokenized_datasets["test"]
-
-    datacollector = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15, return_tensors="pt")
-    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=datacollector)
-    eval_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, collate_fn=datacollector)
-    return train_dataloader, eval_dataloader
-
-
-def inference(model: nn.Module, args):
-    print(model)
-    # print(model.wte.weight.shape)
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    if tokenizer.pad_token is None:
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        tokenizer.pad_token_id = 0
-    token = "Hello, my dog is cute"
-    inputs = tokenizer(token, return_tensors="pt")
-    inputs.to("cuda")
-    model.eval()
-    model.to("cuda")
-    outputs = model(**inputs)
-    print(outputs[0])
-
-
-def train(model: nn.Module, args, num_epoch: int = 3):
-    train_dataloader, eval_dataloader = load_data(args)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
-    num_training = num_epoch * len(train_dataloader)
-    progress_bar = tqdm(range(num_training))
-    lr_scheduler = get_scheduler(name="linear",
-                                 optimizer=optimizer,
-                                 num_warmup_steps=0,
-                                 num_training_steps=num_training)
-    best_test_loss = float("inf")
-    model.to("cuda")
-    model.train()
-    for epoch in range(num_epoch):
-        progress_bar.set_description(f"Rank {get_current_device()} epoch {epoch}")
-        for batch in train_dataloader:
-            optimizer.zero_grad()
-            batch = {k: v.to('cuda') for k, v in batch.items()}
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss.backward()
-            optimizer.step()
-            lr_scheduler.step()
-            progress_bar.update(1)
-        train_loss = loss
-
-        loss = 0.0
-        for batch in eval_dataloader:
-            batch = {k: v.to('cuda') for k, v in batch.items()}
-            outputs = model(**batch)
-            # loss = outputs.loss
-            assert not torch.isnan(outputs.loss), f"{batch}"
-            loss += outputs.loss.item()
-            # loss = criterion(outputs.logits, batch["input_ids"])
-        test_loss = loss / len(eval_dataloader)
-        print_rank_0(f"Train Loss: {train_loss:.4f} Test Loss:{test_loss:.4f}")
-        if args.save_model and test_loss < best_test_loss:
-            best_test_loss = test_loss
-            torch.save(model.state_dict(), "./checkpoints/best_model.pth")
-
-
-if __name__ == "__main__":
-    args = get_args()
-    colossalai.launch_from_torch(config=args.config)
-    if args.model == 'bert-base-uncased':
-        model = BertForMaskedLM.from_pretrained("bert-base-uncased")
-    elif args.model == 'gpt2':
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
-    else:
-        raise AttributeError("model not supported")
-    shard_config = ShardConfig(
-        rank=int(str(get_current_device()).split(':')[-1]),
-        world_size=int(os.environ['WORLD_SIZE']),
-    )
-    sharded_model = shard_model(model, shard_config)
-
-    if args.mode == "train":
-        train(sharded_model, args)
-    elif args.mode == "inference":
-        inference(sharded_model, args)
-    else:
-        raise NotImplementedError
diff --git a/colossalai/shardformer/utils/__init__.py b/colossalai/shardformer/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/shardformer/utils/utils.py b/colossalai/shardformer/utils/utils.py
deleted file mode 100644
index eb84edd88404..000000000000
--- a/colossalai/shardformer/utils/utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-def hasattr_(obj, attr: str):
-    r"""
-    Check whether the object has the multi sublevel attr
-
-    Args:
-        obj (object): The object to check
-        attr (str): The multi level attr to check
-    """
-    attrs = attr.split('.')
-    for a in attrs:
-        try:
-            obj = getattr(obj, a)
-        except AttributeError:
-            return False
-    return True
-
-
-def setattr_(obj, attr: str, value, ignore: bool = False):
-    r"""
-    Set the object's multi sublevel attr to value, if ignore, ignore when it doesn't exist
-
-    Args:
-        obj (object): The object to set
-        attr (str): The multi level attr to set
-        value (Any): The value to set
-        ignore (bool): Whether to ignore when the attr doesn't exist
-    """
-
-    attrs = attr.split('.')
-    for a in attrs[:-1]:
-        try:
-            obj = getattr(obj, a)
-        except AttributeError:
-            if ignore:
-                return
-            raise AttributeError(f"Object {obj} has no attribute {attr}")
-    setattr(obj, attrs[-1], value)
-
-
-def getattr_(obj, attr: str, ignore: bool = None):
-    r"""
-    Get the object's multi sublevel attr
-
-    Args:
-        obj (object): The object to set
-        attr (str): The multi level attr to set
-        ignore (bool): Whether to ignore when the attr doesn't exist
-    """
-
-    attrs = attr.split('.')
-    for a in attrs:
-        try:
-            obj = getattr(obj, a)
-        except AttributeError:
-            if ignore:
-                return None
-            raise AttributeError(f"Object {obj} has no attribute {attr}")
-    return obj
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index dd873c852936..af38d2a502c2 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -16,66 +16,69 @@ def _all_gather(tensor, comm_spec):
     '''
     Implement all gather operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
-    process_group = process_groups[comm_spec.logical_process_axis]
-
-    tensor_list = [
-        torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device)
-        for _ in range(comm_spec.device_mesh.mesh_shape[comm_spec.logical_process_axis])
-    ]
-    # without this contiguous operation, the all gather may get some unexpected results.
-    tensor = tensor.contiguous()
-    dist.all_gather(tensor_list, tensor, group=process_group)
-    output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
-    return output
+    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            tensor_list = [
+                torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device)
+                for _ in range(comm_spec.device_mesh.mesh_shape[comm_spec.logical_process_axis])
+            ]
+            # without this contiguous operation, the all gather may get some unexpected results.
+            tensor = tensor.contiguous()
+            dist.all_gather(tensor_list, tensor, group=process_group)
+            output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
+            return output
 
 
 def _split(tensor, comm_spec):
     '''
     Implement shard operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
-    process_group = process_groups[comm_spec.logical_process_axis]
-
-    dim = comm_spec.shard_dim
-    length = tensor.shape[comm_spec.shard_dim] // dist.get_world_size(process_group)
-    start = length * dist.get_rank(process_group)
-    output = torch.narrow(tensor, dim, start, length).contiguous()
-    return output
+    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, _ in process_groups_list:
+        if dist.get_rank() in rank_list:
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            start = length * rank_list.index(dist.get_rank())
+            output = torch.narrow(tensor, dim, start, length).contiguous()
+            return output
 
 
 def _all_to_all(tensor, comm_spec):
     '''
     Implement all to all operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
-    process_group = process_groups[comm_spec.logical_process_axis]
-    world_size = dist.get_world_size(process_group)
-
-    new_shape = list(tensor.shape)
-    new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // world_size
-    new_shape = torch.Size(new_shape)
-    output_tensor_list = [torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
-    dim = comm_spec.shard_dim
-    length = tensor.shape[comm_spec.shard_dim] // world_size
-    input_tensor_list = [torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(world_size)]
-    group = process_group
-    dist.all_to_all(output_tensor_list, input_tensor_list, group)
-    output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
-    return output
+    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            new_shape = list(tensor.shape)
+            new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // len(rank_list)
+            new_shape = torch.Size(new_shape)
+            output_tensor_list = [
+                torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
+            ]
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            input_tensor_list = [
+                torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(len(rank_list))
+            ]
+            group = process_group
+            dist.all_to_all(output_tensor_list, input_tensor_list, group)
+            output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
+            return output
 
 
 def _all_reduce(tensor, comm_spec, async_op=False):
     '''
     Implement all reduce operation on device mesh based on information provided by comm_spec.
     '''
-    process_groups = comm_spec.device_mesh.get_process_group_for_all_axes()
-    process_group = process_groups[comm_spec.logical_process_axis]
-
-    if not tensor.is_contiguous():
-        tensor = tensor.contiguous()
-    dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
-    return tensor
+    process_groups_list = comm_spec.device_mesh.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            if not tensor.is_contiguous():
+                tensor = tensor.contiguous()
+            dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
+            return tensor
 
 
 def _mix_gather(tensor, comm_spec):
@@ -411,7 +414,7 @@ def __init__(self,
         self.forward_only = forward_only
         if isinstance(self.logical_process_axis, list):
             if not mix_gather:
-                self.device_mesh = self.sharding_spec.device_mesh.flatten()
+                self.device_mesh = self.sharding_spec.device_mesh.flatten_device_mesh
                 self.logical_process_axis = 0
             else:
                 self.device_meshes = self.sharding_spec.device_mesh.flatten_device_meshes
diff --git a/colossalai/tensor/d_tensor/RAEDME.md b/colossalai/tensor/d_tensor/RAEDME.md
deleted file mode 100644
index 95d866388364..000000000000
--- a/colossalai/tensor/d_tensor/RAEDME.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# 🔢 Distributed Tensor
-
-## 📚 Table of Contents
-
-- [🔢 Distributed Tensor](#-distributed-tensor)
-  - [📚 Table of Contents](#-table-of-contents)
-  - [🔗 Introduction](#-introduction)
-  - [📝 Design](#-design)
-  - [🔨 Usage](#-usage)
-  - [🎈 Progress Log](#-progress-log)
-
-## 🔗 Introduction
-
-Distributed tensor is a type of tensor that is distributed across multiple devices. It is a wrapper of PyTorch tensor, and it is used to support distributed training.
-It can represent the device topology and tensor placement over the devices in the topology. It also provides a set of APIs to manipulate the distributed tensor.
-
-## 📝 Design
-
-Our implementation is inspired by the work [Alpa](https://arxiv.org/abs/2201.12023), which unifies data parallelism and tensor parallelism as intra-op parallelism. It uses notations `S` to represent the sharded dimension and `R` to represent the replicated dimension. For example, given a 2D matrix, `[S, R]` represents the tensor is sharded over the first dimension.
-
-Each sharded dimension will have a subscript to represent its placement over the devices. Assuming we have 4 GPUs and the GPUs are arranged in a 2 x 2 manner. Let's say we have a 2D matrix like below:
-
-
-```text
-    [1,  2,  3,  4 ]
-A = [4,  5,  6,  7 ]
-    [8,  9,  10, 11]
-    [12, 13, 14, 15]
-```
-
-`[S0, R]` would mean that the first dimension is sharded over the rows in the device topology.
-
-```text
-| --------------------—————————————————————-|
-|                     |                     |
-|  [1,  2,  3,  4 ]   |  [1,  2,  3,  4 ]   |
-|  [4,  5,  6,  7 ]   |  [4,  5,  6,  7 ]   |
-|                     |                     |
-| --------------------——————————————————-----
-|                     |                     |
-|  [8,  9,  10, 11]   |  [8,  9,  10, 11]   |
-|  [12, 13, 14, 15]   |  [12, 13, 14, 15]   |
-|                     |                     |
-| --------------------——————————————————-----
-```
-
-`[S01, R]` would mean that the first dimension is sharded over both the row and column in the device topology.
-
-```text
-| --------------------—————————————————————-|
-|                     |                     |
-|  [1,  2,  3,  4 ]   |  [4,  5,  6,  7 ]   |
-|                     |                     |
-| --------------------——————————————————-----
-|                     |                     |
-|  [8,  9,  10, 11]   |  [12, 13, 14, 15]   |
-|                     |                     |
-| --------------------——————————————————-----
-```
-
-## 🔨 Usage
-
-A sample API usage is given below.
-
-```python
-import torch
-
-import colossalai
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.d_tensor import DTensor, ShardingSpec
-
-colossalai.launch_from_torch(config={})
-
-# define your device mesh
-# assume you have 4 GPUs
-physical_mesh_id = torch.arange(0, 4).reshape(1, 4)
-mesh_shape = (2, 2)
-device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-# define a tensor
-a = torch.rand(16, 32).cuda()
-
-# create sharding spec for the tensor
-# assume the sharding spec is [S0, R]
-dim_partition_dict = {0: [0]}
-sharding_spec = ShardingSpec(a.dim(), dim_partition_dict)
-
-# create a distributed tensor
-d_tensor = DTensor(a, device_mesh, sharding_spec)
-print(d_tensor)
-
-global_tensor = d_tensor.to_global()
-print(global_tensor)
-```
-
-
-## 🎈 Progress Log
-
-- [x] Support layout conversion
-- [x] Support sharding on 2D device mesh
-- [ ] Support sharding on 3D device mesh
-- [ ] Support sharding 4D device mesh
-- [ ] Support sharding info saving and offline tensor merge (we can save tensor as dtensor and gather the tensors back to the global tensor based on the sharding info in a single process in CPU, useful for distributed training checkpoint load and save.)
diff --git a/colossalai/tensor/d_tensor/__init__.py b/colossalai/tensor/d_tensor/__init__.py
index af77f4f0edfc..e69de29bb2d1 100644
--- a/colossalai/tensor/d_tensor/__init__.py
+++ b/colossalai/tensor/d_tensor/__init__.py
@@ -1,4 +0,0 @@
-from .d_tensor import DTensor
-from .sharding_spec import ShardingSpec
-
-__all__ = ['DTensor', 'ShardingSpec']
diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
index 79b2e3ef936a..159125fa16db 100644
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -24,12 +24,12 @@ class CommSpec:
     '''
     Communication spec is used to record the communication action. It converts the communication spec
     to real action which will be used in runtime. It contains comm_pattern to determine the
-    communication method, process_group_dict to determine the process groups, gather_dim and shard_dim
+    communication method, process_groups_dict to determine the process groups, gather_dim and shard_dim
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
-        process_group_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
+        process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
         logical_process_axis(Union(int, List[int]), Optional): The mesh_dim to implement the communication action.
@@ -37,7 +37,7 @@ class CommSpec:
 
     def __init__(self,
                  comm_pattern: CollectiveCommPattern,
-                 process_group_dict: Dict,
+                 process_groups_dict: Dict,
                  gather_dim: int = None,
                  shard_dim: int = None,
                  logical_process_axis: int = None):
@@ -45,7 +45,7 @@ def __init__(self,
         self.gather_dim = gather_dim
         self.shard_dim = shard_dim
         self.logical_process_axis = logical_process_axis
-        self.process_group_dict = process_group_dict
+        self.process_groups_dict = process_groups_dict
 
     def __repr__(self):
         res_list = ["CommSpec:("]
@@ -92,56 +92,68 @@ def _all_gather(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement all gather operation on device mesh based on information provided by comm_spec.
     '''
-    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
-    world_size = dist.get_world_size(process_group)
-    tensor_list = [torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
-    # without this contiguous operation, the all gather may get some unexpected results.
-    tensor = tensor.contiguous()
-    dist.all_gather(tensor_list, tensor, group=process_group)
-    output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
-    return output
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            tensor_list = [
+                torch.zeros(tensor.shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
+            ]
+            # without this contiguous operation, the all gather may get some unexpected results.
+            tensor = tensor.contiguous()
+            dist.all_gather(tensor_list, tensor, group=process_group)
+            output = torch.cat(tuple(tensor_list), comm_spec.gather_dim).contiguous()
+            return output
 
 
 def _split(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement shard operation on device mesh based on information provided by comm_spec.
     '''
-    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
-    dim = comm_spec.shard_dim
-    length = tensor.shape[comm_spec.shard_dim] // dist.get_world_size(process_group)
-    start = length * dist.get_rank(process_group)
-    output = torch.narrow(tensor, dim, start, length).contiguous()
-    return output
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, _ in process_groups_list:
+        if dist.get_rank() in rank_list:
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            start = length * rank_list.index(dist.get_rank())
+            output = torch.narrow(tensor, dim, start, length).contiguous()
+            return output
 
 
 def _all_to_all(tensor: torch.Tensor, comm_spec: CommSpec):
     '''
     Implement all to all operation on device mesh based on information provided by comm_spec.
     '''
-    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
-    world_size = dist.get_world_size(process_group)
-    new_shape = list(tensor.shape)
-    new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // world_size
-    new_shape = torch.Size(new_shape)
-    output_tensor_list = [torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
-    dim = comm_spec.shard_dim
-    length = tensor.shape[comm_spec.shard_dim] // world_size
-    input_tensor_list = [torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(world_size)]
-    group = process_group
-    dist.all_to_all(output_tensor_list, input_tensor_list, group)
-    output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
-    return output
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            new_shape = list(tensor.shape)
+            new_shape[comm_spec.shard_dim] = new_shape[comm_spec.shard_dim] // len(rank_list)
+            new_shape = torch.Size(new_shape)
+            output_tensor_list = [
+                torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) for _ in range(len(rank_list))
+            ]
+            dim = comm_spec.shard_dim
+            length = tensor.shape[comm_spec.shard_dim] // len(rank_list)
+            input_tensor_list = [
+                torch.narrow(tensor, dim, length * i, length).contiguous() for i in range(len(rank_list))
+            ]
+            group = process_group
+            dist.all_to_all(output_tensor_list, input_tensor_list, group)
+            output = torch.cat(tuple(output_tensor_list), comm_spec.gather_dim).contiguous()
+            return output
 
 
 def _all_reduce(tensor: torch.Tensor, comm_spec: CommSpec, async_op: bool = False):
     '''
     Implement all reduce operation on device mesh based on information provided by comm_spec.
     '''
-    process_group = comm_spec.process_group_dict[comm_spec.logical_process_axis]
-    if not tensor.is_contiguous():
-        tensor = tensor.contiguous()
-    dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
-    return tensor
+    process_groups_list = comm_spec.process_groups_dict[comm_spec.logical_process_axis]
+    for rank_list, process_group in process_groups_list:
+        if dist.get_rank() in rank_list:
+            if not tensor.is_contiguous():
+                tensor = tensor.contiguous()
+            dist.all_reduce(tensor, op=ReduceOp.SUM, group=process_group, async_op=async_op)
+            return tensor
 
 
 class _ReduceGrad(torch.autograd.Function):
@@ -257,7 +269,7 @@ def symbolic(graph, input_):
     def forward(ctx, input_, comm_spec):
         output = _all_to_all(input_, comm_spec)
         comm_spec_for_backward = CommSpec(comm_pattern=comm_spec.comm_pattern,
-                                          process_group_dict=comm_spec.process_group_dict,
+                                          process_groups_dict=comm_spec.process_groups_dict,
                                           gather_dim=comm_spec.shard_dim,
                                           shard_dim=comm_spec.gather_dim,
                                           logical_process_axis=comm_spec.logical_process_axis)
diff --git a/colossalai/tensor/d_tensor/d_tensor.py b/colossalai/tensor/d_tensor/d_tensor.py
index 6bda0f4e579c..c1fe9d50a048 100644
--- a/colossalai/tensor/d_tensor/d_tensor.py
+++ b/colossalai/tensor/d_tensor/d_tensor.py
@@ -3,119 +3,55 @@
 import torch
 from torch.utils._pytree import tree_map
 
-from colossalai.device.device_mesh import DeviceMesh
-
 from .layout import Layout
 from .layout_converter import LayoutConverter, to_global
 from .sharding_spec import ShardingSpec
 
-__all__ = ['DTensor', 'distribute_tensor', 'distribute_module', 'construct_default_sharding_spec']
-
 layout_converter = LayoutConverter()
 
 
 class DTensor(torch.Tensor):
-    """
-    DTensor stands for distributed tensor. It is a subclass of `torch.Tensor` and contains meta information
-    about the tensor distribution. The meta information includes the device mesh, the sharding specification,
-    and the entire shape of the tensor.
-
-    During runtime, we will not directly use the DTensor objects for computation. Instead, we will only use the
-    `DTensor.local_tensor` for computation. The `DTensor.local_tensor` is the local tensor in the current rank.
-    In this way, all tensors involved in computation will only be native PyTorch tensors.
-
-    Example:
-        ```python
-        from colossalai.device import DeviceMesh
-
-        # define your device mesh
-        # assume you have 4 GPUs
-        physical_mesh_id = torch.arange(0, 4).reshape(1, 4)
-        mesh_shape = (2, 2)
-        device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-
-        # define a tensor
-        x = torch.rand(16, 32)
-
-        # create sharding spec for the tensor
-        # assume the sharding spec is [S, R]
-        dim_partition_dict = {
-            0: 1
-        }
-        sharding_spec = ShardingSpec(a.dim(), dim_partition_dict)
-
-        # create a distributed tensor
-        d_tensor = DTensor(x, device_mesh, sharding_spec)
-        ```
 
-    Args:
-        tensor (`torch.Tensor`): the unsharded tensor.
-        device_mesh (`DeviceMesh`): the device mesh for abstraction of the compute devices.
-        sharding_spec (`ShardingSpec`): the sharding specification which describes how the tensor will be sharded.
-    """
-
-    def __init__(self, tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec):
-        # ensure this tensor is not a DTensor
-        assert not isinstance(tensor, DTensor), 'The input tensor should not be a DTensor.'
-
-        # store meta info
-        self.local_tensor = tensor
-        self.data_type = tensor.dtype
-        self.global_shape = tensor.shape
-
-        # create distributed layout
-        dist_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=self.global_shape)
+    def __init__(self, local_tensor: torch.Tensor, dist_layout: Layout):
+        self.local_tensor = local_tensor
+        self.data_type = local_tensor.dtype
+        self.entire_shape = local_tensor.shape
         self.dist_layout = dist_layout
-
-        # shard the tensor
         self._apply_layout()
 
     @staticmethod
-    def __new__(cls, tensor, *args, **kwargs):
-        return torch.Tensor._make_subclass(cls, tensor, tensor.requires_grad)
+    def __new__(cls, local_tensor, layout):
+        return torch.Tensor._make_subclass(cls, local_tensor, local_tensor.requires_grad)
 
     def __repr__(self):
-        return f"DTensor(\n{self.to_global()}\n{self.dist_layout}"
+        return f"DTensor({self.to_global()}, {self.dist_layout})"
 
     def __str__(self):
         return self.__repr__()
 
-    def layout_convert(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> None:
+    def layout_convert(self, target_layout):
         '''
         Convert the layout of the tensor from source_spec to target_spec.
-        This will update the `local_tensor` and `dist_layout` in place.
-
-        Args:
-            target_layout (Layout): the target layout specification.
         '''
-        target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=self.global_shape)
-        self.local_tensor = layout_converter.apply(tensor=self.local_tensor,
-                                                   source_layout=self.dist_layout,
-                                                   target_layout=target_layout)
+        self.local_tensor = layout_converter.apply(self.local_tensor, self.dist_layout, target_layout)
         self.dist_layout = target_layout
 
     def _apply_layout(self):
         '''
         Apply the layout to the local tensor during initializing process.
         '''
-        # layout converter requires a source and target laytout
-        # we construct the source layer for an unsharded tensor
-        # and use self.dist_layer as the targer layout for the sharded tensor
         source_spec = construct_default_sharding_spec(self.local_tensor)
         source_layout = Layout(device_mesh=self.dist_layout.device_mesh,
+                               device_type=self.dist_layout.device_type,
                                sharding_spec=source_spec,
-                               global_shape=self.global_shape)
-        self.local_tensor = layout_converter.apply(tensor=self.local_tensor,
-                                                   source_layout=source_layout,
-                                                   target_layout=self.dist_layout)
+                               entire_shape=self.entire_shape)
+        self.local_tensor = layout_converter.apply(self.local_tensor, source_layout, self.dist_layout)
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
-        # convert all DTensors to native pytorch tensors
-        # so that operations will be conducted on native tensors
         def filter_arg(arg):
             if isinstance(arg, DTensor):
                 return arg.local_tensor
@@ -124,9 +60,9 @@ def filter_arg(arg):
 
         args = tree_map(filter_arg, args)
         kwargs = tree_map(filter_arg, kwargs)
-
-        # NOTE: if we want to convert the result into DTensor, we need to infer the layout of result from the layout of input tensors
+        # if we want to convert the result into DTensor, we need to infer the layout of result from the layout of input tensors
         # and op type.
+
         return func(*args, **kwargs)
 
     @property
@@ -149,6 +85,7 @@ def to(self, *args, **kwargs):
         '''
         self.local_tensor = self.local_tensor.to(*args, **kwargs)
         self.data_type = self.local_tensor.dtype
+        self.dist_layout.device_type = self.local_tensor.device
         # TODO: update the device mesh process groups or we should just cache
         # both the cpu process groups and the cuda process groups?
         return self
@@ -161,7 +98,7 @@ def to_local(self):
 
     def to_global(self):
         '''
-        Recover the global tensor from the distributed tensor by returning a new `torch.Tensor` object.
+        Recover the global tensor from the distributed tensor.
 
         Note: This function will all_gather the local tensor to the global tensor and it
         will not change the layout of the DTensor. This function is mainly used for debugging or
@@ -170,29 +107,24 @@ def to_global(self):
         return to_global(self.local_tensor, self.dist_layout)
 
 
-def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> DTensor:
+def distribute_tensor(local_tensor: torch.Tensor, dist_layout: Layout) -> DTensor:
     '''
     Distribute the local tensor to the distributed tensor according to the dist_layout specified.
 
     Args:
-        tensor (`torch.Tensor`): tensor to be distributed.
-        device_mesh (`DeviceMesh`): the device mesh for abstraction of the compute devices.
-        sharding_spec (`ShardingSpec`): the sharding specification which describes how the tensor will be sharded.
+        local_tensor: tensor to be distributed.
+        dist_layout: the layout specification of the distributed tensor.
 
     Returns:
         A 'DTensor' object.
     '''
-    return DTensor(tensor, device_mesh, sharding_spec)
+    return DTensor(local_tensor, dist_layout)
 
 
 def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable] = None) -> torch.nn.Module:
     '''
     This function converts all the parameters in the module to DTensor(DParam).
 
-    Args:
-        module (`torch.nn.Module`): the module to be distributed.
-        partition_fn (callable): the partition function which will be used to partition the parameters.
-
     Note: This function is subject to future change as the DParam has not been implemented yet.
     '''
     for name, param in module.named_parameters():
@@ -206,11 +138,5 @@ def distribute_module(module: torch.nn.Module, partition_fn: Optional[callable]
 def construct_default_sharding_spec(tensor: torch.Tensor,) -> ShardingSpec:
     '''
     Construct the default sharding specification for the tensor.
-
-    Args:
-        tensor (`torch.Tensor`): the tensor to be sharded.
-
-    Returns:
-        A `ShardingSpec` object without any sharding specified.
     '''
     return ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={})
diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py
index 2946611b4b79..ee7ef74a99ae 100644
--- a/colossalai/tensor/d_tensor/layout.py
+++ b/colossalai/tensor/d_tensor/layout.py
@@ -11,32 +11,28 @@
 
 
 class Layout:
-    """
-    Layout of a tensor refers to the tensor placement on the device mesh and how the tensor is sharded over the devices.
+    """Layout of a tensor.
 
-    Args:
-        device_mesh (`DeviceMesh`): the device mesh to store the tensor distributed.
-        sharding_spec (`ShardingSpec`): the sharding specification to describe how the tensor is sharded.
-        global_shape (`torch.Size`): the entire shape of the global tensor.
+    Attributes:
+        device_mesh: the device mesh to store the tensor distributed.
+        device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
+        sharding_spec: the sharding specification to describe how the tensor is sharded.
+        entire_shape: the entire shape of the global tensor.
     """
 
-    def __init__(self, device_mesh: DeviceMesh, sharding_spec: ShardingSpec, global_shape: torch.Size):
+    def __init__(self, device_mesh: DeviceMesh, device_type: torch.device, sharding_spec: ShardingSpec,
+                 entire_shape: torch.Size):
         self.device_mesh = device_mesh
+        self.device_type = device_type
         self.sharding_spec = sharding_spec
-        self.global_shape = global_shape
+        self.entire_shape = entire_shape
         self._sanity_check()
 
     def __hash__(self) -> int:
         return hash(f'{self.sharding_spec}')
 
-    def get_sharded_shape_per_device(self) -> torch.Size:
-        """
-        Compute the shape of the sharded tensor on each device.
-
-        Returns:
-            `torch.Size`: the shape of the sharded tensor on each device.
-        """
-        sharded_shape = list(self.global_shape)
+    def get_sharded_shape_per_device(self):
+        sharded_shape = list(self.entire_shape)
         for dim, shard_list in self.sharding_spec.dim_partition_dict.items():
             mesh_list = [self.device_mesh.mesh_shape[mesh_dim] for mesh_dim in shard_list]
             shard_partitions = reduce(operator.mul, mesh_list, 1)
@@ -60,7 +56,7 @@ def _sanity_check(self):
 
         # make sure that the sharding for a dimension is divisible by the number of devices
         for dim, shard_list in sharding_spec.dim_partition_dict.items():
-            tensor_dim_size = self.global_shape[dim]
+            tensor_dim_size = self.entire_shape[dim]
             num_devices = 1
 
             for element in shard_list:
diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
index 6eff92ea6b13..cf02aac309f4 100644
--- a/colossalai/tensor/d_tensor/layout_converter.py
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -3,8 +3,10 @@
 from dataclasses import dataclass
 from typing import Dict, List, Tuple
 
+import numpy as np
 import torch
 
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, TrainCycleItem
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.tensor.d_tensor.comm_spec import *
 from colossalai.tensor.d_tensor.layout import Layout
@@ -26,21 +28,13 @@ class LayoutConverterOptions:
     pass
 
 
-def to_global(distributed_tensor: "DTensor", layout: Layout) -> torch.Tensor:
-    """
-    Convert a distributed tensor to the global tensor with the given layout.
-    This function returns a native `torch.Tensor` object.
-
-
-    Args:
-        distributed_tensor (`DTensor`): the distributed tensor to be converted.
-        layout (`Layout`): the target layout specification.
-    """
+def to_global(distributed_tensor: torch.Tensor, layout: Layout) -> torch.Tensor:
     layout_converter = LayoutConverter()
     global_sharding_spec = ShardingSpec(distributed_tensor.dim(), {})
     global_layout = Layout(device_mesh=layout.device_mesh,
+                           device_type=layout.device_type,
                            sharding_spec=global_sharding_spec,
-                           global_shape=layout.global_shape)
+                           entire_shape=layout.entire_shape)
     with torch.no_grad():
         global_tensor = layout_converter.apply(distributed_tensor, layout, global_layout)
     return global_tensor
@@ -55,9 +49,6 @@ def set_layout_converting_options(options: LayoutConverterOptions):
 
 
 class LayoutConverter(metaclass=SingletonMeta):
-    """
-    LayoutConverter is a singleton class which converts the layout of a distributed tensor.
-    """
 
     def __init__(self):
         self._options = None
@@ -100,14 +91,15 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            global_shape = (4, 4, 4)
+            entire_shape = (4, 4, 4)
             dim_partition_dict = {0: [0], 1: [1]}
 
             # [S0,S1,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
+                            device_type=torch.device('cuda'),
                             sharding_spec=sharding_spec,
-                            global_shape=global_shape)
+                            entire_shape=entire_shape)
 
             rst_dict = layout_converter.all_gather_transform_layouts(layout)
             for layout, comm_spec in rst_dict.items():
@@ -120,12 +112,7 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.GATHER_FWD_SPLIT_BWD
         source_spec = source_layout.sharding_spec
-
-        # the key of the dict is the axis
-        # the value is the process group
-        current_rank = source_layout.device_mesh._global_rank_of_current_process
-        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
-
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
         for target_pair in source_spec.dim_partition_dict.items():
             shard_list = all_gather_simulator(target_pair)
             index = target_pair[0]
@@ -143,7 +130,7 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
             logical_process_axis = target_pair[1][-1]
             comm_spec = CommSpec(
                 comm_pattern,
-                process_group_dict=process_group_dict,
+                process_groups_dict=process_groups_dict,
                 gather_dim=gather_dim,
             # shard_dim will be used during backward
                 shard_dim=gather_dim,
@@ -154,7 +141,8 @@ def all_gather_transform_layouts(self, source_layout: Layout) -> Dict[Layout, Co
                 new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
                 new_layout = Layout(device_mesh=source_layout.device_mesh,
                                     sharding_spec=new_sharding_spec,
-                                    global_shape=source_layout.global_shape)
+                                    device_type=source_layout.device_type,
+                                    entire_shape=source_layout.entire_shape)
 
                 valid_spec_dict[new_layout] = comm_spec
             except LayoutException:
@@ -179,14 +167,15 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            global_shape = (4, 4, 4)
+            entire_shape = (4, 4, 4)
             dim_partition_dict = {0: [0], 1: [1]}
 
             # [S0,S1,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
+                                    device_type=torch.device('cuda'),
                                     sharding_spec=sharding_spec,
-                                    global_shape=global_shape)
+                                    entire_shape=entire_shape)
             rst_dict = layout_converter.all_to_all_transform_layout(layout)
 
             for layout, comm_spec in rst_dict.items():
@@ -199,12 +188,7 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
         '''
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.ALL2ALL_FWD_ALL2ALL_BWD
-
-        # the key of the dict is the axis
-        # the value is the process group
-        current_rank = source_layout.device_mesh._global_rank_of_current_process
-        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
-
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
         source_spec = source_layout.sharding_spec
         tensor_dims = source_spec.dims
         for f_index in range(tensor_dims - 1):
@@ -245,7 +229,7 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
                     shard_dim = f_index
                     logical_process_axis = b_target_pair[1][-1]
                 comm_spec = CommSpec(comm_pattern,
-                                     process_group_dict=process_group_dict,
+                                     process_groups_dict,
                                      gather_dim=gather_dim,
                                      shard_dim=shard_dim,
                                      logical_process_axis=logical_process_axis)
@@ -268,7 +252,8 @@ def all_to_all_transform_layout(self, source_layout: Layout) -> Dict[Layout, Com
                     new_sharding_spec = ShardingSpec(source_spec.dims, dim_partition_dict=new_dim_partition_dict)
                     new_layout = Layout(device_mesh=source_layout.device_mesh,
                                         sharding_spec=new_sharding_spec,
-                                        global_shape=source_layout.global_shape)
+                                        device_type=source_layout.device_type,
+                                        entire_shape=source_layout.entire_shape)
                     valid_spec_dict[new_layout] = comm_spec
                 except LayoutException:
                     pass
@@ -293,15 +278,16 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            global_shape = (4, 4, 4)
+            entire_shape = (4, 4, 4)
 
             dim_partition_dict = {0: [0]}
 
             # [S0,R,R]
             sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
             layout = Layout(device_mesh=device_mesh,
+                          device_type=torch.device('cuda'),
                           sharding_spec=sharding_spec,
-                          global_shape=global_shape)
+                          entire_shape=entire_shape)
             rst_dict = layout_converter.shard_transform_layout(layout)
 
             for layout, comm_spec in rst_dict.items():
@@ -315,11 +301,7 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
         valid_spec_dict = {}
         comm_pattern = CollectiveCommPattern.SPLIT_FWD_GATHER_BWD
         source_spec = source_layout.sharding_spec
-
-        # the key of the dict is the axis
-        # the value is the process group
-        current_rank = source_layout.device_mesh._global_rank_of_current_process
-        process_group_dict = source_layout.device_mesh._process_group_dict[current_rank]
+        process_groups_dict = source_layout.device_mesh.process_groups_dict
 
         # legal sharding dims means the mesh_id is still available to use.
         legal_sharding_dims = [i for i in range(len(source_layout.device_mesh.mesh_shape))]
@@ -347,7 +329,7 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
                 shard_dim = index
                 logical_process_axis = shard_list[-1]
                 comm_spec = CommSpec(comm_pattern,
-                                     process_group_dict=process_group_dict,
+                                     process_groups_dict,
                                      gather_dim=shard_dim,
                                      shard_dim=shard_dim,
                                      logical_process_axis=logical_process_axis)
@@ -358,7 +340,8 @@ def shard_transform_layout(self, source_layout: Layout) -> Dict[Layout, CommSpec
                                                      dim_partition_dict=new_dim_partition_dict)
                     new_layout = Layout(device_mesh=source_layout.device_mesh,
                                         sharding_spec=new_sharding_spec,
-                                        global_shape=source_layout.global_shape)
+                                        device_type=source_layout.device_type,
+                                        entire_shape=source_layout.entire_shape)
                     valid_spec_dict[new_layout] = comm_spec
                 except LayoutException:
                     pass
@@ -416,7 +399,7 @@ def layout_converting(self, source_layout: Layout,
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            global_shape = (4, 4, 4)
+            entire_shape = (4, 4, 4)
 
             dim_partition_source = {1: [0, 1]}
             dim_partition_target = {0: [0, 1]}
@@ -424,14 +407,16 @@ def layout_converting(self, source_layout: Layout,
             # [R,S01,R]
             sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
             source_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_source,
-                                global_shape=global_shape)
+                                entire_shape=entire_shape)
 
             # [S01,R,R]
             sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
             target_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_target,
-                                global_shape=global_shape)
+                                entire_shape=entire_shape)
 
             transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
             transform_path_str = '->'.join([str(layout.sharding_spec.sharding_sequence) for layout in transform_path])
@@ -520,19 +505,21 @@ def apply(self, tensor: torch.Tensor, source_layout: Layout, target_layout: Layo
             # [[0, 1,
             #  [2, 3]]
             device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-            global_shape = (4, 4, 4)
+            entire_shape = (4, 4, 4)
 
             # [S0,R,R]
             sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
             source_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_source,
-                                global_shape=global_shape)
+                                entire_shape=entire_shape)
 
             # [R,S0,R]
             sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
             target_layout = Layout(device_mesh=device_mesh,
+                                device_type=torch.device('cuda'),
                                 sharding_spec=sharding_spec_target,
-                                global_shape=global_shape)
+                                entire_shape=entire_shape)
 
             if rank in (0, 1):
                 sharded_tensor_0 = torch.zeros(2, 1)
@@ -567,4 +554,3 @@ def apply(self, tensor: torch.Tensor, source_layout: Layout, target_layout: Layo
         for comm_spec in comm_action_sequence:
             tensor = comm_spec.covert_spec_to_action(tensor)
         return tensor
-        return tensor
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 45b05e10e297..565012b58a03 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -116,21 +116,21 @@ def build_difference_2d_dict(self):
 
     def dim_diff(self, other):
         '''
-        The difference between two DimSpec.
+        The difference between two _DimSpec.
 
         Argument:
-            other(DimSpec): the dim spec to compare with.
+            other(_DimSpec): the dim spec to compare with.
 
         Return:
             difference(int): the difference between two _DimSpec.
 
         Example:
-            ```python
-            dim_spec = DimSpec([0])
-            other_dim_spec = DimSpec([0, 1])
+            dim_spec = _DimSpec([0])
+            other_dim_spec = _DimSpec([0, 1])
             print(dim_spec.difference(other_dim_spec))
-            # output: 5
-            ```
+
+        Output:
+            5
         '''
         difference = self.difference_dict[(str(self), str(other))]
         return difference
@@ -142,13 +142,9 @@ class ShardingSpec:
     [R, R, S0, S1], which means
 
     Argument:
-        dim_size (int): The number of dimensions of the tensor to be sharded.
-        dim_partition_dict (Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
-            and the value of the key describe which logical axis will be sharded in that dimension. Defaults to None.
-            E.g. {0: [0, 1]} means the first dimension of the tensor will be sharded in logical axis 0 and 1.
-        sharding_sequence (List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
-            Generally, users should specify either dim_partition_dict or sharding_sequence.
-            If both are given, users must ensure that they are consistent with each other. Defaults to None.
+        dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
+            and the value of the key describe which logical axis will be sharded in that dimension.
+        sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
     '''
 
     def __init__(self,
@@ -212,7 +208,6 @@ def spec_diff(self, other):
         pair of sharding sequence.
 
         Example:
-            ```python
             dim_partition_dict = {0: [0, 1]}
             # DistSpec:
             #     shard_sequence: S01,R,R
@@ -224,8 +219,10 @@ def spec_diff(self, other):
             #     device_mesh_shape: (4, 4)
             sharding_spec_to_compare = ShardingSpec(device_mesh, entire_shape, dim_partition_dict_to_compare)
             print(sharding_spec.sharding_sequence_difference(sharding_spec_to_compare))
-            # output: 25
-            ```
+
+        Output:
+            25
+
         Argument:
             other(ShardingSpec): The ShardingSpec to compared with.
 
diff --git a/docs/sidebars.json b/docs/sidebars.json
index c3cfbbeef689..8be40e4512f9 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -64,7 +64,6 @@
         },
         "features/pipeline_parallel",
         "features/nvme_offload",
-        "features/lazy_init",
         "features/cluster_utils"
       ]
     },
diff --git a/docs/source/en/features/lazy_init.md b/docs/source/en/features/lazy_init.md
deleted file mode 100644
index 40f5da1cb84d..000000000000
--- a/docs/source/en/features/lazy_init.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Lazy initialization
-
-Author: Hongxin Liu
-
-**Prerequisite**
-- [Booster API](../basics/booster_api.md)
-- [Booster Plugins](../basics/booster_plugins.md)
-- [Booster Checkpoint](../basics/booster_checkpoint.md)
-
-**Related discussion**
-- [Lazy initialization of model](https://github.com/hpcaitech/ColossalAI/discussions/3124)
-
-## Introduction
-
-LazyTensor allows DL framework (PyTorch) to execute operations lazily, by storing all operations related to it and reruning them when it's required to be materialized.
-
-LazyInit defers model initialization and it's based on LazyTensor.
-
-This is especially useful when we use model parallelism to train large models, in which case the model cannot fit in GPU memory. Through this, we can initialize model tensors using meta tensor and do static analysis to get shard strategy. And then materialize each tensor and apply the shard strategy. The static analysis can be omitted if the shard strategy is known in advance.
-
-## Usage
-
-You may use lazy initialization when using Gemini, tensor parallelism, pipeline parallelism, and auto-parallelism. In other cases, you may not need to use lazy initialization.
-
-Gemini is compatible with lazy initialization. You can use them together directly.
-
-```python
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin
-from colossalai.lazy import LazyInitContext
-from colossalai.nn.optimizer import HybridAdam
-from torch.nn import Linear
-import colossalai
-
-colossalai.launch_from_torch({})
-
-plugin = GeminiPlugin()
-booster = Booster(plugin=plugin)
-
-with LazyInitContext():
-    model = Linear(10, 10)
-
-optimizer = HybridAdam(model.parameters())
-model, optimizer, *_ = booster.boost(model, optimizer)
-```
-
-Note that using lazy initialization when using Gemini is not necessary but recommended. If you don't use lazy initialization, you may get OOM error when initializing the model. If you use lazy initialization, you can avoid this error.
-
-> ⚠ Lazy initialization support for tensor parallelism, pipeline parallelism, and auto-parallelism is still under development.
-
-### Load from pretrained model
-
-We should not load pretrained weight in `LazyInitContext`. If so, lazy initialization is meaningless, as the checkpoint is loaded and it takes much GPU memory. A recommended way is to initialize model from scratch in `LazyInitContext` and load pretrained weight outside `LazyInitContext` after calling `Booster.boost()`.
-
-<!--- doc-test-ignore-start -->
-```python
-with LazyInitContext():
-    model = GPT2LMHeadModel(config)
-
-optimizer = ...
-lr_scheduler = ...
-dataloader = ...
-model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
-
-booster.load_model(model, pretrained_path)
-```
-<!--- doc-test-ignore-end -->
-
-As booster supports both pytorch-fashion checkpoint and huggingface/transformers-fashion pretrained weight, the `pretrained_path` of the above pseudo-code can be either a checkpoint file path or a pretrained weight path. Note that it does not support loading pretrained weights from network. You should download the pretrained weight first and then use a local path.
-
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 lazy_init.py  -->
diff --git a/docs/source/zh-Hans/features/lazy_init.md b/docs/source/zh-Hans/features/lazy_init.md
deleted file mode 100644
index 9a3cd90caa8d..000000000000
--- a/docs/source/zh-Hans/features/lazy_init.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# 惰性初始化
-
-作者: Hongxin Liu
-
-**前置教程**
-- [Booster API](../basics/booster_api.md)
-- [Booster 插件](../basics/booster_plugins.md)
-- [Booster Checkpoint](../basics/booster_checkpoint.md)
-
-**相关讨论**
-- [模型的惰性初始化](https://github.com/hpcaitech/ColossalAI/discussions/3124)
-
-## 引言
-
-LazyTensor 允许深度学习框架 (PyTorch) 延迟执行操作，方法是存储与其相关的所有操作并在需要具体化时重新运行它们。
-
-LazyInit 基于 LazyTensor，并支持延迟模型初始化。
-
-这在我们使用模型并行来训练大型模型时特别有用，在这种情况下模型无法容纳在 GPU 内存中。通过这个，我们可以使用 Meta 张量初始化模型张量并进行静态分析以获得分片策略。然后具体化每个张量并应用分片策略。如果事先知道分片策略，则可以省略静态分析。
-
-## 用法
-
-您可以在使用 Gemini、张量并行、流水线并行和自动并行时使用惰性初始化。在其他情况下，您可能不需要使用惰性初始化。
-
-Gemini 与惰性初始化兼容。您可以直接将它们一起使用。
-
-```python
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin
-from colossalai.lazy import LazyInitContext
-from colossalai.nn.optimizer import HybridAdam
-from torch.nn import Linear
-import colossalai
-
-colossalai.launch_from_torch({})
-
-plugin = GeminiPlugin()
-booster = Booster(plugin=plugin)
-
-with LazyInitContext():
-    model = Linear(10, 10)
-
-optimizer = HybridAdam(model.parameters())
-model, optimizer, *_ = booster.boost(model, optimizer)
-```
-
-请注意，在使用 Gemini 时使用惰性初始化不是必需的，但建议使用。如果不使用惰性初始化，在初始化模型时可能会出现 OOM 错误。如果使用惰性初始化，则可以避免此错误。
-
-> ⚠ 对张量并行、流水线并行和自动并行的惰性初始化支持仍在开发中。
-
-### 从预训练模型加载
-
-我们不应该在 `LazyInitContext` 中加载预训练权重。如果这样，惰性初始化就没有意义，因为检查点已加载并且需要大量 GPU 内存。推荐的方法是在 `LazyInitContext` 中初始化模型，并在调用 `Booster.boost()` 后在 `LazyInitContext` 之外加载预训练权重。
-
-<!--- doc-test-ignore-start -->
-```python
-with LazyInitContext():
-    model = GPT2LMHeadModel(config)
-
-optimizer = ...
-lr_scheduler = ...
-dataloader = ...
-model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
-
-booster.load_model(model, pretrained_path)
-```
-<!--- doc-test-ignore-end -->
-
-由于 booster 同时支持 pytorch 风格的 checkpoint 和 huggingface/transformers 风格的预训练权重，上述伪代码的 `pretrained_pa​​th` 可以是 checkpoint 文件路径或预训练权重路径。请注意，它不支持从网络加载预训练权重。您应该先下载预训练的权重，然后使用本地路径。
-
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 lazy_init.py  -->
diff --git a/tests/test_device/test_device_mesh.py b/tests/test_device/test_device_mesh.py
index 19d41d23353f..3be057b3a98b 100644
--- a/tests/test_device/test_device_mesh.py
+++ b/tests/test_device/test_device_mesh.py
@@ -1,19 +1,20 @@
-import torch
-
 from colossalai.device.device_mesh import DeviceMesh
+import torch
 
 
 def test_device_mesh():
-    physical_mesh_id = torch.arange(0, 16)
+    physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
     mesh_shape = (4, 4)
     # [[0, 1, 2, 3],
     #  [4, 5, 6, 7],
     #  [8, 9, 10,11],
     #  [12,13,14,15]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
-    assert device_mesh.global_rank_to_local_rank(5) == [1, 1]
-    assert device_mesh.global_rank_to_local_rank(11) == [2, 3]
-    assert device_mesh.get_ranks_in_process_group(axis=1, global_rank=2) == [0, 1, 2, 3]
+    assert device_mesh.convert_map[5] == [1, 1]
+    assert device_mesh.convert_map[11] == [2, 3]
+    assert device_mesh.global_rank_to_process_groups_with_logical_rank(0)[0] == [[0, 0], [1, 0], [2, 0], [3, 0]]
+    assert device_mesh.global_rank_to_process_groups_with_logical_rank(2)[1] == [[0, 0], [0, 1], [0, 2], [0, 3]]
+    assert device_mesh.global_rank_to_process_groups_with_global_rank(2)[1] == [0, 1, 2, 3]
 
 
 if __name__ == '__main__':
diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py
index 7c6339eff67e..2b7060c4846a 100644
--- a/tests/test_device/test_init_logical_pg.py
+++ b/tests/test_device/test_init_logical_pg.py
@@ -20,12 +20,16 @@ def check_layer(rank, world_size, port):
     # [[0, 1,
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    for axis in range(len(mesh_shape)):
-        tensor = torch.ones(4).cuda()
-        pg = device_mesh.get_process_group(axis=axis)
-        dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg)
-        assert tensor.equal(tensor_to_check)
+    logical_pg_dict = {0: [[0, 2], [1, 3]], 1: [[0, 1], [2, 3]]}
+    logical_process_groups = device_mesh.process_groups_dict
+
+    for mesh_dim, pgs in logical_pg_dict.items():
+        for index, pg in enumerate(pgs):
+            if rank in pg:
+                tensor = torch.ones(4).cuda()
+                group = logical_process_groups[mesh_dim][index][1]
+                dist.all_reduce(tensor, op=ReduceOp.SUM, group=group)
+                assert tensor.equal(tensor_to_check)
 
     gpc.destroy()
 
diff --git a/tests/test_lazy/lazy_init_utils.py b/tests/test_lazy/lazy_init_utils.py
index 2911012fafa8..85bfd0e27801 100644
--- a/tests/test_lazy/lazy_init_utils.py
+++ b/tests/test_lazy/lazy_init_utils.py
@@ -6,9 +6,7 @@
 import torch
 from packaging import version
 
-from colossalai.device.device_mesh import DeviceMesh
 from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
-from colossalai.tensor.d_tensor.layout import Layout
 from colossalai.tensor.d_tensor.layout_converter import to_global
 from tests.kit.model_zoo.registry import ModelAttribute
 
@@ -83,8 +81,7 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
         print(f'{model.__class__.__name__} pass')
 
 
-def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, device_mesh: DeviceMesh,
-                            sharding_spec_dict: dict) -> None:
+def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, layout_dict: dict) -> None:
     state = model.state_dict()
     distributed_state = distributed_model.state_dict()
 
@@ -94,7 +91,6 @@ def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.
         assert n1 == n2
         t1 = t1.cuda()
         t2 = t2.cuda()
-        if n2 in sharding_spec_dict:
-            layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_dict[n2], global_shape=t1.shape)
-            t2 = to_global(t2, layout)
+        if n2 in layout_dict:
+            t2 = to_global(t2, layout_dict[n2])
         assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'
diff --git a/tests/test_lazy/test_distribute.py b/tests/test_lazy/test_distribute.py
index efa43eab5788..d515b175a9ea 100644
--- a/tests/test_lazy/test_distribute.py
+++ b/tests/test_lazy/test_distribute.py
@@ -26,19 +26,23 @@ def find_shard_dim(shape: torch.Size) -> Optional[int]:
             return dim
 
 
-def make_sharding_spec(original_tensor: torch.Tensor) -> Layout:
+def make_layout(device_mesh: DeviceMesh, original_tensor: torch.Tensor) -> Layout:
     shard_dim = find_shard_dim(original_tensor.shape)
     dim_partition_dict = {shard_dim: [0]} if shard_dim is not None else {}
     target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict=dim_partition_dict)
-    return target_sharding_spec
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=target_sharding_spec,
+                    entire_shape=original_tensor.shape)
+    return layout
 
 
 def _get_current_name(prefix: str, name: str) -> str:
     return f'{prefix}.{name}'.lstrip('.')
 
 
-def generate_sharding_spec_dict(model: nn.Module) -> dict:
-    sharding_spec_dict = {}
+def generate_layout_dict(model: nn.Module, device_mesh: DeviceMesh) -> dict:
+    layout_dict = {}
 
     @torch.no_grad()
     def generate_recursively(module: nn.Module, prefix: str = ''):
@@ -49,17 +53,17 @@ def generate_recursively(module: nn.Module, prefix: str = ''):
         # initialize tensors directly attached to the current module
         for name, param in module.named_parameters(recurse=False):
             if isinstance(param, LazyTensor):
-                sharding_spec = make_sharding_spec(param)
-                sharding_spec_dict[_get_current_name(prefix, name)] = sharding_spec
+                layout = make_layout(device_mesh, param)
+                layout_dict[_get_current_name(prefix, name)] = layout
 
         for name, buf in module.named_buffers(recurse=False):
             if isinstance(buf, LazyTensor):
-                sharding_spec = make_sharding_spec(buf)
-                sharding_spec_dict[_get_current_name(prefix, name)] = sharding_spec
+                layout = make_layout(device_mesh, buf)
+                layout_dict[_get_current_name(prefix, name)] = layout
 
     generate_recursively(model)
 
-    return sharding_spec_dict
+    return layout_dict
 
 
 @parameterize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])
@@ -81,9 +85,9 @@ def run_dist_lazy_init(subset, seed: int = 42):
         ctx = LazyInitContext()
         with ctx:
             deferred_model = model_fn()
-        sharding_spec_dict = generate_sharding_spec_dict(deferred_model)
-        ctx.distribute(deferred_model, device_mesh, sharding_spec_dict, verbose=True)
-        assert_dist_model_equal(model, deferred_model, device_mesh, sharding_spec_dict)
+        layout_dict = generate_layout_dict(deferred_model, device_mesh)
+        ctx.distribute(deferred_model, layout_dict, verbose=True)
+        assert_dist_model_equal(model, deferred_model, layout_dict)
 
 
 def run_dist(rank, world_size, port) -> None:
diff --git a/tests/test_tensor/test_dtensor/test_comm_spec.py b/tests/test_tensor/test_dtensor/test_comm_spec.py
index 0797e01e7e9d..d1f5b9299397 100644
--- a/tests/test_tensor/test_dtensor/test_comm_spec.py
+++ b/tests/test_tensor/test_dtensor/test_comm_spec.py
@@ -125,6 +125,23 @@ def check_all_reduce_bwd(process_groups_dict, rank):
     assert tensor_to_comm.equal(tensor_to_check)
 
 
+def check_all_reduce_in_flatten_device_mesh(process_groups_dict, rank):
+    # tensor to comm
+    tensor_to_comm = torch.ones(2, 2).cuda() * rank
+
+    # reduce through logical process axis 0 at flatten device mesh
+    # tensor to check
+    # tensor([[6., 6.],
+    #         [6., 6.]])
+    tensor_to_check = torch.tensor([[6, 6], [6, 6]], dtype=tensor_to_comm.dtype).cuda()
+
+    # CommSpec:(comm_pattern:all_reduce, logical_process_axis:[0, 1])
+    comm_spec = CommSpec(CollectiveCommPattern.ALLREDUCE_FWD_IDENTITY_BWD, process_groups_dict, logical_process_axis=0)
+    tensor_to_comm = comm_spec.covert_spec_to_action(tensor_to_comm)
+
+    assert tensor_to_comm.equal(tensor_to_check)
+
+
 def check_comm(rank, world_size, port):
     disable_existing_loggers()
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
@@ -136,22 +153,24 @@ def check_comm(rank, world_size, port):
     # [[0, 1,
     #  [2, 3]]
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    process_group_dict = device_mesh._process_group_dict[rank]
+    process_groups_dict = device_mesh.process_groups_dict
 
     # test all gather
-    check_all_gather(process_group_dict, rank)
+    check_all_gather(process_groups_dict, rank)
 
     # test shard
-    check_shard(process_group_dict, rank)
+    check_shard(process_groups_dict, rank)
 
     # test all to all
-    check_all_to_all(process_group_dict, rank)
+    check_all_to_all(process_groups_dict, rank)
 
     # test all reduce
-    check_all_reduce_fwd(process_group_dict, rank)
-    check_all_reduce_bwd(process_group_dict, rank)
+    check_all_reduce_fwd(process_groups_dict, rank)
+    check_all_reduce_bwd(process_groups_dict, rank)
 
+    flatten_process_groups_dict = device_mesh.flatten_device_mesh.process_groups_dict
+    # test all reduce in 1D flatten device mesh
+    check_all_reduce_in_flatten_device_mesh(flatten_process_groups_dict, rank)
     gpc.destroy()
 
 
diff --git a/tests/test_tensor/test_dtensor/test_dtensor.py b/tests/test_tensor/test_dtensor/test_dtensor.py
index 50a3bfb15c38..3ca369acbf87 100644
--- a/tests/test_tensor/test_dtensor/test_dtensor.py
+++ b/tests/test_tensor/test_dtensor/test_dtensor.py
@@ -31,9 +31,13 @@ def check_dtensor(rank, world_size, port):
 
     device_mesh = DeviceMesh(torch.Tensor([0, 1, 2, 3]), (2, 2), init_process_group=True)
     target_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0]})
-    d_tensor = DTensor(original_tensor, device_mesh, target_sharding_spec)
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=target_sharding_spec,
+                    entire_shape=original_tensor.shape)
+    d_tensor = DTensor(original_tensor, layout)
 
-    assert d_tensor.global_shape == original_tensor.shape
+    assert d_tensor.entire_shape == original_tensor.shape
     assert d_tensor.data_type == original_tensor.dtype
 
     if rank in (0, 1):
@@ -53,7 +57,12 @@ def check_dtensor(rank, world_size, port):
         raise ValueError(f'rank {rank} is not in the device mesh')
 
     new_sharding_spec = ShardingSpec(dim_size=original_tensor.dim(), dim_partition_dict={0: [0, 1]})
-    d_tensor.layout_convert(device_mesh, new_sharding_spec)
+    new_layout = Layout(device_mesh=device_mesh,
+                        device_type=torch.device('cuda'),
+                        sharding_spec=new_sharding_spec,
+                        entire_shape=original_tensor.shape)
+
+    d_tensor.layout_convert(new_layout)
 
     if rank == 0:
         assert d_tensor.local_tensor.equal(original_tensor.narrow(0, 0, 1))
@@ -66,7 +75,7 @@ def check_dtensor(rank, world_size, port):
     else:
         raise ValueError(f'rank {rank} is not in the device mesh')
 
-    dtensor_from_local = distribute_tensor(original_tensor, device_mesh, new_sharding_spec)
+    dtensor_from_local = distribute_tensor(original_tensor, new_layout)
 
     if rank == 0:
         assert dtensor_from_local.local_tensor.equal(original_tensor.narrow(0, 0, 1))
diff --git a/tests/test_tensor/test_dtensor/test_layout_converter.py b/tests/test_tensor/test_dtensor/test_layout_converter.py
index 6608e4787273..5f56decb5e5d 100644
--- a/tests/test_tensor/test_dtensor/test_layout_converter.py
+++ b/tests/test_tensor/test_dtensor/test_layout_converter.py
@@ -12,9 +12,9 @@
 from colossalai.tensor.d_tensor.sharding_spec import DimSpec, ShardingSpec
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
-global_shape = torch.Size((64, 32, 16))
+entire_shape = torch.Size((64, 32, 16))
 layout_converter = LayoutConverter()
-physical_mesh_id = torch.arange(0, 4)
+physical_mesh_id = torch.arange(0, 4).reshape(2, 2)
 mesh_shape = (2, 2)
 
 
@@ -30,7 +30,10 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,S1,R
     #     device_mesh_shape: (2, 2)
     sharding_spec = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict)
-    layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=global_shape)
+    layout = Layout(device_mesh=device_mesh,
+                    device_type=torch.device('cuda'),
+                    sharding_spec=sharding_spec,
+                    entire_shape=entire_shape)
 
     rst_dict = layout_converter.all_gather_transform_layouts(layout)
 
@@ -46,7 +49,10 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,S1,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_all2all = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_dict_all2all)
-    layout_all2all = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_all2all, global_shape=global_shape)
+    layout_all2all = Layout(device_mesh=device_mesh,
+                            device_type=torch.device('cuda'),
+                            sharding_spec=sharding_spec_all2all,
+                            entire_shape=entire_shape)
 
     rst_dict_all2all = layout_converter.all_to_all_transform_layout(layout_all2all)
 
@@ -65,7 +71,10 @@ def check_one_step_transform(rank, world_size, port):
     #     shard_sequence: S0,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_shard = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_shard)
-    shard_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_shard, global_shape=global_shape)
+    shard_layout = Layout(device_mesh=device_mesh,
+                          device_type=torch.device('cuda'),
+                          sharding_spec=sharding_spec_shard,
+                          entire_shape=entire_shape)
 
     rst_dict_shard = layout_converter.shard_transform_layout(shard_layout)
 
@@ -91,13 +100,19 @@ def check_layout_converting(rank, world_size, port):
     #     shard_sequence: R,S01,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
-    source_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_source, global_shape=global_shape)
+    source_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_source,
+                           entire_shape=entire_shape)
 
     # DistSpec:
     #     shard_sequence: S01,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
-    target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_target, global_shape=global_shape)
+    target_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_target,
+                           entire_shape=entire_shape)
 
     transform_path, comm_action_sequence = layout_converter.layout_converting(source_layout, target_layout)
 
@@ -144,15 +159,21 @@ def check_layout_converting_apply(rank, world_size, port):
     #     shard_sequence: R,S01,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_source = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_source)
-    source_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_source, global_shape=global_shape)
+    source_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_source,
+                           entire_shape=entire_shape)
 
     # DistSpec:
     #     shard_sequence: S01,R,R
     #     device_mesh_shape: (4, 4)
     sharding_spec_target = ShardingSpec(dim_size=3, dim_partition_dict=dim_partition_target)
-    target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_target, global_shape=global_shape)
+    target_layout = Layout(device_mesh=device_mesh,
+                           device_type=torch.device('cuda'),
+                           sharding_spec=sharding_spec_target,
+                           entire_shape=entire_shape)
 
-    original_tensor = torch.rand(global_shape).cuda()
+    original_tensor = torch.rand(entire_shape).cuda()
 
     # tensor_to_apply: [R, S01, R]
     tensor_to_apply = original_tensor.narrow(1, rank * 8, 8)
diff --git a/tests/test_tensor/test_shape_consistency.py b/tests/test_tensor/test_shape_consistency.py
index 859eef051256..6fe9ee292cd0 100644
--- a/tests/test_tensor/test_shape_consistency.py
+++ b/tests/test_tensor/test_shape_consistency.py
@@ -1,10 +1,9 @@
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager, CollectiveCommPattern
 import torch
-
+from colossalai.tensor.sharding_spec import _DimSpec, ShardingSpec
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.shape_consistency import CollectiveCommPattern, ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
 
-physical_mesh_id = torch.arange(0, 16)
+physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
 mesh_shape = (4, 4)
 # [[0, 1, 2, 3],
 #  [4, 5, 6, 7],
diff --git a/tests/test_tensor/test_sharded_linear.py b/tests/test_tensor/test_sharded_linear.py
index 9bd9805e9b8f..d66d4fec14d1 100644
--- a/tests/test_tensor/test_sharded_linear.py
+++ b/tests/test_tensor/test_sharded_linear.py
@@ -26,7 +26,7 @@ def run_dist(rank, world_size, port):
     # the mesh is in the following topo
     # [[0, 1],
     #  [2, 3]]
-    physical_mesh_id = torch.arange(0, 4)
+    physical_mesh_id = torch.arange(0, 4).reshape(2, 2)
     mesh_shape = (2, 2)
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
     row_id = rank // 2
diff --git a/tests/test_tensor/test_sharding_spec.py b/tests/test_tensor/test_sharding_spec.py
index 5007c4141849..909c84ef0f0e 100644
--- a/tests/test_tensor/test_sharding_spec.py
+++ b/tests/test_tensor/test_sharding_spec.py
@@ -5,7 +5,7 @@
 
 
 def test_sharding_spec():
-    physical_mesh_id = torch.arange(0, 16)
+    physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
     mesh_shape = (4, 4)
     # [[0, 1, 2, 3],
     #  [4, 5, 6, 7],

From e61ffc77c61df999b900ac1961d0329f0b544924 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Fri, 9 Jun 2023 09:49:41 +0800
Subject: [PATCH 48/52] fix typo tests/ (#3936)

---
 tests/kit/model_zoo/registry.py                           | 4 ++--
 .../test_node_handler/test_batch_norm_handler.py          | 2 +-
 .../test_node_handler/test_output_handler.py              | 8 ++++----
 tests/test_tensor/test_dtensor/test_layout_converter.py   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index 7470327a65b6..6cc4c8ef370d 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 from typing import Callable
 
-__all__ = ['ModelZooRegistry', 'ModelAttributem', 'model_zoo']
+__all__ = ['ModelZooRegistry', 'ModelAttribute', 'model_zoo']
 
 
 @dataclass
@@ -37,7 +37,7 @@ def register(self,
         >>> model_zoo = ModelZooRegistry()
         >>> model_zoo.register('resnet18', resnet18, resnet18_data_gen)
         >>> # Run the model
-        >>> data = resnresnet18_data_gen() # do not input any argument
+        >>> data = resnet18_data_gen() # do not input any argument
         >>> model = resnet18() # do not input any argument
         >>> out = model(**data)
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
index b47b3508ad1b..c3ceef4c7adf 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
@@ -27,7 +27,7 @@ def check_bn_module_handler(rank, world_size, port):
     # the index of bn node in computation graph
     node_index = 1
     # the total number of bn strategies without sync bn mode
-    # TODO: add sync bn stategies after related passes ready
+    # TODO: add sync bn strategies after related passes ready
     strategy_number = 4
     numerical_test_for_node_strategy(model=model,
                                      device_mesh=device_mesh,
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
index 5259455d2179..1703d5ded2f2 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
@@ -43,14 +43,14 @@ def test_output_handler(output_option):
     output_strategies_vector = StrategiesVector(output_node)
 
     # build handler
-    otuput_handler = OutputHandler(node=output_node,
+    output_handler = OutputHandler(node=output_node,
                                    device_mesh=device_mesh,
                                    strategies_vector=output_strategies_vector,
                                    output_option=output_option)
 
-    otuput_handler.register_strategy(compute_resharding_cost=False)
+    output_handler.register_strategy(compute_resharding_cost=False)
     # check operation data mapping
-    mapping = otuput_handler.get_operation_data_mapping()
+    mapping = output_handler.get_operation_data_mapping()
 
     for name, op_data in mapping.items():
         op_data: OperationData
@@ -59,7 +59,7 @@ def test_output_handler(output_option):
 
     assert mapping['output'].name == "output"
     assert mapping['output'].type == OperationDataType.OUTPUT
-    strategy_name_list = [val.name for val in otuput_handler.strategies_vector]
+    strategy_name_list = [val.name for val in output_handler.strategies_vector]
     if output_option == 'distributed':
         assert "Distributed Output" in strategy_name_list
     else:
diff --git a/tests/test_tensor/test_dtensor/test_layout_converter.py b/tests/test_tensor/test_dtensor/test_layout_converter.py
index 5f56decb5e5d..5c3da5f2b9ff 100644
--- a/tests/test_tensor/test_dtensor/test_layout_converter.py
+++ b/tests/test_tensor/test_dtensor/test_layout_converter.py
@@ -137,7 +137,7 @@ def check_layout_converting(rank, world_size, port):
     assert comm_action_sequence[2].shard_dim == 0
     assert comm_action_sequence[2].logical_process_axis == 1
 
-    # checkout chached_spec_pairs_transform_path
+    # checkout cached_spec_pairs_transform_path
     assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][0] == transform_path
     assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][1] == comm_action_sequence
 

From 1aadeedeea4e5f2ee6304e10abe38ceb7beda33f Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Fri, 9 Jun 2023 10:30:50 +0800
Subject: [PATCH 49/52] fix typo .github/workflows/scripts/ (#3946)

---
 .../generate_leaderboard_and_send_to_lark.py       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
index 16b8957c1d88..d8f6c8fe309e 100644
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -38,7 +38,7 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
 
 def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
     """
-    Retrive the issue/PR comments made by our members in the last 7 days.
+    Retrieve the issue/PR comments made by our members in the last 7 days.
 
     Args:
         github_token (str): GitHub access token for API calls
@@ -89,7 +89,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
 
 def get_discussion_comments(github_token, since) -> Dict[str, int]:
     """
-    Retrive the discussion comments made by our members in the last 7 days.
+    Retrieve the discussion comments made by our members in the last 7 days.
     This is only available via the GitHub GraphQL API.
 
     Args:
@@ -194,7 +194,7 @@ def _call_graphql_api(query):
 
                 discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                 # check if the updatedAt is within the last 7 days
-                # if yes, add it to dicussion_numbers
+                # if yes, add it to discussion_numbers
                 if discussion_updated_at > since:
                     if discussion['authorAssociation'] != 'MEMBER':
                         discussion_numbers.append(discussion['number'])
@@ -207,14 +207,14 @@ def _call_graphql_api(query):
             # update cursor
             cursor = edges[-1]['cursor']
 
-    # get the dicussion comments and replies made by our member
+    # get the discussion comments and replies made by our member
     user_engagement_count = {}
-    for dicussion_number in discussion_numbers:
+    for discussion_number in discussion_numbers:
         cursor = None
         num_per_request = 10
 
         while True:
-            query = _generate_comment_reply_count_for_discussion(dicussion_number, num_per_request, cursor)
+            query = _generate_comment_reply_count_for_discussion(discussion_number, num_per_request, cursor)
             data = _call_graphql_api(query)
 
             # get the comments
@@ -249,7 +249,7 @@ def _call_graphql_api(query):
                             reply = reply_edge['node']
                             if reply['authorAssociation'] == 'MEMBER':
                                 # check if the updatedAt is within the last 7 days
-                                # if yes, add it to dicussion_numbers
+                                # if yes, add it to discussion_numbers
                                 reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                                 if reply_updated_at > since:
                                     member_name = reply['author']['login']

From b3ab7fbabf3b72805403b82eeb79a6155d72004f Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <eddiezhang@pku.edu.cn>
Date: Mon, 12 Jun 2023 15:02:27 +0800
Subject: [PATCH 50/52] [example] update ViT example using booster api (#3940)

---
 examples/images/vit/README.md                |  65 ++-----
 examples/images/vit/args.py                  | 124 +++++++++++++
 examples/images/vit/configs/vit_1d_tp2.py    |  32 ----
 examples/images/vit/configs/vit_1d_tp2_ci.py |  32 ----
 examples/images/vit/data.py                  |  32 ++++
 examples/images/vit/requirements.txt         |   6 +-
 examples/images/vit/run.sh                   |  15 --
 examples/images/vit/run_benchmark.sh         |  27 +++
 examples/images/vit/run_demo.sh              |  44 +++++
 examples/images/vit/test_ci.sh               |  24 ++-
 examples/images/vit/test_vit.py              | 160 -----------------
 examples/images/vit/train.py                 | 174 ------------------
 examples/images/vit/vit.py                   |  95 ----------
 examples/images/vit/vit_benchmark.py         | 129 ++++++++++++++
 examples/images/vit/vit_train_demo.py        | 177 +++++++++++++++++++
 examples/language/opt/opt_benchmark.py       |  19 +-
 examples/language/opt/opt_train_demo.py      |  15 +-
 17 files changed, 577 insertions(+), 593 deletions(-)
 create mode 100644 examples/images/vit/args.py
 delete mode 100644 examples/images/vit/configs/vit_1d_tp2.py
 delete mode 100644 examples/images/vit/configs/vit_1d_tp2_ci.py
 create mode 100644 examples/images/vit/data.py
 delete mode 100644 examples/images/vit/run.sh
 create mode 100644 examples/images/vit/run_benchmark.sh
 create mode 100644 examples/images/vit/run_demo.sh
 delete mode 100644 examples/images/vit/test_vit.py
 delete mode 100644 examples/images/vit/train.py
 delete mode 100644 examples/images/vit/vit.py
 create mode 100644 examples/images/vit/vit_benchmark.py
 create mode 100644 examples/images/vit/vit_train_demo.py

diff --git a/examples/images/vit/README.md b/examples/images/vit/README.md
index 4423d85d19e0..7c4147b76457 100644
--- a/examples/images/vit/README.md
+++ b/examples/images/vit/README.md
@@ -1,61 +1,28 @@
-# Vision Transformer with ColoTensor
+## Overview
 
-# Overview
+Vision Transformer is a class of Transformer model tailored for computer vision tasks. It was first proposed in paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) and achieved SOTA results on various tasks at that time.
 
-In this example, we will run Vision Transformer with ColoTensor.
+In our example, we are using pretrained weights of ViT loaded from HuggingFace.
+We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
 
-We use model **ViTForImageClassification** from Hugging Face [Link](https://huggingface.co/docs/transformers/model_doc/vit) for unit test.
-You can change world size or decide whether use DDP in our code.
+## Run Demo
 
-We use model **vision_transformer** from timm [Link](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) for training example.
-
-(2022/6/28) The default configuration now supports 2DP+2TP with gradient accumulation and checkpoint support. Zero is not supported at present.
-
-# Requirement
-
-Install colossalai version >= 0.1.11
-
-## Unit test
-To run unit test, you should install pytest, transformers with:
-```shell
-pip install pytest transformers
+By running the following script:
+```bash
+bash run_demo.sh
 ```
+You will finetune a a [ViT-base](https://huggingface.co/google/vit-base-patch16-224) model on this [dataset](https://huggingface.co/datasets/beans), with more than 8000 images of bean leaves. This dataset is for image classification task and there are 3 labels: ['angular_leaf_spot', 'bean_rust', 'healthy'].
 
-## Training example
-To run training example with ViT-S, you should install **NVIDIA DALI** from [Link](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html) for dataloader support.
-You also need to install timm and titans for model/dataloader support with:
-```shell
-pip install timm titans
-```
+The script can be modified if you want to try another set of hyperparameters or change to another ViT model with different size.
 
-### Data preparation
-You can download the ImageNet dataset from the [ImageNet official website](https://www.image-net.org/download.php). You should get the raw images after downloading the dataset. As we use **NVIDIA DALI** to read data, we use the TFRecords dataset instead of raw Imagenet dataset. This offers better speedup to IO. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one.
+The demo code refers to this [blog](https://huggingface.co/blog/fine-tune-vit).
 
-Before you start training, you need to set the environment variable `DATA` so that the script knows where to fetch the data for DALI dataloader.
-```shell
-export DATA=/path/to/ILSVRC2012
-```
 
 
-# How to run
+## Run Benchmark
 
-## Unit test
-In your terminal
-```shell
-pytest test_vit.py
+You can run benchmark for ViT model by running the following script:
+```bash
+bash run_benchmark.sh
 ```
-
-This will evaluate models with different **world_size** and **use_ddp**.
-
-## Training example
-Modify the settings in run.sh according to your environment.
-For example, if you set `--nproc_per_node=8` in `run.sh` and `TP_WORLD_SIZE=2` in your config file,
-data parallel size will be automatically calculated as 4.
-Thus, the parallel strategy is set to 4DP+2TP.
-
-Then in your terminal
-```shell
-sh run.sh
-```
-
-This will start ViT-S training with ImageNet.
+The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
\ No newline at end of file
diff --git a/examples/images/vit/args.py b/examples/images/vit/args.py
new file mode 100644
index 000000000000..e4a873a9eb52
--- /dev/null
+++ b/examples/images/vit/args.py
@@ -0,0 +1,124 @@
+from colossalai import get_default_parser
+
+def parse_demo_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="google/vit-base-patch16-224",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="./output_model.bin",
+        help="The path of your saved model after finetuning."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--num_epoch",
+        type=int,
+        default=3,
+        help="Number of epochs."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=3e-4,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--warmup_ratio",
+        type=float,
+        default=0.3,
+        help="Ratio of warmup steps against total training steps."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.1, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+
+    args = parser.parse_args()
+    return args
+
+def parse_benchmark_args():
+
+    parser = get_default_parser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="google/vit-base-patch16-224",
+        help="Path to a pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--num_labels",
+        type=int,
+        default=10,
+        help="Number of labels for classification."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.0, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--mem_cap", 
+        type=int, 
+        default=0, 
+        help="Limit on the usage of space for each GPU (in GB)."
+    )
+    args = parser.parse_args()
+
+    return args
\ No newline at end of file
diff --git a/examples/images/vit/configs/vit_1d_tp2.py b/examples/images/vit/configs/vit_1d_tp2.py
deleted file mode 100644
index fbf399f2e50d..000000000000
--- a/examples/images/vit/configs/vit_1d_tp2.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 384
-DEPTH = 12
-NUM_HEADS = 6
-MLP_RATIO = 4
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-USE_DDP = True
-TP_WORLD_SIZE = 2
-TP_TYPE = 'row'
-parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
-
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-gradient_accumulation = 8
-
-LOG_PATH = "./log"
diff --git a/examples/images/vit/configs/vit_1d_tp2_ci.py b/examples/images/vit/configs/vit_1d_tp2_ci.py
deleted file mode 100644
index e491e4ada45e..000000000000
--- a/examples/images/vit/configs/vit_1d_tp2_ci.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 8
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 3
-WARMUP_EPOCHS = 1
-
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 32
-DEPTH = 2
-NUM_HEADS = 4
-MLP_RATIO = 4
-NUM_CLASSES = 10
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-USE_DDP = True
-TP_WORLD_SIZE = 2
-TP_TYPE = 'row'
-parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
-
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-gradient_accumulation = 2
-
-LOG_PATH = "./log_ci"
diff --git a/examples/images/vit/data.py b/examples/images/vit/data.py
new file mode 100644
index 000000000000..00fde707b173
--- /dev/null
+++ b/examples/images/vit/data.py
@@ -0,0 +1,32 @@
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+
+class BeansDataset(Dataset):
+    
+    def __init__(self, image_processor, split='train'):
+
+        super().__init__()
+        self.image_processor = image_processor
+        self.ds = load_dataset('beans')[split]
+        self.label_names = self.ds.features['labels'].names
+        self.num_labels = len(self.label_names)
+        self.inputs = []
+        for example in self.ds:
+            self.inputs.append(self.process_example(example))
+    
+    def __len__(self):
+        return len(self.inputs)
+
+    def __getitem__(self, idx):
+        return self.inputs[idx]
+    
+    def process_example(self, example):
+        input = self.image_processor(example['image'], return_tensors='pt')
+        input['labels'] = example['labels']
+        return input
+    
+
+def beans_collator(batch):
+    return {'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
+            'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)}
diff --git a/examples/images/vit/requirements.txt b/examples/images/vit/requirements.txt
index 1f69794ebe70..edad87ca380f 100644
--- a/examples/images/vit/requirements.txt
+++ b/examples/images/vit/requirements.txt
@@ -1,8 +1,6 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
 numpy>=1.24.1
-timm>=0.6.12
-titans>=0.0.7
 tqdm>=4.61.2
-transformers>=4.25.1
-nvidia-dali-cuda110>=1.8.0 --extra-index-url https://developer.download.nvidia.com/compute/redist
+transformers>=4.20.0
+datasets
\ No newline at end of file
diff --git a/examples/images/vit/run.sh b/examples/images/vit/run.sh
deleted file mode 100644
index 84fe58f11a6a..000000000000
--- a/examples/images/vit/run.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-export DATA=/data/scratch/imagenet/tf_records
-export OMP_NUM_THREADS=4
-
-# resume
-# CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
-# --nproc_per_node 4 train.py \
-# --config configs/vit_1d_tp2.py \
-# --resume_from checkpoint/epoch_10 \
-# --master_port 29598 | tee ./out 2>&1
-
-# train
-CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
---nproc_per_node 4 train.py \
---config configs/vit_1d_tp2.py \
---master_port 29598 | tee ./out 2>&1
diff --git a/examples/images/vit/run_benchmark.sh b/examples/images/vit/run_benchmark.sh
new file mode 100644
index 000000000000..2487bf81ee2b
--- /dev/null
+++ b/examples/images/vit/run_benchmark.sh
@@ -0,0 +1,27 @@
+set -xe
+pip install -r requirements.txt
+
+export BS=8
+export MEMCAP=0
+export GPUNUM=1
+
+for BS in 8 32 128
+do
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+MODEL_PATH="google/vit-base-patch16-224"
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_benchmark.py \
+  --model_name_or_path ${MODEL_PATH} \
+  --mem_cap ${MEMCAP} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+  
+done
+done
+done
diff --git a/examples/images/vit/run_demo.sh b/examples/images/vit/run_demo.sh
new file mode 100644
index 000000000000..2d140dd6e423
--- /dev/null
+++ b/examples/images/vit/run_demo.sh
@@ -0,0 +1,44 @@
+set -xe
+pip install -r requirements.txt
+
+# model name or path
+MODEL="google/vit-base-patch16-224"
+
+# path for saving model
+OUTPUT_PATH="./output_model.bin"
+
+# plugin(training strategy)
+# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
+PLUGIN="gemini"
+
+# number of gpus to use
+GPUNUM=4
+
+# batch size per gpu
+BS=16
+
+# learning rate
+LR="2e-4"
+
+# number of epoch
+EPOCH=3
+
+# weight decay
+WEIGHT_DECAY=0.05
+
+# ratio of warmup steps
+WARMUP_RATIO=0.3
+
+# run the script for demo
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_train_demo.py \
+  --model_name_or_path ${MODEL} \
+  --output_path ${OUTPUT_PATH} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS} \
+  --num_epoch ${EPOCH} \
+  --learning_rate ${LR} \
+  --weight_decay ${WEIGHT_DECAY} \
+  --warmup_ratio ${WARMUP_RATIO}
diff --git a/examples/images/vit/test_ci.sh b/examples/images/vit/test_ci.sh
index 41d25ee23521..8606015c0397 100644
--- a/examples/images/vit/test_ci.sh
+++ b/examples/images/vit/test_ci.sh
@@ -1,9 +1,19 @@
-export OMP_NUM_THREADS=4
-
+set -xe
 pip install -r requirements.txt
 
-# train
-colossalai run \
---nproc_per_node 4 train.py \
---config configs/vit_1d_tp2_ci.py \
---dummy_data
+BS=8
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_benchmark.py \
+  --model_name_or_path "google/vit-base-patch16-224" \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+
+done
+done
diff --git a/examples/images/vit/test_vit.py b/examples/images/vit/test_vit.py
deleted file mode 100644
index c0ae35bca871..000000000000
--- a/examples/images/vit/test_vit.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import os
-import random
-
-import numpy as np
-import pytest
-import torch
-from torch.nn.parallel import DistributedDataParallel as DDP
-from vit import get_training_components
-
-import colossalai
-from colossalai.context import ParallelMode
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.parallel.data_parallel import ColoDDP
-from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, ProcessGroup, ShardSpec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext
-
-
-def set_seed(seed):
-    random.seed(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
-
-
-def tensor_equal(A, B):
-    return torch.allclose(A, B, rtol=1e-3, atol=1e-1)
-
-
-def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor):
-    assert tensor.ndim == shard.ndim
-    if tensor.shape == shard.shape:
-        return tensor_equal(tensor, shard)
-    else:
-        dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape))
-        if dims_not_eq.numel() == 1:
-            # 1D shard
-            dim = dims_not_eq.item()
-            world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-            rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard)
-        else:
-            raise
-
-
-# Only for all Linear, it's 1d_row split because Linear will be transposed when calculating.
-# But for other layers, it's 1d_col split.
-# Layernorm is not supported for now.
-# patch_embeddings.projection has nn.Conv2d
-# https://github.com/huggingface/transformers/blob/dcb08b99f44919425f8ba9be9ddcc041af8ec25e/src/transformers/models/vit/modeling_vit.py#L182
-def init_1d_row_for_linear_weight_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([-1], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if 'weight' in n and 'layernorm' not in n and 'embeddings.patch_embeddings.projection.weight' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-# Similarly, it's col split for Linear but row split for others.
-def init_1d_col_for_linear_weight_bias_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if ('weight' in n
-                    or 'bias' in n) and 'layernorm' not in n and 'embeddings.patch_embeddings.projection' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-def check_param_equal(model, torch_model):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        assert tensor_shard_equal(torch_p, p)
-
-
-def check_grad_equal(model, torch_model):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        if (torch_p.grad.shape == p.grad.shape):
-            assert torch.allclose(torch_p.grad, p.grad, rtol=1e-3, atol=2.0) == True
-        else:
-            dims_not_eq = torch.nonzero(torch.tensor(torch_p.grad.shape) != torch.tensor(p.grad.shape))
-            dim = dims_not_eq.item()
-            world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-            rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            assert torch.allclose(torch_p.grad.chunk(world_size, dim)[rank], p.grad, rtol=1e-3, atol=2.0) == True
-
-
-def run_vit(init_spec_func, use_ddp):
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_training_components()
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-    model = model.cuda()
-    torch_model = model_builder().cuda()
-    if use_ddp:
-        model = ColoDDP(model)
-        torch_model = DDP(torch_model,
-                          device_ids=[gpc.get_global_rank()],
-                          process_group=gpc.get_group(ParallelMode.DATA))
-    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
-        torch_p.data.copy_(p)
-
-    world_size = torch.distributed.get_world_size()
-    init_spec_func(model, world_size)
-
-    check_param_equal(model, torch_model)
-    model.train()
-    torch_model.train()
-    set_seed(gpc.get_local_rank(ParallelMode.DATA))
-
-    optimizer = optimizer_class(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
-    torch_optimizer = optimizer_class(torch_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
-
-    for i, image_dict in enumerate(train_dataloader):
-        if use_ddp:
-            model.zero_grad()
-        else:
-            optimizer.zero_grad()
-        logits = model(image_dict['pixel_values'])
-        torch_logits = torch_model(image_dict['pixel_values'])
-        assert tensor_equal(torch_logits.logits, logits.logits)
-        loss = criterion(logits.logits, image_dict['label'])
-        torch_loss = criterion(torch_logits.logits, image_dict['label'])
-        if use_ddp:
-            model.backward(loss)
-        else:
-            loss.backward()
-        torch_loss.backward()
-        check_grad_equal(model, torch_model)
-        optimizer.step()
-        torch_optimizer.step()
-        check_param_equal(model, torch_model)
-        break
-
-
-def run_dist(rank, world_size, port, use_ddp):
-    if use_ddp and world_size == 1:
-        return
-    tp_world_size = world_size // 2 if use_ddp else world_size
-    config = dict(parallel=dict(tensor=dict(mode="1d", size=tp_world_size),))
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_vit(init_1d_row_for_linear_weight_spec, use_ddp)
-    run_vit(init_1d_col_for_linear_weight_bias_spec, use_ddp)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@pytest.mark.parametrize('use_ddp', [False, True])
-@rerun_if_address_is_in_use()
-def test_vit(world_size, use_ddp):
-    spawn(run_dist, world_size, use_ddp=use_ddp)
-
-
-if __name__ == '__main__':
-    test_vit(1, False)
diff --git a/examples/images/vit/train.py b/examples/images/vit/train.py
deleted file mode 100644
index b42cf2bedc6b..000000000000
--- a/examples/images/vit/train.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import os
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from timm.models.vision_transformer import _create_vision_transformer
-from titans.dataloader.imagenet import build_dali_imagenet
-from tqdm import tqdm
-from vit import DummyDataLoader
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn._ops import *
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.nn.parallel.data_parallel import ColoDDP
-from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, ProcessGroup, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
-
-
-def init_1d_row_for_linear_weight_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([-1], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if 'weight' in n and 'norm' not in n and 'patch_embed.proj.weight' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-# Similarly, it's col split for Linear but row split for others.
-def init_1d_col_for_linear_weight_bias_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if ('weight' in n or 'bias' in n) and 'norm' not in n and ('patch_embed.proj.weight' not in n
-                                                                       and 'patch_embed.proj.bias' not in n):
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-def init_spec_func(model, tp_type):
-    world_size = torch.distributed.get_world_size()
-    if tp_type == 'row':
-        init_1d_row_for_linear_weight_spec(model, world_size)
-    elif tp_type == 'col':
-        init_1d_col_for_linear_weight_bias_spec(model, world_size)
-    else:
-        raise NotImplemented
-
-
-def train_imagenet():
-
-    parser = colossalai.get_default_parser()
-    parser.add_argument('--resume_from', default=False, action='store_true')
-    parser.add_argument('--dummy_data', default=False, action='store_true')
-
-    args = parser.parse_args()
-    colossalai.launch_from_torch(config=args.config)
-    use_ddp = gpc.config.USE_DDP
-
-    disable_existing_loggers()
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    logger.info('Build data loader', ranks=[0])
-    if not args.dummy_data:
-        root = os.environ['DATA']
-        train_dataloader, test_dataloader = build_dali_imagenet(root,
-                                                                train_batch_size=gpc.config.BATCH_SIZE,
-                                                                test_batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader = DummyDataLoader(length=10,
-                                           batch_size=gpc.config.BATCH_SIZE,
-                                           category=gpc.config.NUM_CLASSES,
-                                           image_size=gpc.config.IMG_SIZE,
-                                           return_dict=False)
-        test_dataloader = DummyDataLoader(length=5,
-                                          batch_size=gpc.config.BATCH_SIZE,
-                                          category=gpc.config.NUM_CLASSES,
-                                          image_size=gpc.config.IMG_SIZE,
-                                          return_dict=False)
-
-    logger.info('Build model', ranks=[0])
-
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        embed_dim=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=gpc.config.NUM_CLASSES,
-                        drop_rate=0.1,
-                        attn_drop_rate=0.1,
-                        weight_init='jax')
-
-    with ColoInitContext(device=get_current_device()):
-        model = _create_vision_transformer('vit_small_patch16_224', pretrained=False, **model_kwargs)
-    init_spec_func(model, gpc.config.TP_TYPE)
-
-    world_size = torch.distributed.get_world_size()
-    model = ColoDDP(module=model, process_group=ProcessGroup(tp_degree=world_size))
-    logger.info('Build criterion, optimizer, lr_scheduler', ranks=[0])
-    optimizer = HybridAdam(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-
-    criterion = CrossEntropyLoss()
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-
-    start_epoch = 0
-    if args.resume_from:
-        load_model = torch.load(args.resume_from + '_model.pth')
-        start_epoch = load_model['epoch']
-        model.load_state_dict(load_model['model'])
-        load_optim = torch.load(args.resume_from + '_optim_rank_{}.pth'.format(dist.get_rank()))
-        optimizer.load_state_dict(load_optim['optim'])
-
-    for epoch in range(start_epoch, gpc.config.NUM_EPOCHS):
-        model.train()
-        for index, (x, y) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), leave=False):
-            x, y = x.cuda(), y.cuda()
-            output = model(x)
-            loss = criterion(output, y)
-            loss = loss / gpc.config.gradient_accumulation
-            if use_ddp:
-                model.backward(loss)
-            else:
-                loss.backward()
-            if (index + 1) % gpc.config.gradient_accumulation == 0:
-                optimizer.step()
-                if use_ddp:
-                    model.zero_grad()
-                else:
-                    optimizer.zero_grad()
-
-        logger.info(
-            f"Finish Train Epoch [{epoch+1}/{gpc.config.NUM_EPOCHS}] loss: {loss.item():.3f} lr: {optimizer.state_dict()['param_groups'][0]['lr']}",
-            ranks=[0])
-
-        model.eval()
-        test_loss = 0
-        correct = 0
-        test_sum = 0
-        with torch.no_grad():
-            for index, (x, y) in tqdm(enumerate(test_dataloader), total=len(test_dataloader), leave=False):
-                x, y = x.cuda(), y.cuda()
-                output = model(x)
-                test_loss += F.cross_entropy(output, y, reduction='sum').item()
-                pred = output.argmax(dim=1, keepdim=True)
-                correct += pred.eq(y.view_as(pred)).sum().item()
-                test_sum += y.size(0)
-
-        test_loss /= test_sum
-        logger.info(
-            f"Finish Test Epoch [{epoch+1}/{gpc.config.NUM_EPOCHS}] loss: {test_loss:.3f} Accuracy: [{correct}/{test_sum}]({correct/test_sum:.3f})",
-            ranks=[0])
-
-        lr_scheduler.step()
-
-
-if __name__ == '__main__':
-    train_imagenet()
diff --git a/examples/images/vit/vit.py b/examples/images/vit/vit.py
deleted file mode 100644
index f22e8ea90cec..000000000000
--- a/examples/images/vit/vit.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from abc import ABC, abstractmethod
-
-import torch
-import torch.nn as nn
-from transformers import ViTConfig, ViTForImageClassification
-
-from colossalai.utils.cuda import get_current_device
-
-
-class DummyDataGenerator(ABC):
-
-    def __init__(self, length=10):
-        self.length = length
-
-    @abstractmethod
-    def generate(self):
-        pass
-
-    def __iter__(self):
-        self.step = 0
-        return self
-
-    def __next__(self):
-        if self.step < self.length:
-            self.step += 1
-            return self.generate()
-        else:
-            raise StopIteration
-
-    def __len__(self):
-        return self.length
-
-
-class DummyDataLoader(DummyDataGenerator):
-
-    def __init__(self, length=10, batch_size=4, channel=3, category=8, image_size=224, return_dict=True):
-        super().__init__(length)
-        self.batch_size = batch_size
-        self.channel = channel
-        self.category = category
-        self.image_size = image_size
-        self.return_dict = return_dict
-
-    def generate(self):
-        image_dict = {}
-        image_dict['pixel_values'] = torch.rand(
-            self.batch_size, self.channel, self.image_size, self.image_size, device=get_current_device()) * 2 - 1
-        image_dict['label'] = torch.randint(self.category, (self.batch_size,),
-                                            dtype=torch.int64,
-                                            device=get_current_device())
-        if not self.return_dict:
-            return image_dict['pixel_values'], image_dict['label']
-        return image_dict
-
-
-class ViTCVModel(nn.Module):
-
-    def __init__(self,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 image_size=224,
-                 patch_size=16,
-                 num_channels=3,
-                 num_labels=8,
-                 checkpoint=False):
-        super().__init__()
-        self.checkpoint = checkpoint
-        self.model = ViTForImageClassification(
-            ViTConfig(hidden_size=hidden_size,
-                      num_hidden_layers=num_hidden_layers,
-                      num_attention_heads=num_attention_heads,
-                      image_size=image_size,
-                      patch_size=patch_size,
-                      num_channels=num_channels,
-                      num_labels=num_labels))
-        if checkpoint:
-            self.model.gradient_checkpointing_enable()
-
-    def forward(self, pixel_values):
-        return self.model(pixel_values=pixel_values)
-
-
-def vit_base_s(checkpoint=True):
-    return ViTCVModel(checkpoint=checkpoint)
-
-
-def vit_base_micro(checkpoint=True):
-    return ViTCVModel(hidden_size=32, num_hidden_layers=2, num_attention_heads=4, checkpoint=checkpoint)
-
-
-def get_training_components():
-    trainloader = DummyDataLoader()
-    testloader = DummyDataLoader()
-    return vit_base_micro, trainloader, testloader, torch.optim.Adam, torch.nn.functional.cross_entropy
diff --git a/examples/images/vit/vit_benchmark.py b/examples/images/vit/vit_benchmark.py
new file mode 100644
index 000000000000..11d480bba65f
--- /dev/null
+++ b/examples/images/vit/vit_benchmark.py
@@ -0,0 +1,129 @@
+import time
+
+import torch
+import transformers
+from transformers import ViTConfig, ViTForImageClassification
+import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_benchmark_args
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def get_data(batch_size, num_labels, num_channels=3, height=224, width=224):
+    pixel_values = torch.randn(batch_size, num_channels, height, width, device=torch.cuda.current_device(), dtype=torch.float)
+    labels = torch.randint(0, num_labels, (batch_size, ), device=torch.cuda.current_device(), dtype=torch.int64)
+    return pixel_values, labels
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print(f"Limiting GPU memory usage to {size_in_GB} GB")
+
+
+def main():
+
+    args = parse_benchmark_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Whether to set limit on memory capacity
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+    
+    # Build ViT model
+    config = ViTConfig.from_pretrained(args.model_name_or_path)
+    model = ViTForImageClassification(config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size))
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+    
+
+    # Start training.
+    logger.info(f"Start testing", ranks=[0])
+    progress_bar = tqdm.tqdm(total=args.max_train_steps, desc="Training Step", disable=not coordinator.is_master())
+    
+    torch.cuda.synchronize()
+    model.train()
+    start_time = time.time()
+   
+    for _ in range(args.max_train_steps):
+
+        pixel_values, labels = get_data(args.batch_size, args.num_labels, 3, 224, 224)
+        optimizer.zero_grad()
+        outputs = model(pixel_values=pixel_values, labels=labels)
+        loss = outputs['loss']
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+        torch.cuda.synchronize()
+        progress_bar.update(1)
+       
+    # Compute Statistics   
+    end_time = time.time()
+    throughput = "{:.4f}".format((world_size * args.max_train_steps * args.batch_size) / (end_time - start_time))
+    max_mem = format_num(torch.cuda.max_memory_allocated(device=torch.cuda.current_device()), bytes=True)
+    
+    logger.info(f"Testing finished, " 
+                f"batch size per gpu: {args.batch_size}, "
+                f"plugin: {args.plugin}, "
+                f"throughput: {throughput}, "
+                f"maximum memory usage per gpu: {max_mem}.",
+                ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/images/vit/vit_train_demo.py b/examples/images/vit/vit_train_demo.py
new file mode 100644
index 000000000000..3a739f10b5d0
--- /dev/null
+++ b/examples/images/vit/vit_train_demo.py
@@ -0,0 +1,177 @@
+import torch
+import torch.distributed as dist
+import transformers
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor
+from tqdm import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_demo_args
+from data import BeansDataset, beans_collator
+
+
+def move_to_cuda(batch, device):
+    return {k: v.to(device) for k, v in batch.items()}
+
+
+def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator):
+        
+    torch.cuda.synchronize()
+    model.train()
+
+    with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar:
+        
+        for batch in pbar:
+
+            # Foward
+            optimizer.zero_grad()
+            batch = move_to_cuda(batch, torch.cuda.current_device())
+            outputs = model(**batch)
+            loss = outputs['loss']
+
+            # Backward
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            lr_scheduler.step()
+
+            # Print batch loss
+            pbar.set_postfix({'loss': loss.item()})
+
+
+@torch.no_grad()
+def evaluate_model(epoch, model, eval_dataloader, num_labels, coordinator):
+    
+    model.eval()
+    accum_loss = torch.zeros(1, device=get_current_device())
+    total_num = torch.zeros(1, device=get_current_device())
+    accum_correct = torch.zeros(1, device=get_current_device())
+
+    for batch in eval_dataloader:
+        batch = move_to_cuda(batch, torch.cuda.current_device())
+        outputs = model(**batch)
+        val_loss, logits = outputs[:2]
+        accum_loss += (val_loss / len(eval_dataloader))
+        if num_labels > 1:
+            preds = torch.argmax(logits, dim=1)
+        elif num_labels == 1:
+            preds = logits.squeeze()
+
+        labels = batch["labels"]
+        total_num += batch["labels"].shape[0]
+        accum_correct += (torch.sum(preds == labels))
+
+    dist.all_reduce(accum_loss)
+    dist.all_reduce(total_num)
+    dist.all_reduce(accum_correct)
+    avg_loss = "{:.4f}".format(accum_loss.item())
+    accuracy = "{:.4f}".format(accum_correct.item() / total_num.item())
+    if coordinator.is_master():
+        print(f"Evaluation result for epoch {epoch + 1}: \
+                average_loss={avg_loss}, \
+                accuracy={accuracy}.")
+        
+    
+   
+
+def main():
+
+    args = parse_demo_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Prepare Dataset
+    image_processor = ViTImageProcessor.from_pretrained(args.model_name_or_path)
+    train_dataset = BeansDataset(image_processor, split='train')
+    eval_dataset = BeansDataset(image_processor, split='validation')
+
+
+    # Load pretrained ViT model
+    config = ViTConfig.from_pretrained(args.model_name_or_path)
+    config.num_labels = train_dataset.num_labels
+    config.id2label = {str(i): c for i, c in enumerate(train_dataset.label_names)}
+    config.label2id = {c: str(i) for i, c in enumerate(train_dataset.label_names)}
+    model = ViTForImageClassification.from_pretrained(args.model_name_or_path, 
+                                                      config=config, 
+                                                      ignore_mismatched_sizes=True)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Prepare dataloader
+    train_dataloader = plugin.prepare_dataloader(train_dataset,
+                                        batch_size=args.batch_size,
+                                        shuffle=True,
+                                        drop_last=True,
+                                        collate_fn=beans_collator)
+    eval_dataloader = plugin.prepare_dataloader(eval_dataset,
+                                        batch_size=args.batch_size,
+                                        shuffle=True,
+                                        drop_last=True,
+                                        collate_fn=beans_collator)
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size), weight_decay=args.weight_decay)
+
+    # Set lr scheduler
+    total_steps = len(train_dataloader) * args.num_epoch
+    num_warmup_steps = int(args.warmup_ratio * total_steps)
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                           total_steps=(len(train_dataloader) * args.num_epoch),
+                                           warmup_steps=num_warmup_steps)
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, train_dataloader, lr_scheduler = booster.boost(model=model, 
+                                                                  optimizer=optimizer, 
+                                                                  dataloader=train_dataloader, 
+                                                                  lr_scheduler=lr_scheduler)
+    
+    # Finetuning
+    logger.info(f"Start finetuning", ranks=[0])
+    for epoch in range(args.num_epoch):
+        train_epoch(epoch, model, optimizer, lr_scheduler, train_dataloader, booster, coordinator)
+        evaluate_model(epoch, model, eval_dataloader, eval_dataset.num_labels, coordinator)
+    logger.info(f"Finish finetuning", ranks=[0])
+
+    # Save the finetuned model
+    booster.save_model(model, args.output_path)
+    logger.info(f"Saving model checkpoint to {args.output_path}", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/language/opt/opt_benchmark.py b/examples/language/opt/opt_benchmark.py
index da2be4055fa3..2d69036b50c6 100755
--- a/examples/language/opt/opt_benchmark.py
+++ b/examples/language/opt/opt_benchmark.py
@@ -67,17 +67,8 @@ def main():
         colo_memory_cap(args.mem_cap)
     
     # Build OPT model
-    # Initialize the model under ColoInitContext if using GeminiPlugin
     config = AutoConfig.from_pretrained(args.model_name_or_path)
-    if args.plugin == 'gemini':
-        shard_pg = ProcessGroup(tp_degree=world_size)
-        default_dist_spec = ShardSpec([-1], [world_size])
-        with ColoInitContext(device='cpu',
-                            default_dist_spec=default_dist_spec,
-                            default_pg=shard_pg):
-            model = OPTForCausalLM(config)
-    else:
-        model = OPTForCausalLM(config)
+    model = OPTForCausalLM(config=config)
     logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
 
     # Enable gradient checkpointing
@@ -91,10 +82,10 @@ def main():
         plugin = TorchDDPPlugin()
     elif args.plugin == 'gemini':
         plugin = GeminiPlugin(device=get_current_device(),
-                        placement_policy='cpu',
-                        pin_memory=True,
-                        strict_ddp_mode=True,
-                        initial_scale=2**5)
+                             placement_policy='cpu',
+                             pin_memory=True,
+                             strict_ddp_mode=True,
+                             initial_scale=2**5)
     elif args.plugin == 'low_level_zero':
         plugin = LowLevelZeroPlugin(initial_scale=2**5)
     logger.info(f"Set plugin as {args.plugin}", ranks=[0])
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
index bb2eb52ce560..fa7feca9c9a9 100644
--- a/examples/language/opt/opt_train_demo.py
+++ b/examples/language/opt/opt_train_demo.py
@@ -74,17 +74,8 @@ def main():
         transformers.utils.logging.set_verbosity_error()
     
     # Build OPT model
-    # Initialize the model under ColoInitContext if using GeminiPlugin
     config = AutoConfig.from_pretrained(args.model_name_or_path)
-    if args.plugin == 'gemini':
-        shard_pg = ProcessGroup(tp_degree=world_size)
-        default_dist_spec = ShardSpec([-1], [world_size])
-        with ColoInitContext(device='cpu',
-                            default_dist_spec=default_dist_spec,
-                            default_pg=shard_pg):
-            model = OPTForCausalLM(config)
-    else:
-        model = OPTForCausalLM(config)
+    model = OPTForCausalLM.from_pretrained(args.model_name_or_path, config=config)
     logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
 
     # Enable gradient checkpointing
@@ -116,7 +107,9 @@ def main():
                                            collate_fn=netflix_collator)
     
     # Set optimizer
-    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size))
+    optimizer = HybridAdam(model.parameters(), 
+                           lr=(args.learning_rate * world_size),
+                           weight_decay=args.weight_decay)
 
     # Set lr scheduler
     total_steps = len(dataloader) * args.num_epoch

From 9d02590c9a64d12bc31866f35bf9b51a4084963f Mon Sep 17 00:00:00 2001
From: Wenhao Chen <cwher@outlook.com>
Date: Tue, 13 Jun 2023 13:31:56 +0800
Subject: [PATCH 51/52] [chat] refactor actor class (#3968)

* refactor: separate log_probs fn from Actor forward fn

* refactor: separate generate fn from Actor class

* feat: update unwrap_model and get_base_model
* unwrap_model returns model not wrapped by Strategy
* get_base_model returns HF model for Actor, Critic and RewardModel

* feat: simplify Strategy.prepare

* style: remove get_base_model method of Actor

* perf: tokenize text in batches

* refactor: move calc_action_log_probs to utils of model

* test: update test with new forward fn

* style: rename forward fn args

* fix: do not unwrap model in save_model fn of naive strategy

* test: add gemini test for train_prompts

* fix: fix _set_default_generate_kwargs
---
 .../Chat/coati/dataset/prompt_dataset.py      | 16 +++---
 .../Chat/coati/dataset/sft_dataset.py         | 30 +++++------
 .../Chat/coati/experience_maker/naive.py      | 12 +++--
 .../Chat/coati/models/base/__init__.py        | 14 ++---
 applications/Chat/coati/models/base/actor.py  | 53 +++++--------------
 applications/Chat/coati/models/generation.py  | 41 ++++++++++++--
 applications/Chat/coati/models/utils.py       | 19 +++++++
 applications/Chat/coati/trainer/ppo.py        | 23 ++++----
 .../Chat/coati/trainer/strategies/base.py     | 21 +++-----
 .../coati/trainer/strategies/colossalai.py    | 19 +++----
 .../Chat/coati/trainer/strategies/ddp.py      |  4 +-
 .../Chat/coati/trainer/strategies/naive.py    |  7 ++-
 applications/Chat/examples/test_ci.sh         |  8 +++
 applications/Chat/tests/test_checkpoint.py    |  4 +-
 14 files changed, 151 insertions(+), 120 deletions(-)

diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
index 5858052c836a..0bdcbbc5928e 100644
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -35,14 +35,14 @@ def __init__(self,
             logger.info(f"Limiting dataset to {max_datasets_size} examples.")
             list_data_dict = list_data_dict[:max_datasets_size]
 
-        for data_dict in list_data_dict:
-            token = tokenizer(data_dict["instruction"],
-                              return_tensors='pt',
-                              max_length=max_length,
-                              padding='max_length',
-                              truncation=True)
-            for k, tensor in token.items():
-                self.keyed_prompt[k].extend(tensor.to(torch.cuda.current_device()).unbind())
+        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
+        tokens = tokenizer(instructions,
+                           return_tensors='pt',
+                           max_length=max_length,
+                           padding='max_length',
+                           truncation=True)
+        for k, tensor in tokens.items():
+            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()
 
     def __len__(self):
         return len(self.keyed_prompt["input_ids"])
diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
index 3e2453468bbc..3702d00cc609 100644
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@@ -74,21 +74,18 @@ def __getitem__(self, idx):
         return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
 
 
-def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, max_length: int) -> Dict:
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 max_length: int
+                 ) -> Dict[str, torch.Tensor]:
     """Tokenize a list of strings."""
-    tokenized_list = [
-        tokenizer(
-            text,
-            return_tensors="pt",
-            padding="longest",
-            max_length=max_length,
-            truncation=True,
-        ) for text in strings
-    ]
-    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
-    input_ids_lens = labels_lens = [
-        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
-    ]
+    tokenized_list = tokenizer(
+        strings, return_tensors="pt", padding="longest",
+        max_length=max_length, truncation=True
+    )
+    input_ids = labels = tokenized_list["input_ids"]
+    input_ids_lens = labels_lens = \
+        tokenized_list["input_ids"].ne(tokenizer.pad_token_id).sum(dim=-1)
     return dict(
         input_ids=input_ids,
         labels=labels,
@@ -105,7 +102,10 @@ def preprocess(
 ) -> Dict:
     """Preprocess the data by tokenizing."""
     examples = [s + t for s, t in zip(sources, targets)]
-    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)]
+    examples_tokenized, sources_tokenized = [
+        _tokenize_fn(strings, tokenizer, max_length)
+        for strings in (examples, sources)
+    ]
     input_ids = examples_tokenized["input_ids"]
     labels = copy.deepcopy(input_ids)
     for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py
index 94546eeb28e7..e5bb029e63d0 100644
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@@ -1,5 +1,6 @@
 import torch
-from coati.models.utils import compute_reward, normalize
+from coati.models.generation import generate_with_actor
+from coati.models.utils import calc_action_log_probs, compute_reward, normalize
 
 from .base import Experience, ExperienceMaker
 
@@ -16,13 +17,16 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
         self.initial_model.eval()
         self.reward_model.eval()
 
-        sequences, attention_mask, action_mask = self.actor.generate(input_ids,
+        sequences, attention_mask, action_mask = generate_with_actor(self.actor,
+                                                                     input_ids,
                                                                      return_action_mask=True,
                                                                      **generate_kwargs)
         num_actions = action_mask.size(1)
 
-        action_log_probs = self.actor(sequences, num_actions, attention_mask)
-        base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask)
+        actor_output = self.actor(sequences, attention_mask)
+        action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
+        base_model_output = self.initial_model(sequences, attention_mask)
+        base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
         value = self.critic(sequences, action_mask, attention_mask)
         r = self.reward_model(sequences, attention_mask)
         reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
diff --git a/applications/Chat/coati/models/base/__init__.py b/applications/Chat/coati/models/base/__init__.py
index fe4152f2b760..c5f748a0c85a 100644
--- a/applications/Chat/coati/models/base/__init__.py
+++ b/applications/Chat/coati/models/base/__init__.py
@@ -1,3 +1,5 @@
+from typing import Union
+
 import torch.nn as nn
 
 from .actor import Actor
@@ -5,10 +7,10 @@
 from .reward_model import RewardModel
 
 
-def get_base_model(model: nn.Module) -> nn.Module:
+def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
     """Get the base model of our wrapper classes.
-    For Actor, it's base model is ``actor.model`` and it's usually a ``transformers.PreTrainedModel``.
-    For Critic and RewardModel, it's base model is itself.
+    For Actor, Critic and RewardModel, return ``model.model``, 
+    it's usually a ``transformers.PreTrainedModel``.
 
     Args:
         model (nn.Module): model to get base model from
@@ -16,9 +18,9 @@ def get_base_model(model: nn.Module) -> nn.Module:
     Returns:
         nn.Module: the base model
     """
-    if isinstance(model, Actor):
-        return model.get_base_model()
-    return model
+    assert isinstance(model, (Actor, Critic, RewardModel)), \
+        f'Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first.'
+    return model.model
 
 
 __all__ = ['Actor', 'Critic', 'RewardModel', 'get_base_model']
diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py
index 71fbf7bbae7d..2034d5cc81d4 100644
--- a/applications/Chat/coati/models/base/actor.py
+++ b/applications/Chat/coati/models/base/actor.py
@@ -1,12 +1,9 @@
-from typing import Optional, Tuple, Union
+from typing import Optional
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
-from ..generation import generate
 from ..lora import LoRAModule
-from ..utils import log_probs_from_logits
 
 
 class Actor(LoRAModule):
@@ -24,42 +21,16 @@ def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str =
         self.model = model
         self.convert_to_lora()
 
-    @torch.no_grad()
-    def generate(
-        self,
-        input_ids: torch.Tensor,
-        return_action_mask: bool = True,
-        **kwargs
-    ) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
-        sequences = generate(self.model, input_ids, **kwargs)
-        attention_mask = None
-        pad_token_id = kwargs.get('pad_token_id', None)
-        if pad_token_id is not None:
-            attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
-        if not return_action_mask:
-            return sequences, attention_mask, None
-        input_len = input_ids.size(1)
-        eos_token_id = kwargs.get('eos_token_id', None)
-        if eos_token_id is None:
-            action_mask = torch.ones_like(sequences, dtype=torch.bool)
-        else:
-            # left padding may be applied, only mask action
-            action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
-            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)    # include eos token and input
-        action_mask[:, :input_len] = False
-        action_mask = action_mask[:, 1:]
-        return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
-
     def forward(self,
-                sequences: torch.LongTensor,
-                num_actions: int,
-                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Returns action log probs
+                input_ids: torch.LongTensor,
+                attention_mask: Optional[torch.Tensor] = None,
+                **model_kwargs,  # HACK: `generate` method may pass more kwargs
+                ) -> torch.Tensor:
+        """Returns model output.
         """
-        output = self.model(sequences, attention_mask=attention_mask)
-        logits = output['logits']
-        log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
-        return log_probs[:, -num_actions:]
-
-    def get_base_model(self):
-        return self.model
+        output = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            **model_kwargs
+        )
+        return output
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index f57c9458a271..0156e2284e52 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -1,8 +1,10 @@
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
+
 
 try:
     from transformers.generation_logits_process import (
@@ -55,9 +57,8 @@ def sample(model: nn.Module,
     unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
 
     for _ in range(input_ids.size(1), max_length):
-        model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {
-            'input_ids': input_ids
-        }
+        model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) \
+            if prepare_inputs_fn is not None else {'input_ids': input_ids}
         outputs = model(**model_inputs)
 
         next_token_logits = outputs['logits'][:, -1, :]
@@ -144,3 +145,35 @@ def generate(model: nn.Module,
         raise NotImplementedError
     else:
         raise ValueError("Unsupported generation mode")
+
+
+@torch.no_grad()
+def generate_with_actor(actor_model: nn.Module,
+                        input_ids: torch.Tensor,
+                        return_action_mask: bool = True,
+                        **kwargs
+                        ) -> Union[Tuple[torch.LongTensor, torch.LongTensor],
+                                   Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
+    """Generate token sequence with actor model. Refer to `generate` for more details.
+    """
+    # generate sequences
+    sequences = generate(actor_model, input_ids, **kwargs)
+
+    # calculate auxiliary tensors
+    attention_mask = None
+    pad_token_id = kwargs.get('pad_token_id', None)
+    if pad_token_id is not None:
+        attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
+    if not return_action_mask:
+        return sequences, attention_mask, None
+    input_len = input_ids.size(1)
+    eos_token_id = kwargs.get('eos_token_id', None)
+    if eos_token_id is None:
+        action_mask = torch.ones_like(sequences, dtype=torch.bool)
+    else:
+        # left padding may be applied, only mask action
+        action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
+        action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)    # include eos token and input
+    action_mask[:, :input_len] = False
+    action_mask = action_mask[:, 1:]
+    return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
diff --git a/applications/Chat/coati/models/utils.py b/applications/Chat/coati/models/utils.py
index 0ff13181fcd2..b9f15f894a1f 100644
--- a/applications/Chat/coati/models/utils.py
+++ b/applications/Chat/coati/models/utils.py
@@ -46,6 +46,25 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T
     return log_probs_labels.squeeze(-1)
 
 
+def calc_action_log_probs(output: torch.Tensor,
+                          sequences: torch.LongTensor,
+                          num_actions: int
+                          ) -> torch.Tensor:
+    """Calculate action log probs.
+
+    Args:
+        output (torch.Tensor): Output tensor of Actor.forward.
+        sequences (torch.LongTensor): Input sequences.
+        num_actions (int): Number of actions.
+
+    Returns:
+        torch.Tensor: Action log probs.
+    """
+    logits = output['logits']
+    log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
+    return log_probs[:, -num_actions:]
+
+
 def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
     tensor = tensor * mask
     tensor = tensor.sum(dim=dim)
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index fe5ae48d9c2f..e2e44e62533e 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -3,8 +3,9 @@
 import torch
 import torch.nn as nn
 from coati.experience_maker import Experience, NaiveExperienceMaker
-from coati.models.base import Actor, Critic
+from coati.models.base import Actor, Critic, get_base_model
 from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
+from coati.models.utils import calc_action_log_probs
 from coati.replay_buffer import NaiveReplayBuffer
 from torch import Tensor
 from torch.optim import Optimizer
@@ -165,7 +166,8 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         self.critic.train()
         # policy loss
         num_actions = experience.action_mask.size(1)
-        action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
+        actor_output = self.actor(experience.sequences, attention_mask=experience.attention_mask)
+        action_log_probs = calc_action_log_probs(actor_output, experience.sequences, num_actions)
         actor_loss = self.actor_loss_fn(action_log_probs,
                                         experience.action_log_probs,
                                         experience.advantages,
@@ -175,8 +177,8 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         if self.ptx_coef != 0:
             batch = next(iter(self.pretrain_dataloader))
             batch = to_device(batch, self.device)
-            ptx_log_probs = self.actor.get_base_model()(batch['input_ids'],
-                                                        attention_mask=batch['attention_mask'])['logits']
+            ptx_log_probs = self.actor(batch['input_ids'],
+                                       attention_mask=batch['attention_mask'])['logits']
             ptx_loss = self.ptx_loss_fn(ptx_log_probs, batch['labels'])
             actor_loss = ptx_loss * self.ptx_coef + actor_loss * (1 - self.ptx_coef)
 
@@ -200,14 +202,15 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         return {'reward': experience.reward.mean().item()}
 
 
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
-    origin_model = strategy.unwrap_model(actor)
+def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
+    unwrapper_model = strategy.unwrap_model(actor)
+    hf_model = get_base_model(unwrapper_model)
     new_kwargs = {**generate_kwargs}
     # use huggingface models method directly
-    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
-        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(hf_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = hf_model.prepare_inputs_for_generation
 
-    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
-        new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(hf_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = hf_model._update_model_kwargs_for_generation
 
     return new_kwargs
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index bd30422022ae..06f81f21ab26 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from coati.models.base import Actor, get_base_model
 from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
@@ -69,21 +68,16 @@ def prepare(
             Union[List[ModelOrModelOptimPair], ModelOrModelOptimPair]: Models or model-optimizer-pairs in the original order.
         """
 
-        def prepare_model(model: nn.Module):
-            if isinstance(model, Actor):
-                return Actor(self.setup_model(model.get_base_model()))
-            return self.setup_model(model)
-
         rets = []
         for arg in models_or_model_optim_pairs:
             if isinstance(arg, tuple):
                 assert len(arg) == 2, f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"'
                 model, optimizer = arg
-                model = prepare_model(model)
-                optimizer = self.setup_optimizer(optimizer, get_base_model(model))
+                model = self.setup_model(model)
+                optimizer = self.setup_optimizer(optimizer, model)
                 rets.append((model, optimizer))
             elif isinstance(arg, nn.Module):
-                rets.append(prepare_model(arg))
+                rets.append(self.setup_model(model))
             else:
                 raise RuntimeError(f'Expect model or (model, optimizer) pair, got {type(arg)}')
 
@@ -93,16 +87,15 @@ def prepare_model(model: nn.Module):
 
     @staticmethod
     def unwrap_model(model: nn.Module) -> nn.Module:
-        """Get the unwrapped model from a wrapped model. Useful for getting original huggingface model.
-        For Actor, it will unwrap `actor.model`.
+        """Get the unwrapped model from a wrapped model made by Strategy.prepare.
 
         Args:
             model (nn.Module): the model to unwrap
 
         Returns:
-            nn.Module: the original model (usually a huggingface model)
+            nn.Module: the original model
         """
-        return get_base_model(model)
+        return model
 
     @abstractmethod
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
@@ -133,4 +126,4 @@ def save_pretrained(self,
 
     @abstractmethod
     def get_model_state_dict_shard(self, model: nn.Module, **config):
-        pass
\ No newline at end of file
+        pass
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 88268b677eb2..fafd0918deaf 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -5,7 +5,6 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
-from coati.models.base import get_base_model
 from torch.optim import Optimizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
@@ -153,14 +152,13 @@ def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
         if only_rank0 and dist.get_rank() != 0 and self.stage != 3:
             return
-        base_model = get_base_model(model)
         if self.stage == 3:
-            assert isinstance(base_model, ZeroDDP)
+            assert isinstance(model, ZeroDDP)
             # for stage 3, state_dict() method should be called on every rank
-            state_dict = base_model.state_dict(only_rank_0=only_rank0)
+            state_dict = model.state_dict(only_rank_0=only_rank0)
         else:
             # only_rank0 is false or rank == 0
-            state_dict = base_model.state_dict()
+            state_dict = model.state_dict()
         if only_rank0 and dist.get_rank() != 0:
             return
         torch.save(state_dict, path)
@@ -172,11 +170,10 @@ def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = Fal
         torch.save(optimizer.state_dict(), path)
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        base_model: Union[nn.Module, ZeroDDP] = get_base_model(model)
         if self.stage == 3:
-            assert isinstance(base_model, ZeroDDP)
-            return base_model.module
-        return base_model
+            assert isinstance(model, ZeroDDP)
+            return model.module
+        return model
 
     def save_pretrained(self,
                         model: nn.Module,
@@ -196,5 +193,5 @@ def get_model_state_dict_shard(self, model: nn.Module, **config):
             #     if isinstance(module, LoraLinear):
             #         module.merge_weights = True
             #         module.eval()
-            base_model: ZeroDDP = get_base_model(model)
-            yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
+            assert isinstance(model, ZeroDDP)
+            yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index a1fecb36373f..713d7b90c6f0 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -69,8 +69,8 @@ def setup_sampler(self, dataset) -> DistributedSampler:
         return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        base_model: DDP = super().unwrap_model(model)
-        return base_model.module
+        assert isinstance(model, DDP)
+        return model.module
 
     def save_pretrained(self,
                         model: nn.Module,
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index 972deebeaa0d..202c480e06d9 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -58,14 +58,13 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
                           collate_fn=replay_buffer.collate_fn)
 
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
-        base_model = get_base_model(model)
-        state_dict = base_model.state_dict()
+        state_dict = model.state_dict()
         torch.save(state_dict, path)
 
     def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
-        base_model = get_base_model(model)
+        unwrapped_model = self.unwrap_model(model)
         state_dict = torch.load(path, map_location=map_location)
-        base_model.load_state_dict(state_dict, strict=strict)
+        unwrapped_model.load_state_dict(state_dict, strict=strict)
 
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         torch.save(optimizer.state_dict(), path)
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 2fa6c6052f8d..ac3a9b507864 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -121,6 +121,14 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
          --rm_pretrain 'gpt2' \
          --rm_path ${BASE}/rm_ckpt_gpt.pt \
          --save_path ${BASE}/actor_checkpoint_prompts.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+         --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+         --pretrain 'gpt2' --model gpt2 \
+         --rm_pretrain 'gpt2' \
+         --rm_path ${BASE}/rm_ckpt_gpt.pt \
+         --save_path ${BASE}/actor_checkpoint_prompts.pt
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index 4c05a3431699..d93a5c94d8ea 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed as dist
 from coati.models.gpt import GPTActor
+from coati.models.utils import calc_action_log_probs
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 
@@ -43,7 +44,8 @@ def run_test_checkpoint(strategy):
     def run_step():
         data = get_data(BATCH_SIZE)
         action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
-        action_log_probs = actor(data['input_ids'], action_mask.size(1), data['attention_mask'])
+        actor_output = actor(data['input_ids'], data['attention_mask'])
+        action_log_probs = calc_action_log_probs(actor_output, data['input_ids'], action_mask.size(1))
         loss = action_log_probs.sum()
         strategy.backward(loss, actor, actor_optim)
         strategy.optimizer_step(actor_optim)

From 2925f473991875681c62bb8f7d47a0b3bc1a3044 Mon Sep 17 00:00:00 2001
From: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:12:29 +0800
Subject: [PATCH 52/52] [evaluate] support gpt evaluation with reference
 (#3972)

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
---
 applications/Chat/evaluate/README.md          |  11 +-
 .../Chat/evaluate/config/config_cn.json       |   2 +-
 .../Chat/evaluate/config/config_en.json       |   2 +-
 applications/Chat/evaluate/eval.py            |  11 +-
 applications/Chat/evaluate/evaluator.py       |  13 +-
 applications/Chat/evaluate/gpt_evaluate.py    | 197 ++++++++++++++++--
 .../evaluation_prompt_cn.json                 |   4 +-
 .../evaluation_prompt_en.json                 |  26 +--
 8 files changed, 219 insertions(+), 47 deletions(-)

diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index 077193b63ce0..e4a50b11d41f 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -17,6 +17,7 @@ The whole evaluation pipeline consists of three methods:
 1. `GPT Evaluation`: evaluates model predictions using GPT models.
    * Compare the performance of two different models (battle).
    * Rate the model according to pre-defined metrics using prompting design.
+   * Rate the model according to pre-defined metrics with additional reference answer using prompting design.
 2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
 3. `UniEval`: evaluates model predictions using UniEval models(English only).
 
@@ -66,7 +67,7 @@ GPT evaluation uses GPT models to evaluate the prediction of different models an
 |       切题<br/>(Relevance)       | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。</br></br>Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic. | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。</br></br>1. Read the question to determine what the question asks and what aspects of the question need to be answered.<br>2. Read the answers to make sure that they directly answer the question asked.<br>3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.<br>4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all. |
 |      创意性<br/>(Creativity)       | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。</br></br>Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.<br>3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.<br>4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given. |
 |     实用性<br/>(Practicality)      | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。</br></br>Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.<br>3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.<br>4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given. |
-|      正确性<br/>(Correctness)      | 正确性(1-5)：答案应该符合常识、生活实际等等。 </br></br> Correctness (1-5): The answer should be in line with common sense, life experience, etc. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.<br>3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.<br>4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc. |
+|      正确性<br/>(Correctness)      | 正确性(1-5)：正确性(1-5)：答案是否正确。</br></br> Correctness (1-5): whether the answer is correct or not. | 1. 仔细阅读题目，尝试自己回答该问题。<br/>2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。<br/><br/>1. Read the question carefully and try to answer the question yourself. <br/>2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded. |
 |      自然<br/>(Naturalness)      | 自然(1-5)：答案是否自然，并且符合问题给定的身份。</br></br>Naturalness (1-5): whether the answer is natural and fits the identity given by the question. | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。</br></br>1. Read the question and determine the identity information provided in the question.<br>2. Check whether the content of the answer matches the identity given in the question.<br>3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question. |
 |     参与感<br/>(Engagingness)      | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。</br></br>Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation. | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。</br></br>1. Read the questions to determine the context and background of the dialogue.<br>2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.<br>3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation. |
 |    合理性<br/>(Reasonableness)     | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。</br></br>Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context. | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。</br></br>1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.<br>2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.<br>3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense. |
@@ -76,7 +77,7 @@ GPT evaluation uses GPT models to evaluate the prediction of different models an
 
 GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
 
-> **NOTE 1:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+> **NOTE 1:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "Whether the answer is correct or not."(this is for category `classification`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
 
 > **NOTE 2:** To add customized metrics, you can refer to [FAQ](#faq).
 
@@ -249,7 +250,7 @@ The following is an example of a Chinese config file. The configuration file can
     },
     "category": {
         "brainstorming": {
-            "GPT": ["relevance", "creativity", "practicality", "correctness"],
+            "GPT": ["relevance", "creativity", "practicality", "reasonableness"],
             "Metrics": ["Distinct"],
             "UniEval": ["summarization-fluency", "data2text-naturalness", "data2text-informativeness"]
         },
@@ -313,6 +314,8 @@ python eval.py \
     --openai_key "your openai key" \
 ```
 
+If you want GPT evaluation with reference, you can add an argument `--gpt_with_reference`.
+
 ## FAQ
 
 <details><summary><b>How can I add a new GPT evaluation metric?</b></summary>
@@ -354,7 +357,7 @@ if task == 'data2text':
 - [x] Add evaluation for English capability
 - [x] Support UniEval
 - [x] Support GPT-4 evaluation
-- [ ] Support GPT evaluation with reference in the prompt
+- [x] Support GPT evaluation with reference
 
 ## Citations
 
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
index cf647f79bbf8..dffb66f6c3be 100644
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -7,7 +7,7 @@
         "relevance",
         "creativity",
         "practicality",
-        "correctness"
+        "reasonableness"
       ],
       "Metrics": [
         "Distinct"
diff --git a/applications/Chat/evaluate/config/config_en.json b/applications/Chat/evaluate/config/config_en.json
index 014c61d93a54..5238bd19f67e 100644
--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
@@ -12,7 +12,7 @@
         "relevance",
         "creativity",
         "practicality",
-        "correctness"
+        "reasonableness"
       ],
       "Metrics": [
         "Distinct"
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index 180ef438cc43..e3fe0e9e091b 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -38,9 +38,14 @@ def main(args):
             raise Exception(
                 "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
 
+        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
+            raise Exception(
+                "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
+            )
+
         # initialize evaluator
         evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
-                              config["language"], config.get("path_for_UniEval", None))
+                              config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
         if len(args.model_name_list) == 2:
             answers1 = jload(args.answer_file_list[0])
             answers2 = jload(args.answer_file_list[1])
@@ -92,6 +97,10 @@ def main(args):
                         default="gpt-3.5-turbo",
                         choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
                         help='which GPT model to use for evaluation')
+    parser.add_argument('--gpt_with_reference',
+                        default=False,
+                        action="store_true",
+                        help='whether to include reference answer in gpt evaluation')
     parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
     parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
     args = parser.parse_args()
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index 6bb8cdb29431..3dd5fd6f2f23 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -16,13 +16,14 @@ class Evaluator(object):
     """
 
     def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
-                 gpt_model: str, language: str, path_for_UniEval: Dict[str, str]) -> None:
+                 gpt_model: str, language: str, path_for_UniEval: Dict[str, str], gpt_with_reference: bool) -> None:
         self.params = params
         self.battle_prompt = battle_prompt
         self.gpt_evaluation_prompt = gpt_evaluation_prompt
         self.gpt_model = gpt_model
         self.language = language
         self.path_for_UniEval = path_for_UniEval
+        self.gpt_with_reference = gpt_with_reference
         self.automatic_metric_stats = dict()
         self.unieval_metric_stats = dict()
         self.gpt_evaluation_results = dict()
@@ -157,8 +158,14 @@ def switch(metric, language):
                 print(f"No prompt for category {category}! Use prompt for category general now.")
                 prompt = self.gpt_evaluation_prompt["general"]
 
-            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
-                                                                          category_metrics, category, self.gpt_model)
+            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(
+                answers_per_category[category],
+                prompt,
+                category_metrics,
+                category,
+                self.gpt_model,
+                self.language,
+                references=targets_per_category[category] if self.gpt_with_reference else None)
 
     def save(self, path: str, model_name_list: List[str]) -> None:
         """
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index 6702526ac5e6..012f41ab0c41 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -13,6 +13,23 @@
 import tqdm
 from utils import jdump, jload
 
+ref_step_template = {
+    "en":
+        "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
+    "cn":
+        "请比较答案与上面的{adjective}答案，确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n"
+}
+
+ref_answer_template_general = {
+    "en": "\nAn example answer with good quality is as follows:\n\n{answer}\n\n",
+    "cn": "\n一个优质的示例答案如下：\n\n{answer}\n\n"
+}
+
+ref_answer_template_correctness = {
+    "en": "\nA correct answer is as follows:\n\n{answer}\n\n",
+    "cn": "\n标准答案如下：\n\n{answer}\n\n"
+}
+
 
 def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
     """
@@ -233,18 +250,125 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
     print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
 
 
+def reference_template(metric: str, language: str, reference: Dict[str, Any]) -> str:
+    """
+    Get prompt template for GPT evaluation with reference.
+
+    Different languages have different prompt templates.
+
+    Args:
+        metric: metric used in GPT evaluation with reference.
+        language: language for the template.
+        reference: the instruction that contains target answer.
+
+    Returns:
+        Prompt template for GPT evaluation with reference.
+    """
+
+    step_to_add = ref_step_template[language]
+
+    for_the_given_answer = "{metric} (1-5) (directly give the score for the given answer):" if language == "en" else "{metric} (1-5) (直接对给定答案打分)"
+
+    # adjective is used to describe the word "answer" in the prompt.
+    adjective = "example" if language == "en" else "示例"
+    answer_to_add = ref_answer_template_general[language]
+
+    # Only for correctness, we will provide a correct answer and so the adjective for "answer" will be "correct". The prompt words will be "a correct answer".
+    # In other cases, the prompt words will be "an example answer with good quality" by default.
+    if metric.lower() == "correctness":
+        adjective = "correct" if language == "en" else "标准"
+        answer_to_add = ref_answer_template_correctness[language]
+
+    answer_to_add = answer_to_add.format(answer=reference["target"] if reference["target"] else reference["output"])
+    step_to_add = step_to_add.format(metric=metric.lower(),
+                                     adjective=adjective) + for_the_given_answer.format(metric=metric)
+
+    return answer_to_add + step_to_add
+
+
+def fill_in_message(role: str, content: str) -> Dict[str, str]:
+    """
+    Generate one formatted message to send through chat completion.
+
+    Args:
+        role: the role of the author of this message.
+        content: the contents of the message.
+
+    Returns:
+        One message to send through chat completion.
+    """
+
+    return {"role": role, "content": content}
+
+
+def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens: int = 1, turns=2) -> Dict[str, Any]:
+    """
+    Do multi-turn chat completion.
+
+    When turns == 1, it is a one-turn conversation for normal GPT evaluation.
+    When turns == 2, it is a two-turn conversation which is used for GPT evaluation with reference answers.
+
+    Args:
+        user_messages: messages user wants to send.
+        model: the model used to evaluate answers.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+        turns: the number of turns for conversation.
+
+    Returns:
+        Last turn's response.
+    """
+
+    if len(user_messages) != turns:
+        raise Exception("The length of user messages should be equal to the turn number!")
+
+    assistant_responses = []
+
+    for i in range(turns):
+        messages_to_send = []
+
+        for j in range(i):
+            messages_to_send.append(fill_in_message("user", user_messages[j]))
+            messages_to_send.append(
+                fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"]))
+
+        # Length of user messages == Length of assistant messages + 1
+        # Because we always expect the api to response
+        messages_to_send.append(fill_in_message("user", user_messages[i]))
+
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=messages_to_send,
+            temperature=0,
+            max_tokens=max_tokens,
+        )
+
+        # Avoid exceeding rate limits.
+        # You can comment this line if your request doesn't contain many tokens.
+        time.sleep(1)
+
+        assistant_responses.append(response)
+
+    return assistant_responses[-1]
+
+
 def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
                                         inst: Dict[str, Any],
                                         metrics: List[str],
+                                        language: str,
+                                        reference: Dict[str, Any] = None,
                                         model: str = "gpt-3.5-turbo",
                                         max_tokens: int = 2048) -> Dict[str, Any]:
     """
     Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
 
+    Temprature is set to 0 to make the model more deterministic.
+
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
         inst: the instruction that is needed to be evaluated.
         metrics: the metrics for evaluation.
+        language: language used to change the CoT(add one more step about comparing the given answer and reference) if reference is not None.
+        reference: the reference answer.
         model: the model used to evaluate answers.
         max_tokens: the maximum number of tokens to generate in the chat completion.
 
@@ -254,7 +378,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
 
     MAX_API_RETRY = 3
 
-    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
     answer = inst["output"]
     inst["evaluation"] = {}
 
@@ -265,28 +389,34 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
             )
         for i in range(MAX_API_RETRY):
             try:
-                response = openai.ChatCompletion.create(
-                    model=model,
-                    messages=[
-                        {
-                            "role":
-                                "user",
-                            "content":
-                                prompt["prompt"].format(
-                                    question=question,
-                                    answer=answer,
-                                    metric=prompt["metrics"][metric],
-                                    steps=prompt["CoT"][metric],
-                                ),
-                        },
-                    ],
-                    temperature=0,
-                    max_tokens=max_tokens,
+                prompt_reference = "" if reference is None else reference_template(metric, language, reference)
+
+                prompt_1st_round = prompt["prompt"].format(
+                    question=question,
+                    answer=answer,
+                    metric=prompt["metrics"][metric],
+                    steps=prompt["CoT"][metric],
                 )
+
+                if prompt_reference:
+                    # Do a 2-round conversation
+                    response = multiturn_chat_completion([prompt_1st_round, prompt_reference],
+                                                         model,
+                                                         max_tokens=max_tokens,
+                                                         turns=2)
+                else:
+                    response = multiturn_chat_completion([prompt_1st_round], model, max_tokens=max_tokens, turns=1)
+
                 inst["evaluation"][metric] = {
                     "response": response["choices"][0]["message"]["content"],
                     "logprobs": None,
                 }
+
+                # Prevent exceeding rate limits because we have multiple workers.
+                # But this will slow down the evaluation process.
+                # You can comment this line if your request doesn't contain many tokens.
+                time.sleep(len(metrics) * 0.5)
+
                 break
             except Exception as e:
                 print(e)
@@ -305,6 +435,8 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
     Use completion model(text-davinci-003) to evaluate one model answer.
     Only completion models can return log probabilities.
 
+    Temprature is set to 0 to make the model more deterministic.
+
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
         inst: the instruction that is needed to be evaluated.
@@ -317,7 +449,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
 
     MAX_API_RETRY = 3
 
-    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
     answer = inst["output"]
     inst["evaluation"] = {}
 
@@ -344,6 +476,12 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
                     "response": response["choices"][0]["text"],
                     "logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
                 }
+
+                # Prevent exceeding rate limits because we have multiple workers.
+                # But this will slow down the evaluation process.
+                # You can comment this line if your request doesn't contain many tokens.
+                time.sleep(len(metrics) * 0.5)
+
                 break
             except Exception as e:
                 print(e)
@@ -354,7 +492,13 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
     return inst
 
 
-def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
+def evaluate(answers: List[Dict],
+             prompt: Dict[str, Any],
+             metrics: List[str],
+             category: str,
+             model: str,
+             language: str,
+             references: List[Dict] = None) -> List[Dict]:
     """
     Use GPT models to evaluate model answers and save evaluation results.
 
@@ -364,6 +508,8 @@ def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], ca
         metrics: metrics for GPT evaluation.
         category: the category of the model answers for evaluation.
         model: the specific GPT model used to evaluate answers.
+        language: language used in GPT evaluation
+        references: references for GPT evaluation
 
     Returns:
         Evaluations of the given answers.
@@ -378,12 +524,19 @@ def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], ca
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
         futures = []
-        for inst in answers:
+        for idx, inst in enumerate(answers):
             # Completion models can return log probabilities.
             if model == "text-davinci-003":
                 future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
             else:
-                future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
+                future = executor.submit(get_gpt_evaluation_without_logprobs,
+                                         prompt,
+                                         inst,
+                                         metrics,
+                                         language,
+                                         reference=None if references is None else references[idx],
+                                         model=model,
+                                         max_tokens=1)
 
             futures.append(future)
 
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
index ee6caae32091..783f453cafdb 100644
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
@@ -7,14 +7,14 @@
       "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
       "creativity": "创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。",
       "practicality": "实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。",
-      "correctness": "正确性(1-5)：答案应该符合常识、生活实际等等。"
+      "reasonableness": "合理性(1-5)：答案应该符合常识、生活实际等等。"
     },
     "CoT": {
       "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
       "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
       "creativity": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。\n3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。\n4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。\n\n创意性：",
       "practicality": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。\n3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。\n4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。\n\n实用性：",
-      "correctness": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。\n3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。\n4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。\n\n正确性："
+      "reasonableness": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则合理性评分可能会受到影响。\n3. 考虑答案中所提供的信息是否合理、符合常识、生活实际等等。如果答案中存在明显的不合理之处，则合理性评分可能会受到影响。\n4. 根据答案的合理性，给出一个1到5的评分。如果答案存在明显的不合理之处，则应给出一个较低的评分。如果答案合理、符合常识、生活实际等等，则应给出一个较高的评分。\n\n合理性："
     },
     "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
index 0b2053746af2..2285b639427c 100644
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
@@ -7,14 +7,14 @@
       "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
       "creativity": "Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas.",
       "practicality": "Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions.",
-      "correctness": "Correctness (1-5): The answer should be in line with common sense, life experience, etc."
+      "reasonableness": "Reasonableness (1-5): The answer should be in line with common sense, life experience, etc."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "creativity": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.\n3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.\n4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given.\n\nCreativity:",
       "practicality": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.\n3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.\n4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given.\n\nPracticality:",
-      "correctness": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.\n3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.\n4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc.\n\nCorrectness:"
+      "reasonableness": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the reasonableness score may be affected.\n3. Consider whether the information provided in the answer is reasonable, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the reasonableness score may be affected.\n4. Give a score of 1 to 5 depending on the reasonableness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is reasonable, consistent with common sense, real life, etc.\n\nReasonableness:"
     },
     "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
   },
@@ -29,7 +29,7 @@
       "reasonableness": "Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "naturalness": "1. Read the question and determine the identity information provided in the question.\n2. Check whether the content of the answer matches the identity given in the question.\n3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question.\n\nNaturalness:",
       "engagingness": "1. Read the questions to determine the context and background of the dialogue.\n2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.\n3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation.\n\nEngagingness:",
@@ -46,7 +46,7 @@
       "correctness": "Correctness (1-5): whether the answer is correct or not."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
     },
@@ -61,7 +61,7 @@
       "correctness": "Correctness (1-5): whether the answer is correct or not."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
     },
@@ -76,7 +76,7 @@
       "correctness": "correctness (1-5): Answers should extract the required information accurately and should not contain any incorrect or misleading information."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the questions carefully and identify the information that needs to be extracted from the material.\n2. Read the answer carefully and make sure it covers all the information that needs to be extracted.\n3. Use the material provided to verify the correctness of the response. If the response is inaccurate or contains incorrect or misleading information, a high score cannot be given.\n4. Check that the answer contains all the information required to be extracted and do not leave out any important details.\n5. Give a score between 1 and 5 based on the correctness and completeness of the response, with a score of 5 indicating a very accurate and complete response and a score of 1 indicating that the response barely extracts the required information.\n\nCorrectness:"
     },
@@ -91,7 +91,7 @@
       "diversity": "Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "diversity": "1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.\n2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.\n3. Check the creativity and imagination of the response to see if the response is engaging to read on.\n4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.\n5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic.\n\nDiversity:"
     },
@@ -106,7 +106,7 @@
       "correctness": "Correctness (1-5): whether the answer is correct or not."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
     },
@@ -121,7 +121,7 @@
       "correctness": "Correctness (1-5): whether the answer is correct or not."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
     },
@@ -137,7 +137,7 @@
       "creativity": "Creativity (1-5): The answers to the role-play questions need to be somewhat creative, but at the same time they need to adhere to the setting of the role."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "fidelity": "1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.\n2. Read the question's request and confirm the details that need to be taken into account when answering the request.\n3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.\n4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request.\n\nFidelity:",
       "creativity": "1. Read the question carefully to understand how the character is set up and represented in the question, including career, background, perspective, and personality.\n2. Evaluate whether the answer has unique ideas and suggestions that bring new ideas and insights to the questioner.\n3. Compare the creativity in the response to the setting of the persona and assess whether the response adheres to the setting and essential characteristics of the persona.\n4. Evaluate the quality of the responses in general and combine the results of the above assessment to give a creativity score ranging from 1 to 5, where a score of 1 indicates that the response lacks creativity and a score of 5 indicates that the response has unique ideas and suggestions and is able to adhere to the set-up of the persona.\n\nCreativity:"
@@ -154,7 +154,7 @@
       "conciseness": "Conciseness (1-5): answers should be concise and without redundant content."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the material given in the question carefully to understand its content and main points.\n2. Assess whether the answer accurately summarizes the key points of the source material.\n3. assess whether the response contains all the key information in the source material.\n4. Based on the above steps, give a score of 1-5, where 1 means that the response does not accurately summarize the main points of the material and 5 means that the response completely accurately summarizes the main points of the material.\n\nCorrectness:",
       "conciseness": "1. Read the title and extract the main points of the material.\n2. Read the summary and note the main ideas and messages in it.\n3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.\n4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.\n5. Make sure that the summary covers the key information in the material and that no important details have been omitted.\n6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score.\n\nConciseness:"
@@ -170,7 +170,7 @@
       "correctness": "Correctness (1-5): whether the answer is correct or not."
     },
     "CoT": {
-      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
       "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
       "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
     },