diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index 407f630e2469..47c80fc9a9fe 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -32,14 +32,14 @@ jobs:
 
       - name: Install ColossalAI and ChatGPT
         run: |
-          pip install -v .
-          cd applications/ChatGPT
+          pip install -e .
+          cd applications/Chat
           pip install -v .
           pip install -r requirements-test.txt
 
       - name: Execute Unit Testing
         run: |
-          cd applications/ChatGPT
+          cd applications/Chat
           rm -rf ~/.cache/colossalai
           pytest tests/
         env:
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index dea562c4d2ad..2a9c916d45c9 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -243,6 +243,7 @@ from coati.trainer import SFTTrainer
 model = LlamaLM(pretrained=args.pretrain)
 tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
 
+(model, optim) = strategy.prepare((model, optim))
 trainer = SFTTrainer(model=model,
     strategy=strategy,
     optim=optim,
@@ -250,11 +251,15 @@ trainer = SFTTrainer(model=model,
     eval_dataloader=eval_dataloader,
     batch_size=args.batch_size,
     max_epochs=args.max_epochs,
-    accimulation_steps = args.accimulation_steps
+    accumulation_steps = args.accumulation_steps
 )
 
 trainer.fit()
-trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+# this saves in pytorch format
+strategy.save_model(model, args.save_path, only_rank0=True)
+
+# this saves in HF format. ColossalAI strategy with stage-3 doesn't support this method
+strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=tokenizer)
 ```
 
 </details>
@@ -263,7 +268,7 @@ trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
 
 Here are some examples that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
 
-If you only have a single 24G GPU, you can use the following script. `batch_size` and `lora_rank` are the most important parameters to successfully train the model.
+If you only have a single 24G GPU, you can use the following script. `batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model.
 ```
 torchrun --standalone --nproc_per_node=1 train_sft.py \
     --pretrain "/path/to/LLaMa-7B/" \
@@ -273,11 +278,12 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
     --batch_size 1 \
-    --accimulation_steps 8 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
     --lora_rank 16 \
+    --grad_checkpoint
 ```
 
 `colossalai_gemini` strategy can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. You can use the following script.
@@ -290,10 +296,11 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
     --batch_size 1 \
-    --accimulation_steps 8 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
+    --grad_checkpoint
 ```
 
 If you have 4x32 GB GPUs, you can even train the whole 7B model using our `colossalai_zero2_cpu` strategy! The script is given as follows.
@@ -306,10 +313,11 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
     --batch_size 1 \
-    --accimulation_steps 8 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
+    --grad_checkpoint
 ```
 </details>
 
diff --git a/applications/Chat/benchmarks/README.md b/applications/Chat/benchmarks/README.md
index b4e28ba1d764..bc8ad8ba9816 100644
--- a/applications/Chat/benchmarks/README.md
+++ b/applications/Chat/benchmarks/README.md
@@ -1,70 +1,5 @@
 # Benchmarks
 
-## Benchmark GPT on dummy prompt data
-
-We provide various GPT models (string in parentheses is the corresponding model name used in this script):
-
-- GPT2-S (s)
-- GPT2-M (m)
-- GPT2-L (l)
-- GPT2-XL (xl)
-- GPT2-4B (4b)
-- GPT2-6B (6b)
-- GPT2-8B (8b)
-- GPT2-10B (10b)
-- GPT2-12B (12b)
-- GPT2-15B (15b)
-- GPT2-18B (18b)
-- GPT2-20B (20b)
-- GPT2-24B (24b)
-- GPT2-28B (28b)
-- GPT2-32B (32b)
-- GPT2-36B (36b)
-- GPT2-40B (40b)
-- GPT3 (175b)
-
-We also provide various training strategies:
-
-- ddp: torch DDP
-- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
-- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
-- colossalai_zero2: ColossalAI zero2
-- colossalai_zero2_cpu: ColossalAI zero2-offload
-- colossalai_zero1: ColossalAI zero1
-- colossalai_zero1_cpu: ColossalAI zero1-offload
-
-We only support `torchrun` to launch now. E.g.
-
-```shell
-# run GPT2-S on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_gpt_dummy.py --model s --strategy ddp --experience_batch_size 1 --train_batch_size 1
-# run GPT2-XL on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_gpt_dummy.py --model xl --strategy colossalai_zero2
-# run GPT3 on 8-node 8-GPU
-torchrun --nnodes 8 --nproc_per_node 8 \
- --rdzv_id=$JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$HOST_NODE_ADDR \
- benchmark_gpt_dummy.py --model 175b --strategy colossalai_gemini
-```
-
-> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
-
-In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
-
-We also provide a simple shell script to run a set of benchmarks. But it only supports benchmark on single node. However, it's easy to run on multi-nodes by modifying launch command in this script.
-
-Usage:
-
-```shell
-# run for GPUS=(1 2 4 8) x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh
-# run for GPUS=2 x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2
-# run for GPUS=2 x strategy=ddp x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2 ddp
-# run for GPUS=2 x strategy=ddp x model=l x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2 ddp l
-```
-
 ## Benchmark OPT with LoRA on dummy prompt data
 
 We provide various OPT models (string in parentheses is the corresponding model name used in this script):
@@ -80,15 +15,21 @@ We provide various OPT models (string in parentheses is the corresponding model
 - OPT-10B (10b)
 - OPT-13B (13b)
 
+We also provide various training strategies:
+
+- ddp: torch DDP
+- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
+- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
+- colossalai_zero2: ColossalAI zero2
+- colossalai_zero2_cpu: ColossalAI zero2-offload
+- colossalai_zero1: ColossalAI zero1
+- colossalai_zero1_cpu: ColossalAI zero1-offload
+
 We only support `torchrun` to launch now. E.g.
 
 ```shell
 # run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
-# run OPT-350M with lora_rank=4 on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 350m --strategy colossalai_zero2 --lora_rank 4
+torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --critic_model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
+# run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
+torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
 ```
-
-> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
-
-In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
diff --git a/applications/Chat/benchmarks/benchmark_gpt_dummy.py b/applications/Chat/benchmarks/benchmark_gpt_dummy.py
deleted file mode 100644
index e41ef239d378..000000000000
--- a/applications/Chat/benchmarks/benchmark_gpt_dummy.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.models.base import RewardModel
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.trainer import PPOTrainer
-from coati.trainer.callbacks import PerformanceEvaluator
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
-from torch.optim import Adam
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
-    numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
-        numel *= dist.get_world_size()
-    return numel
-
-
-def preprocess_batch(samples) -> dict:
-    input_ids = torch.stack(samples)
-    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-    return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-def print_rank_0(*args, **kwargs) -> None:
-    if dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def print_model_numel(model_dict: dict) -> None:
-    B = 1024**3
-    M = 1024**2
-    K = 1024
-    outputs = ''
-    for name, numel in model_dict.items():
-        outputs += f'{name}: '
-        if numel >= B:
-            outputs += f'{numel / B:.2f} B\n'
-        elif numel >= M:
-            outputs += f'{numel / M:.2f} M\n'
-        elif numel >= K:
-            outputs += f'{numel / K:.2f} K\n'
-        else:
-            outputs += f'{numel}\n'
-    print_rank_0(outputs)
-
-
-def get_gpt_config(model_name: str) -> GPT2Config:
-    model_map = {
-        's': GPT2Config(),
-        'm': GPT2Config(n_embd=1024, n_layer=24, n_head=16),
-        'l': GPT2Config(n_embd=1280, n_layer=36, n_head=20),
-        'xl': GPT2Config(n_embd=1600, n_layer=48, n_head=25),
-        '2b': GPT2Config(n_embd=2048, n_layer=40, n_head=16),
-        '4b': GPT2Config(n_embd=2304, n_layer=64, n_head=16),
-        '6b': GPT2Config(n_embd=4096, n_layer=30, n_head=16),
-        '8b': GPT2Config(n_embd=4096, n_layer=40, n_head=16),
-        '10b': GPT2Config(n_embd=4096, n_layer=50, n_head=16),
-        '12b': GPT2Config(n_embd=4096, n_layer=60, n_head=16),
-        '15b': GPT2Config(n_embd=4096, n_layer=78, n_head=16),
-        '18b': GPT2Config(n_embd=4096, n_layer=90, n_head=16),
-        '20b': GPT2Config(n_embd=8192, n_layer=25, n_head=16),
-        '24b': GPT2Config(n_embd=8192, n_layer=30, n_head=16),
-        '28b': GPT2Config(n_embd=8192, n_layer=35, n_head=16),
-        '32b': GPT2Config(n_embd=8192, n_layer=40, n_head=16),
-        '36b': GPT2Config(n_embd=8192, n_layer=45, n_head=16),
-        '40b': GPT2Config(n_embd=8192, n_layer=50, n_head=16),
-        '175b': GPT2Config(n_positions=2048, n_embd=12288, n_layer=96, n_head=96),
-    }
-    try:
-        return model_map[model_name]
-    except KeyError:
-        raise ValueError(f'Unknown model "{model_name}"')
-
-
-def main(args):
-    if args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif args.strategy == 'colossalai_gemini_cpu':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
-    elif args.strategy == 'colossalai_zero1':
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero1_cpu':
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    model_config = get_gpt_config(args.model)
-
-    with strategy.model_init_context():
-        actor = GPTActor(config=model_config).cuda()
-        critic = GPTCritic(config=model_config).cuda()
-
-        initial_model = deepcopy(actor).cuda()
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
-
-    actor_numel = get_model_numel(actor, strategy)
-    critic_numel = get_model_numel(critic, strategy)
-    initial_model_numel = get_model_numel(initial_model, strategy)
-    reward_model_numel = get_model_numel(reward_model, strategy)
-    print_model_numel({
-        'Actor': actor_numel,
-        'Critic': critic_numel,
-        'Initial model': initial_model_numel,
-        'Reward model': reward_model_numel
-    })
-    performance_evaluator = PerformanceEvaluator(actor_numel,
-                                                 critic_numel,
-                                                 initial_model_numel,
-                                                 reward_model_numel,
-                                                 enable_grad_checkpoint=False,
-                                                 ignore_episodes=1)
-
-    if args.strategy.startswith('colossalai'):
-        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
-        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=5e-6)
-        critic_optim = Adam(critic.parameters(), lr=5e-6)
-
-    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-    tokenizer.pad_token = tokenizer.eos_token
-
-    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
-
-    trainer = PPOTrainer(strategy,
-                         actor,
-                         critic,
-                         reward_model,
-                         initial_model,
-                         actor_optim,
-                         critic_optim,
-                         max_epochs=args.max_epochs,
-                         train_batch_size=args.train_batch_size,
-                         experience_batch_size=args.experience_batch_size,
-                         tokenizer=preprocess_batch,
-                         max_length=512,
-                         do_sample=True,
-                         temperature=1.0,
-                         top_k=50,
-                         pad_token_id=tokenizer.pad_token_id,
-                         eos_token_id=tokenizer.eos_token_id,
-                         callbacks=[performance_evaluator])
-
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 1, 400), device=torch.cuda.current_device())
-    random_attention_mask = torch.randint(1, (1000, 1, 400), device=torch.cuda.current_device()).to(torch.bool)
-    random_pretrain = [{'input_ids':random_prompts[i], 'labels':random_prompts[i], 'attention_mask':random_attention_mask[i]} for i in range(1000)]
-    trainer.fit(random_prompts, random_pretrain,
-                num_episodes=args.num_episodes,
-                max_timesteps=args.max_timesteps,
-                update_timesteps=args.update_timesteps)
-
-    print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', default='s')
-    parser.add_argument('--strategy',
-                        choices=[
-                            'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
-                            'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
-                        ],
-                        default='ddp')
-    parser.add_argument('--num_episodes', type=int, default=3)
-    parser.add_argument('--max_timesteps', type=int, default=8)
-    parser.add_argument('--update_timesteps', type=int, default=8)
-    parser.add_argument('--max_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/benchmarks/benchmark_gpt_dummy.sh b/applications/Chat/benchmarks/benchmark_gpt_dummy.sh
deleted file mode 100755
index d70f8872570a..000000000000
--- a/applications/Chat/benchmarks/benchmark_gpt_dummy.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-# Usage: $0 <?number-of-gpus> <?strategy> <?model>
-set -xu
-
-BASE=$(realpath $(dirname $0))
-
-
-PY_SCRIPT=${BASE}/benchmark_gpt_dummy.py
-export OMP_NUM_THREADS=8
-
-function tune_batch_size() {
-    # we found when experience batch size is equal to train batch size
-    # peak CUDA memory usage of making experience phase is less than or equal to that of training phase
-    # thus, experience batch size can be larger than or equal to train batch size
-    for bs in 1 2 4 8 16 32 64 128 256; do
-        torchrun --standalone --nproc_per_node $1 $PY_SCRIPT --model $2 --strategy $3 --experience_batch_size $bs --train_batch_size $bs || return 1
-    done
-}
-
-if [ $# -eq 0 ]; then
-    num_gpus=(1 2 4 8)
-else
-    num_gpus=($1)
-fi
-
-if [ $# -le 1 ]; then
-    strategies=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu")
-else
-    strategies=($2)
-fi
-
-if [ $# -le 2 ]; then
-    models=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b")
-else
-    models=($3)
-fi
-
-
-for num_gpu in ${num_gpus[@]}; do
-    for strategy in ${strategies[@]}; do
-        for model in ${models[@]}; do
-            tune_batch_size $num_gpu $model $strategy || break
-        done
-    done
-done
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index c79435ec63c5..a991e8558aee 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -10,6 +10,7 @@
 from coati.trainer.callbacks import PerformanceEvaluator
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
 from torch.optim import Adam
+from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
 from transformers.models.opt.configuration_opt import OPTConfig
 
@@ -92,13 +93,13 @@ def main(args):
     torch.cuda.set_per_process_memory_fraction(args.cuda_mem_frac)
 
     model_config = get_gpt_config(args.model)
-
+    critic_config = get_gpt_config(args.critic_model)
     with strategy.model_init_context():
         actor = OPTActor(config=model_config, lora_rank=args.lora_rank).cuda()
-        critic = OPTCritic(config=model_config, lora_rank=args.lora_rank).cuda()
+        critic = OPTCritic(config=critic_config, lora_rank=args.lora_rank).cuda()
 
-        initial_model = deepcopy(actor).cuda()
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda()
+        initial_model = deepcopy(actor).cuda().half()
+        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()
 
     actor_numel = get_model_numel(actor, strategy)
     critic_numel = get_model_numel(critic, strategy)
@@ -127,8 +128,7 @@ def main(args):
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
     tokenizer.pad_token = tokenizer.eos_token
 
-    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
 
     trainer = PPOTrainer(strategy,
                          actor,
@@ -137,22 +137,27 @@ def main(args):
                          initial_model,
                          actor_optim,
                          critic_optim,
+                         ptx_coef=0,
                          max_epochs=args.max_epochs,
                          train_batch_size=args.train_batch_size,
-                         experience_batch_size=args.experience_batch_size,
-                         tokenizer=preprocess_batch,
+                         offload_inference_models=args.offload_inference_models,
                          max_length=512,
                          do_sample=True,
                          temperature=1.0,
                          top_k=50,
+                         use_cache=True,
                          pad_token_id=tokenizer.pad_token_id,
                          eos_token_id=tokenizer.eos_token_id,
                          callbacks=[performance_evaluator])
 
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 1, 400), device=torch.cuda.current_device())
-    random_attention_mask = torch.randint(1, (1000, 1, 400), device=torch.cuda.current_device()).to(torch.bool)
-    random_pretrain = [{'input_ids':random_prompts[i], 'labels':random_prompts[i], 'attention_mask':random_attention_mask[i]} for i in range(1000)]
-    trainer.fit(random_prompts, random_pretrain,
+    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
+    dataloader = DataLoader(random_prompts,
+                            batch_size=args.experience_batch_size,
+                            shuffle=True,
+                            collate_fn=preprocess_batch)
+
+    trainer.fit(dataloader,
+                None,
                 num_episodes=args.num_episodes,
                 max_timesteps=args.max_timesteps,
                 update_timesteps=args.update_timesteps)
@@ -163,6 +168,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', default='125m')
+    parser.add_argument('--critic_model', default='125m')
     parser.add_argument('--strategy',
                         choices=[
                             'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
@@ -172,10 +178,11 @@ def main(args):
     parser.add_argument('--num_episodes', type=int, default=3)
     parser.add_argument('--max_timesteps', type=int, default=8)
     parser.add_argument('--update_timesteps', type=int, default=8)
-    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--max_epochs', type=int, default=1)
     parser.add_argument('--train_batch_size', type=int, default=8)
     parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=4)
+    parser.add_argument('--lora_rank', type=int, default=0)
     parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
+    parser.add_argument('--offload_inference_models', action='store_true', default=False)
     args = parser.parse_args()
     main(args)
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
index 4367a2c6f3ce..f8ab2346c4b7 100644
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -1,5 +1,6 @@
 import copy
 import random
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Callable, Dict, Sequence
 
@@ -19,9 +20,13 @@
 class PromptDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
-    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, max_datasets_size: int = None):
+    def __init__(self,
+                 data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 max_datasets_size: int = None,
+                 max_length: int = 96):
         super(PromptDataset, self).__init__()
-        self.prompt = []
+        self.keyed_prompt = defaultdict(list)
         logger.info("Loading data...")
         list_data_dict = jload(data_path)
         logger.info(f"Loaded {len(list_data_dict)} examples.")
@@ -33,14 +38,14 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer,
         for data_dict in list_data_dict:
             token = tokenizer(data_dict["instruction"],
                               return_tensors='pt',
-                              max_length=96,
+                              max_length=max_length,
                               padding='max_length',
                               truncation=True)
-            for idx in token['input_ids']:
-                self.prompt.append(idx.to(torch.cuda.current_device()))
+            for k, tensor in token.items():
+                self.keyed_prompt[k].extend(tensor.to(torch.cuda.current_device()).unbind())
 
     def __len__(self):
-        return len(self.prompt)
+        return len(self.keyed_prompt)
 
     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        return self.prompt[i]
+        return {k: v[i] for k, v in self.keyed_prompt.items()}
diff --git a/applications/Chat/coati/models/__init__.py b/applications/Chat/coati/models/__init__.py
index 7489b2e87ca0..709bc5ac0948 100644
--- a/applications/Chat/coati/models/__init__.py
+++ b/applications/Chat/coati/models/__init__.py
@@ -1,4 +1,8 @@
 from .base import Actor, Critic, RewardModel
+from .lora import LoRAModule, convert_to_lora_module
 from .loss import LogExpLoss, LogSigLoss, PolicyLoss, PPOPtxActorLoss, ValueLoss
 
-__all__ = ['Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'LogSigLoss', 'LogExpLoss']
+__all__ = [
+    'Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'PPOPtxActorLoss', 'LogSigLoss', 'LogExpLoss',
+    'LoRAModule', 'convert_to_lora_module'
+]
diff --git a/applications/Chat/coati/models/base/__init__.py b/applications/Chat/coati/models/base/__init__.py
index 7cf82309af7b..fe4152f2b760 100644
--- a/applications/Chat/coati/models/base/__init__.py
+++ b/applications/Chat/coati/models/base/__init__.py
@@ -1,6 +1,24 @@
+import torch.nn as nn
+
 from .actor import Actor
 from .critic import Critic
-from .lm import LM
 from .reward_model import RewardModel
 
-__all__ = ['Actor', 'Critic', 'RewardModel', 'LM']
+
+def get_base_model(model: nn.Module) -> nn.Module:
+    """Get the base model of our wrapper classes.
+    For Actor, it's base model is ``actor.model`` and it's usually a ``transformers.PreTrainedModel``.
+    For Critic and RewardModel, it's base model is itself.
+
+    Args:
+        model (nn.Module): model to get base model from
+
+    Returns:
+        nn.Module: the base model
+    """
+    if isinstance(model, Actor):
+        return model.get_base_model()
+    return model
+
+
+__all__ = ['Actor', 'Critic', 'RewardModel', 'get_base_model']
diff --git a/applications/Chat/coati/models/base/lm.py b/applications/Chat/coati/models/base/lm.py
deleted file mode 100644
index e32ba4253369..000000000000
--- a/applications/Chat/coati/models/base/lm.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..generation import generate
-from .actor import Actor
-
-
-class LM(Actor):
-    """
-    Language model base class.
-
-    Args:
-        model (nn.Module): Language Model.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
-        super().__init__(model=model, lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-
-    def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Returns output log probs
-        """
-        output = self.model(sequences, attention_mask=attention_mask)
-        logits = output['logits']
-        log_probs = F.log_softmax(logits, dim=-1)
-        return log_probs
diff --git a/applications/Chat/coati/models/bloom/__init__.py b/applications/Chat/coati/models/bloom/__init__.py
index 39dfe036a2f2..d0e7f7b1ef94 100644
--- a/applications/Chat/coati/models/bloom/__init__.py
+++ b/applications/Chat/coati/models/bloom/__init__.py
@@ -1,6 +1,5 @@
 from .bloom_actor import BLOOMActor
 from .bloom_critic import BLOOMCritic
-from .bloom_lm import BLOOMLM
 from .bloom_rm import BLOOMRM
 
-__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM', 'BLOOMLM']
+__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM']
diff --git a/applications/Chat/coati/models/bloom/bloom_lm.py b/applications/Chat/coati/models/bloom/bloom_lm.py
deleted file mode 100644
index e4184fcd0d9c..000000000000
--- a/applications/Chat/coati/models/bloom/bloom_lm.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Optional
-
-import torch
-from transformers import BloomConfig, BloomForCausalLM, BloomModel
-
-from ..base import LM
-
-
-class BLOOMLM(LM):
-    """
-    BLOOM language model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self,
-                 pretrained: str = None,
-                 config: Optional[BloomConfig] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = BloomForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomForCausalLM(config)
-        else:
-            model = BloomForCausalLM(BloomConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
-
-    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
-        return self.model(input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index eb30c36d0f84..f57c9458a271 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -76,7 +76,7 @@ def sample(model: nn.Module,
         # update generated ids, model inputs for next step
         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
         if update_model_kwargs_fn is not None:
-            model_kwargs = update_model_kwargs_fn(outputs, **model_kwargs)
+            model_kwargs = update_model_kwargs_fn(outputs, model_kwargs)
 
         # if eos_token was found in one sentence, set sentence to finished
         if eos_token_id is not None:
diff --git a/applications/Chat/coati/models/generation_utils.py b/applications/Chat/coati/models/generation_utils.py
deleted file mode 100644
index c7bc1b383fb9..000000000000
--- a/applications/Chat/coati/models/generation_utils.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from typing import Optional
-
-import torch
-
-
-def gpt_prepare_inputs_fn(input_ids: torch.Tensor, past: Optional[torch.Tensor] = None, **kwargs) -> dict:
-    token_type_ids = kwargs.get("token_type_ids", None)
-    # only last token for inputs_ids if past is defined in kwargs
-    if past:
-        input_ids = input_ids[:, -1].unsqueeze(-1)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-    attention_mask = kwargs.get("attention_mask", None)
-    position_ids = kwargs.get("position_ids", None)
-
-    if attention_mask is not None and position_ids is None:
-        # create position_ids on the fly for batch generation
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past:
-            position_ids = position_ids[:, -1].unsqueeze(-1)
-    else:
-        position_ids = None
-    return {
-        "input_ids": input_ids,
-        "past_key_values": past,
-        "use_cache": kwargs.get("use_cache"),
-        "position_ids": position_ids,
-        "attention_mask": attention_mask,
-        "token_type_ids": token_type_ids,
-    }
-
-
-def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
-    if "past_key_values" in outputs:
-        model_kwargs["past"] = outputs["past_key_values"]
-    else:
-        model_kwargs["past"] = None
-
-    # update token_type_ids with last value
-    if "token_type_ids" in model_kwargs:
-        token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
-
-    # update attention mask
-    if "attention_mask" in model_kwargs:
-        attention_mask = model_kwargs["attention_mask"]
-        model_kwargs["attention_mask"] = torch.cat(
-            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
-
-    return model_kwargs
-
-
-def opt_prepare_inputs_fn(input_ids: torch.Tensor,
-                          past: Optional[torch.Tensor] = None,
-                          attention_mask: Optional[torch.Tensor] = None,
-                          use_cache: Optional[bool] = None,
-                          **kwargs) -> dict:
-    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-    if attention_mask is None:
-        attention_mask = input_ids.new_ones(input_ids.shape)
-
-    if past:
-        input_ids = input_ids[:, -1:]
-    # first step, decoder_cached_states are empty
-    return {
-        "input_ids": input_ids,    # encoder_outputs is defined. input_ids not needed
-        "attention_mask": attention_mask,
-        "past_key_values": past,
-        "use_cache": use_cache,
-    }
-
-
-def bloom_prepare_inputs_fn(input_ids: torch.Tensor,
-                            past: Optional[torch.Tensor] = None,
-                            attention_mask: Optional[torch.Tensor] = None,
-                            use_cache: Optional[bool] = None,
-                            **kwargs) -> dict:
-    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-    if attention_mask is None:
-        attention_mask = input_ids.new_ones(input_ids.shape)
-
-    if past:
-        input_ids = input_ids[:, -1:]
-    # first step, decoder_cached_states are empty
-    return {
-        "input_ids": input_ids,    # encoder_outputs is defined. input_ids not needed
-        "attention_mask": attention_mask,
-        "past_key_values": past,
-        "use_cache": use_cache,
-    }
diff --git a/applications/Chat/coati/models/gpt/__init__.py b/applications/Chat/coati/models/gpt/__init__.py
index 9dc68e37544f..63dc5ab0f5ea 100644
--- a/applications/Chat/coati/models/gpt/__init__.py
+++ b/applications/Chat/coati/models/gpt/__init__.py
@@ -1,6 +1,5 @@
 from .gpt_actor import GPTActor
 from .gpt_critic import GPTCritic
-from .gpt_lm import GPTLM
 from .gpt_rm import GPTRM
 
-__all__ = ['GPTActor', 'GPTCritic', 'GPTRM', 'GPTLM']
+__all__ = ['GPTActor', 'GPTCritic', 'GPTRM']
diff --git a/applications/Chat/coati/models/gpt/gpt_lm.py b/applications/Chat/coati/models/gpt/gpt_lm.py
deleted file mode 100644
index c558d7e9ea8d..000000000000
--- a/applications/Chat/coati/models/gpt/gpt_lm.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-
-from ..base import LM
-
-
-class GPTLM(LM):
-    """
-    GPT language model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the LoRa layer.
-        lora_train_bias (str): Bias training strategy for the LoRa layer.
-    """
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 config: Optional[GPT2Config] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = GPT2LMHeadModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2LMHeadModel(config)
-        else:
-            model = GPT2LMHeadModel(GPT2Config())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
-
-    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
-        return self.model(input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
diff --git a/applications/Chat/coati/models/llama/__init__.py b/applications/Chat/coati/models/llama/__init__.py
index 0d4dada3c9f1..9b2a024afdb2 100644
--- a/applications/Chat/coati/models/llama/__init__.py
+++ b/applications/Chat/coati/models/llama/__init__.py
@@ -1,6 +1,5 @@
 from .llama_actor import LlamaActor
 from .llama_critic import LlamaCritic
-from .llama_lm import LlamaLM
 from .llama_rm import LlamaRM
 
-__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM', 'LlamaLM']
+__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM']
diff --git a/applications/Chat/coati/models/llama/llama_lm.py b/applications/Chat/coati/models/llama/llama_lm.py
deleted file mode 100644
index 181910fb13eb..000000000000
--- a/applications/Chat/coati/models/llama/llama_lm.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from typing import Optional
-
-from transformers import LlamaConfig, LlamaForCausalLM
-
-from ..base import LM
-
-
-class LlamaLM(LM):
-    """
-    Llama language model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 config: Optional[LlamaConfig] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-
-        if pretrained is not None:
-            model = LlamaForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaForCausalLM(config)
-        else:
-            model = LlamaForCausalLM(LlamaConfig())
-
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-
-        super().__init__(model, lora_rank, lora_train_bias)
-
-    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
-        return self.model(input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 7f6eb73262fa..0533a60dc532 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -106,6 +106,23 @@ def convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
             convert_to_lora_recursively(child, lora_rank)
 
 
+def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = 'none') -> nn.Module:
+    """Convert a torch.nn.Module to a LoRA module.
+
+    Args:
+        module (nn.Module): The module to convert.
+        lora_rank (int): LoRA rank.
+
+    Returns:
+        nn.Module: The converted module.
+    """
+    if lora_rank <= 0:
+        return module
+    convert_to_lora_recursively(module, lora_rank)
+    lora.mark_only_lora_as_trainable(module, lora_train_bias)
+    return module
+
+
 class LoRAModule(nn.Module):
     """A LoRA module base class. All derived classes should call `convert_to_lora()` at the bottom of `__init__()`.
     This class will convert all torch.nn.Linear layer to LoraLinear layer.
@@ -123,7 +140,4 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
         self.lora_train_bias = lora_train_bias
 
     def convert_to_lora(self) -> None:
-        if self.lora_rank <= 0:
-            return
-        convert_to_lora_recursively(self, self.lora_rank)
-        lora.mark_only_lora_as_trainable(self, self.lora_train_bias)
+        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias)
diff --git a/applications/Chat/coati/models/opt/__init__.py b/applications/Chat/coati/models/opt/__init__.py
index 3d7a8adbf82e..334f4df0032a 100644
--- a/applications/Chat/coati/models/opt/__init__.py
+++ b/applications/Chat/coati/models/opt/__init__.py
@@ -1,6 +1,5 @@
 from .opt_actor import OPTActor
 from .opt_critic import OPTCritic
-from .opt_lm import OPTLM
 from .opt_rm import OPTRM
 
-__all__ = ['OPTActor', 'OPTCritic', 'OPTRM', 'OPTLM']
+__all__ = ['OPTActor', 'OPTCritic', 'OPTRM']
diff --git a/applications/Chat/coati/models/opt/opt_lm.py b/applications/Chat/coati/models/opt/opt_lm.py
deleted file mode 100644
index 47afae847f13..000000000000
--- a/applications/Chat/coati/models/opt/opt_lm.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-
-from ..base import LM
-
-
-class OPTLM(LM):
-    """
-    OPT language model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 config: Optional[OPTConfig] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = OPTForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTForCausalLM(config)
-        else:
-            model = OPTForCausalLM(OPTConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
-
-    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
-        return self.model(input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
index d676799496dd..ac3a878be884 100644
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -15,7 +15,6 @@ class Trainer(ABC):
     Args:
         strategy (Strategy):the strategy to use for training
         max_epochs (int, defaults to 1): the number of epochs of training process
-        tokenizer (Callable, optional): the tokenizer to use for tokenizing the input
         dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
         callbacks (List[Callback], defaults to []): the callbacks to call during training process
         generate_kwargs (dict, optional): the kwargs to use while model generating
@@ -24,14 +23,12 @@ class Trainer(ABC):
     def __init__(self,
                  strategy: Strategy,
                  max_epochs: int = 1,
-                 tokenizer: Optional[Callable[[Any], dict]] = None,
                  dataloader_pin_memory: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
         super().__init__()
         self.strategy = strategy
         self.max_epochs = max_epochs
-        self.tokenizer = tokenizer
         self.generate_kwargs = generate_kwargs
         self.dataloader_pin_memory = dataloader_pin_memory
         self.callbacks = callbacks
diff --git a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
index 5ca44a52d6e7..925455444597 100644
--- a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
+++ b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
@@ -19,6 +19,14 @@ def print_rank_0(*args, **kwargs) -> None:
         print(*args, **kwargs)
 
 
+def divide(x: float, y: float) -> float:
+    if y == 0:
+        return float('inf')
+    elif y == float('inf'):
+        return float('nan')
+    return x / y
+
+
 @torch.no_grad()
 def all_reduce_mean(x: float, world_size: int) -> float:
     if world_size == 1:
@@ -29,6 +37,24 @@ def all_reduce_mean(x: float, world_size: int) -> float:
     return tensor.item()
 
 
+class Timer:
+
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        assert self.start_time is not None
+        self.duration += time() - self.start_time
+        self.start_time = None
+
+    def reset(self) -> None:
+        self.duration = 0.
+
+
 class PerformanceEvaluator(Callback):
     """
         Callback for valuate the performance of the model.
@@ -58,27 +84,34 @@ def __init__(self,
         self.ignore_episodes = ignore_episodes
         self.disable: bool = False
 
-        self.make_experience_duration: float = 0.
-        self.make_experience_start_time: Optional[float] = None
+        self.overall_timer = Timer()
+        self.make_experience_timer = Timer()
+        self.learn_timer = Timer()
         self.make_experience_num_samples: int = 0
         self.make_experience_flop: int = 0
-        self.learn_duration: float = 0.
-        self.learn_start_time: Optional[float] = None
         self.learn_num_samples: int = 0
         self.learn_flop: int = 0
 
     def on_episode_start(self, episode: int) -> None:
         self.disable = self.ignore_episodes > 0 and episode < self.ignore_episodes
+        if self.disable:
+            return
+        self.overall_timer.start()
+
+    def on_episode_end(self, episode: int) -> None:
+        if self.disable:
+            return
+        self.overall_timer.end()
 
     def on_make_experience_start(self) -> None:
         if self.disable:
             return
-        self.make_experience_start_time = time()
+        self.make_experience_timer.start()
 
     def on_make_experience_end(self, experience: Experience) -> None:
         if self.disable:
             return
-        self.make_experience_duration += time() - self.make_experience_start_time
+        self.make_experience_timer.end()
 
         batch_size, seq_len = experience.sequences.shape
 
@@ -101,12 +134,12 @@ def on_make_experience_end(self, experience: Experience) -> None:
     def on_learn_batch_start(self) -> None:
         if self.disable:
             return
-        self.learn_start_time = time()
+        self.learn_timer.start()
 
     def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
         if self.disable:
             return
-        self.learn_duration += time() - self.learn_start_time
+        self.learn_timer.end()
 
         batch_size, seq_len = experience.sequences.shape
 
@@ -118,16 +151,33 @@ def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
         self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
 
     def on_fit_end(self) -> None:
-        avg_make_experience_duration = all_reduce_mean(self.make_experience_duration, self.world_size)
-        avg_learn_duration = all_reduce_mean(self.learn_duration, self.world_size)
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
+        avg_learn_duration = all_reduce_mean(self.learn_timer.duration, self.world_size)
+        avg_overall_duration = all_reduce_mean(self.overall_timer.duration, self.world_size)
 
-        avg_make_experience_throughput = self.make_experience_num_samples / (avg_make_experience_duration + 1e-12)
+        avg_make_experience_throughput = self.make_experience_num_samples * \
+            self.world_size / (avg_make_experience_duration + 1e-12)
         avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
 
-        avg_learn_throughput = self.learn_num_samples / (avg_learn_duration + 1e-12)
+        avg_learn_throughput = self.learn_num_samples * self.world_size / (avg_learn_duration + 1e-12)
         avg_learn_tflops = self.learn_flop / 1e12 / (avg_learn_duration + 1e-12)
 
+        num_effective_samples = min(self.learn_num_samples, self.make_experience_num_samples) * self.world_size
+
+        avg_overall_throughput = num_effective_samples / (avg_overall_duration + 1e-12)
+
+        overall_time_per_sample = divide(1, avg_overall_throughput)
+        make_experience_time_per_sample = divide(avg_make_experience_duration, num_effective_samples)
+        learn_time_per_sample = divide(avg_learn_duration, num_effective_samples)
+
         print_rank_0(
-            f'Making experience throughput: {avg_make_experience_throughput:.3f} samples/sec, TFLOPS: {avg_make_experience_tflops:.3f}'
+            f'Performance summary:\n' +
+            f'Generate {self.make_experience_num_samples * self.world_size} samples, throughput: {avg_make_experience_throughput:.2f} samples/s, TFLOPS per GPU: {avg_make_experience_tflops:.2f}\n'
+            +
+            f'Train {self.learn_num_samples * self.world_size} samples, throughput: {avg_learn_throughput:.2f} samples/s, TFLOPS per GPU: {avg_learn_tflops:.2f}\n'
+            + f'Overall throughput: {avg_overall_throughput:.2f} samples/s\n' +
+            f'Overall time per sample: {overall_time_per_sample:.2f} s\n' +
+            f'Make experience time per sample: {make_experience_time_per_sample:.2f} s, {make_experience_time_per_sample/overall_time_per_sample*100:.2f}%\n'
+            +
+            f'Learn time per sample: {learn_time_per_sample:.2f} s, {learn_time_per_sample/overall_time_per_sample*100:.2f}%'
         )
-        print_rank_0(f'Learning throughput: {avg_learn_throughput:.3f} samples/sec, TFLOPS: {avg_learn_tflops:.3f}')
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index cf752549501f..fe5ae48d9c2f 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -4,19 +4,20 @@
 import torch.nn as nn
 from coati.experience_maker import Experience, NaiveExperienceMaker
 from coati.models.base import Actor, Critic
-from coati.models.generation_utils import update_model_kwargs_fn
-from coati.models.loss import PolicyLoss, ValueLoss
+from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
 from coati.replay_buffer import NaiveReplayBuffer
 from torch import Tensor
 from torch.optim import Optimizer
 from torch.utils.data import DistributedSampler
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from tqdm import tqdm
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from colossalai.utils import get_current_device
 
 from .base import Trainer
 from .callbacks import Callback
 from .strategies import Strategy
-from .utils import is_rank_0
+from .utils import is_rank_0, to_device
 
 
 class PPOTrainer(Trainer):
@@ -39,11 +40,10 @@ class PPOTrainer(Trainer):
         vf_coef (float, defaults to 1.0): the coefficient of value loss
         ptx_coef (float, defaults to 0.9): the coefficient of ptx loss
         value_clip (float, defaults to 0.4): the clip coefficient of value loss
-        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
         max_epochs (int, defaults to 1): the number of epochs of training process
-        tokenizer (Callable, optional): the tokenizer to use for tokenizing the input
         sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
         dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        offload_inference_models (bool, defaults to True): whether to offload inference models to cpu during training process
         callbacks (List[Callback], defaults to []): the callbacks to call during training process
         generate_kwargs (dict, optional): the kwargs to use while model generating
     """
@@ -64,22 +64,21 @@ def __init__(self,
                  eps_clip: float = 0.2,
                  vf_coef: float = 1.0,
                  value_clip: float = 0.4,
-                 experience_batch_size: int = 8,
                  max_epochs: int = 1,
-                 tokenizer: Optional[Callable[[Any], dict]] = None,
                  sample_replay_buffer: bool = False,
                  dataloader_pin_memory: bool = True,
+                 offload_inference_models: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
         experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
         replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
         generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
-        super().__init__(strategy, max_epochs, tokenizer, dataloader_pin_memory, callbacks, **generate_kwargs)
+        super().__init__(strategy, max_epochs, dataloader_pin_memory, callbacks, **generate_kwargs)
 
         self.experience_maker = experience_maker
         self.replay_buffer = replay_buffer
-        self.experience_batch_size = experience_batch_size
         self.sample_replay_buffer = sample_replay_buffer
+        self.offload_inference_models = offload_inference_models
 
         self.actor = actor
         self.critic = critic
@@ -87,11 +86,13 @@ def __init__(self,
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
         self.vf_coef = vf_coef
-        self.ptx_loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
+        self.ptx_loss_fn = GPTLMLoss()
         self.ptx_coef = ptx_coef
         self.actor_optim = actor_optim
         self.critic_optim = critic_optim
 
+        self.device = get_current_device()
+
     def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
         if isinstance(inputs, Tensor):
             return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
@@ -100,23 +101,15 @@ def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experien
         else:
             raise ValueError(f'Unsupported input type "{type(inputs)}"')
 
-    def _sample_prompts(self, prompts) -> list:
-        indices = list(range(len(prompts)))
-        sampled_indices = self.strategy.experience_sampler.choice(
-            indices, self.experience_batch_size, replace=False)
-        return [prompts[i] for i in sampled_indices]
-
     def _learn(self):
         # replay buffer may be empty at first, we should rebuild at each training
         if not self.sample_replay_buffer:
-            dataloader = self.strategy.setup_dataloader(
-                self.replay_buffer, self.dataloader_pin_memory)
-            device = torch.cuda.current_device()
+            dataloader = self.strategy.setup_dataloader(self.replay_buffer, self.dataloader_pin_memory)
         if self.sample_replay_buffer:
-            pbar = tqdm(range(self.max_epochs), desc='Train epoch',
-                        disable=not is_rank_0())
+            pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
             for _ in pbar:
                 experience = self.replay_buffer.sample()
+                experience.to_device(self.device)
                 metrics = self.training_step(experience)
                 pbar.set_postfix(metrics)
         else:
@@ -124,11 +117,10 @@ def _learn(self):
                 self._on_learn_epoch_start(epoch)
                 if isinstance(dataloader.sampler, DistributedSampler):
                     dataloader.sampler.set_epoch(epoch)
-                pbar = tqdm(
-                    dataloader, desc=f'Train epoch [{epoch+1}/{self.max_epochs}]', disable=not is_rank_0())
+                pbar = tqdm(dataloader, desc=f'Train epoch [{epoch+1}/{self.max_epochs}]', disable=not is_rank_0())
                 for experience in pbar:
                     self._on_learn_batch_start()
-                    experience.to_device(device)
+                    experience.to_device(self.device)
                     metrics = self.training_step(experience)
                     self._on_learn_batch_end(metrics, experience)
                     pbar.set_postfix(metrics)
@@ -152,16 +144,17 @@ def fit(self,
                 time += 1
                 prompts = next(iter(self.prompt_dataloader))
                 self._on_make_experience_start()
-                self.experience_maker.initial_model.to(
-                    torch.cuda.current_device())
-                self.experience_maker.reward_model.to(
-                    torch.cuda.current_device())
+                if self.offload_inference_models:
+                    # TODO(ver217): this may be controlled by strategy if they are prepared by strategy
+                    self.experience_maker.initial_model.to(self.device)
+                    self.experience_maker.reward_model.to(self.device)
                 experience = self._make_experience(prompts)
                 self._on_make_experience_end(experience)
                 self.replay_buffer.append(experience)
                 if time % update_timesteps == 0:
-                    self.experience_maker.initial_model.to('cpu')
-                    self.experience_maker.reward_model.to('cpu')
+                    if self.offload_inference_models:
+                        self.experience_maker.initial_model.to('cpu')
+                        self.experience_maker.reward_model.to('cpu')
                     self._learn()
                     self.replay_buffer.clear()
             self._on_episode_end(episode)
@@ -181,11 +174,10 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         # ptx loss
         if self.ptx_coef != 0:
             batch = next(iter(self.pretrain_dataloader))
-            ptx = batch['input_ids'].to(torch.cuda.current_device())
-            label = batch['labels'].to(torch.cuda.current_device())[:, 1:]
-            attention_mask = batch['attention_mask'].to(torch.cuda.current_device())
-            ptx_log_probs = self.actor.get_base_model()(ptx, attention_mask=attention_mask)['logits'][..., :-1, :]
-            ptx_loss = self.ptx_loss_fn(ptx_log_probs.view(-1, ptx_log_probs.size(-1)), label.view(-1))
+            batch = to_device(batch, self.device)
+            ptx_log_probs = self.actor.get_base_model()(batch['input_ids'],
+                                                        attention_mask=batch['attention_mask'])['logits']
+            ptx_loss = self.ptx_loss_fn(ptx_log_probs, batch['labels'])
             actor_loss = ptx_loss * self.ptx_coef + actor_loss * (1 - self.ptx_coef)
 
         self.strategy.backward(actor_loss, self.actor, self.actor_optim)
@@ -206,19 +198,16 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
         self.critic_optim.zero_grad()
 
         return {'reward': experience.reward.mean().item()}
-    
-    def save_model(self, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        self.strategy.save_model(model=self.actor, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
 
 
 def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
-    origin_model = strategy._unwrap_actor(actor)
+    origin_model = strategy.unwrap_model(actor)
     new_kwargs = {**generate_kwargs}
     # use huggingface models method directly
     if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
         new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
 
-    if 'update_model_kwargs_fn' not in generate_kwargs:
-        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
 
     return new_kwargs
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index ed6720abc2af..cdae5108ab00 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Optional, List
+from typing import List, Optional
 
 import pandas as pd
 import torch
@@ -9,8 +9,8 @@
 from tqdm import tqdm
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
-from .callbacks import Callback
 from .base import Trainer
+from .callbacks import Callback
 from .strategies import Strategy
 from .utils import is_rank_0
 
@@ -41,20 +41,18 @@ def __init__(
         train_dataloader: DataLoader,
         valid_dataloader: DataLoader,
         eval_dataloader: DataLoader,
-        batch_size: int = 1,
         max_epochs: int = 1,
         callbacks: List[Callback] = [],
     ) -> None:
         super().__init__(strategy, max_epochs, callbacks=callbacks)
-        train_sampler = None
 
         self.train_dataloader = train_dataloader
         self.valid_dataloader = valid_dataloader
         self.eval_dataloader = eval_dataloader
 
-        self.model = strategy.setup_model(model)
+        self.model = model
         self.loss_fn = loss_fn
-        self.optimizer = strategy.setup_optimizer(optim, self.model)
+        self.optimizer = optim
         self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer, self.train_dataloader.__len__() // 100)
 
     def eval_acc(self, dataloader):
@@ -123,9 +121,3 @@ def fit(self):
             epoch_bar.update()
             step_bar.set_postfix({'dist': dist, 'acc': acc})
             step_bar.close()
-
-    def save_model(self,
-                   path: str,
-                   only_rank0: bool = False,
-                   tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        self.strategy.save_model(model=self.model, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index 350553108e68..63fde53956cc 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,27 +1,20 @@
 import math
 import time
-from typing import Optional, List
+from typing import List, Optional
 
-import loralib as lora
 import torch
 import torch.distributed as dist
 import wandb
-from coati.models.loss import GPTLMLoss
-from torch import nn
-from torch.optim import Adam, Optimizer
-from torch.optim.lr_scheduler import LambdaLR
+from torch.optim import Optimizer
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import get_scheduler
 
-from colossalai.logging import get_dist_logger
-
-from .callbacks import Callback
 from .base import Trainer
-from .strategies import Strategy
-from .utils import is_rank_0
+from .callbacks import Callback
+from .strategies import ColossalAIStrategy, Strategy
+from .utils import is_rank_0, to_device
 
 
 class SFTTrainer(Trainer):
@@ -47,22 +40,20 @@ def __init__(
         optim: Optimizer,
         train_dataloader: DataLoader,
         eval_dataloader: DataLoader = None,
-        batch_size: int = 1,
         max_epochs: int = 2,
-        accimulation_steps: int = 8,
+        accumulation_steps: int = 8,
         callbacks: List[Callback] = [],
     ) -> None:
+        if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3:
+            raise ValueError("Accumulation steps are not supported in stage 3 of ColossalAI")
         super().__init__(strategy, max_epochs, callbacks=callbacks)
         self.train_dataloader = train_dataloader
         self.eval_dataloader = eval_dataloader
+        self.model = model
+        self.optimizer = optim
 
-        self.model = strategy.setup_model(model)
-        if "DDP" in str(self.strategy):
-            self.model = self.model.module
-        self.optimizer = strategy.setup_optimizer(optim, self.model)
-
-        self.accimulation_steps = accimulation_steps
-        num_update_steps_per_epoch = len(train_dataloader) // self.accimulation_steps
+        self.accumulation_steps = accumulation_steps
+        num_update_steps_per_epoch = len(train_dataloader) // self.accumulation_steps
         max_steps = math.ceil(self.max_epochs * num_update_steps_per_epoch)
 
         self.scheduler = get_scheduler("cosine",
@@ -70,12 +61,13 @@ def __init__(
                                        num_warmup_steps=math.ceil(max_steps * 0.03),
                                        num_training_steps=max_steps)
 
-    def fit(self, logger, log_interval=10):
-        wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
-        wandb.watch(self.model)
+    def fit(self, logger, use_wandb: bool = False):
+        if use_wandb:
+            wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
+            wandb.watch(self.model)
         total_loss = 0
         # epoch_bar = tqdm(range(self.epochs), desc='Epochs', disable=not is_rank_0())
-        step_bar = tqdm(range(len(self.train_dataloader) // self.accimulation_steps * self.max_epochs),
+        step_bar = tqdm(range(len(self.train_dataloader) // self.accumulation_steps * self.max_epochs),
                         desc=f'steps',
                         disable=not is_rank_0())
         for epoch in range(self.max_epochs):
@@ -85,35 +77,28 @@ def fit(self, logger, log_interval=10):
             self.model.train()
             for batch_id, batch in enumerate(self.train_dataloader):
 
-                prompt_ids = batch["input_ids"].to(torch.cuda.current_device())
-                p_mask = batch["attention_mask"].to(torch.cuda.current_device())
-                labels = batch["labels"].to(torch.cuda.current_device())
-                # prompt_ids = prompt_ids.squeeze(1).cuda()
-                # p_mask = p_mask.squeeze(1).cuda()
-                # prompt_logits = self.model(prompt_ids, attention_mask=p_mask, labels=labels)
-
-                outputs = self.model(prompt_ids, attention_mask=p_mask, labels=labels)
+                batch = to_device(batch, torch.cuda.current_device())
+                outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
 
                 loss = outputs.loss
-                prompt_logits = outputs.logits
 
                 if loss >= 2.5 and is_rank_0():
                     logger.warning(f"batch_id:{batch_id}, abnormal loss: {loss}")
 
-                loss = loss / self.accimulation_steps
+                loss = loss / self.accumulation_steps
 
                 self.strategy.backward(loss, self.model, self.optimizer)
 
                 total_loss += loss.item()
 
                 # gradient accumulation
-                if (batch_id + 1) % self.accimulation_steps == 0:
+                if (batch_id + 1) % self.accumulation_steps == 0:
                     self.strategy.optimizer_step(self.optimizer)
                     self.optimizer.zero_grad()
                     self.scheduler.step()
-                    if is_rank_0():
+                    if is_rank_0() and use_wandb:
                         wandb.log({
-                            "loss": total_loss / self.accimulation_steps,
+                            "loss": total_loss / self.accumulation_steps,
                             "lr": self.scheduler.get_last_lr()[0],
                             "epoch": epoch,
                             "batch_id": batch_id
@@ -134,27 +119,17 @@ def fit(self, logger, log_interval=10):
                     loss_sum = 0
                     num_seen = 0
                     for batch in self.eval_dataloader:
-                        prompt_ids = batch["input_ids"].to(torch.cuda.current_device())
-                        p_mask = batch["attention_mask"].to(torch.cuda.current_device())
-                        labels = batch["labels"].to(torch.cuda.current_device())
-                        # prompt_ids = prompt_ids.squeeze(1).cuda()
-                        # p_mask = p_mask.squeeze(1).cuda()
-
-                        outputs = self.model(prompt_ids, attention_mask=p_mask, labels=labels)
+                        batch = to_device(batch, torch.cuda.current_device())
+                        outputs = self.model(batch["input_ids"],
+                                             attention_mask=batch["attention_mask"],
+                                             labels=batch["labels"])
                         loss = outputs.loss
-                        # prompt_logits = outputs.logits
 
                         loss_sum += loss.item()
-                        num_seen += prompt_ids.size(0)
+                        num_seen += batch["input_ids"].size(0)
 
                     loss_mean = loss_sum / num_seen
                     if dist.get_rank() == 0:
                         logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
 
             # epoch_bar.update()
-
-    def save_model(self,
-                   path: str,
-                   only_rank0: bool = False,
-                   tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        self.strategy.save_model(model=self.model, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index 7d25138561ea..b1452869179e 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -2,10 +2,9 @@
 from contextlib import nullcontext
 from typing import Any, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
-from coati.models.base import LM, Actor, Critic, RewardModel
+from coati.models.base import Actor, get_base_model
 from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
@@ -72,8 +71,8 @@ def prepare(
 
         def prepare_model(model: nn.Module):
             if isinstance(model, Actor):
-                return Actor(self.setup_model(self._unwrap_model(model)))
-            return self.setup_model(self._unwrap_model(model))
+                return Actor(self.setup_model(model.get_base_model()))
+            return self.setup_model(model)
 
         rets = []
         for arg in models_or_model_optim_pairs:
@@ -81,7 +80,7 @@ def prepare_model(model: nn.Module):
                 assert len(arg) == 2, f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"'
                 model, optimizer = arg
                 model = prepare_model(model)
-                optimizer = self.setup_optimizer(optimizer, self._unwrap_model(model))
+                optimizer = self.setup_optimizer(optimizer, get_base_model(model))
                 rets.append((model, optimizer))
             elif isinstance(arg, nn.Module):
                 rets.append(prepare_model(arg))
@@ -93,31 +92,20 @@ def prepare_model(model: nn.Module):
         return rets
 
     @staticmethod
-    def _unwrap_model(model: nn.Module) -> nn.Module:
-        """Useful for saving state dict. As actor is wrapped by Actor class again in `prepare()`, we should unwrap it before saving.
+    def unwrap_model(model: nn.Module) -> nn.Module:
+        """Get the unwrapped model from a wrapped model. Useful for getting original huggingface model.
+        For Actor, it will unwrap `actor.model`.
 
         Args:
-            model (nn.Module): an actor or a critic
-        """
-        if isinstance(model, Actor) or isinstance(model, LM):
-            return model.model
-        return model
-
-    @staticmethod
-    def _unwrap_actor(actor: Actor) -> nn.Module:
-        """Get `actor.model` from a wrapped (by `prepare()`) actor. Useful for getting original huggingface model.
+            model (nn.Module): the model to unwrap
 
-        Args:
-            actor (Actor): a wrapped actor
+        Returns:
+            nn.Module: the original model (usually a huggingface model)
         """
-        return Strategy._unwrap_model(actor)
+        return get_base_model(model)
 
     @abstractmethod
-    def save_model(self,
-                   model: nn.Module,
-                   path: str,
-                   only_rank0: bool = False,
-                   tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
         pass
 
     @abstractmethod
@@ -134,3 +122,11 @@ def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = No
 
     def setup_sampler(self, dataset) -> DistributedSampler:
         return DistributedSampler(dataset, 1, 0)
+
+    @abstractmethod
+    def save_pretrained(self,
+                        model: nn.Module,
+                        path: str,
+                        only_rank0: bool = True,
+                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        pass
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index ba85ba76d4b1..8aa302c77eee 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -5,10 +5,8 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
-from coati.models.base import LM, Actor, RewardModel
-from coati.models.lora import LoraLinear
+from coati.models.base import get_base_model
 from torch.optim import Optimizer
-from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 import colossalai
@@ -17,9 +15,7 @@
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, ZeroDDP, zero_model_wrapper, zero_optim_wrapper
-from colossalai.zero.gemini.utils import get_static_torch_model
 
-from .base import Strategy
 from .ddp import DDPStrategy
 
 logger = get_dist_logger(__name__)
@@ -67,6 +63,7 @@ def __init__(
             placement_policy: str = 'cuda',
             pin_memory: bool = True,    # only for stage 3
             force_outputs_fp32: bool = False,    # only for stage 3
+            scatter_after_inference: bool = False,    # only for stage 3
             search_range_mb: int = 32,    # only for stage 3
             hidden_dim: Optional[int] = None,    # only for stage 3
             min_chunk_size_mb: float = 32,    # only for stage 3
@@ -103,7 +100,8 @@ def __init__(
                                   strict_ddp_mode=shard_init,
                                   search_range_mb=search_range_mb,
                                   hidden_dim=hidden_dim,
-                                  min_chunk_size_mb=min_chunk_size_mb)
+                                  min_chunk_size_mb=min_chunk_size_mb,
+                                  scatter_after_inference=scatter_after_inference)
         if stage == 3:
             self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio)
         else:
@@ -139,7 +137,7 @@ def setup_model(self, model: nn.Module) -> nn.Module:
         model = zero_model_wrapper(model, zero_stage=self.stage, gemini_config=self.gemini_config)
 
         if self.stage != 3 and self.precision == 'fp16':
-            model = model.half()
+            model = model.half().cuda()
         return model
 
     def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
@@ -152,61 +150,39 @@ def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimi
     def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
         optimizer.step()
 
-    @staticmethod
-    def _unwrap_actor(actor: Actor) -> nn.Module:
-        model: Union[nn.Module, ZeroDDP] = Strategy._unwrap_actor(actor)
-        if isinstance(model, ZeroDDP):
-            return model.module
-        return model
-
-    def _unwrap_model(self, model: Union[nn.Module, ZeroDDP]) -> nn.Module:
-        if isinstance(model, ZeroDDP) and self.stage == 3:
-            logger.info(f"model type: {type(model)}, get static torch model")
-            model = get_static_torch_model(model)
-            logger.info(f"unwrapped_model type: {type(model)}")
-
-        return super()._unwrap_model(model)
-
-    def save_model(self,
-                   model: nn.Module,
-                   path: str,
-                   only_rank0: bool = True,
-                   tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-
-        if only_rank0 and dist.get_rank() != 0:
-            return None
-        unwrapped_model = self._unwrap_model(model)
-        # TODO : better way to get torch model from gemini model
-        # to get torch model from gemini model
-
-        for module in unwrapped_model.modules():
-            if isinstance(module, LoraLinear):
-                module.merge_weights = True
-                module.eval()
-        if isinstance(unwrapped_model, RewardModel):
-            state_dict = unwrapped_model.state_dict()
-            if only_rank0 and dist.get_rank() != 0:
-                return
-            torch.save(state_dict, path)
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
+        if only_rank0 and dist.get_rank() != 0 and self.stage != 3:
+            return
+        base_model = get_base_model(model)
+        if self.stage == 3:
+            assert isinstance(base_model, ZeroDDP)
+            # for stage 3, state_dict() method should be called on every rank
+            state_dict = base_model.state_dict(only_rank_0=only_rank0)
         else:
-            try:
-                if isinstance(unwrapped_model, LM):
-                    unwrapped_model = unwrapped_model.model
-                logger.info(f'Saving model to {path}', ranks=[0])
-                unwrapped_model.save_pretrained(path)
-                logger.info(f'Model saved to {path} Successfully', ranks=[0])
-                if tokenizer is not None:
-                    logger.info(f'Saving tokenizer to {path}', ranks=[0])
-                    tokenizer.save_pretrained(path)
-                    logger.info(f'Tokenizer saved to {path} Successfully', ranks=[0])
-            except AttributeError:
-                state_dict = unwrapped_model.state_dict()
-                if only_rank0 and dist.get_rank() != 0:
-                    return
-                torch.save(state_dict, path)
+            # only_rank0 is false or rank == 0
+            state_dict = base_model.state_dict()
+        if only_rank0 and dist.get_rank() != 0:
+            return
+        torch.save(state_dict, path)
 
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         if only_rank0:
             raise RuntimeError(
                 f'Optimizer states are sharded when using ColossalAIStrategy. Only rank0 is not supported.')
         torch.save(optimizer.state_dict(), path)
+
+    def unwrap_model(self, model: nn.Module) -> nn.Module:
+        base_model: Union[nn.Module, ZeroDDP] = get_base_model(model)
+        if self.stage == 3:
+            assert isinstance(base_model, ZeroDDP)
+            return base_model.module
+        return base_model
+
+    def save_pretrained(self,
+                        model: nn.Module,
+                        path: str,
+                        only_rank0: bool = True,
+                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        if self.stage == 3:
+            raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
+        super().save_pretrained(model, path, only_rank0, tokenizer)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 8a8c4b3c2f4e..7910b57878f8 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -1,21 +1,17 @@
-from typing import Optional
-
 import os
 import random
+from typing import Optional
 
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from coati.models.base import LM, Actor, RewardModel
-from coati.models.lora import LoraLinear
 from coati.replay_buffer import ReplayBuffer
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
-from .base import Strategy
 from .naive import NaiveStrategy
 from .sampler import DistributedSampler
 
@@ -70,37 +66,10 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
             pin_memory=pin_memory,
             collate_fn=replay_buffer.collate_fn)
 
-    @staticmethod
-    def _unwrap_actor(actor: Actor) -> nn.Module:
-        model: DDP = Strategy._unwrap_actor(actor)
-        return model.module
-
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
         if only_rank0 and dist.get_rank() != 0:
-            return None
-        
-        for module in model.modules():
-            if isinstance(module, LoraLinear):
-                module.merge_weights = True
-                module.eval()
-        
-        if isinstance(model, RewardModel):
-            state_dict = model.state_dict()
-            if only_rank0 and dist.get_rank() != 0:
-                return
-            torch.save(state_dict, path)
-        else:
-            try:
-                if isinstance(model, LM):
-                    model = model.model
-                model.save_pretrained(path)
-                if tokenizer is not None:
-                    tokenizer.save_pretrained(path)
-            except AttributeError:
-                state_dict = model.state_dict()
-                if only_rank0 and dist.get_rank() != 0:
-                    return
-                torch.save(state_dict, path)
+            return
+        super().save_model(model, path, only_rank0)
 
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         if only_rank0 and dist.get_rank() != 0:
@@ -109,3 +78,16 @@ def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = Fal
 
     def setup_sampler(self, dataset) -> DistributedSampler:
         return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
+
+    def unwrap_model(self, model: nn.Module) -> nn.Module:
+        base_model: DDP = super().unwrap_model(model)
+        return base_model.module
+
+    def save_pretrained(self,
+                        model: nn.Module,
+                        path: str,
+                        only_rank0: bool = True,
+                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        if only_rank0 and dist.get_rank() != 0:
+            return
+        super().save_pretrained(model, path, only_rank0, tokenizer)
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index bb47e5ab2688..4d94026ce932 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -3,11 +3,11 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
+from coati.models.base import get_base_model
 from coati.replay_buffer import ReplayBuffer
-from coati.models.base import LM, RewardModel
-from coati.models.lora import LoraLinear
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
+from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from .base import Strategy
@@ -41,30 +41,15 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
                           pin_memory=pin_memory,
                           collate_fn=replay_buffer.collate_fn)
 
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        for module in model.modules():
-            if isinstance(module, LoraLinear):
-                module.merge_weights = True
-                module.eval()
-        
-        if isinstance(model, RewardModel):
-            state_dict = model.state_dict()
-            torch.save(state_dict, path)
-        else:
-            try:
-                if isinstance(model, LM):
-                    model = model.model
-                model.save_pretrained(path)
-                if tokenizer is not None:
-                    tokenizer.save_pretrained(path)
-            except AttributeError:
-                state_dict = model.state_dict()
-                torch.save(state_dict, path)
+    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
+        base_model = get_base_model(model)
+        state_dict = base_model.state_dict()
+        torch.save(state_dict, path)
 
     def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
-        unwrapped_model = self._unwrap_model(model)
+        base_model = get_base_model(model)
         state_dict = torch.load(path, map_location=map_location)
-        unwrapped_model.load_state_dict(state_dict, strict=strict)
+        base_model.load_state_dict(state_dict, strict=strict)
 
     def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
         torch.save(optimizer.state_dict(), path)
@@ -72,3 +57,14 @@ def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = Fal
     def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
         state_dict = torch.load(path, map_location=map_location)
         optimizer.load_state_dict(state_dict)
+
+    def save_pretrained(self,
+                        model: nn.Module,
+                        path: str,
+                        only_rank0: bool = True,
+                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        unwrapped_model = self.unwrap_model(model)
+        assert isinstance(unwrapped_model, PreTrainedModel)
+        unwrapped_model.save_pretrained(path)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(path)
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index 1b17a0421656..9cccb5c92603 100644
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -1,14 +1,19 @@
-import torch.distributed as dist
-from typing import Any, Callable, Dict, List, Optional
-from coati.models.bloom import BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from typing import Any
+
 import torch
-import os
+import torch.distributed as dist
+from torch.utils._pytree import tree_map
 
 
 def is_rank_0() -> bool:
     return not dist.is_initialized() or dist.get_rank() == 0
 
 
+def to_device(x: Any, device: torch.device) -> Any:
+
+    def _to(t: Any):
+        if isinstance(t, torch.Tensor):
+            return t.to(device)
+        return t
+
+    return tree_map(_to, x)
diff --git a/applications/Chat/coati/utils/tokenizer_utils.py b/applications/Chat/coati/utils/tokenizer_utils.py
index 80dcc55fca3e..e0d96cfc8be2 100644
--- a/applications/Chat/coati/utils/tokenizer_utils.py
+++ b/applications/Chat/coati/utils/tokenizer_utils.py
@@ -16,8 +16,6 @@
 
 import transformers
 
-from ..models.llama.llama_lm import LlamaLM
-
 DEFAULT_PAD_TOKEN = "[PAD]"
 DEFAULT_EOS_TOKEN = "</s>"
 DEFAULT_BOS_TOKEN = "</s>"
@@ -62,9 +60,6 @@ def smart_tokenizer_and_embedding_resize(
     if tokenizer.pad_token is None:
         num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
 
-        if isinstance(model, LlamaLM):
-            model = model.get_base_model()
-
         model.resize_token_embeddings(len(tokenizer))
 
         if num_new_tokens > 0:
diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index 6113dbbb1ef2..7ace4bfe6d18 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -1,26 +1,36 @@
 # Evaluation
 
-In this directory we will introduce how you can evaluate your model with GPT-4. 
+In this directory, we introduce how you can evaluate your model with GPT-4. 
 
 ## Evaluation Pipeline
 
-The whole evaluation process undergoes two steps. 
-
-1. Generate answers from different models: Use `generate_gpt35_answers.py` to generate answers of GPT 3.5 and use `generate_answers.py` to generate answers of your own models.
-2. Evaluate models using GPT 4: Use `evaluate.py` to evaluate model answers with GPT-4.
+The whole evaluation process undergoes the following three steps: 
+1. Prepare the questions following the internal data structure in the data format section (described below).
+2. Generate answers from different models: 
+    * Generate answers using GPT-3.5: [`generate_gpt35_answers.py`](generate_gpt35_answers.py).
+    * Generate answers using your own models: [`generate_answers.py`](generate_answers.py).
+3. Evaluate models using GPT-4: [`evaluate.py`](evaluate.py).
 
 ### Generate Answers
+#### Generate Answers Using GPT-3.5
+You can provide your own OpenAI key to generate answers from GPT-3.5 using [`generate_gpt35_answers.py`](./generate_gpt35_answers.py).
 
-To generate answers, you should first format [FastChat's]([FastChat/question.jsonl at main · lm-sys/FastChat (github.com)](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/question.jsonl)) `question.jsonl` file. We do this formatting because we would like to add more questions later and the pipeline for generating new questions may follow that of Self-Instruct and Stanford Alpaca. An example script is given as follows.
-
+An example script is provided as follows:
 ```shell
-python format_questions.py \
-    --questions_path "path to FastChat's question.jsonl" \
-    --save_path "path to the formatted file" \
+python generate_gpt35_answers.py \
+    --dataset "path to the question dataset" \
+    --answer_path "path to answer folder" \
+    --num_workers 4 \
+    --openai_key "your openai key" \
+    --max_tokens 512 \
+``` 
 
-```
+#### Generate Answers Using our Own Model
+You can also generate answers using your own models. The generation process is divided into two stages:
+1. Generate answers using multiple GPUs (optional) with batch processing: [`generate_answers.py`](./generate_answers.py).
+2. Merge multiple shards and output a single file: [`merge.py`](./merge.py).
 
-In `generate_answers.py`, the model will generate answers in a batch way and different GPU processes will do inference on different shards of the given questions. Once all GPU process generate its answers, `merge.py` will merge different shards of answers and output a single answer file. Finally, the script will also remove the answer shards. An example script is given as follows.
+An example script is given as follows:
 
 ```shell
 device_number=number of your devices
@@ -51,21 +61,9 @@ done
 
 ```
 
-`generate_gpt35_answers.py` will generate answers of GPT-3.5 An example script is given as follows.
-
-```shell
-python generate_gpt35_answers.py \
-    --dataset "path to the question dataset" \
-    --answer_path "path to answer folder" \
-    --num_workers 4 \
-    --openai_key "your openai key" \
-    --max_tokens 512 \
-
-```
-
 ### Evaluate Answers
 
-In `evaluate.py`, GPT-4 will help review and score answers of two different models. Here `Model 1` refers to the first model you specify in the `--answer_file_list` and `Model 2` refers to the second model. The script will finally print several metrics and output corresponding JSON files.
+In [`evaluate.py`](./evaluate.py), GPT-4 helps to review and score answers of two different models. Here `Model 1` refers to the first model you specify in the `--answer_file_list` and `Model 2` refers to the second model. The script shows several metrics and output the corresponding JSON files.
 
 The metrics include:
 
@@ -107,16 +105,23 @@ We would like to mention that the evaluation of model answers using the GPT-3.5
 ## Data Format
 
 ### Questions
-
-We store questions in `questions.json`. The JSON file contains one list. Each element in the list is a question record.
-
-A question record has the following field:
-
-* `category` (str): The category of the question.
-* `instruction` (str): The question.
-* `input` (str): This is empty if you only use [FastChat's]([FastChat/question.jsonl at main · lm-sys/FastChat (github.com)](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/question.jsonl)) questions.
-* `output` (str): This is empty.
-* `id` (int): The question id.
+The file [`questions.json`](./sample/questions.json) shows the example questions used to evaluate the performance of the model. Each question record has the following field:
+* `id` (id, compulsory): The ID of the instruction / question.
+* `instruction` (str, compulsory): The instruction / question for the LLM.
+* `input` (str, optional): The additional context of the instruction / question.
+* `output` (str, optional): The sample output of the instruction / question.
+* `category` (str, compulsory): The category of the instruction / question.
+
+Example:
+```
+{
+    "id": 0,
+    "instruction": "Help me summarize the following short story?",
+    "input": "{story}",
+    "output": "{summarized story}",
+    "category": "closed qa"
+}
+```
 
 ### Answers
 
@@ -124,11 +129,11 @@ We store model answers in `{model_name}_answers.json`. The JSON file contains on
 
 An answer record has the following field:
 
-* `category` (str): The category of the question.
-* `instruction` (str): The question.
-* `input` (str): This is empty if you only use [FastChat's]([FastChat/question.jsonl at main · lm-sys/FastChat (github.com)](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/question.jsonl)) questions.
-* `output` (str): The answer to the question.
-* `id` (int): The question id.
+* `category` (str, compulsory): The category of the instruction / question.
+* `instruction` (str, compulsory): The instruction / question for the LLM.
+* `input` (str, optional): The additional context of the instruction / question.
+* `output` (str, compulsory): The output from the LLM.
+* `id` (int, compulsory): The ID of the instruction / question.
 
 ### Results
 
@@ -136,12 +141,12 @@ We store evaluation results in `results.json`. The JSON file contains one dictio
 
 The value has the following field:
 
-* `model` (list): The names of the two models.
-* `better` (int): The number of reviews where Model 2 receives a higher score.
-* `worse` (int): The number of reviews where Model 2 receives a lower score.
-* `tie` (int): The number of reviews where two models play to a tie.
-* `win_rate` (float): Win rate of Model 2.
-* `score` (list): Average score of the two models.
+* `model` (list, compulsory): The names of the two models.
+* `better` (int, compulsory): The number of reviews where Model 2 receives a higher score.
+* `worse` (int, compulsory): The number of reviews where Model 2 receives a lower score.
+* `tie` (int, compulsory): The number of reviews where two models play to a tie.
+* `win_rate` (float, compulsory): Win rate of Model 2.
+* `score` (list, compulsory): Average score of the two models.
 
 ### Better, Worse, Tie, Invalid, Review
 
@@ -149,24 +154,20 @@ To help better compare the model answers, we store JSON files whose name ends wi
 
 A record has the following field:
 
-* `review_id` (str): Random UUID, not in use.
-* `id` (int): The question id.
-* `reviewer_id` (int): A unique ID for a reviewer. Different reviewer id use different prompts.
-* `metadata` (dict): It is empty.
-* `review` (str): GPT-4 's review.
-* `score` (list): The scores of two models.
+* `review_id` (str, optional): Random UUID, not in use.
+* `id` (int, compulsory): The ID of the instruction / question.
+* `reviewer_id` (int, compulsory): A unique ID for a reviewer. Different reviewer id use different prompts.
+* `metadata` (dict, optional): It is empty.
+* `review` (str, optional): GPT-4's review.
+* `score` (list, compulsory): The scores of two models.
 
 ### Prompts
 
-The data format is the same with [FastChat's]([FastChat/prompt.jsonl at main · lm-sys/FastChat (github.com)](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/prompt.jsonl)) prompts.
+The data format is the same with [`FastChat's`](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/prompt.jsonl) prompts.
 
 ### Reviewer
 
-The data format is the same with [FastChat's]([FastChat/reviewer.jsonl at main · lm-sys/FastChat (github.com)](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/reviewer.jsonl)) reviewers.
-
-## Plan
-
-- [ ] Extend the questions
+The data format is the same with [`FastChat's`](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/reviewer.jsonl) reviewers.
 
 ## Citations
 
diff --git a/applications/Chat/evaluate/format_questions.py b/applications/Chat/evaluate/format_questions.py
deleted file mode 100644
index 9b47907c34bf..000000000000
--- a/applications/Chat/evaluate/format_questions.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import argparse
-import os
-import json
-import copy
-
-from utils import jdump, get_json_list
-
-
-def format_questions(args):
-    questions = get_json_list(args.questions_path)
-    keys=questions[0].keys()
-    
-    formatted_questions=copy.deepcopy(questions)
-    for i in range(len(formatted_questions)):
-        formatted_questions[i]['instruction']=questions[i]['text']
-        formatted_questions[i]['input']=""
-        formatted_questions[i]['output']=""
-        formatted_questions[i]['id']=questions[i]['question_id']
-        for key in keys:
-            if key=="category":
-                continue
-            del formatted_questions[i][key]
-    
-    jdump(formatted_questions, args.save_path)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--questions_path', type=str, default='table/question.jsonl')
-    parser.add_argument('--save_path', type=str, default="table/questions.json")
-    args = parser.parse_args()
-    format_questions(args)
\ No newline at end of file
diff --git a/applications/Chat/evaluate/format_questions.sh b/applications/Chat/evaluate/format_questions.sh
deleted file mode 100755
index a7568da364ad..000000000000
--- a/applications/Chat/evaluate/format_questions.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-python format_questions.py \
-    --questions_path "path to FastChat's question.jsonl" \
-    --save_path "path to the formatted file" \
diff --git a/applications/Chat/evaluate/sample/questions.json b/applications/Chat/evaluate/sample/questions.json
new file mode 100644
index 000000000000..e9ef9f8b1c66
--- /dev/null
+++ b/applications/Chat/evaluate/sample/questions.json
@@ -0,0 +1,9 @@
+[
+    {
+        "id": 0,
+        "instruction": "Help me summarize the following news?",
+        "input": "National Commercial Bank (NCB), Saudi Arabia's largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba's Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region's third-largest lender. The entity's $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East's biggest lender with about $268 billion of assets.",
+        "output": "NCB to pay 28.45 riyals for each Samba share. Deal will create Gulf region's third-largest lender",
+        "category": "closed qa"
+    }
+]
\ No newline at end of file
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index af8ded005600..3e85bfe2d170 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -62,13 +62,14 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
     --batch_size 4 \
-    --accimulation_steps 8 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
+    --grad_checkpoint
 ```
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='naive'
+- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --max_datasets_size: the max size of dataset, type=int, default=None
@@ -78,6 +79,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 - --batch_size:        batch size while training, type=int, default=4
 - --lora_rank:         low-rank adaptation matrices rank, type=int, default=0
 - --log_interval:      how many steps to log, type=int, default=100
+- --grad_checkpoint:   enable gradient checkpointing, type=bool, default=False
 
 ## Stage2 - Training reward model
 
@@ -115,7 +117,7 @@ Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
 <div align=left>We also train the reward model based on LLaMA-7B, which reaches the ACC of 72.06% after 1 epoch, performing almost the same as Anthropic's best RM.
 
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='naive'
+- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --model_path:        the path of rm model(if continue to train), type=str, default=None
@@ -146,20 +148,24 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
          --pretrain "/path/to/LLaMa-7B/" \
          --model 'llama' \
          --strategy colossalai_zero2 \
-         --prompt_path /path/to/your/prompt_dataset \
+         --prompt_dataset /path/to/your/prompt_dataset \
          --pretrain_dataset /path/to/your/pretrain_dataset \
          --rm_pretrain /your/pretrain/rm/defination \
          --rm_path /your/rm/model/path
 ```
+
+Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
+Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
+
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='naive'
+- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type of actor, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --rm_model:          reward model type, type=str, choices=['gpt2', 'bloom', 'opt', 'llama'], default=None
 - --rm_pretrain:       pretrain model for reward model, type=str, default=None
 - --rm_path:           the path of rm model, type=str, default=None
 - --save_path:         path to save the model, type=str, default='output'
-- --prompt_path:       path of the prompt dataset, type=str, default=None
+- --prompt_dataset:       path of the prompt dataset, type=str, default=None
 - --pretrain_dataset:  path of the ptx dataset, type=str, default=None
 - --need_optim_ckpt:   whether to save optim ckpt, type=bool, default=False
 - --num_episodes:      num of episodes for training, type=int, default=10
@@ -250,29 +256,6 @@ class CoatiActor(Actor):
         super().__init__(model, lora_rank, lora_train_bias)
 ```
 
-### LM model
-
-```
-from ..base import LM
-from transformers.models.coati import CoatiModel
-
-class GPTLM(LM):
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        super().__init__(model, lora_rank, lora_train_bias)
-
-    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
-        return self.model(input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
-```
 ### Reward model
 ```
 from ..base import RewardModel
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index fcc65e24478a..9bd0ebc12a83 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -154,7 +154,7 @@ def train(args):
                          eval_dataloader=eval_dataloader,
                          batch_size=args.batch_size,
                          max_epochs=args.max_epochs,
-                         accimulation_steps=args.accimulation_steps)
+                         accumulation_steps=args.accumulation_steps)
 
     trainer.fit(logger=logger, log_interval=args.log_interval)
 
@@ -183,7 +183,7 @@ def train(args):
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
     parser.add_argument('--lr', type=float, default=5e-6)
-    parser.add_argument('--accimulation_steps', type=int, default=8)
+    parser.add_argument('--accumulation_steps', type=int, default=8)
     parser.add_argument('--enable_peft_lora', action='store_true', default=False)
     parser.add_argument("--is_short_text", action='store_true', default=False)
     args = parser.parse_args()
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 32f5858a51b6..2b049163c801 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -31,16 +31,19 @@ torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'bigsci
         --model 'bloom' --strategy colossalai_zero2 --lora_rank 4\
         --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
         --save_path ${BASE}/output
+rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
         --model 'gpt2' --strategy colossalai_zero2 \
         --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
         --save_path ${BASE}/output
+rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
         --model 'opt' --strategy colossalai_zero2 --lora_rank 4\
         --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
         --save_path ${BASE}/output
+rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
         --model 'gpt2' --strategy ddp --lora_rank 4\
@@ -59,14 +62,14 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --pretrain 'facebook/opt-350m' --model 'opt' \
                             --strategy colossalai_zero2 --loss_fn 'log_sig'\
                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
-                            --test True --lora_rank 4 \
+                            --test True --lora_rank 0 \
                             --save_path ${BASE}/rm_ckpt_opt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --pretrain 'gpt2' --model 'gpt2' \
                             --strategy colossalai_zero2 --loss_fn 'log_exp' \
                             --dataset 'Dahoas/rm-static' \
-                            --test True  --lora_rank 4 \
+                            --test True  --lora_rank 0 \
                             --save_path ${BASE}/rm_ckpt_gpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
@@ -75,6 +78,7 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --dataset 'Dahoas/rm-static' \
                             --test True --lora_rank 4 \
                             --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --pretrain 'bigscience/bloom-560m' --model 'bloom' \
@@ -82,6 +86,7 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
                             --test True --lora_rank 4 \
                             --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --pretrain 'microsoft/deberta-v3-large' --model 'deberta' \
@@ -89,6 +94,7 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
                             --test True --lora_rank 4 \
                             --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
                             --pretrain 'roberta-base' --model 'roberta' \
@@ -99,7 +105,7 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
 
 rm -rf ${BASE}/rm_ckpt.pt
 
-torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_path $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
         --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
         --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
         --pretrain 'facebook/opt-350m' --model opt \
@@ -108,7 +114,7 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_path
         --save_path ${BASE}/actor_checkpoint_prompts.pt
 rm -rf ${BASE}/rm_ckpt_opt.pt
 
-torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_path $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
          --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
          --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
          --pretrain 'gpt2' --model gpt2 \
@@ -117,4 +123,4 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_path
          --save_path ${BASE}/actor_checkpoint_prompts.pt
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
-rm -rf ${BASE}/actor_checkpoint_prompts.pt
\ No newline at end of file
+rm -rf ${BASE}/actor_checkpoint_prompts.pt
diff --git a/applications/Chat/examples/train_dummy.py b/applications/Chat/examples/train_dummy.py
deleted file mode 100644
index 5f34c80f0892..000000000000
--- a/applications/Chat/examples/train_dummy.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import torch
-from coati.models.base import RewardModel
-from coati.models.bloom import BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.models.opt import OPTActor, OPTCritic
-from coati.models.roberta import RoBERTaActor, RoBERTaCritic
-from coati.trainer import PPOTrainer
-from coati.trainer.callbacks import SaveCheckpoint
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast, RobertaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def preprocess_batch(samples):
-    input_ids = torch.stack(samples)
-    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-    return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-def main(args):
-    # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    with strategy.model_init_context():
-        if args.model == 'gpt2':
-            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-            critic = GPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-        elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-            critic = BLOOMCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-        elif args.model == 'opt':
-            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-            critic = OPTCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-        elif args.model == 'roberta':
-            actor = RoBERTaActor(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-            critic = RoBERTaCritic(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-        initial_model = deepcopy(actor).to(torch.cuda.current_device())
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())
-
-    # configure optimizer
-    if args.strategy.startswith('colossalai'):
-        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
-        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=5e-6)
-        critic_optim = Adam(critic.parameters(), lr=5e-6)
-
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    elif args.model == 'roberta':
-        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
-
-    callbacks = []
-    if args.save_ckpt_path:
-        ckpt_callback = SaveCheckpoint(
-            args.save_ckpt_path,
-            args.save_ckpt_interval,
-            strategy,
-            actor,
-            critic,
-            actor_optim,
-            critic_optim,
-        )
-        callbacks.append(ckpt_callback)
-
-    # configure trainer
-
-    trainer = PPOTrainer(strategy,
-                         actor,
-                         critic,
-                         reward_model,
-                         initial_model,
-                         actor_optim,
-                         critic_optim,
-                         max_epochs=args.max_epochs,
-                         train_batch_size=args.train_batch_size,
-                         tokenizer=preprocess_batch,
-                         max_length=128,
-                         do_sample=True,
-                         temperature=1.0,
-                         top_k=50,
-                         pad_token_id=tokenizer.pad_token_id,
-                         eos_token_id=tokenizer.eos_token_id,
-                         callbacks=callbacks)
-
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 1, 64), device=torch.cuda.current_device())
-    random_attention_mask = torch.randint(1, (1000, 1, 64), device=torch.cuda.current_device()).to(torch.bool)
-    random_pretrain = [{'input_ids':random_prompts[i], 'labels':random_prompts[i], 'attention_mask':random_attention_mask[i]} for i in range(1000)]
-    trainer.fit(random_prompts, random_pretrain,
-                num_episodes=args.num_episodes,
-                max_timesteps=args.max_timesteps,
-                update_timesteps=args.update_timesteps)
-
-    # save model checkpoint after fitting
-    trainer.save_model(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(actor_optim,
-                                'actor_optim_checkpoint_dummy_%d.pt' % (torch.cuda.current_device()),
-                                only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', type=str, default='gpt2', choices=['gpt2', 'bloom', 'opt', 'roberta'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_dummy')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=50)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument('--save_ckpt_path',
-                        type=str,
-                        default=None,
-                        help="path to save checkpoint, None means not to save")
-    parser.add_argument('--save_ckpt_interval', type=int, default=1, help="the interval of episode to save checkpoint")
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/examples/train_dummy.sh b/applications/Chat/examples/train_dummy.sh
deleted file mode 100755
index 595da573e2b1..000000000000
--- a/applications/Chat/examples/train_dummy.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-torchrun --standalone --nproc_per_node=2 train_dummy.py --strategy colossalai_zero2
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 2086ff003e34..a584991cd34e 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -71,9 +71,8 @@ def main(args):
     if args.rm_path is not None:
         reward_model.load_state_dict(state_dict)
 
-    if args.strategy != 'colossalai_gemini':
-        initial_model.to(torch.float16).to(torch.cuda.current_device())
-        reward_model.to(torch.float16).to(torch.cuda.current_device())
+    initial_model.to(torch.float16).to(torch.cuda.current_device())
+    reward_model.to(torch.float16).to(torch.cuda.current_device())
 
     with strategy.model_init_context():
         if args.model == 'gpt2':
@@ -140,7 +139,7 @@ def main(args):
 
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
 
-    prompt_dataset = PromptDataset(tokenizer=tokenizer, data_path=args.prompt_path, max_datasets_size=16384)
+    prompt_dataset = PromptDataset(tokenizer=tokenizer, data_path=args.prompt_dataset, max_datasets_size=16384)
     if dist.is_initialized() and dist.get_world_size() > 1:
         prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
     else:
@@ -148,9 +147,12 @@ def main(args):
     prompt_dataloader = DataLoader(prompt_dataset,
                                    shuffle=(prompt_sampler is None),
                                    sampler=prompt_sampler,
-                                   batch_size=args.train_batch_size)
+                                   batch_size=args.experience_batch_size)
 
-    pretrain_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=args.pretrain_dataset, max_datasets_size=16384)
+    pretrain_dataset = SupervisedDataset(tokenizer=tokenizer,
+                                         data_path=args.pretrain_dataset,
+                                         max_datasets_size=16384,
+                                         max_length=args.max_input_len)
     if dist.is_initialized() and dist.get_world_size() > 1:
         pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
     else:
@@ -161,12 +163,6 @@ def main(args):
                                      batch_size=args.ptx_batch_size,
                                      collate_fn=data_collator)
 
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.to(torch.cuda.current_device()) for k, v in batch.items()}
-
     (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
 
     # configure trainer
@@ -182,9 +178,8 @@ def tokenize_fn(texts):
         ptx_coef=args.ptx_coef,
         max_epochs=args.max_epochs,
         train_batch_size=args.train_batch_size,
-        experience_batch_size=args.experience_batch_size,
-        tokenizer=tokenize_fn,
-        max_length=128,
+        max_length=args.max_seq_len,
+        use_cache=True,
         do_sample=True,
         temperature=1.0,
         top_k=50,
@@ -199,7 +194,7 @@ def tokenize_fn(texts):
                 update_timesteps=args.update_timesteps)
 
     # save model checkpoint after fitting
-    trainer.save_model(args.save_path, only_rank0=True, tokenizer=tokenizer)
+    strategy.save_model(actor, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(actor_optim,
@@ -209,11 +204,11 @@ def tokenize_fn(texts):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset')
+    parser.add_argument('--prompt_dataset', type=str, default=None, help='path to the prompt dataset')
     parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive',
+                        default='colossalai_zero2',
                         help='strategy to use')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama', 'roberta'])
     parser.add_argument('--pretrain', type=str, default=None)
@@ -232,5 +227,7 @@ def tokenize_fn(texts):
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument('--kl_coef', type=float, default=0.1)
     parser.add_argument('--ptx_coef', type=float, default=0.9)
+    parser.add_argument('--max_input_len', type=int, default=96)
+    parser.add_argument('--max_seq_len', type=int, default=128)
     args = parser.parse_args()
     main(args)
diff --git a/applications/Chat/examples/train_prompts.sh b/applications/Chat/examples/train_prompts.sh
index 8e1ce67ecc64..7f3b2636ca32 100755
--- a/applications/Chat/examples/train_prompts.sh
+++ b/applications/Chat/examples/train_prompts.sh
@@ -17,4 +17,4 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
 # torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
 
-torchrun --standalone --nproc_per_node=2 train_prompts.py --prompt_path /path/to/data.json --strategy colossalai_zero2
+torchrun --standalone --nproc_per_node=2 train_prompts.py --prompt_dataset /path/to/data.json --strategy colossalai_zero2
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 6a788a891ca6..48b12336fa67 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -124,11 +124,23 @@ def train(args):
         raise ValueError(f'Unsupported dataset "{args.dataset}"')
 
     if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(train_dataset, shuffle=True, seed=42, drop_last=True, rank=dist.get_rank(),
+        train_sampler = DistributedSampler(train_dataset,
+                                           shuffle=True,
+                                           seed=42,
+                                           drop_last=True,
+                                           rank=dist.get_rank(),
                                            num_replicas=dist.get_world_size())
-        valid_sampler = DistributedSampler(valid_dataset, shuffle=True, seed=42, drop_last=True, rank=dist.get_rank(),
+        valid_sampler = DistributedSampler(valid_dataset,
+                                           shuffle=True,
+                                           seed=42,
+                                           drop_last=True,
+                                           rank=dist.get_rank(),
                                            num_replicas=dist.get_world_size())
-        eval_sampler = DistributedSampler(eval_dataset, shuffle=True, seed=42, drop_last=True, rank=dist.get_rank(),
+        eval_sampler = DistributedSampler(eval_dataset,
+                                          shuffle=True,
+                                          seed=42,
+                                          drop_last=True,
+                                          rank=dist.get_rank(),
                                           num_replicas=dist.get_world_size())
     else:
         train_sampler = None
@@ -141,13 +153,19 @@ def train(args):
                                   batch_size=args.batch_size,
                                   pin_memory=True)
 
-    valid_dataloader = DataLoader(valid_dataset, shuffle=(valid_sampler is None),
+    valid_dataloader = DataLoader(valid_dataset,
+                                  shuffle=(valid_sampler is None),
                                   sampler=valid_sampler,
-                                  batch_size=args.batch_size, pin_memory=True)
+                                  batch_size=args.batch_size,
+                                  pin_memory=True)
 
-    eval_dataloader = DataLoader(eval_dataset, shuffle=(eval_sampler is None),
-                                 sampler=eval_sampler, batch_size=args.batch_size, pin_memory=True)
+    eval_dataloader = DataLoader(eval_dataset,
+                                 shuffle=(eval_sampler is None),
+                                 sampler=eval_sampler,
+                                 batch_size=args.batch_size,
+                                 pin_memory=True)
 
+    (model, optim) = strategy.prepare((model, optim))
     trainer = RewardModelTrainer(model=model,
                                  strategy=strategy,
                                  optim=optim,
@@ -155,12 +173,11 @@ def train(args):
                                  train_dataloader=train_dataloader,
                                  valid_dataloader=valid_dataloader,
                                  eval_dataloader=eval_dataloader,
-                                 batch_size=args.batch_size,
                                  max_epochs=args.max_epochs)
 
     trainer.fit()
     # save model checkpoint after fitting on only rank0
-    trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+    strategy.save_model(model, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(trainer.optimizer,
@@ -172,7 +189,7 @@ def train(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
+                        default='colossalai_zero2')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta', 'llama', 'roberta'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--model_path', type=str, default=None)
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index d7502c23b5e6..da499f068b17 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -5,11 +5,7 @@
 import torch
 import torch.distributed as dist
 from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
-from coati.models.base import RewardModel
-from coati.models.bloom import BLOOMLM
-from coati.models.gpt import GPTLM
-from coati.models.llama import LlamaLM
-from coati.models.opt import OPTLM
+from coati.models import convert_to_lora_module
 from coati.trainer import SFTTrainer
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
@@ -17,8 +13,12 @@
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, BloomTokenizerFast, LlamaConfig, LlamaForCausalLM
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
 
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
@@ -32,6 +32,8 @@ def train(args):
     elif args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
+        raise NotImplementedError(
+            'Gemini is not supported .from_pretrained() yet. We will update this after checkpoint io is ready.')
         strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2':
         strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
@@ -43,16 +45,19 @@ def train(args):
     # configure model
     with strategy.model_init_context():
         if args.model == 'bloom':
-            model = BLOOMLM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            model = convert_to_lora_module(BloomForCausalLM.from_pretrained(args.pretrain),
+                                           args.lora_rank).half().cuda()
         elif args.model == 'opt':
-            model = OPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            model = convert_to_lora_module(OPTForCausalLM.from_pretrained(args.pretrain), args.lora_rank).half().cuda()
         elif args.model == 'gpt2':
-            model = GPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+            model = convert_to_lora_module(GPT2LMHeadModel.from_pretrained(args.pretrain), args.lora_rank).half().cuda()
         elif args.model == 'llama':
-            model = LlamaLM(pretrained=args.pretrain, lora_rank=args.lora_rank,
-                            checkpoint=True).to(torch.float16).to(torch.cuda.current_device())
+            model = convert_to_lora_module(LlamaForCausalLM.from_pretrained(args.pretrain),
+                                           args.lora_rank).half().cuda()
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
 
     # configure tokenizer
     if args.model == 'gpt2':
@@ -147,19 +152,19 @@ def train(args):
     else:
         eval_dataloader = None
 
+    (model, optim) = strategy.prepare((model, optim))
     trainer = SFTTrainer(model=model,
                          strategy=strategy,
                          optim=optim,
                          train_dataloader=train_dataloader,
                          eval_dataloader=eval_dataloader,
-                         batch_size=args.batch_size,
                          max_epochs=args.max_epochs,
-                         accimulation_steps=args.accimulation_steps)
+                         accumulation_steps=args.accumulation_steps)
 
-    trainer.fit(logger=logger, log_interval=args.log_interval)
+    trainer.fit(logger=logger, use_wandb=args.use_wandb)
 
     # save model checkpoint after fitting on only rank0
-    trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+    strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(trainer.optimizer,
@@ -171,7 +176,7 @@ def train(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
-                        default='naive')
+                        default='colossalai_zero2')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default=None)
@@ -184,6 +189,8 @@ def train(args):
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
     parser.add_argument('--lr', type=float, default=5e-6)
-    parser.add_argument('--accimulation_steps', type=int, default=8)
+    parser.add_argument('--accumulation_steps', type=int, default=8)
+    parser.add_argument('--use_wandb', default=False, action='store_true')
+    parser.add_argument('--grad_checkpoint', default=False, action='store_true')
     args = parser.parse_args()
     train(args)
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index 73710d1b19f8..c880f85825a7 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -6,7 +6,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
     --batch_size 4 \
-    --accimulation_steps 8 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
diff --git a/colossalai/_analyzer/fx/codegen.py b/colossalai/_analyzer/fx/codegen.py
index b768e59004b1..41d74f2e3719 100644
--- a/colossalai/_analyzer/fx/codegen.py
+++ b/colossalai/_analyzer/fx/codegen.py
@@ -138,7 +138,7 @@ def emit_ckpt_func(body,
                    delete_unused_value_func,
                    ckpt_level=0,
                    in_ckpt=False):
-    """Emit ckpt fuction in nested way
+    """Emit ckpt function in nested way
 
     Args:
         body: forward code - in recursive calls, this part will be checkpoint
diff --git a/colossalai/auto_parallel/offload/region.py b/colossalai/auto_parallel/offload/region.py
index 9a2f558c3145..819ffbd96eb1 100644
--- a/colossalai/auto_parallel/offload/region.py
+++ b/colossalai/auto_parallel/offload/region.py
@@ -111,7 +111,7 @@ def copy_grad_to_region_slice(self, param: torch.nn.Parameter, data_slice: torch
         Copy data slice to the memory space indexed by the input tensor in the region.
 
         Args:
-            param (torch.nn.Parameter): the param used to retrive meta information
+            param (torch.nn.Parameter): the param used to retrieve meta information
             data_slice (torch.Tensor): the tensor to be copied to the region
         """
 
diff --git a/colossalai/auto_parallel/offload/training_simulator.py b/colossalai/auto_parallel/offload/training_simulator.py
index f277c183a912..de58023ec2d6 100644
--- a/colossalai/auto_parallel/offload/training_simulator.py
+++ b/colossalai/auto_parallel/offload/training_simulator.py
@@ -22,7 +22,7 @@ class TrainingSimulator(ABC):
 
     Args:
         region_list (List[Region]): represents the linearized DNN computing graph.
-        comp_power (float): the NVIDIA GPU FP16 compuing power.
+        comp_power (float): the NVIDIA GPU FP16 computing power.
         link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth.
     """
 
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index e1d0c627274e..08af846b221d 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -149,7 +149,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh
 
     def _extract_target_dim(node):
         '''
-        A helper function to etract the target dimension from size node.
+        A helper function to extract the target dimension from size node.
         There are two usages of torch.Tensor.size:
         1. tensor.size()
         2. tensor.size(dim)
@@ -427,7 +427,7 @@ def _shard_param(param, target_sharding_spec):
         if target_sharding_spec.dim_partition_dict != {}:
             origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
             setattr(param, 'sharding_spec', origin_sharding_spec)
-            # TODO: build a ColoParamter class to manager the distributed parameters
+            # TODO: build a ColoParameter class to manager the distributed parameters
             # we could use .data here, because all the operations just happen before the real training
             # loop, so we don't need to track these operations in the autograd graph.
             param = torch.nn.Parameter(
diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 2cbc6c9221aa..d0a467254d72 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -287,7 +287,7 @@ def emit_code_with_chunk(body: List[str],
             body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # new tensor
             body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
-            # reassgin reshape size
+            # reassign reshape size
             body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 08a55f9aa04a..77bc2ef17bc3 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -153,7 +153,7 @@ def estimate_chunk_inference_mem(self, node_list: List, chunk_infos: Dict = None
 
         Returns:
             act_memory_peak_log (List): peak memory of every node
-            act_memory_after_node_log (List): memory after excuting every node
+            act_memory_after_node_log (List): memory after executing every node
             active_node_list_log (List): active nodes of every node. active nodes refer to
                 nodes generated but not deleted.
         """
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 326445ee9f12..59645c80e808 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -16,7 +16,7 @@ class SearchChunk(object):
     This is the core class for AutoChunk.
 
     It defines the framework of the strategy of AutoChunk.
-    Chunks will be selected one by one utill search stops.
+    Chunks will be selected one by one until search stops.
 
     The chunk search is as follows:
     1. find the peak memory node
@@ -73,7 +73,7 @@ def _init_trace(self) -> None:
 
     def _find_peak_region(self, mem_peak: List) -> int:
         """
-        find peak node, along with its neighbour nodes exceeds max mem
+        find peak node, along with its neighbor nodes exceeds max mem
         """
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
@@ -118,7 +118,7 @@ def _search_max_chunk_region(self, active_node: List, peak_region: int, chunk_re
             chunk_region_start (int)
             chunk_region_end (int)
         """
-        # check if peak node already in chunkinfo
+        # check if peak node already in chunk info
         if chunk_regions is not None:
             for i in chunk_regions:
                 if i["region"][0] < peak_region[0] <= i["region"][1] or \
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 16815215f52b..db25267e9b42 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -479,7 +479,7 @@ def check_region_start_end(self, start_node: Node, start_dim: int, start_idx: in
         # check index source align
         if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
             return False
-        # check index copmute
+        # check index compute
         if not self.check_index_compute(start_idx, end_dim, end_node, end_idx):
             return False
         return True
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 307f4de326d7..c7fce4c8bee1 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -8,7 +8,7 @@
 
 class TraceIndice(object):
     """
-    Trace all indice infomation for every node.
+    Trace all indice information for every node.
 
     Indice is a logical concept. Equal dims can been treated as one indice.
     eg. dim(x1) = [a, b, c]
@@ -153,7 +153,7 @@ def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None:
 
     def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
         """
-        inheirt indice from node without init
+        inherit indice from node without init
         """
         if exclude == None:
             exclude = []
@@ -301,7 +301,7 @@ def _assign_permute_indice(self, node: Node, node_idx: int) -> None:
     def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for linear op.
-        1. copy trace from input node and change last indice accroding to weight
+        1. copy trace from input node and change last indice according to weight
         2. mark equal for input node last indice, weight first dim and bias dim.
         3. inherit input's computation, mark computation for last dim.
 
@@ -360,7 +360,7 @@ def _assign_baddbmm_indice(self, node: Node, node_idx: int) -> None:
     def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for matmul op.
-        1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
+        1. copy trace from matmul_left and change last indice according to matmul_right. (assert they have same length)
         2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
         3. inherit matmul_left and matmul_right computation, mark computation for last dim.
 
@@ -720,11 +720,11 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
         Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
         2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin indice for generated dim.
+        3. determine changed dim, and assign indice for generated dim.
         4. log changed dim and generated dim for restore
         5. inherit computation.
         6. look into view list to see whether the view is associated with other,
-           if so assgin equal dim according to previous view.
+           if so assign equal dim according to previous view.
 
         Args:
             node (node)
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 1ad9f7f20ec1..c14e602deaf5 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -20,7 +20,7 @@
 class Booster:
     """
     Booster is a high-level API for training neural networks. It provides a unified interface for
-    training with different precisio, accelerator, and plugin.
+    training with different precision, accelerator, and plugin.
 
     Examples:
         >>> colossalai.launch(...)
diff --git a/colossalai/booster/plugin/__init__.py b/colossalai/booster/plugin/__init__.py
index 8e09b6cb281d..aa45bcb59ad7 100644
--- a/colossalai/booster/plugin/__init__.py
+++ b/colossalai/booster/plugin/__init__.py
@@ -1,5 +1,6 @@
 from .gemini_plugin import GeminiPlugin
+from .low_level_zero_plugin import LowLevelZeroPlugin
 from .plugin_base import Plugin
 from .torch_ddp_plugin import TorchDDPPlugin
 
-__all__ = ['Plugin', 'TorchDDPPlugin', 'GeminiPlugin']
+__all__ = ['Plugin', 'TorchDDPPlugin', 'GeminiPlugin', 'LowLevelZeroPlugin']
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
new file mode 100644
index 000000000000..969c430bd317
--- /dev/null
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -0,0 +1,259 @@
+import random
+import warnings
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.checkpoint_io import CheckpointIO
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.utils import get_current_device
+from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
+
+from .plugin_base import Plugin
+from .torch_ddp_plugin import TorchDDPCheckpointIO
+
+__all__ = ['LowLevelZeroPlugin']
+
+
+def _convert_to_fp16(x):
+    if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
+        return x.half()
+    return x
+
+
+class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
+
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
+        """
+        Save optimizer to checkpoint but only on master process.
+        """
+        # TODO(ver217): optimizer state dict is sharded
+        super().save_unsharded_optimizer(optimizer, checkpoint, gather_dtensor)
+
+
+class LowLevelZeroModel(ModelWrapper):
+
+    def __init__(self, module: nn.Module, stage: int, precision: str) -> None:
+        super().__init__(module)
+        self.convert_inputs = (precision == 'fp16')
+        module = zero_model_wrapper(module, zero_stage=stage)
+        if precision == 'fp16':
+            module = module.half()
+        module = module.to(get_current_device())
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        if self.convert_inputs:
+            args = tree_map(_convert_to_fp16, args)
+            kwargs = tree_map(_convert_to_fp16, kwargs)
+        return super().forward(*args, **kwargs)
+
+
+class LowLevelZeroOptimizer(OptimizerWrapper):
+
+    def __init__(self,
+                 module: nn.Module,
+                 optimizer: Optimizer,
+                 zero_optim_config: dict,
+                 optim_kwargs: dict,
+                 verbose: bool = False) -> None:
+        optimizer = zero_optim_wrapper(module,
+                                       optimizer,
+                                       optim_config=zero_optim_config,
+                                       **optim_kwargs,
+                                       verbose=verbose)
+        super().__init__(optimizer)
+
+    def backward(self, loss: Tensor, *args, **kwargs):
+        self.optim.backward(loss)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> Tensor:
+        warnings.warn(f'LowLevelZero controls grad clipping by itself, so you should not use clip_grad_by_norm')
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        raise NotImplementedError('LowLevelZero does not support clip_grad_by_value')
+
+
+class LowLevelZeroPlugin(Plugin):
+    """
+    Plugin for low level zero.
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import LowLevelZeroPlugin
+        >>>
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin = LowLevelZeroPlugin()
+
+        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
+
+    Args:
+        strage (int, optional): ZeRO stage. Defaults to 1.
+        precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
+        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
+        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
+        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
+        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
+            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
+        norm_type (float, optional): norm_type used for `clip_grad_norm`.
+        reduce_bucket_size_in_m (int, optional): grad reduce bucket size in M. Defaults to 12.
+        communication_dtype (torch.dtype, optional): communication dtype. If not specified, the dtype of param will be used. Defaults to None.
+        overlap_communication (bool, optional): whether to overlap communication and computation. Defaults to True.
+        cpu_offload (bool, optional): whether to offload grad, master weight and optimizer state to cpu. Defaults to False.
+        verbose (bool, optional): verbose mode. Debug info including grad overflow will be printed. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        stage: int = 1,
+        precision: str = 'fp16',
+        initial_scale: float = 2**32,
+        min_scale: float = 1,
+        growth_factor: float = 2,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 1000,
+        hysteresis: int = 2,
+        max_scale: float = 2**32,
+        max_norm: float = 0.0,
+        norm_type: float = 2.0,
+        reduce_bucket_size_in_m: int = 12,
+        communication_dtype: Optional[torch.dtype] = None,
+        overlap_communication: bool = True,
+        cpu_offload: bool = False,
+        verbose: bool = False,
+    ) -> None:
+
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
+        assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
+
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+
+        self.stage = stage
+        self.precision = precision
+        self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size_in_m * 1024 * 1024,
+                                      communication_dtype=communication_dtype,
+                                      overlap_communication=overlap_communication,
+                                      cpu_offload=cpu_offload)
+        self.optim_kwargs = dict(initial_scale=initial_scale,
+                                 growth_factor=growth_factor,
+                                 backoff_factor=backoff_factor,
+                                 growth_interval=growth_interval,
+                                 hysteresis=hysteresis,
+                                 min_scale=min_scale,
+                                 max_scale=max_scale,
+                                 max_norm=max_norm,
+                                 norm_type=norm_type)
+        self.verbose = verbose
+
+    def support_no_sync(self) -> bool:
+        return False
+
+    def control_precision(self) -> bool:
+        return True
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16', 'fp32']
+
+    def control_device(self) -> bool:
+        return True
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+
+        if not isinstance(model, ModelWrapper):
+            model = LowLevelZeroModel(model, self.stage, self.precision)
+
+        if not isinstance(optimizer, OptimizerWrapper):
+            optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
+                                              self.verbose)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def control_checkpoint_io(self) -> bool:
+        return True
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        return LowLevelZeroCheckpointIO()
diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py
index 3f8b0b0a6b47..cb853559c48c 100644
--- a/colossalai/checkpoint_io/checkpoint_io_base.py
+++ b/colossalai/checkpoint_io/checkpoint_io_base.py
@@ -71,7 +71,7 @@ def load_model(self,
 
         Args:
             model (nn.Module): model to be loaded.
-            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
                         mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
                         1. a file path, e.g. 'model.pt'
                         2. a path to a json file which defines the index to the sharded checkpoint
@@ -127,7 +127,7 @@ def save_model(self,
                 1. a file path, e.g. 'model.pt'
                 2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
             shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
-                multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure
+                multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure
                 that the checkpoint path is a directory path instead of a file path.
             gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
             variant (str): If specified, weights are saved in the format pytorch_model.<variant>.bin. Default: None.
@@ -149,7 +149,7 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
 
         Args:
             optimizer (Optimizer): optimizer to be loaded.
-            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
         """
         index_file_exists, index_file_path = has_index_file(checkpoint)
 
@@ -180,7 +180,7 @@ def save_optimizer(self,
                 2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
                 3. a path to a folder containing a unique .index.json file for sharded checkpoint
             shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
-                multiple files. The optimizer shards will be specificed by a `optimizer.index.json` file.
+                multiple files. The optimizer shards will be specified by a `optimizer.index.json` file.
             gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
             prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None.
             size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.
diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py
index 44d7840700ef..cb3dbbc09301 100644
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
@@ -76,7 +76,7 @@ def check_installation():
     click.echo("")
     click.echo(f"Note:")
     click.echo(
-        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set"
+        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
     )
     click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
 
@@ -88,7 +88,7 @@ def check_installation():
     click.echo(f"Note:")
     click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
     click.echo(
-        f"   - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
+        f"   - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
     )
     click.echo(
         f"   - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py
index 6dd4d0d6608d..0200cd3c6553 100644
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@@ -103,10 +103,10 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non
                    previous rank.
         recv_next (bool): boolean for whether tensor should be received from
                    next rank.
-        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defualts to None.
-        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defualts to None.
-        prev_rank (int): the rank of the previous pipeline stage, defualts to None,
-        next_rank (int): the rank of the next pipeline stage, defualts to None,
+        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
+        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
+        prev_rank (int): the rank of the previous pipeline stage, defaults to None,
+        next_rank (int): the rank of the next pipeline stage, defaults to None,
         dtype (torch.dtype): data type of intermediate buffers, defaults to None
         scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
 
diff --git a/colossalai/communication/p2p_v2.py b/colossalai/communication/p2p_v2.py
index 4223f78d58cd..0dacd8c3c9b5 100644
--- a/colossalai/communication/p2p_v2.py
+++ b/colossalai/communication/p2p_v2.py
@@ -230,7 +230,7 @@ def recv_backward(next_rank: int = None) -> Any:
         next_rank (int, optional): The rank of the source of the tensor.
 
     Returns:
-        Any: The input gradient tensor or gradident tensor list.
+        Any: The input gradient tensor or gradient tensor list.
     """
     if gpc.is_pipeline_last_stage():
         output_tensor_grad = None
diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py
index 1d7a883b1552..b41f4072a405 100644
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -64,7 +64,7 @@ def setup(self, seed: int, use_kernel_optim: bool = True):
         from colossalai.core import global_context as gpc
         self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
         assert self.world_size % self.max_ep_size == 0, \
-            "Maximum epxert parallel size must be a factor of the number of GPUs"
+            "Maximum expert parallel size must be a factor of the number of GPUs"
         self.min_dp_size = self.world_size // self.max_ep_size
 
         # Enabling kernel optimization may raise error in some cases
diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index 0cd533fdef1a..003f0cdd91b6 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -44,7 +44,7 @@ def __init__(self):
         # load config from file
         self._config = None
 
-        # default 3D parallel args, will be overwritten during process group intialization
+        # default 3D parallel args, will be overwritten during process group initialization
         self.world_size = 1
         self.data_parallel_size = 1
         self.pipeline_parallel_size = 1
@@ -264,7 +264,7 @@ def _add_world_size(self, parallel_mode: ParallelMode, world_size: int):
         """Adds world size for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode correponding to the process group
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
             world_size (int): The world size to be added
 
         Raises:
diff --git a/colossalai/context/random/seed_manager.py b/colossalai/context/random/seed_manager.py
index 3c84aaafc179..956f9001200d 100644
--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/context/random/seed_manager.py
@@ -59,23 +59,23 @@ def set_mode(self, parallel_mode: ParallelMode):
         self._current_mode = parallel_mode
         torch.cuda.set_rng_state(self._seed_states[parallel_mode])
 
-    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
+    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
         """Adds a seed to the seed manager for `parallel_mode`.
 
         Args:
             parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
             seed (int): The seed to be added.
-            overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
+            overwrite (bool, optional): Whether allows to overwrite the seed that has been set already
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
                 or the seed for `parallel_mode` has been added.
         """
         assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
-        if overwrtie is False:
+        if overwrite is False:
             assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
         elif parallel_mode in self._seed_states:
-            print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
+            print(f"Warning: {parallel_mode} seed has been overwritten.", flush=True)
 
         current_state = torch.cuda.get_rng_state()
         torch.cuda.manual_seed(seed)
diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py
index 492ebf918a9c..5a72cb9ca923 100644
--- a/colossalai/fx/codegen/activation_checkpoint_codegen.py
+++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py
@@ -305,7 +305,7 @@ def emit_ckpt_func(body,
                    delete_unused_value_func,
                    level=0,
                    in_ckpt=False):
-    """Emit ckpt fuction in nested way
+    """Emit ckpt function in nested way
     Args:
         body: forward code, in recursive calls, this part will be checkpoint
         functions code
diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py
index 9bc4bf1f5c42..5ce5b969cbde 100644
--- a/colossalai/fx/passes/split_module.py
+++ b/colossalai/fx/passes/split_module.py
@@ -155,7 +155,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
                 use_partition = partitions[use_partition_name]
                 use_partition.outputs.setdefault(def_node.name)
 
-    # split nodes into parititons
+    # split nodes into partitions
     for node in m.graph.nodes:
         orig_nodes[node.name] = node
 
@@ -198,7 +198,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
     if len(sorted_partitions) != len(partitions):
         raise RuntimeError("cycle exists between partitions!")
 
-    # add placeholders to parititons
+    # add placeholders to partitions
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         for input in partition.inputs:
diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py
index 7df53731edc5..3b6470cdcbb9 100644
--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@@ -111,7 +111,7 @@ class MultiHeadAttention(nn.Module):
     Arguments:
         hidden_size: Total dimension of hidden_size.
         nhead: Number of parallel attention heads.
-        batch_size: Batch Size for one foward
+        batch_size: Batch Size for one forward
         max_seq_len: Max length of input sequence
         dropout: Dropout probability
         norm_first: perform LayerNorms before attention
diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/nn/_ops/embedding_bag.py
index 0e8aa8fecb01..0026f579b6dc 100644
--- a/colossalai/nn/_ops/embedding_bag.py
+++ b/colossalai/nn/_ops/embedding_bag.py
@@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor,
     assert isinstance(weight, ColoTensor)
     input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
 
-    # Handle differen parallel actions.
+    # Handle different parallel actions.
 
     if not weight.has_compute_spec():    # No Model Parallel Applied
         assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py
index 2e5d9e6e79a9..56b11f4d9e08 100644
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -13,7 +13,7 @@
 
 
 class MoeExperts(nn.Module):
-    """Basic class for experts in MoE. It stores what kind of communication expersts use
+    """Basic class for experts in MoE. It stores what kind of communication experts use
     to exchange tokens, how many experts in a single GPU and parallel information such as
     expert parallel size, data parallel size and their distributed communication groups.
     """
@@ -24,7 +24,7 @@ def __init__(self, comm_name: str, num_experts: int):
             "This kind of communication has not been implemented yet.\n Please use Experts build function."
         self.comm_name = comm_name
         self.num_total_experts = num_experts
-        # Get the configuration of experts' deployment and parallel information from moe contex
+        # Get the configuration of experts' deployment and parallel information from moe context
         self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts)
 
 
@@ -32,7 +32,7 @@ def __init__(self, comm_name: str, num_experts: int):
 class Experts(MoeExperts):
     """A wrapper class to create experts. It will create E experts across the
     moe model parallel group, where E is the number of experts. Every expert
-    is a instence of the class, 'expert' in initialization parameters.
+    is a instance of the class, 'expert' in initialization parameters.
 
     Args:
         expert_cls (:class:`torch.nn.Module`): The class of all experts
@@ -146,15 +146,15 @@ def forward(self, inputs):    # inputs [g, el, c, h]
 
 class TPExperts(MoeExperts):
     """Use tensor parallelism to split each expert evenly, which can deploy experts in
-    case that the number of experts can't be divied by maximum expert parallel size or
-    maximum expert parallel size can't be divied by the number of experts.
+    case that the number of experts can't be divide by maximum expert parallel size or
+    maximum expert parallel size can't be divide by the number of experts.
     """
 
     def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
         super().__init__("all_gather", MOE_CONTEXT.max_ep_size)
 
         assert d_ff % MOE_CONTEXT.max_ep_size == 0, \
-            "d_ff should be divied by maximum expert parallel size"
+            "d_ff should be divide by maximum expert parallel size"
 
         p_ff = d_ff // MOE_CONTEXT.max_ep_size
 
diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py
index b90d1f0bfcc6..03f55d91f3a8 100644
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -25,7 +25,7 @@
 class MoeLayer(nn.Module):
     """A MoE layer, that puts its input tensor to its gate and uses the output logits
     to router all tokens, is mainly used to exchange all tokens for every expert across
-    the moe tensor group by all to all comunication. Then it will get the output of all
+    the moe tensor group by all to all communication. Then it will get the output of all
     experts and exchange the output. At last returns the output of the moe system.
 
     Args:
@@ -122,7 +122,7 @@ class MoeModule(nn.Module):
         drop_tks (bool, optional): Whether drops tokens in evaluation
         use_residual (bool, optional): Makes this MoE layer a Residual MoE.
             More information can be found in `Microsoft paper`_.
-        residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE
+        residual_instance (nn.Module, optional): The instance of residual module in Residual MoE
         expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
         expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
         expert_args (optional): The args of expert when no instance is given
diff --git a/colossalai/nn/layer/moe/routers.py b/colossalai/nn/layer/moe/routers.py
index c522c655a511..c5b8390bf047 100644
--- a/colossalai/nn/layer/moe/routers.py
+++ b/colossalai/nn/layer/moe/routers.py
@@ -60,7 +60,7 @@ def pop_routing_loss(self) -> torch.Tensor:
 
 class Top1Router(MoeRouter):
     """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
-    for routing usage. More deailted function can be found in the paper about Switch Transformer
+    for routing usage. More detailed function can be found in the paper about Switch Transformer
     of Google.
     Args:
         capacity_factor_train (float, optional): Capacity factor in routing of training.
@@ -143,7 +143,7 @@ def forward(self, inputs: torch.Tensor, use_kernel: bool = False, ep_group: Opti
 
 class Top2Router(MoeRouter):
     """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
-    for routing usage. More deailted function can be found in the paper about ViT-MoE.
+    for routing usage. More detailed function can be found in the paper about ViT-MoE.
     Args:
         capacity_factor_train (float, optional): Capacity factor in routing of training.
         capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
diff --git a/colossalai/nn/layer/moe/utils.py b/colossalai/nn/layer/moe/utils.py
index 9362347414e0..4ca8bd703386 100644
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -12,7 +12,7 @@ def half(self, memory_format=None):
 
 
 class NormalNoiseGenerator:
-    """Generates a random noisy mask for logtis tensor.
+    """Generates a random noisy mask for logits tensor.
 
     All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where
     `E = the number of experts`.
@@ -32,7 +32,7 @@ def __call__(self, inputs: torch.Tensor):
 
 
 class UniformNoiseGenerator:
-    """Generates a random noisy mask for logtis tensor.
+    """Generates a random noisy mask for logits tensor.
     copied from mesh tensorflow:
     Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`.
     Makes models more resilient to rounding errors introduced by bfloat16.
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index e96abd87ed10..406173a18c60 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer):
                     to all GPUs, otherwise, every GPU will have its output
                     which is :math:`Y_i = XA_i`, defaults to False
         skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to Fals
+            which is preserved for kernel fusion, defaults to False
         weight_initializer (:class:`typing.Callable`, optional):
             The initializer of weight, defaults to kaiming uniform initializer.
         bias_initializer (:class:`typing.Callable`, optional):
@@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer):
         dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
         parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
         skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to Fals
+            which is preserved for kernel fusion, defaults to False
         weight_initializer (:class:`typing.Callable`, optional):
             The initializer of weight, defaults to kaiming uniform initializer.
         bias_initializer (:class:`typing.Callable`, optional):
@@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule):
     :type dtype: torch.dtype, optional
     :param flatten: whether to flatten output tensor, defaults to True
     :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
     :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
     :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
+    :param position_embed_initializer: The initializer of position embedding, defaults to zero
     :type position_embed_initializer: typing.Callable, optional
     """
 
diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index 40eefc3ec5d1..4d762076461d 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -184,7 +184,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
             # we have to capture the `backward` function
             # and make sure that it does not in `torch._C.DisableTorchFunction()` context
             if func is torch.Tensor.backward:
-                assert len(args) == 1    # only has 1 paramter
+                assert len(args) == 1    # only has 1 parameter
                 backward_tensor = torch.Tensor(args[0])
                 tensor_kwargs = {k: torch.Tensor(v) if torch.is_tensor(v) else v for k, v in kwargs.items()}
                 return backward_tensor.backward(**tensor_kwargs)
@@ -228,7 +228,7 @@ def redistribute(self, dist_spec: _DistSpec, pg: Optional[ProcessGroup] = None)
         2. If the pg is not not None and not equal to the current process group.
         First, convert the tensor as replicated among the TP process group.
         Second, reset the process group to the new pg.
-        Third, conver the tensor (new replicated both among the tp process group) to the new dist_spec.
+        Third, convert the tensor (new replicated both among the tp process group) to the new dist_spec.
 
         Args:
             dist_spec (_DistSpec): the new dist spec.
@@ -297,7 +297,7 @@ def size_local(self, *args) -> torch.Size:
     def size_global(self, *args) -> torch.Size:
         """size_global
 
-        override the torch buildin size()
+        override the torch building size()
         the shape passed in must be in a replicate placement.
 
         Returns:
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index 0d8de1062d42..af38d2a502c2 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -391,7 +391,7 @@ class CommSpec:
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
         sharding_spec(ShardingSpec): This is sharding spec of the tensor which will join the communication action.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
diff --git a/colossalai/tensor/compute_spec.py b/colossalai/tensor/compute_spec.py
index 73328285ee93..12f8f36bc613 100644
--- a/colossalai/tensor/compute_spec.py
+++ b/colossalai/tensor/compute_spec.py
@@ -10,7 +10,7 @@ class ComputePattern(Enum):
 
 class ComputeSpec(object):
     """ComputeSpec
-    The Specification for compuattion pattern
+    The Specification for computation pattern
 
     Args:
         compute_pattern (ComputePattern): an Enum instance for compute pattern.
diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py
index 72a2694a1eaf..ee7ef74a99ae 100644
--- a/colossalai/tensor/d_tensor/layout.py
+++ b/colossalai/tensor/d_tensor/layout.py
@@ -14,7 +14,7 @@ class Layout:
     """Layout of a tensor.
 
     Attributes:
-        device_mesh: the device mesh to store the tensor distributedly.
+        device_mesh: the device mesh to store the tensor distributed.
         device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
         sharding_spec: the sharding specification to describe how the tensor is sharded.
         entire_shape: the entire shape of the global tensor.
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 7591f760cb30..2ea0c4db89fd 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -14,7 +14,7 @@
 
 class DimSpec:
     '''
-    Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of
+    Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
     logical device mesh and give a method to compute the difference between them.
     This class is used internally in ShardingSpec.
 
@@ -143,7 +143,7 @@ class ShardingSpec:
 
     Argument:
         dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
-            and the value of the key decribe which logical axis will be sharded in that dimension.
+            and the value of the key describe which logical axis will be sharded in that dimension.
         sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
     '''
 
diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py
index d5c0ce28e9fb..8657989235db 100644
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/tensor/dist_spec_mgr.py
@@ -61,7 +61,7 @@ def _shard_as(tensor: torch.Tensor, old_dist_spec: _DistSpec, dist_spec: _DistSp
         Args:
             tensor (torch.Tensor): a global (replicated) tensor before shard
             dist_spec (_DistSpec): the distributed spec. to be sharded as.
-            pg (ProcessGrouo): the process group of the corresponding colotensor
+            pg (ProcessGroup): the process group of the corresponding colotensor
         Returns:
             torch.Tensor: a torch tensor after sharded.
         """
diff --git a/colossalai/tensor/distspec.py b/colossalai/tensor/distspec.py
index 8dd0d8791537..3a09f1426e31 100644
--- a/colossalai/tensor/distspec.py
+++ b/colossalai/tensor/distspec.py
@@ -15,7 +15,7 @@ class _DistSpec:
     A class indicates Distributed Specification.
     The DistSpec is only works for the tensor parallel process groups.
     Because the dist spec of data parallel process group can be automatically deduced.
-    This is an internal data structrue.
+    This is an internal data structure.
     The API for users should be `ShardSpec` and `ReplicaSpec`.
 
     Args:
diff --git a/colossalai/tensor/shape_consistency.py b/colossalai/tensor/shape_consistency.py
index 2831b10a3c57..0a840006f086 100644
--- a/colossalai/tensor/shape_consistency.py
+++ b/colossalai/tensor/shape_consistency.py
@@ -73,7 +73,7 @@ def get_all_all_gather_spec(self, source_spec: ShardingSpec,
                                 orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
         '''
         Get all valid sharding specs from source_spec with single all-gather operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         For the all-gather operation, we just care about the S dimension.
 
         Argument:
@@ -145,7 +145,7 @@ def get_all_all_to_all_spec(self, source_spec: ShardingSpec,
                                 orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
         '''
         Get all valid sharding specs from source_spec with single all-to-all operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         For the all-to-all operation, we just care about the pairs containing S dimension.
 
         Argument:
diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py
index cdd0338850cf..bed320130ccd 100644
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@@ -18,7 +18,7 @@
 
 class _DimSpec:
     '''
-    Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of
+    Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
     logical device mesh and give a method to compute the difference between them.
     This class is used internally in ShardingSpec.
 
diff --git a/colossalai/tensor/utils.py b/colossalai/tensor/utils.py
index 0c2ead630d59..6e30f97fef03 100644
--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@@ -18,7 +18,7 @@ def all_gather_simulator(target_pair):
 
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     '''
     _, shard_list = target_pair
     new_shard_list = shard_list[:-1]
@@ -36,7 +36,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
     Therefore, if the behind shard_list is not None, we just extend it to the front shard_list.
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     e.g.:
         all-to-all(S0, S1) -> [S01, R]
         all-to-all(S0, R) -> [R, S0]
@@ -46,7 +46,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
 
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     '''
     _, f_shard_list = f_target_pair
     _, b_shard_list = b_target_pair
diff --git a/colossalai/testing/utils.py b/colossalai/testing/utils.py
index eac83e6d7bd5..6583eeb12bf4 100644
--- a/colossalai/testing/utils.py
+++ b/colossalai/testing/utils.py
@@ -17,10 +17,10 @@ def parameterize(argument: str, values: List[Any]) -> Callable:
     we want to avoid the number of distributed network initialization, we need to have
     this extra decorator on the function launched by torch.multiprocessing.
 
-    If a function is wrapped with this wrapper, non-paramterized arguments must be keyword arguments,
-    positioanl arguments are not allowed.
+    If a function is wrapped with this wrapper, non-parametrized arguments must be keyword arguments,
+    positional arguments are not allowed.
 
-    Usgae::
+    Usage::
 
         # Example 1:
         @parameterize('person', ['xavier', 'davis'])
@@ -33,7 +33,7 @@ def say_something(person, msg):
         # > xavier: hello
         # > davis: hello
 
-        # Exampel 2:
+        # Example 2:
         @parameterize('person', ['xavier', 'davis'])
         @parameterize('msg', ['hello', 'bye', 'stop'])
         def say_something(person, msg):
@@ -110,7 +110,7 @@ def test_method():
             If the pattern is not None and matches the exception message,
             the exception will be detected for rerun
         max_try (int, Optional): Maximum reruns for this function. The default value is 5.
-            If max_try is None, it will rerun foreven if exception keeps occurings
+            If max_try is None, it will rerun forever if exception keeps occurring
     """
 
     def _match_lines(lines, pattern):
@@ -144,7 +144,7 @@ def _run_until_success(*args, **kwargs):
 
         # Override signature
         # otherwise pytest.mark.parameterize will raise the following error:
-        # function does not use argumetn xxx
+        # function does not use argument xxx
         sig = signature(func)
         _run_until_success.__signature__ = sig
 
@@ -231,7 +231,7 @@ def spawn(func, nprocs=1, **kwargs):
     This function is used to spawn processes for testing.
 
     Usage:
-        # must contians arguments rank, world_size, port
+        # must contains arguments rank, world_size, port
         def do_something(rank, world_size, port):
             ...
 
diff --git a/colossalai/utils/checkpoint/module_checkpoint.py b/colossalai/utils/checkpoint/module_checkpoint.py
index a109b3702577..d390da864cd3 100644
--- a/colossalai/utils/checkpoint/module_checkpoint.py
+++ b/colossalai/utils/checkpoint/module_checkpoint.py
@@ -89,7 +89,7 @@ def load_checkpoint(path: str,
         torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function
         load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function
     """
-    # initialize the default paramters
+    # initialize the default parameters
     if not torch_load_kwargs:
         torch_load_kwargs = dict()
     if not load_state_dict_kwargs:
diff --git a/colossalai/utils/checkpoint/utils.py b/colossalai/utils/checkpoint/utils.py
index 5652600ffd9b..682cd0903d5b 100644
--- a/colossalai/utils/checkpoint/utils.py
+++ b/colossalai/utils/checkpoint/utils.py
@@ -34,7 +34,7 @@ def gather_tensor(colo_tensor: ColoTensor) -> None:
         dist.barrier()
 
     if dist.get_rank() == 0:
-        setattr(colo_tensor, 'save_ready', True)    # set saving signitrue
+        setattr(colo_tensor, 'save_ready', True)    # set saving signature
 
 
 def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None:
diff --git a/colossalai/utils/moe.py b/colossalai/utils/moe.py
index 90783e5d9b8e..86d04c11958b 100644
--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@@ -38,7 +38,7 @@ def sync_moe_model_param(model: nn.Module):
 
         param_dict = get_moe_epsize_param_dict(model)
 
-        # synchrosize the parameters whose dp_group is the whole world
+        # synchronize the parameters whose dp_group is the whole world
         if 1 in param_dict:
             src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0]
             for param in param_dict[1]:
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index 5937ee9eff9a..75f8576ca477 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -74,7 +74,7 @@ def __init__(self,
         """
         Args:
             device (torch.device): the device where parameters initialized are resident. Defaults to torch.device('cpu').
-            dtype (torch.dtype): the dtype of parameters initialized. Defults to torch.float.
+            dtype (torch.dtype): the dtype of parameters initialized. Defaults to torch.float.
             default_pg (ProcessGroup): the default process group for all initialized parameters.
             default_dist_spec: the default distributed specifications.
         """
@@ -164,7 +164,7 @@ def post_process_colo_init_ctx(model: torch.nn.Module,
         model (torch.nn.module): the model
         device (torch.device, optional): device type of the model params. Defaults to torch.device('cpu').
         dtype (torch.dtype, optional): dtype of the model params. Defaults to torch.float.
-        default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Inidicates a DP-only process group.
+        default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Indicates a DP-only process group.
         default_dist_spec (Any, optional): default dist spec of params. Defaults to None.
 
     Raises:
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index e151f1aefb2d..8a001b114e9a 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -1,5 +1,6 @@
 import itertools
 from collections import OrderedDict
+from contextlib import nullcontext
 from functools import partial
 from typing import Dict, Iterator, List, Optional, Union
 
@@ -42,13 +43,14 @@ class ZeroDDP(ColoDDP):
 
     Args:
         module (torch.nn.Module): Module to apply ZeRO-DP.
-        gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
+        gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous memory space.
             For more details, see the API reference of ``GeminiManager``.
         pin_memory (bool): Chunks on CPU Memory use pin-memory.
         force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
             Defaults to False.
         strict_ddp_mode (bool): If set to True, there is no tensor sharding, each tensor is replicated.
             Defaults to False. Users can set it to True, when they clearly know that they only need DDP.
+        scatter_after_inference (bool): If set to True, the model will be scattered after inference. This will save memory but slow down the consecutive inference.
     """
 
     def __init__(self,
@@ -56,7 +58,8 @@ def __init__(self,
                  gemini_manager: GeminiManager,
                  pin_memory: bool = False,
                  force_outputs_fp32: bool = False,
-                 strict_ddp_mode: bool = False) -> None:
+                 strict_ddp_mode: bool = False,
+                 scatter_after_inference: bool = True) -> None:
         self.gemini_manager = gemini_manager
         self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
         self.force_outputs_fp32 = force_outputs_fp32
@@ -67,6 +70,7 @@ def __init__(self,
         self.grads_device: Dict[torch.Tensor, torch.device] = dict()
         self.param2name: Dict[nn.Parameter, str] = dict()
         self.name2param: Dict[str, nn.Parameter] = dict()
+        self.scatter_after_inference = scatter_after_inference
 
         self._logger = get_dist_logger()
 
@@ -108,8 +112,6 @@ def _post_forward(self):
             first_param = next(iter(chunk.tensors_info))
             self.chunk_manager.move_chunk(chunk, self.grads_device[first_param])
         assert self.chunk_manager.accessed_mem == 0
-        # reset all recorded attributes
-        self.gemini_manager.reset_attributes()
 
     def forward(self, *args, **kwargs):
         # check whether we are in a inference mode
@@ -120,17 +122,35 @@ def forward(self, *args, **kwargs):
 
         args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
         self.module.zero_grad(set_to_none=True)
-        self.gemini_manager.pre_iter(*args)
-        with ColoParamOpHookManager.use_hooks(self.param_op_hook):
-            outputs = self.module(*args, **kwargs)
-        # scatter chunks in the inference mode
         if not grad_flag:
-            self._post_forward()
+            outputs = self._inference_forward(*args, **kwargs)
+        else:
+            self.gemini_manager.pre_iter(*args)
+            with ColoParamOpHookManager.use_hooks(self.param_op_hook):
+                outputs = self.module(*args, **kwargs)
 
         if self.force_outputs_fp32:
             return _cast_float(outputs, torch.float)
         return outputs
 
+    def _inference_forward(self, *args, **kwargs):
+        """This function is only triggered for inference.
+        """
+        fwd_ctx = ColoParamOpHookManager.use_hooks(self.param_op_hook)
+        if not self.scatter_after_inference:
+            # gather all chunks
+            for chunk in self.chunk_manager.get_chunks(self.fp16_params):
+                self.chunk_manager.access_chunk(chunk)
+            fwd_ctx = nullcontext()
+        with fwd_ctx:
+            outputs = self.module(*args, **kwargs)
+        if self.scatter_after_inference:
+            # scatter chunks
+            self._post_forward()
+        # reset all recorded attributes
+        self.gemini_manager.reset_attributes()
+        return outputs
+
     def _setup_grads_ptr(self):
         for p in self.module.parameters():
             if is_ddp_ignored(p):
@@ -678,13 +698,14 @@ def __init__(self,
                  pin_memory: bool = False,
                  force_outputs_fp32: bool = False,
                  strict_ddp_mode: bool = False,
+                 scatter_after_inference: bool = True,
                  search_range_mb: int = 32,
                  hidden_dim: Optional[int] = None,
                  min_chunk_size_mb: float = 32,
                  memstats: Optional[MemStats] = None,
                  verbose: bool = False) -> None:
         """
-        A torch.Module warpper using ZeRO-DP and Genimi.
+        A torch.Module wrapper using ZeRO-DP and Gemini.
         ZeRO is for parallel. Gemini is for memory management.
         WARNING: The class will modify the module inline!
 
@@ -706,7 +727,7 @@ def __init__(self,
                 Users can provide this argument to speed up searching.
                 If users do not know this argument before training, it is ok. We will use a default value 1024.
             min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
-                If the aggregate size of parameters is still samller than the minimum chunk size,
+                If the aggregate size of parameters is still smaller than the minimum chunk size,
                 all parameters will be compacted into one small chunk.
             memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
         """
@@ -722,4 +743,5 @@ def __init__(self,
                                            strict_ddp_flag=strict_ddp_mode,
                                            verbose=verbose)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
-        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)
+        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode,
+                         scatter_after_inference)
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
index 5115ff74da16..8f8fec64924e 100644
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
@@ -8,7 +8,7 @@
 @OPHOOKS.register_module
 class ShardGradMemTracerHook(BaseOpHook):
     """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
     """
 
     def __init__(self):
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
index 80736d14085e..a2a62fb9788a 100644
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
@@ -8,7 +8,7 @@
 @OPHOOKS.register_module
 class ShardParamHook(BaseOpHook):
     """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
     """
 
     def __init__(self):
diff --git a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
index 3b37444b0fe0..4f9ea7c6d520 100644
--- a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
+++ b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
@@ -53,7 +53,7 @@ def finish_iter(self):
         self._evict_time = 0
 
     def adjust_layout(self) -> None:
-        """ Adjust the layout of statefuil tensor according to the information provided
+        """ Adjust the layout of stateful tensor according to the information provided
         by mem_stats_collector, which should belongs to a Sharded Model.
         """
         # find stateful tensor in state COMPUTE
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py
index f8be0ca4f3fc..a921ca0aa83a 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -97,7 +97,7 @@ def calc_fanin_fanout(tensor: torch.Tensor):
         """We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
         This can help us get correct fan-in and fan-out for sharded tensor.
         """
-        assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
+        assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"
 
         # get correct shape of input tensor
         if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:
diff --git a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
index 11297bf6d62c..d663104831ce 100644
--- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
     """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
     which will fully utilize network bandwidth.
     It is especially useful when sub-module contains bias,
-    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
+    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
     """
 
     def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
index edd2cc8e68fe..b3a83b741825 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -192,7 +192,7 @@ def cpu_offload(self):
 
     def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
         """
-        dummy memory tracer collected infomation to a file.
+        dummy memory tracer collected information to a file.
         try:
             # forward: model(inputs)
             # backward: optimizer.backward()
@@ -201,7 +201,7 @@ def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> N
             exit(0)
         """
         if self._use_memory_tracer:
-            self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
+            self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
             if gpc.get_global_rank() == 0:
                 with open(filename, 'w+') as f:
                     f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -293,7 +293,7 @@ def _post_backward_operations(self) -> None:
             if not p.requires_grad:
                 continue
             # Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
-            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
+            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
             # If _require_backward_grad_sync is True,
             # p.grad remains the accumulated unsharded gradient from prior no-sync passes.
             # We also allows to interleave no-sync pass with sync passes, if desired.
@@ -385,7 +385,7 @@ def _save_grad(self, param: Parameter, grad: torch.Tensor):
             param.colo_attr.grad_payload_reset(grad.data)
             # release the memory of param
             # we set a false None for parameter's payload
-            # so we can get paramter's device and dtype later in optimizer
+            # so we can get parameter's device and dtype later in optimizer
             param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))
 
             if param.colo_attr.is_replicated:
diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
index 7ce1c056f583..be60209af434 100644
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
         hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
         max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
-        dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
-        mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
+        dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
+        mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.
 
     .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
         https://arxiv.org/abs/2108.05818
@@ -274,7 +274,7 @@ def _register_master_weight(self):
                 assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
                 shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
                 if shard_flag:
-                    # we always shard replicated paramters
+                    # we always shard replicated parameters
                     self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
                 self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
                 if shard_flag:
@@ -312,7 +312,7 @@ def _prepare_grads(self):
                 # If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
                 if not p.colo_attr.offload_grad:
                     colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
-                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
+                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
                 # If we change p.grad directly
                 # it may raise error because of different shape/dtype/device of p.data and p.grad
                 # We just set p.data = p.colo_attr.saved_grad.payload here
@@ -333,7 +333,7 @@ def _point_param_fp16_to_master_param(self):
 
     def _copy_master_model_to_model_fp16(self):
         # Copy master param data (fp32) to payload of colo_attr (fp16)
-        # TODO() improve efficiency by gathering tensors into a chunk and transfering
+        # TODO() improve efficiency by gathering tensors into a chunk and transferring
         # a chunk.
         for group in self.optim.param_groups:
             for p in group['params']:
@@ -350,7 +350,7 @@ def _copy_master_param_to_param_fp16(self, p):
 
         p.data = self.master_params[p].payload
 
-        # we need to allocate new memory for keep_not_shard paramters
+        # we need to allocate new memory for keep_not_shard parameters
         # in order to use copy, otherwise, the sizes of tensor is not compatible
         if p.colo_attr.data_payload.numel() != p.data.numel():
             p.colo_attr.data_payload_reset(
diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py
index 9ca2fdf5aa06..afc98e7a7f54 100644
--- a/colossalai/zero/low_level/_utils.py
+++ b/colossalai/zero/low_level/_utils.py
@@ -91,10 +91,18 @@ def get_grad_accumulate_object(tensor):
     return grad_acc_obj
 
 
-def split_half_float_double(tensor_list):
+def split_by_dtype(tensor_list):
+    """
+    Splits a list of PyTorch tensors into sublists based on their data type.
+
+    :param tensor_list: A list of PyTorch tensors.
+    :type tensor_list: list[torch.Tensor]
+    :return: A list of sublists, where each sublist contains tensors of a specific data type.
+    :rtype: list[list[torch.Tensor]]
+    """
     dtypes = ["torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"]
     buckets = []
-    for i, dtype in enumerate(dtypes):
+    for _, dtype in enumerate(dtypes):
         bucket = [t for t in tensor_list if t.type() == dtype]
         if bucket:
             buckets.append(bucket)
diff --git a/colossalai/zero/low_level/bookkeeping/parameter_store.py b/colossalai/zero/low_level/bookkeeping/parameter_store.py
index cbf708b3471f..1f3ba7cbc3bc 100644
--- a/colossalai/zero/low_level/bookkeeping/parameter_store.py
+++ b/colossalai/zero/low_level/bookkeeping/parameter_store.py
@@ -11,9 +11,9 @@ class ParameterStore(BaseStore):
     def __init__(self, torch_pg: ProcessGroup):
         super().__init__(torch_pg)
         # param partitioning data structures
-        self._fp16_param_to_rank = dict()
-        self._rank_groupid_to_fp16_param_list = dict()
-        self._rank_group_id_to_flat_fp16_param = dict()
+        self._param_to_rank = dict()
+        self._rank_group_id_to_param_list = dict()
+        self._rank_group_id_to_flat_param = dict()
 
         # param reduction data structures
         self._is_param_reduced = dict()
@@ -29,7 +29,7 @@ def set_param_to_rank(self, tensor: Tensor, rank: int) -> None:
         :type rank: int
         """
 
-        self._fp16_param_to_rank[tensor] = rank
+        self._param_to_rank[tensor] = rank
 
     def get_param_rank(self, tensor: Tensor) -> int:
         """
@@ -38,7 +38,7 @@ def get_param_rank(self, tensor: Tensor) -> int:
         :param tensor: A :class:`torch.Tensor` object
         :type tensor: torch.Tensor
         """
-        return self._fp16_param_to_rank[tensor]
+        return self._param_to_rank[tensor]
 
     def belongs_to_current_rank(self, tensor) -> bool:
         """
@@ -51,29 +51,29 @@ def belongs_to_current_rank(self, tensor) -> bool:
         :rtype: bool
         """
 
-        tensor_rank = self._fp16_param_to_rank[tensor]
+        tensor_rank = self._param_to_rank[tensor]
         return tensor_rank == self._local_rank
 
-    def add_fp16_param_list_by_rank_group(self, rank, group_id, tensor_list) -> None:
-        if rank not in self._rank_groupid_to_fp16_param_list:
-            self._rank_groupid_to_fp16_param_list[rank] = dict()
+    def add_param_list_by_rank_group(self, rank, group_id, tensor_list) -> None:
+        if rank not in self._rank_group_id_to_param_list:
+            self._rank_group_id_to_param_list[rank] = dict()
 
-        if group_id not in self._rank_groupid_to_fp16_param_list[rank]:
-            self._rank_groupid_to_fp16_param_list[rank][group_id] = []
+        if group_id not in self._rank_group_id_to_param_list[rank]:
+            self._rank_group_id_to_param_list[rank][group_id] = []
 
-        self._rank_groupid_to_fp16_param_list[rank][group_id].extend(tensor_list)
+        self._rank_group_id_to_param_list[rank][group_id].extend(tensor_list)
 
-    def get_fp16_params_by_rank_group(self, rank, group_id) -> List[Tensor]:
-        return self._rank_groupid_to_fp16_param_list[rank][group_id]
+    def get_params_by_rank_group(self, rank, group_id) -> List[Tensor]:
+        return self._rank_group_id_to_param_list[rank][group_id]
 
-    def add_flat_fp16_param_by_rank_group(self, rank, group_id, tensor) -> None:
-        if rank not in self._rank_group_id_to_flat_fp16_param:
-            self._rank_group_id_to_flat_fp16_param[rank] = dict()
+    def add_flat_param_by_rank_group(self, rank, group_id, tensor) -> None:
+        if rank not in self._rank_group_id_to_flat_param:
+            self._rank_group_id_to_flat_param[rank] = dict()
 
-        self._rank_group_id_to_flat_fp16_param[rank][group_id] = tensor
+        self._rank_group_id_to_flat_param[rank][group_id] = tensor
 
-    def get_flat_fp16_param_by_rank_group(self, rank, group_id) -> Tensor:
-        return self._rank_group_id_to_flat_fp16_param[rank][group_id]
+    def get_flat_param_by_rank_group(self, rank, group_id) -> Tensor:
+        return self._rank_group_id_to_flat_param[rank][group_id]
 
     def is_param_reduced(self, tensor):
         return self._is_param_reduced[tensor]
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 39ade27b9d98..3e7661ecab76 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -21,7 +21,7 @@
     has_inf_or_nan,
     reduce_tensor_dp_group,
     release_param_grad,
-    split_half_float_double,
+    split_by_dtype,
     sync_param,
 )
 from .bookkeeping import BucketStore, GradientStore, ParameterStore, TensorBucket
@@ -55,6 +55,7 @@ def __init__(
         # 2. contiguous gradients
         # 3. cpu offload
         # 4. support when some parameters requires_grad = False
+        # 5. support layer drop
         super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
         self._dtype = self.optim.param_groups[0]['params'][0].dtype
         self._logger = get_dist_logger()
@@ -89,9 +90,10 @@ def __init__(
                 self._mp_torch_group = gpc.get_group(mp_parallel_mode)
         else:
             raise NotImplementedError
-        # fp16 and fp32 params for mixed precision training
-        self._fp16_param_groups = dict()
-        self._fp32_flat_param_groups_of_current_rank = dict()
+
+        # working and master params for mixed precision training
+        self._working_param_groups = dict()
+        self._master_flat_param_groups_of_current_rank = dict()
 
         # communication params
         self._overlap_communication = overlap_communication
@@ -137,8 +139,8 @@ def __init__(
                 if param.requires_grad:
                     group_params.append(param)
 
-            # add the fp16 params to fp16_param_groups for bookkeeping
-            self._fp16_param_groups[group_id] = group_params
+            # add the working params to working_param_groups for bookkeeping
+            self._working_param_groups[group_id] = group_params
 
             # assign parameters to ranks
             # the params in the list are sorted
@@ -147,7 +149,7 @@ def __init__(
             # store the mapping between param to rank
             # each param should belong to only one rank
             for rank, params in enumerate(params_per_rank):
-                self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params)
+                self._param_store.add_param_list_by_rank_group(rank, group_id, params)
                 for param in params:
                     self._param_store.set_param_to_rank(param, rank)
 
@@ -158,33 +160,33 @@ def __init__(
 
             # flatten the reordered tensors
             for rank in range(self._world_size):
-                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                tensor_list = self._param_store.get_params_by_rank_group(rank, group_id)
                 with torch.no_grad():
                     flat_tensor = flatten(tensor_list)
                 flat_tensor = flat_tensor.data.cuda()
-                self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+                self._param_store.add_flat_param_by_rank_group(rank, group_id, flat_tensor)
 
             # sync parameters
             for rank in range(self._world_size):
-                flat_tensor = self._param_store.get_flat_fp16_param_by_rank_group(rank, group_id)
-                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                flat_tensor = self._param_store.get_flat_param_by_rank_group(rank, group_id)
+                tensor_list = self._param_store.get_params_by_rank_group(rank, group_id)
                 sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
 
-            # create a copy of fp32 weights of the parameters for which this rank is responsible
-            fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group(self._local_rank, group_id)
-            fp32_flat_current_rank = fp16_flat_current_rank.float()
+            # create a copy of fp32 master weights of the parameters for which this rank is responsible
+            working_flat_current_rank = self._param_store.get_flat_param_by_rank_group(self._local_rank, group_id)
+            master_flat_current_rank = working_flat_current_rank.float()
             device = 'cpu' if self._cpu_offload else get_current_device()
-            fp32_flat_current_rank = fp32_flat_current_rank.to(device)
-            fp32_flat_current_rank.requires_grad = True
-            self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank
+            master_flat_current_rank = master_flat_current_rank.to(device)
+            master_flat_current_rank.requires_grad = True
+            self._master_flat_param_groups_of_current_rank[group_id] = master_flat_current_rank
 
             # need to replace the params in the `params` field in the optimizer
             # so that when the optimizer calls step(), it only updates the tensors
             # managed by this data parallel rank
-            param_group['params'] = [fp32_flat_current_rank]
+            param_group['params'] = [master_flat_current_rank]
 
             # set reduction state
-            for param in self._fp16_param_groups[group_id]:
+            for param in self._working_param_groups[group_id]:
                 self._param_store.set_param_reduction_state(param, False)
 
         # intialize communication stream for
@@ -208,7 +210,7 @@ def loss_scale(self):
 
     @property
     def num_param_groups(self):
-        return len(self._fp16_param_groups)
+        return len(self._working_param_groups)
 
     def _sanity_checks(self):
         assert torch.cuda.is_available(), 'CUDA is required'
@@ -260,10 +262,10 @@ def _grad_handler(self, param, grad, reduce_rank):
         return grad
 
     def _attach_reduction_hook(self):
-        # we iterate over the fp16 params
+        # we iterate over the working params
         # on each param, we register a hook to its AccumulateGrad object
         for group_id in range(self.num_param_groups):
-            param_group = self._fp16_param_groups[group_id]
+            param_group = self._working_param_groups[group_id]
             for param in param_group:
                 if param.requires_grad:
                     # determines the reduction destionation rank
@@ -314,7 +316,7 @@ def _reduce_tensor_list_with_one_dtype(self, tensor_list, bucket_size, reduce_ra
             self._reduce_tensor_bucket(bucket=param_bucket, reduce_rank=reduce_rank)
 
     def _reduce_grads(self, reduce_rank, grads, bucket_size):
-        grad_buckets_by_dtype = split_half_float_double(grads)
+        grad_buckets_by_dtype = split_by_dtype(grads)
 
         for tensor_list in grad_buckets_by_dtype:
             self._reduce_tensor_list_with_one_dtype(tensor_list=tensor_list,
@@ -417,7 +419,7 @@ def zero_grad(self, set_to_none=True):
         :param set_to_none: Whether set the gradient to None. Default value is True.
         :type set_to_none: bool
         """
-        for _, param_group in self._fp16_param_groups.items():
+        for _, param_group in self._working_param_groups.items():
             for param in param_group:
                 if set_to_none:
                     param.grad = None
@@ -445,33 +447,33 @@ def step(self, closure=None):
             self.zero_grad()
             return
 
-        # copy the grad of fp16 param to fp32 param
+        # copy the grad of working param to master param
         single_grad_partition_groups = []
         norm_groups = []
 
         for group_id in range(self.num_param_groups):
             # compute norm
             norm_group = compute_norm(gradients=self._grad_store.get_averaged_gradients_by_group(group_id),
-                                      params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
-                                                                                             rank=self._local_rank),
+                                      params=self._param_store.get_params_by_rank_group(group_id=group_id,
+                                                                                        rank=self._local_rank),
                                       dp_group=self._dp_torch_group,
                                       mp_group=self._mp_torch_group)
             norm_groups.append(norm_group)
 
-            # create flat gradient for the flat fp32 params
-            fp16_avg_grads = self._grad_store.get_averaged_gradients_by_group(group_id)
-            flat_fp16_avg_grads = flatten(fp16_avg_grads)
+            # create flat gradient for the flat fp32 master params
+            working_avg_grads = self._grad_store.get_averaged_gradients_by_group(group_id)
+            flat_working_avg_grads = flatten(working_avg_grads)
 
-            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
-            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
+            dtype = self._master_flat_param_groups_of_current_rank[group_id].dtype
+            flat_master_avg_grads = flat_working_avg_grads.to(dtype)
 
-            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
-            assert param_shape == flat_fp32_avg_grads.shape, \
-                f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}'
+            param_shape = self._master_flat_param_groups_of_current_rank[group_id].shape
+            assert param_shape == flat_master_avg_grads.shape, \
+                f'fp32 param and grad have different shape {param_shape} vs {flat_master_avg_grads.shape}'
 
-            single_grad_partition_groups.append(flat_fp32_avg_grads)
-            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
-            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
+            single_grad_partition_groups.append(flat_master_avg_grads)
+            device = self._master_flat_param_groups_of_current_rank[group_id].device
+            self._master_flat_param_groups_of_current_rank[group_id].grad = flat_master_avg_grads.to(device)
             self._grad_store.reset_average_gradients_by_group(group_id)
 
         # unscale and clip grads
@@ -480,37 +482,37 @@ def step(self, closure=None):
 
         # update the parameters
         self.optim.step()
-        # release the fp32 grad
-        release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
+        # release the master grad
+        release_param_grad(self._master_flat_param_groups_of_current_rank.values())
 
-        # update fp16 partition updated by the current rank
-        for group_id in range(len(self._fp16_param_groups)):
-            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=self._local_rank, group_id=group_id)
-            fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
-            fp16_param.data.copy_(fp32_param)
+        # update working partition updated by the current rank
+        for group_id in range(len(self._working_param_groups)):
+            working_param = self._param_store.get_flat_param_by_rank_group(rank=self._local_rank, group_id=group_id)
+            master_param = self._master_flat_param_groups_of_current_rank[group_id]
+            working_param.data.copy_(master_param)
 
         # broadcast the updated model weights
         handles = []
         for group_id in range(self.num_param_groups):
             for index in range(self._world_size):
                 rank = self._dp_global_ranks[index]
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=index, group_id=group_id)
-                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_torch_group, async_op=True)
+                working_param = self._param_store.get_flat_param_by_rank_group(rank=index, group_id=group_id)
+                handle = dist.broadcast(working_param, src=rank, group=self._dp_torch_group, async_op=True)
                 handles.append(handle)
 
         for handle in handles:
             handle.wait()
 
-    ##################
-    # FP16 Utilities #
-    ##################
+    #############################
+    # Mixed Precision Utilities #
+    #############################
 
     def _check_overflow(self):
         # clear previous overflow record
         self._found_overflow.fill_(0.0)
 
         # check for overflow
-        for group_id in range(len(self._fp16_param_groups)):
+        for group_id in range(len(self._working_param_groups)):
             for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
                 if avg_grad is not None and has_inf_or_nan(avg_grad):
                     self._found_overflow.fill_(1.0)
@@ -553,7 +555,7 @@ def _sync_grad(self):
 
         # accumulate gradient
         for group_id in range(self.num_param_groups):
-            param_group = self._param_store.get_fp16_params_by_rank_group(self._local_rank, group_id)
+            param_group = self._param_store.get_params_by_rank_group(self._local_rank, group_id)
 
             avg_gradients_group = self._grad_store.get_averaged_gradients_by_group(group_id)
 
@@ -574,8 +576,8 @@ def _reduce_grad_stage1(self):
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
         if not self._overlap_communication:
-            for group_id in range(len(self._fp16_param_groups)):
-                param_group = self._fp16_param_groups[group_id]
+            for group_id in range(len(self._working_param_groups)):
+                param_group = self._working_param_groups[group_id]
                 for param in param_group:
                     if param.grad is not None:
                         self._add_to_reduction_bucket(param)
diff --git a/colossalai/zero/wrapper.py b/colossalai/zero/wrapper.py
index 6cdb8fc59ba5..3e48f49fa305 100644
--- a/colossalai/zero/wrapper.py
+++ b/colossalai/zero/wrapper.py
@@ -26,7 +26,7 @@ def zero_model_wrapper(model: nn.Module,
         zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
             https://arxiv.org/abs/1910.02054
         gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
-            when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config.
+            when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
             Here is an example where we set the device of the model, the placement policy of Gemini, and the
             size of hidden dimension to help Gemini find out a unified chunk size.
 
@@ -78,7 +78,7 @@ def zero_optim_wrapper(model: nn.Module,
         max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
             clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
         norm_type (float, optional): norm_type used for `clip_grad_norm`.
-        optim_config (dict, optinoal): The configuration used for the ZeRO optimizer.
+        optim_config (dict, optional): The configuration used for the ZeRO optimizer.
             Example:
 
                 >>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
diff --git a/docs/source/en/Colossal-Auto/get_started/run_demo.md b/docs/source/en/Colossal-Auto/get_started/run_demo.md
index 6f7a82966f20..34872e399c81 100644
--- a/docs/source/en/Colossal-Auto/get_started/run_demo.md
+++ b/docs/source/en/Colossal-Auto/get_started/run_demo.md
@@ -4,7 +4,7 @@ Colossal-Auto simplifies the process of deploying large-scale machine learning m
 
 ### 1. Basic usage
 
-Colossal-Auto can be used to find a hybrid SPMD parallel strategy includes data, tensor(i.e., 1D, 2D, sequencial) for each operation. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel).
+Colossal-Auto can be used to find a hybrid SPMD parallel strategy includes data, tensor(i.e., 1D, 2D, sequential) for each operation. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel).
 Detailed instructions can be found in its `README.md`.
 
 ### 2. Integration with activation checkpoint
diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md
index 4889b30a6cf8..8afb6705b6ae 100644
--- a/docs/source/en/advanced_tutorials/meet_gemini.md
+++ b/docs/source/en/advanced_tutorials/meet_gemini.md
@@ -44,7 +44,7 @@ In some solutions, the [Zero-offload](https://arxiv.org/abs/2101.06840) adopted
 </figure>
 
 
-Colossal-AI designed Gemini, just like two-stars, which manages the memory space of CPU and GPU efficiently. It can make the tensor dynamically distributed in the storage space of CPU-GPU during training, so that the model training can break through the memory wall of GPU. The memory manager consists of two parts: **MemStatsCollector (MSC)** and **StatefuleTensorMgr (STM)**.
+Colossal-AI designed Gemini, just like two-stars, which manages the memory space of CPU and GPU efficiently. It can make the tensor dynamically distributed in the storage space of CPU-GPU during training, so that the model training can break through the memory wall of GPU. The memory manager consists of two parts: **MemStatsCollector (MSC)** and **StatefulTensorMgr (STM)**.
 
 We take advantage of the iterative characteristics of the deep learning network training process. We divide iterations into two stages: warmup and non-warmup. One or several iterative steps at the beginning belong to the warmup stage, and the other iterative steps belong to the non-warmup stage. In the warmup stage, we collect information for the MSC, while in the non-warmup stage, STM gets the information collected by the MSC to move the tensor, so as to minimize the CPU-GPU data movement volume.
 
diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md
index b317de91bbdd..a43ec7fdd1fe 100644
--- a/docs/source/en/advanced_tutorials/opt_service.md
+++ b/docs/source/en/advanced_tutorials/opt_service.md
@@ -20,7 +20,7 @@ To launch the distributed inference service quickly, you can download the OPT-12
 
 2. Prepare a prebuilt service image
 
-Pull a docker image from dockerhub installed with Colossal-AI inference.
+Pull a docker image from docker hub installed with Colossal-AI inference.
 
 ```bash
 docker pull hpcaitech/energon-ai:latest
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 1f3086559939..b2438a1cf562 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -12,7 +12,7 @@ Author: Yuxuan Lou
 
 ## Introduction
 
-In this example for ViT model, Colossal-AI provides three different parallelism techniques which acclerate model training: data parallelism, pipeline parallelism and tensor parallelism.
+In this example for ViT model, Colossal-AI provides three different parallelism techniques which accelerate model training: data parallelism, pipeline parallelism and tensor parallelism.
 We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs.
 
 
@@ -31,7 +31,7 @@ pip install colossalai
 
 
 ## Data Parallelism
-Data parallism is one basic way to accelerate model training process. You can apply data parallism to training by only two steps:
+Data parallism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps:
 1. Define a configuration file
 2. Change a few lines of code in train script
 
@@ -108,7 +108,7 @@ disable_existing_loggers()
 logger = get_dist_logger()
 ```
 
-After initialization, you can acess the variables in the config file by using `colossalai.core.global_context`.
+After initialization, you can access the variables in the config file by using `colossalai.core.global_context`.
 
 ```python
 #access parameters
@@ -162,7 +162,7 @@ optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
 # build loss
 criterion = torch.nn.CrossEntropyLoss()
 
-# lr_scheduelr
+# lr_scheduler
 lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
 ```
 
@@ -230,10 +230,10 @@ torchrun --standalone --nproc_per_node <NUM_GPUs>  train_dp.py --config ./config
 
 
 ## Pipeline Parallelism
-Aside from data parallelism, Colossal-AI also support pipleline parallelism. In specific, Colossal-AI uses 1F1B pipeline introduced by NVIDIA. For more details, you can view the related [documents](https://www.colossalai.org/tutorials/features/pipeline_parallel).
+Aside from data parallelism, Colossal-AI also support pipeline parallelism. In specific, Colossal-AI uses 1F1B pipeline introduced by NVIDIA. For more details, you can view the related [documents](https://www.colossalai.org/tutorials/features/pipeline_parallel).
 
 ### Define your configuration file(`hybrid_parallel/configs/vit_pipeline.py`)
-To apply pipleline parallel on the data parallel basis, you only need to add a **parallel dict**
+To apply pipeline parallel on the data parallel basis, you only need to add a **parallel dict**
 ```python
 from colossalai.amp import AMP_TYPE
 
@@ -250,7 +250,7 @@ clip_grad_norm = 1.0
 
 Other configs：
 ```python
-# hyperparameters
+# hyper parameters
 # BATCH_SIZE is as per GPU
 # global batch size = BATCH_SIZE x data parallel size
 BATCH_SIZE = 256
@@ -276,7 +276,7 @@ Colossal-AI provides two methods to build a pipeline model from the existing mod
 - `colossalai.builder.build_pipeline_model_from_cfg`
 - `colossalai.builder.build_pipeline_model`
 
-Besides, you can also build a pipeline model from scrath with Colossal-AI.
+Besides, you can also build a pipeline model from scratch with Colossal-AI.
 ```python
 import math
 from typing import Callable
@@ -521,7 +521,7 @@ def build_cifar(batch_size):
     return train_dataloader, test_dataloader
 
 
-# craete dataloaders
+# create dataloaders
 train_dataloader , test_dataloader = build_cifar()
 
 # create loss function
@@ -539,7 +539,7 @@ lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
 #### Start Colossal-AI engine
 
 ```python
-# intiailize
+# initialize
 engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                      optimizer=optimizer,
                                                                      criterion=criterion,
@@ -615,7 +615,7 @@ TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
 
 Ohter configs:
 ```python
-# hyperparameters
+# hyper parameters
 # BATCH_SIZE is as per GPU
 # global batch size = BATCH_SIZE x data parallel size
 BATCH_SIZE = 256
diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md
index 1b855c03b919..909c5e4d3c6f 100644
--- a/docs/source/en/basics/colotensor_concept.md
+++ b/docs/source/en/basics/colotensor_concept.md
@@ -42,7 +42,7 @@ Therefore, when using Distributed Spec, we only need to describe the way that th
 
 ## Compute Spec
 
-An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec) describes how a Coloensor be used in DNN training. Currently, we will set the correct Compute Pattern for the ColoTensor as the parameters of the module. The specific application scenarios will be shown in the next document.
+An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec) describes how a Colotensor be used in DNN training. Currently, we will set the correct Compute Pattern for the ColoTensor as the parameters of the module. The specific application scenarios will be shown in the next document.
 
 ## ColoParameter
 
diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md
index 39792f622aa9..bbe32ed5a3b5 100644
--- a/docs/source/en/basics/engine_trainer.md
+++ b/docs/source/en/basics/engine_trainer.md
@@ -172,7 +172,7 @@ In this config file, we specify that we want to use batch size 128 per GPU and r
 #### Step 2. Initialize Distributed Environment
 
 We need to initialize the distributed training environment. This has been introduced in the tutorial on how to
-[launch Colossal-AI](./launch_colossalai.md). For this demostration, we use `launch_from_torch` and PyTorch launch utility.
+[launch Colossal-AI](./launch_colossalai.md). For this demonstration, we use `launch_from_torch` and PyTorch launch utility.
 
 ```python
 import colossalai
diff --git a/docs/source/en/concepts/colossalai_overview.md b/docs/source/en/concepts/colossalai_overview.md
index d75d20196b08..38b682d49e62 100644
--- a/docs/source/en/concepts/colossalai_overview.md
+++ b/docs/source/en/concepts/colossalai_overview.md
@@ -6,18 +6,18 @@ Author: Shenggui Li, Siqi Mai
 
 With the development of deep learning model size, it is important to shift to a new training paradigm. The traditional training method with no parallelism and optimization became a thing of the past and new training methods are the key to make training large-scale models efficient and cost-effective.
 
-Colossal-AI is designed to be a unfied system to provide an integrated set of training skills and utilities to the user. You can find the common training utilities such as mixed precision training and gradient accumulation. Besides, we provide an array of parallelism including data, tensor and pipeline parallelism. We optimize tensor parallelism with different multi-dimensional distributed matrix-matrix multiplication algorithm. We also provided different pipeline parallelism methods to allow the user to scale their model across nodes efficiently. More advanced features such as offloading can be found in this tutorial documentation in detail as well.
+Colossal-AI is designed to be a unified system to provide an integrated set of training skills and utilities to the user. You can find the common training utilities such as mixed precision training and gradient accumulation. Besides, we provide an array of parallelism including data, tensor and pipeline parallelism. We optimize tensor parallelism with different multi-dimensional distributed matrix-matrix multiplication algorithm. We also provided different pipeline parallelism methods to allow the user to scale their model across nodes efficiently. More advanced features such as offloading can be found in this tutorial documentation in detail as well.
 
 ## General Usage
 
-We aim to make Colossal-AI easy to use and non-instrusive to user code. There is a simple general workflow if you want to use Colossal-AI.
+We aim to make Colossal-AI easy to use and non-intrusive to user code. There is a simple general workflow if you want to use Colossal-AI.
 
 <figure style={{textAlign: "center"}}>
 <img src="https://s2.loli.net/2022/01/28/ZK7ICWzbMsVuJof.png"/>
 <figcaption>Workflow</figcaption>
 </figure>
 
-1. Prepare a configiguration file where specifies the features you want to use and your parameters.
+1. Prepare a configuration file where specifies the features you want to use and your parameters.
 2. Initialize distributed backend with `colossalai.launch`
 3. Inject the training features into your training components (e.g. model, optimizer) with `colossalai.initialize`.
 4. Run training and testing
diff --git a/docs/source/en/features/1D_tensor_parallel.md b/docs/source/en/features/1D_tensor_parallel.md
index 695a8f31f8c5..7577e50400e9 100644
--- a/docs/source/en/features/1D_tensor_parallel.md
+++ b/docs/source/en/features/1D_tensor_parallel.md
@@ -42,7 +42,7 @@ Given $P$ processors, we present the theoretical computation and memory cost, as
 
 ## Usage
 
-To enable 1D tensor parallelism for our model, e.g. on 2 GPUs, we need to configure the parallism setting as below.
+To enable 1D tensor parallelism for our model, e.g. on 2 GPUs, we need to configure the parallelism setting as below.
 ```python
 CONFIG = dict(parallel=dict(
     data=1,
diff --git a/docs/source/en/features/2D_tensor_parallel.md b/docs/source/en/features/2D_tensor_parallel.md
index 582614c2f2f4..7b6c10766099 100644
--- a/docs/source/en/features/2D_tensor_parallel.md
+++ b/docs/source/en/features/2D_tensor_parallel.md
@@ -60,7 +60,7 @@ Given $P=q\times q$ processors, we present the theoretical computation and memor
 
 ## Usage
 
-To enable 2D tensor parallelism for our model, e.g. on 4 GPUs, we need to configure the parallism setting as below.
+To enable 2D tensor parallelism for our model, e.g. on 4 GPUs, we need to configure the parallelism setting as below.
 ```python
 CONFIG = dict(parallel=dict(
     data=1,
diff --git a/docs/source/en/features/2p5D_tensor_parallel.md b/docs/source/en/features/2p5D_tensor_parallel.md
index 34a261ea0aa0..6076562e6dca 100644
--- a/docs/source/en/features/2p5D_tensor_parallel.md
+++ b/docs/source/en/features/2p5D_tensor_parallel.md
@@ -57,7 +57,7 @@ Given $P=q \times q \times d$ processors, we present the theoretical computation
 
 ## Usage
 
-To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below.
+To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below.
 ```python
 CONFIG = dict(parallel=dict(
     data=1,
diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index d8781ee691bc..ecc209fbac8d 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -28,7 +28,7 @@ gradient_accumulation = <int>
 ## Hands-on Practice
 
 We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation)
-to demonstrate gradient accumulation. In this example, we set the gradinet accumulation size to be 4. You can run the script using this command:
+to demonstrate gradient accumulation. In this example, we set the gradient accumulation size to be 4. You can run the script using this command:
 
 ```shell
 python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  run_resnet_cifar10_with_engine.py
diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md
index 71cb6971d346..11aa5235301a 100644
--- a/docs/source/en/features/mixed_precision_training.md
+++ b/docs/source/en/features/mixed_precision_training.md
@@ -101,7 +101,7 @@ you can use `colossalai.amp.convert_to_amp`.
 ```python
 from colossalai.amp import AMP_TYPE
 
-# exmaple of using torch amp
+# example of using torch amp
 model, optimizer, criterion = colossalai.amp.convert_to_amp(model,
                                                             optimizer,
                                                             criterion,
@@ -220,7 +220,7 @@ The default parameters of Naive AMP:
 - initial_scale(int): initial scale of gradient scaler
 - growth_factor(int): the growth rate of loss scale
 - backoff_factor(float): the decrease rate of loss scale
-- hysterisis(int): delay shift in dynamic loss scaling
+- hysteresis(int): delay shift in dynamic loss scaling
 - max_scale(int): maximum loss scale allowed
 - verbose(bool): if set to `True`, will print debug info
 
@@ -292,7 +292,7 @@ colossalai.launch_from_torch(config=args.config)
 ### Step 4. Create training components
 
 Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is
-obtained from the environment varialbe `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])`
+obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])`
 to a path on your machine. Data will be automatically downloaded to the root path.
 
 ```python
@@ -326,7 +326,7 @@ to a path on your machine. Data will be automatically downloaded to the root pat
     # build loss
     criterion = torch.nn.CrossEntropyLoss()
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
 ```
 
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index 38d2c4af904c..4374da3c9c45 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -57,7 +57,7 @@ It's compatible with all parallel methods in ColossalAI.
 
 Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 
-We should install denpendencies first:
+We should install dependencies first:
 
 ```shell
 pip install psutil transformers
@@ -99,7 +99,7 @@ class GPTLMLoss(nn.Module):
                             shift_labels.view(-1))
 ```
 
-And we define some utility functions, which generates random data, computes the number of paramters of a model and get memory usage of current process:
+And we define some utility functions, which generates random data, computes the number of parameters of a model and get memory usage of current process:
 
 ```python
 def get_data(batch_size: int, seq_len: int,
@@ -251,7 +251,7 @@ Time: 3.691 s
 Mem usage: 5298.344 MB
 ```
 
-NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemini can accelerate training but increase memory usage. So this result also meets our expectation. If we disable `pin_memory`, we can aslo observe a memory usage drop about 900 MB.
+NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemini can accelerate training but increase memory usage. So this result also meets our expectation. If we disable `pin_memory`, we can also observe a memory usage drop about 900 MB.
 
 ## API Reference
 
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index 6b0a9585af85..a105831a5409 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -32,11 +32,11 @@ and the first and second momentum estimates) are partitioned across the processe
 
 3. **Shard Parameter**: The 16-bit model parameters are partitioned across the processes of a data parallel group.
 
-4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: Dynamic heterogeneous memory space manager for paramters, gradients and optimizer states.
+4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: Dynamic heterogeneous memory space manager for parameters, gradients and optimizer states.
 
 Besides, this article will introduce the Zero Redundancy Optimizer with chunk-based memory management.
 
-When using ZeRO, we distributed the model by sharding the parameters. The advantage of this method is that the memory of each node is load balanced. But this approach has two significiant disadvantages. First, during communication, a temporary memory buffer needs to be allocated and released afterwards, leading to the memory fragmentation problem. Secondly, using tensor as the granularity for communication will cause the network bandwidth underutilized. Generally, the longer the transmitted message length, the higher the bandwidth utilization.
+When using ZeRO, we distributed the model by sharding the parameters. The advantage of this method is that the memory of each node is load balanced. But this approach has two significant disadvantages. First, during communication, a temporary memory buffer needs to be allocated and released afterwards, leading to the memory fragmentation problem. Secondly, using tensor as the granularity for communication will cause the network bandwidth underutilized. Generally, the longer the transmitted message length, the higher the bandwidth utilization.
 
 Using the Chunk mechanism introduced in ColossalAI v0.1.8, we can improve the efficiency of ZeRO. We store a continuous set of parameters in initialization order into a Chunk (a chunk is a continuous memory space), and each Chunk has the same size. Organizing memory in chunks can lead to efficient use of network bandwidth between PCI-e and GPU-GPU, reduce the number of communications, and avoid potential memory fragmentation.
 
diff --git a/examples/images/diffusion/ldm/data/teyvat.py b/examples/images/diffusion/ldm/data/teyvat.py
index 61dc29d56e7c..eb5d3ea469d4 100644
--- a/examples/images/diffusion/ldm/data/teyvat.py
+++ b/examples/images/diffusion/ldm/data/teyvat.py
@@ -13,7 +13,7 @@
 
 def make_multi_folder_data(paths, caption_files=None, **kwargs):
     """Make a concat dataset from multiple folders
-    Don't suport captions yet
+    Don't support captions yet
     If paths is a list, that's ok, if it's a Dict interpret it as:
     k=folder v=n_times to repeat that
     """
diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py
index e31d75e0874d..713029fc677d 100644
--- a/examples/images/diffusion/main.py
+++ b/examples/images/diffusion/main.py
@@ -40,7 +40,7 @@ class DataLoaderX(DataLoader):
 # A custom data loader class that inherits from DataLoader
     def __iter__(self):
         # Overriding the __iter__ method of DataLoader to return a BackgroundGenerator
-        #This is to enable data laoding in the background to improve training performance
+        #This is to enable data loading in the background to improve training performance
         return BackgroundGenerator(super().__iter__())
 
 
@@ -60,7 +60,7 @@ def str2bool(v):
     # Create an ArgumentParser object with specifies kwargs
     parser = argparse.ArgumentParser(**parser_kwargs)
 
-    # Add vairous command line arguments with their default balues and descriptions
+    # Add various command line arguments with their default values and descriptions
     parser.add_argument(
         "-n",
         "--name",
@@ -162,7 +162,7 @@ def str2bool(v):
 
 # A function that returns the non-default arguments between two objects
 def nondefault_trainer_args(opt):
-    # create an argument parsser
+    # create an argument parser
     parser = argparse.ArgumentParser()
     # add pytorch lightning trainer default arguments
     parser = Trainer.add_argparse_args(parser)
@@ -203,7 +203,7 @@ def worker_init_fn(_):
     else:
         return np.random.seed(np.random.get_state()[1][0] + worker_id)
 
-#Provide functionality for creating data loadedrs based on provided dataset configurations
+#Provide functionality for creating data loaders based on provided dataset configurations
 class DataModuleFromConfig(pl.LightningDataModule):
 
     def __init__(self,
@@ -255,7 +255,7 @@ def setup(self, stage=None):
     def _train_dataloader(self):
         #Check if the train dataset is iterable
         is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset)
-        #Set the worker initialization function of the dataset isiterable or use_worker_init_fn is True
+        #Set the worker initialization function of the dataset is iterable or use_worker_init_fn is True
         if is_iterable_dataset or self.use_worker_init_fn:
             init_fn = worker_init_fn
         else:
@@ -310,7 +310,7 @@ def _predict_dataloader(self, shuffle=False):
 
 
 class SetupCallback(Callback):
-    # I nitialize the callback with the necessary parameters
+    # Initialize the callback with the necessary parameters
 
     def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
         super().__init__()
@@ -371,7 +371,7 @@ def on_fit_start(self, trainer, pl_module):
     #         trainer.save_checkpoint(ckpt_path)
 
 
-# PyTorch Lightning callback for ogging images during training and validation of a deep learning model
+# PyTorch Lightning callback for logging images during training and validation of a deep learning model
 class ImageLogger(Callback):
 
     def __init__(self,
@@ -379,10 +379,10 @@ def __init__(self,
                  max_images,      # Maximum number of images to log
                  clamp=True,      # Whether to clamp pixel values to [-1,1]
                  increase_log_steps=True,   # Whether to increase frequency of log steps exponentially
-                 rescale=True,    # Whetehr to rescale pixel values to [0,1]
+                 rescale=True,    # Whether to rescale pixel values to [0,1]
                  disabled=False,  # Whether to disable logging
-                 log_on_batch_idx=False,   # Whether to log on baych index instead of global step
-                 log_first_step=False,     # Whetehr to log on the first step
+                 log_on_batch_idx=False,   # Whether to log on batch index instead of global step
+                 log_first_step=False,     # Whether to log on the first step
                  log_images_kwargs=None):  # Additional keyword arguments to pass to log_images method
         super().__init__()
         self.rescale = rescale
@@ -593,7 +593,7 @@ def on_train_epoch_end(self, trainer, pl_module):
     parser = Trainer.add_argparse_args(parser)
 
     opt, unknown = parser.parse_known_args()
-    # Veirfy the arguments are both specified
+    # Verify the arguments are both specified
     if opt.name and opt.resume:
         raise ValueError("-n/--name and -r/--resume cannot be specified both."
                          "If you want to resume training in a new log folder, "
@@ -646,7 +646,7 @@ def on_train_epoch_end(self, trainer, pl_module):
     # Sets the seed for the random number generator to ensure reproducibility
     seed_everything(opt.seed)
 
-    # Intinalize and save configuratioon using teh OmegaConf library. 
+    # Initialize and save configuration using teh OmegaConf library. 
     try:
         # init and save configs
         configs = [OmegaConf.load(cfg) for cfg in opt.base]
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index b067a437c764..7c117d841e24 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -61,7 +61,7 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
 - `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
 - `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
 - `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-- `placement`  refers to the training strategy supported by Colossal AI, defult = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
+- `placement`  refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
 
 ### Training with prior-preservation loss
 
diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 10d6c2ddd5d7..47d24a4d69cb 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -40,7 +40,7 @@ We provide two stable solutions.
 One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model.
 The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP.
 
-We recommend using Gemini to qucikly run your model in a distributed manner.
+We recommend using Gemini to quickly run your model in a distributed manner.
 It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily.
 And use Titans as an advanced weapon to pursue a more extreme performance.
 Titans has included the some typical models, such as Vit and GPT.
diff --git a/examples/language/gpt/experiments/auto_offload/README.md b/examples/language/gpt/experiments/auto_offload/README.md
index a0d252119056..535aa76541cc 100644
--- a/examples/language/gpt/experiments/auto_offload/README.md
+++ b/examples/language/gpt/experiments/auto_offload/README.md
@@ -27,7 +27,7 @@ pip install transformers
 
 ## Dataset
 
-For simplicity, the input data is randonly generated here.
+For simplicity, the input data is randomly generated here.
 
 ## Training
 
diff --git a/examples/language/gpt/experiments/auto_parallel/README.md b/examples/language/gpt/experiments/auto_parallel/README.md
index 404c8391109e..1c8b1c35109f 100644
--- a/examples/language/gpt/experiments/auto_parallel/README.md
+++ b/examples/language/gpt/experiments/auto_parallel/README.md
@@ -34,7 +34,7 @@ conda install -c conda-forge coin-or-cbc
 
 ## Dataset
 
-For simplicity, the input data is randonly generated here.
+For simplicity, the input data is randomly generated here.
 
 ## Training
 
diff --git a/examples/language/gpt/experiments/pipeline_parallel/README.md b/examples/language/gpt/experiments/pipeline_parallel/README.md
index 702e3c8d6540..5af994a00665 100644
--- a/examples/language/gpt/experiments/pipeline_parallel/README.md
+++ b/examples/language/gpt/experiments/pipeline_parallel/README.md
@@ -27,7 +27,7 @@ pip install transformers
 
 ## Dataset
 
-For simplicity, the input data is randonly generated here.
+For simplicity, the input data is randomly generated here.
 
 ## Training
 
diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
index 4874f831c2ec..3614b689de26 100755
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
@@ -163,7 +163,7 @@ def main():
     else:
         init_dev = get_current_device()
 
-    # shard init prameters
+    # shard init parameters
     if args.shardinit:
         logger.info("Sharding initialization !", ranks=[0])
     else:
@@ -192,7 +192,7 @@ def main():
                                                    config=config,
                                                    local_files_only=False)
 
-    # enable graident checkpointing
+    # enable gradient checkpointing
     model.gradient_checkpointing_enable()
 
     numel = sum([p.numel() for p in model.parameters()])
diff --git a/tests/components_to_test/__init__.py b/tests/components_to_test/__init__.py
index 106f4e61c7e1..f29efefce4a4 100644
--- a/tests/components_to_test/__init__.py
+++ b/tests/components_to_test/__init__.py
@@ -9,11 +9,11 @@
     resnet,
     simple_net,
 )
-from .utils import run_fwd_bwd
+from .utils import run_fwd, run_fwd_bwd
 
 from . import albert    # isort:skip
 
 __all__ = [
     'bert', 'gpt2', 'hanging_param_model', 'inline_op_model', 'nested_model', 'repeated_computed_layers', 'resnet',
-    'simple_net', 'run_fwd_bwd', 'albert', 'beit'
+    'simple_net', 'run_fwd_bwd', 'albert', 'beit', 'run_fwd'
 ]
diff --git a/tests/components_to_test/utils/__init__.py b/tests/components_to_test/utils/__init__.py
index f223f7d322cb..150124b58800 100644
--- a/tests/components_to_test/utils/__init__.py
+++ b/tests/components_to_test/utils/__init__.py
@@ -1,2 +1,2 @@
 from .dummy_data_generator import DummyDataGenerator
-from .executor import run_fwd_bwd
+from .executor import run_fwd, run_fwd_bwd
diff --git a/tests/components_to_test/utils/executor.py b/tests/components_to_test/utils/executor.py
index e77152561e6c..631401e022e6 100644
--- a/tests/components_to_test/utils/executor.py
+++ b/tests/components_to_test/utils/executor.py
@@ -1,9 +1,9 @@
 import torch
 
 
-def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor:
-    """run_fwd_bwd
-    run fwd and bwd for the model
+def run_fwd(model, data, label, criterion) -> torch.Tensor:
+    """run_fwd
+    run fwd for the model
 
     Args:
         model (torch.nn.Module): a PyTorch model
@@ -22,6 +22,23 @@ def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor:
         loss = model(data, label)
 
     loss = loss.float()
+    return loss
+
+
+def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor:
+    """run_fwd_bwd
+    run fwd and bwd for the model
+
+    Args:
+        model (torch.nn.Module): a PyTorch model
+        data (torch.Tensor): input data
+        label (torch.Tensor): label
+        criterion (Optional[Callable]): a function of criterion
+
+    Returns:
+        torch.Tensor: loss of fwd
+    """
+    loss = run_fwd(model, data, label, criterion)
     if optimizer:
         optimizer.backward(loss)
     else:
diff --git a/tests/kit/model_zoo/diffusers/diffusers.py b/tests/kit/model_zoo/diffusers/diffusers.py
index 8aa3f4c6741f..204c1d7773ca 100644
--- a/tests/kit/model_zoo/diffusers/diffusers.py
+++ b/tests/kit/model_zoo/diffusers/diffusers.py
@@ -18,6 +18,7 @@
 data_unet_fn = lambda: dict(sample=torch.randn(2, 3, 32, 32), timestep=3)
 
 identity_output = lambda x: x
+clip_vision_model_output = lambda x: dict(pooler_output=x[1])
 
 
 def data_clip_model():
@@ -65,7 +66,7 @@ def data_clip_vision():
 model_zoo.register(name='diffusers_clip_vision_model',
                    model_fn=partial(transformers.CLIPVisionModel, config=transformers.CLIPVisionConfig()),
                    data_gen_fn=data_clip_vision,
-                   output_transform_fn=identity_output)
+                   output_transform_fn=clip_vision_model_output)
 
 model_zoo.register(name='diffusers_unet2d_model',
                    model_fn=diffusers.UNet2DModel,
diff --git a/tests/kit/model_zoo/torchaudio/torchaudio.py b/tests/kit/model_zoo/torchaudio/torchaudio.py
index 74611720292f..9a244ac312c0 100644
--- a/tests/kit/model_zoo/torchaudio/torchaudio.py
+++ b/tests/kit/model_zoo/torchaudio/torchaudio.py
@@ -1,3 +1,5 @@
+from functools import partial
+
 import torch
 import torchaudio.models as tm
 
@@ -101,13 +103,11 @@ def tacotron_data_gen_fn():
                 mel_specgram_lengths=mel_specgram_lengths)
 
 
-model_zoo.register(
-    name='torchaudio_tacotron',
-    model_fn=lambda: tm.Tacotron2(n_mels=N_MELS),
-    data_gen_fn=tacotron_data_gen_fn,
-    output_transform_fn=lambda outputs: dict(
-        spectrogram_before=outputs[0], spectrogram_after=outputs[1], stop_tokens=outputs[2], attn_weights=outputs[3]),
-    model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='torchaudio_tacotron',
+                   model_fn=lambda: tm.Tacotron2(n_mels=N_MELS),
+                   data_gen_fn=tacotron_data_gen_fn,
+                   output_transform_fn=lambda outputs: dict(summed_output=sum(x.sum() for x in outputs)),
+                   model_attribute=ModelAttribute(has_control_flow=True))
 
 
 def wav2vec_data_gen_fn():
@@ -118,7 +118,7 @@ def wav2vec_data_gen_fn():
 
 
 model_zoo.register(name='torchaudio_wav2vec2_base',
-                   model_fn=tm.wav2vec2_base,
+                   model_fn=partial(tm.wav2vec2_base, encoder_layer_drop=0.0),
                    data_gen_fn=wav2vec_data_gen_fn,
                    output_transform_fn=transformer_output_transform_fn,
                    model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/torchvision/torchvision.py b/tests/kit/model_zoo/torchvision/torchvision.py
index 62bda93d5a75..ddc3ec24b2ff 100644
--- a/tests/kit/model_zoo/torchvision/torchvision.py
+++ b/tests/kit/model_zoo/torchvision/torchvision.py
@@ -36,12 +36,12 @@ def swin_s():
 
 
 # special output transform fn
-google_net_output_transform_fn = lambda x: dict(output=x.logits) if isinstance(x, torchvision.models.GoogLeNetOutputs
-                                                                              ) else dict(output=x)
+google_net_output_transform_fn = lambda x: dict(output=sum(x)) if isinstance(x, torchvision.models.GoogLeNetOutputs
+                                                                            ) else dict(output=x)
 swin_s_output_output_transform_fn = lambda x: {f'output{idx}': val
                                                for idx, val in enumerate(x)} if isinstance(x, tuple) else dict(output=x)
-inception_v3_output_transform_fn = lambda x: dict(output=x.logits) if isinstance(x, torchvision.models.InceptionOutputs
-                                                                                ) else dict(output=x)
+inception_v3_output_transform_fn = lambda x: dict(output=sum(x)) if isinstance(x, torchvision.models.InceptionOutputs
+                                                                              ) else dict(output=x)
 
 model_zoo.register(name='torchvision_alexnet',
                    model_fn=tm.alexnet,
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index d804c727ad3e..985d7989fc9d 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -1,4 +1,5 @@
 from contextlib import nullcontext
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -10,11 +11,53 @@
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils.model.experimental import LazyInitContext
 from colossalai.zero import ColoInitContext
 from tests.kit.model_zoo import model_zoo
 
 
-@parameterize('init_method', ['lazy', 'none', 'colo'])
+def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
+    try:
+        if init_method == 'colo':
+            ctx = ColoInitContext()
+        elif init_method == 'lazy':
+            ctx = LazyInitContext()
+        else:
+            ctx = nullcontext()
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+        booster = Booster(plugin=plugin)
+        with ctx:
+            model = model_fn()
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+
+        data = {
+            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
+        }
+
+        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+        for n, p in model.named_parameters():
+            assert isinstance(p, ColoParameter), f'{n} is not a ColoParameter'
+
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+    except Exception as e:
+        return repr(e)
+
+
+# TODO(ver217): CI does not support lazy now
+# @parameterize('init_method', ['lazy', 'none', 'colo'])
+
+
+@parameterize('init_method', ['none'])
 def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
     """check gemini plugin over model zoo
 
@@ -25,7 +68,6 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
     if not is_support_meta and init_method == 'lazy':
         return
 
-    from colossalai.utils.model.experimental import LazyInitContext
     passed_models = []
     failed_info = {}    # (model_name, error) pair
 
@@ -58,47 +100,15 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
         ]:
             continue
 
-        try:
-            if init_method == 'colo':
-                ctx = ColoInitContext()
-            elif init_method == 'lazy':
-                ctx = LazyInitContext()
-            else:
-                ctx = nullcontext()
-            plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
-            booster = Booster(plugin=plugin)
-            with ctx:
-                model = model_fn()
-            optimizer = HybridAdam(model.parameters(), lr=1e-3)
-            criterion = lambda x: x.mean()
-            data = data_gen_fn()
-
-            data = {
-                k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v
-                for k, v in data.items()
-            }
-
-            model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-            for n, p in model.named_parameters():
-                assert isinstance(p, ColoParameter), f'{n} is not a ColoParameter'
-
-            output = model(**data)
-            output = output_transform_fn(output)
-            output_key = list(output.keys())[0]
-            loss = criterion(output[output_key])
-
-            booster.backward(loss, optimizer)
-            optimizer.step()
-            passed_models.append(name)
+        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
 
-            del booster, plugin, model, optimizer, criterion, data, output, loss
-        except Exception as e:
-            failed_info[name] = e
+        if err is None:
+            passed_models.append(name)
+        else:
+            failed_info[name] = err
             if early_stop:
-                raise e
-
-        torch.cuda.empty_cache()
+                break
 
     if dist.get_rank() == 0:
         print(f'Init method: {init_method}')
@@ -140,7 +150,7 @@ def run_dist(rank, world_size, port, early_stop: bool = True):
 
 @rerun_if_address_is_in_use()
 def test_gemini_plugin(early_stop: bool = True):
-    spawn(run_dist, 2, early_stop=early_stop)
+    spawn(run_dist, 4, early_stop=early_stop)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
new file mode 100644
index 000000000000..e24196a14917
--- /dev/null
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -0,0 +1,122 @@
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import model_zoo
+
+# These models are not compatible with AMP
+_AMP_ERR_MODELS = ['timm_convit', 'dlrm', 'deepfm_interactionarch', 'deepfm_simpledeepfmnn`']
+# These models have no parameters
+_LOW_LEVEL_ZERO_ERR_MODELS = ['dlrm_interactionarch']
+# These models will get stuck
+_STUCK_MODELS = [
+    'diffusers_vq_model', 'transformers_albert', 'transformers_albert_for_pretraining', 'transformers_bert',
+    'transformers_bert_for_pretraining', 'transformers_gpt_double_heads'
+]
+
+
+def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
+    try:
+        plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
+        booster = Booster(plugin=plugin)
+        model = model_fn()
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+
+        data = {
+            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
+        }
+
+        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+    except Exception as e:
+        return repr(e)
+
+
+@parameterize('stage', [2])
+def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
+    """check low level zero plugin over model zoo
+
+    Args:
+        stage (int), stage of low level zero plugin
+        early_stop (bool, optional): Whether to stop when getting the first error. Defaults to True.
+    """
+    passed_models = []
+    failed_info = {}    # (model_name, error) pair
+    ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
+    skipped_models = []
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+        # FIXME(ver217): fix these models
+        if name in ignore_models:
+            skipped_models.append(name)
+            continue
+        err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
+
+        if err is None:
+            passed_models.append(name)
+        else:
+            failed_info[name] = err
+            if early_stop:
+                break
+
+    if dist.get_rank() == 0:
+        print(f'Passed models({len(passed_models)}): {passed_models}\n\n')
+        print(f'Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n')
+        print(f'Skipped models({len(skipped_models)}): {skipped_models}\n\n')
+    assert len(failed_info) == 0, '\n'.join([f'{k}: {v}' for k, v in failed_info.items()])
+
+
+def check_dataloader_sharding():
+    plugin = LowLevelZeroPlugin()
+
+    # create a custom dasetset with 0 to 10
+    dataset = torch.utils.data.TensorDataset(torch.arange(0, 10))
+    train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
+
+    # get the first batch of data
+    batch = next(iter(train_dataloader))[0].cuda()
+    is_rank_0 = dist.get_rank() == 0
+
+    if is_rank_0:
+        batch_to_compare = batch.clone()
+    else:
+        batch_to_compare = batch
+    # pass to the rank 1 value to rank 0
+    dist.broadcast(batch_to_compare, src=1)
+
+    # compare on rank 0
+    if is_rank_0:
+        assert not torch.equal(batch,
+                               batch_to_compare), 'Same number was found across ranks but expected it to be different'
+
+
+def run_dist(rank, world_size, port, early_stop: bool = True):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_low_level_zero_plugin(early_stop=early_stop)
+
+
+@rerun_if_address_is_in_use()
+def test_low_level_zero_plugin(early_stop: bool = True):
+    spawn(run_dist, 2, early_stop=early_stop)
+
+
+if __name__ == '__main__':
+    test_low_level_zero_plugin(early_stop=False)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index c225a1a069dd..5354eae01d40 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -11,36 +11,37 @@
 from tests.kit.model_zoo import model_zoo
 
 
-def check_torch_ddp_plugin():
+def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
+    model = model_fn()
+    optimizer = SGD(model.parameters(), lr=1e-3)
+    criterion = lambda x: x.mean()
+    data = data_gen_fn()
 
-    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
-        if name == 'dlrm_interactionarch':
-            continue
+    data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
 
-        model = model_fn()
-        optimizer = SGD(model.parameters(), lr=1e-3)
-        criterion = lambda x: x.mean()
-        data = data_gen_fn()
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 
-        data = {
-            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
-        }
+    assert isinstance(model.module, DDP)
+    assert isinstance(optimizer, OptimizerWrapper)
 
-        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+    output = model(**data)
+    output = output_transform_fn(output)
+    output_key = list(output.keys())[0]
+    loss = criterion(output[output_key])
 
-        assert isinstance(model.module, DDP)
-        assert isinstance(optimizer, OptimizerWrapper)
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
 
-        output = model(**data)
-        output = output_transform_fn(output)
-        output_key = list(output.keys())[0]
-        loss = criterion(output[output_key])
 
-        booster.backward(loss, optimizer)
-        optimizer.clip_grad_by_norm(1.0)
-        optimizer.step()
+def check_torch_ddp_plugin():
+    for name, (model_fn, data_gen_fn, output_transform_fn, _) in model_zoo.items():
+        if name == 'dlrm_interactionarch':
+            continue
+        run_fn(model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
 
 
 def check_dataloader_sharding():
diff --git a/tests/test_zero/test_gemini/test_fwd_bwd.py b/tests/test_zero/test_gemini/test_fwd_bwd.py
index 697595bc3352..f2cbb7fb77d6 100644
--- a/tests/test_zero/test_gemini/test_fwd_bwd.py
+++ b/tests/test_zero/test_gemini/test_fwd_bwd.py
@@ -12,7 +12,7 @@
 from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer
 from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
 from colossalai.zero.gemini.gemini_mgr import GeminiManager
-from tests.components_to_test import run_fwd_bwd
+from tests.components_to_test import run_fwd, run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
 from tests.test_tensor.common_utils import set_seed
 
@@ -89,10 +89,65 @@ def exam_gpt_fwd_bwd(
         check_grad(model, torch_model)
 
 
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('keep_gather', [False, True])
+@parameterize('model_name', ['gpt2', 'bert', 'albert'])
+@parameterize('scatter_after_inference', [False, True])
+def exam_gpt_inference(
+    placement_policy,
+    keep_gather,
+    model_name: str,
+    scatter_after_inference: bool = False,
+):
+    init_device = get_current_device()
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
+
+    set_seed(42)
+    with ColoInitContext(device=init_device):
+        model = model_builder()
+
+    set_seed(42)
+    torch_model = model_builder().cuda()
+    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
+        torch_p.data.copy_(p.data)
+
+    world_size = torch.distributed.get_world_size()
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict[world_size]['chunk_size'] = 5000
+    config_dict[world_size]['keep_gathered'] = keep_gather
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True, scatter_after_inference=scatter_after_inference)
+
+    pg = ProcessGroup()
+    amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=1)
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
+    torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
+    torch_model = DDP(torch_model, device_ids=[pg.rank()], process_group=pg.dp_process_group())
+
+    set_seed(pg.dp_local_rank())
+    model.eval()
+    torch_model.eval()
+    for i, (input_ids, label) in enumerate(train_dataloader):
+        # you can only test a single fwd + bwd.
+        # after bwd param is grad for Gemini, due to the chunk reuse optimization.
+        if i > 0:
+            break
+        with torch.no_grad():
+            input_ids, label = input_ids.cuda(), label.cuda()
+
+            torch_loss = run_fwd(torch_model, input_ids, label, criterion)
+            loss = run_fwd(model, input_ids, label, criterion)
+
+        assert torch.equal(torch_loss, loss)
+
+
 def run_dist(rank, world_size, port):
     config = {}
     colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     exam_gpt_fwd_bwd()
+    exam_gpt_inference()
 
 
 @pytest.mark.dist