diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index f40f4cc86d1b..3fad7e36f14c 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -43,10 +43,18 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files and a branch is created/deleted. It will run all the unit tests in the repository with 4 GPUs. |
 | `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 
+To reduce the average time of the unit test on PR, `Build on PR` workflow manages testmon cache.
+
+1. When creating a new branch, it copies `cache/main/.testmondata*` to `cache/<branch>/`.
+2. When creating a new PR or change the base branch of a PR, it copies `cache/<base_ref>/.testmondata*` to `cache/_pull/<pr_number>/`.
+3. When running unit tests for each PR, it restores testmon cache from `cache/_pull/<pr_number>/`. After the test, it stores the cache back to `cache/_pull/<pr_number>/`.
+4. When a PR is closed, if it's merged, it copies `cache/_pull/<pr_number>/.testmondata*` to `cache/<base_ref>/`. Otherwise, it just removes `cache/_pull/<pr_number>`.
+5. When a branch is deleted, it removes `cache/<ref>`.
+
 ### Example Test
 
 | Workflow Name              | File name                       | Description                                                                    |
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index a5a17d176c9d..8b2253e57cfb 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -2,7 +2,7 @@ name: Build on PR
 
 on:
   pull_request:
-    types: [synchronize, opened, reopened]
+    types: [synchronize, opened, reopened, ready_for_review, closed, edited]
     branches:
       - "main"
       - "develop"
@@ -18,11 +18,63 @@ on:
       - "!tests/**.md" # ignore doc change
       - "pytest.ini" # test config change
       - "setup.py" # install command change
+  create:
+  delete:
 
 jobs:
+  prepare_cache:
+    name: Prepare testmon cache
+    if: |
+      github.event_name == 'create' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
+          fi
+        env:
+          MAIN_BRANCH: ${{ github.event.master_branch }}
+
+  prepare_cache_for_pr:
+    name: Prepare testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d "/github/home/testmon_cache/${BASE}" ]; then
+            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+
   detect:
     name: Detect file change
     if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
       github.event.pull_request.draft == false &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
@@ -135,9 +187,11 @@ jobs:
 
       - name: Restore Testmon Cache
         run: |
-          if [ -d /github/home/testmon_cache ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Execute Unit Testing
         run: |
@@ -149,8 +203,10 @@ jobs:
 
       - name: Store Testmon Cache
         run: |
-          [ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/
+          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Collate artifact
         env:
@@ -188,3 +244,54 @@ jobs:
         with:
           name: report
           path: report/
+
+  store_cache:
+    name: Store testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      github.event.action == 'closed' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Store testmon cache if possible
+        if: github.event.pull_request.merged == true
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Remove testmon cache
+        run: |
+          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+  remove_cache:
+    name: Remove testmon cache
+    if: |
+      github.event_name == 'delete' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Remove testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          rm -rf "/github/home/testmon_cache/${BASE}"
diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml
index 22698ca192ed..6c8df9730b0d 100644
--- a/.github/workflows/release_docker_after_publish.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@@ -23,8 +23,11 @@ jobs:
         run: |
           version=$(cat version.txt)
           tag=hpcaitech/colossalai:$version
+          latest=hpcaitech/colossalai:latest
           docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 --build-arg VERSION=v${version} -t $tag ./docker
+          docker tag $tag $latest
           echo "tag=${tag}" >> $GITHUB_OUTPUT
+          echo "latest=${latest}" >> $GITHUB_OUTPUT
 
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
@@ -36,6 +39,7 @@ jobs:
         id: docker-push
         run: |
           docker push ${{ steps.build.outputs.tag }}
+          docker push ${{ steps.build.outputs.latest }}
 
   notify:
     name: Notify Lark via webhook
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 9d9d3a007851..129bf7ed3270 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
+      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
       run:
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
new file mode 100644
index 000000000000..9e8f36cefc4f
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py
@@ -0,0 +1,178 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker0", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        kl_coef=0.1,
+        debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f'maker{x}' for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    wait_tasks.append(
+        experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                 num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
new file mode 100644
index 000000000000..46a0062893b8
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                     num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 0533a60dc532..2a9059e6901e 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -61,7 +61,13 @@ def T(w):
         if self.merge_weights and self.merged:
             # Make sure that the weights are not merged
             if self.r > 0:
-                self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+                if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
+                    # FIXME(csric): temporary fix
+                    self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
+                    self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
+                    self.reset_parameters()
+                else:
+                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
             self.merged = False
 
     def eval(self):
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/Chat/coati/quant/__init__.py
new file mode 100644
index 000000000000..a65a78d07bb8
--- /dev/null
+++ b/applications/Chat/coati/quant/__init__.py
@@ -0,0 +1,7 @@
+from .llama_gptq import load_quant as llama_load_quant
+from .utils import low_resource_init
+
+__all__ = [
+    'llama_load_quant',
+    'low_resource_init',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/Chat/coati/quant/llama_gptq/__init__.py
new file mode 100644
index 000000000000..51c8d6316290
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/__init__.py
@@ -0,0 +1,5 @@
+from .loader import load_quant
+
+__all__ = [
+    'load_quant',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/Chat/coati/quant/llama_gptq/loader.py
new file mode 100644
index 000000000000..5353dc8a2ea3
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/loader.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from .model_utils import find_layers
+from .quant import make_quant
+
+
+def load_quant(model: nn.Module, checkpoint: str, wbits: int, groupsize: int):
+    model = model.eval()
+    layers = find_layers(model)
+
+    # ignore lm head
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+
+    make_quant(model, layers, wbits, groupsize)
+
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+
+    return model
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/Chat/coati/quant/llama_gptq/model_utils.py
new file mode 100644
index 000000000000..62db171abb52
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/model_utils.py
@@ -0,0 +1,13 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
+
+import torch
+import torch.nn as nn
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/Chat/coati/quant/llama_gptq/quant.py
new file mode 100644
index 000000000000..f7d5b7ce4bd8
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/quant.py
@@ -0,0 +1,283 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/quant.py
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def quantize(x, scale, zero, maxq):
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        self.scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+        else:
+            self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float('inf'), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+try:
+    import quant_cuda
+except:
+    print('CUDA extension not installed.')
+
+# Assumes layer is perfectly divisible into 256 * 256 blocks
+
+
+class QuantLinear(nn.Module):
+
+    def __init__(self, bits, groupsize, infeatures, outfeatures):
+        super().__init__()
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2, int(math.log2(groupsize)))):
+            raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)")
+        groupsize = groupsize if groupsize != -1 else infeatures
+        self.groupsize = groupsize
+        self.register_buffer(
+            'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
+                                  dtype=torch.int))
+        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
+        self.register_buffer('bias', torch.zeros(outfeatures))
+        self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
+        self._initialized_quant_state = False
+
+    def pack(self, linear, scales, zeros):
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.groupsize
+            intweight.append(
+                torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
+                                                                                                                  None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros((intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32)
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        intermediate_dtype = torch.float32
+
+        if not self._initialized_quant_state:
+            # Do we even have a bias? Check for at least one non-zero element.
+            if self.bias is not None and bool(torch.any(self.bias != 0)):
+                # Then make sure it's the right type.
+                self.bias.data = self.bias.data.to(intermediate_dtype)
+            else:
+                self.bias = None
+
+        outshape = list(x.shape)
+        outshape[-1] = self.outfeatures
+        x = x.reshape(-1, x.shape[-1])
+        if self.bias is None:
+            y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device)
+        else:
+            y = self.bias.clone().repeat(x.shape[0], 1)
+
+        output_dtype = x.dtype
+        x = x.to(intermediate_dtype)
+        if self.bits == 2:
+            quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 3:
+            quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 4:
+            quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 8:
+            quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        else:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        y = y.to(output_dtype)
+        return y.reshape(outshape)
+
+
+def make_quant(module, names, bits, groupsize, name=''):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
+    for name1, child in module.named_children():
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
diff --git a/applications/Chat/coati/quant/utils.py b/applications/Chat/coati/quant/utils.py
new file mode 100644
index 000000000000..01b8cff0add1
--- /dev/null
+++ b/applications/Chat/coati/quant/utils.py
@@ -0,0 +1,28 @@
+from contextlib import contextmanager
+
+import torch
+
+
+def _noop(*args, **kwargs):
+    pass
+
+
+@contextmanager
+def low_resource_init():
+    """This context manager disables weight initialization and sets the default float dtype to half.
+    """
+    old_kaiming_uniform_ = torch.nn.init.kaiming_uniform_
+    old_uniform_ = torch.nn.init.uniform_
+    old_normal_ = torch.nn.init.normal_
+    dtype = torch.get_default_dtype()
+    try:
+        torch.nn.init.kaiming_uniform_ = _noop
+        torch.nn.init.uniform_ = _noop
+        torch.nn.init.normal_ = _noop
+        torch.set_default_dtype(torch.half)
+        yield
+    finally:
+        torch.nn.init.kaiming_uniform_ = old_kaiming_uniform_
+        torch.nn.init.uniform_ = old_uniform_
+        torch.nn.init.normal_ = old_normal_
+        torch.set_default_dtype(dtype)
diff --git a/applications/Chat/coati/ray/README.md b/applications/Chat/coati/ray/README.md
new file mode 100644
index 000000000000..228155a6855b
--- /dev/null
+++ b/applications/Chat/coati/ray/README.md
@@ -0,0 +1,160 @@
+# Distributed PPO Training on Stage 3
+
+## Detach Experience Makers and Trainers
+
+We can completely separate the trainers and makers.
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/basic_structure.png?raw=true" width=600/>
+</p>
+
+- The experience maker performs inference, produces experience, and remotely delivers it to the trainer (1).
+- The trainer consumes experience to train models, and periodically transmits new model parameters to the maker (2.1, 2.2).
+- Using an experience buffer to overlap transmission and computing.
+
+In this manner, each node will work continuously without model idle time, and different optimization strategies can be applied for inference and training to meet the needs of speed or storage. It is also helpful for scalability.
+
+`DetachedPPOTrainer` and `ExperienceMakerHolder` are Ray Actors (distinguished from Actor Model), representing Trainer and Experience Maker on the graph above, respectively.
+
+[More about Ray Core](https://docs.ray.io/en/latest/ray-core/walkthrough.html)
+
+## Usage
+
+See examples at `ColossalAI/application/Chat/examples/ray`
+
+### Setup Makers
+
+- define makers' environment variables :
+
+    ```python
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(num_makers)]
+
+    ```
+- define maker models :
+    ```python
+    def model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        reward_model = get_reward_model_from_args(...)
+        initial_model = get_actor_from_args(...)
+        return actor, critic, reward_model, initial_model
+
+    ```
+- set experience_holder_refs :
+
+    ```python
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(
+            name=f"maker_{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            detached_trainer_name_list=[f"trainer_{x}" for x in target_trainers(...)],
+            model_fn=model_fn,
+            ...)
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+    ```
+    The names in the `detached_trainer_name_list` refer to the target trainers that the maker should send experience to.
+    We set a trainer's name the same as a maker, by `.options(name="str")`. See below.
+
+### Setup Trainers
+
+- define trainers' environment variables :
+    ```python
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(num_trainers)]
+    ```
+- define trainer models :
+
+    ```python
+    def trainer_model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        return actor, critic
+    ```
+- set trainer_refs :
+    ```python
+    trainer_refs = [
+        DetachedPPOTrainer.options(
+            name=f"trainer{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            experience_maker_holder_name_list=[f"maker{x}" for x in target_makers(...)],
+            model_fn = trainer_model_fn(),
+            ...)
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+    ```
+    The names in `experience_maker_holder_name_list` refer to the target makers that the trainer should send updated models to.
+    By setting  `detached_trainer_name_list` and `experience_maker_holder_name_list`, we can customize the transmission graph.
+
+### Launch Jobs
+- define data_loader :
+    ```python
+    def data_loader_fn():
+        return = torch.utils.data.DataLoader(dataset=dataset)
+
+    ```
+- launch makers :
+    ```python
+    wait_tasks = []
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(data_loader_fn(),
+                                                     num_steps=experience_steps))
+
+    ```
+
+- launch trainers :
+    ```python
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, update_steps, train_epochs))
+    ```
+
+- wait for done :
+    ```python
+    ray.get(wait_tasks)
+    ```
+
+## Flexible Structure
+
+We can deploy different strategies to makers and trainers. Here are some notions.
+
+### 2 Makers 1 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m1t.png?raw=true" width=600/>
+</p>
+
+### 2 Makers 2 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t.png?raw=true" width=600/>
+</p>
+
+### Maker Inference Quantization
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t_quantize.png?raw=true" width=600/>
+</p>
+
+### Tensor Parallel
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/tp_ddp_hybrid.png?raw=true" width=600/>
+</p>
+
+## TODO
+
+- [ ] Support LoRA
+- [ ] Support TP & PP
diff --git a/applications/Chat/coati/ray/__init__.py b/applications/Chat/coati/ray/__init__.py
index 5802c05bc03f..e69de29bb2d1 100644
--- a/applications/Chat/coati/ray/__init__.py
+++ b/applications/Chat/coati/ray/__init__.py
@@ -1,2 +0,0 @@
-from .src.detached_replay_buffer import DetachedReplayBuffer
-from .src.detached_trainer_ppo import DetachedPPOTrainer
diff --git a/applications/Chat/coati/ray/callbacks/__init__.py b/applications/Chat/coati/ray/callbacks/__init__.py
new file mode 100644
index 000000000000..5f5e488f383e
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/__init__.py
@@ -0,0 +1,9 @@
+from .base import MakerCallback, TrainerCallback
+from .performance_evaluator import ExperienceMakerPerformanceEvaluator, TrainerPerformanceEvaluator
+
+__all__ = [
+    "TrainerCallback",
+    "MakerCallback",
+    "ExperienceMakerPerformanceEvaluator",
+    "TrainerPerformanceEvaluator",
+]
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/Chat/coati/ray/callbacks/base.py
new file mode 100644
index 000000000000..3306150a41ff
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/base.py
@@ -0,0 +1,66 @@
+from abc import ABC
+
+from coati.experience_maker import Experience
+
+
+class TrainerCallback(ABC):
+    """
+        Base callback class. It defines the interface for callbacks.
+    """
+
+    def on_fit_start(self) -> None:
+        pass
+
+    def on_fit_end(self) -> None:
+        pass
+
+    def on_episode_start(self, episode: int) -> None:
+        pass
+
+    def on_episode_end(self, episode: int) -> None:
+        pass
+
+    def on_epoch_start(self, epoch: int) -> None:
+        pass
+
+    def on_epoch_end(self, epoch: int) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        pass
+
+    def on_update_start(self) -> None:
+        pass
+
+    def on_update_end(self) -> None:
+        pass
+
+
+class MakerCallback(ABC):
+
+    def on_loop_start(self) -> None:
+        pass
+
+    def on_loop_end(self) -> None:
+        pass
+
+    def on_make_experience_start(self) -> None:
+        pass
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        pass
+
+    def on_send_start(self) -> None:
+        pass
+
+    def on_send_end(self) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self) -> None:
+        pass
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
new file mode 100644
index 000000000000..cd3517609e7a
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
@@ -0,0 +1,212 @@
+from time import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from coati.experience_maker import Experience
+
+from .base import MakerCallback, TrainerCallback
+
+
+def get_world_size() -> int:
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class Timer:
+
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        self.duration += time() - self.start_time
+
+    def reset(self) -> None:
+        self.duration = 0.
+
+
+class ExperienceMakerPerformanceEvaluator(MakerCallback):
+
+    def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int,
+                 reward_model_num_params: int) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.initial_model_num_params = initial_model_num_params
+        self.reward_model_num_params = reward_model_num_params
+
+        self.batch_timer = Timer()
+        self.send_timer = Timer()
+        self.make_experience_timer = Timer()
+        self.total_samples: int = 0
+        self.make_experience_flop: int = 0
+
+        print_rank_0(
+            f'ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_make_experience_start(self) -> None:
+        self.make_experience_timer.start()
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        self.make_experience_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor generate
+        num_actions = experience.action_mask.size(1)
+        input_len = seq_len - num_actions
+        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
+        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
+        # actor forward
+        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
+        # critic forward
+        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
+        # initial model forward
+        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
+        # reward model forward
+        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
+
+    def on_send_start(self) -> None:
+        self.send_timer.start()
+
+    def on_send_end(self) -> None:
+        self.send_timer.end()
+
+    def on_batch_start(self) -> None:
+        self.batch_timer.start()
+
+    def on_batch_end(self) -> None:
+        self.batch_timer.end()
+
+    def on_loop_end(self) -> None:
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
+        avg_overall_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_send_duration = all_reduce_mean(self.send_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_overall_duration + 1e-12)
+        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
+        avg_time_per_sample = (avg_overall_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / \
+            (self.total_samples * self.world_size)
+        avg_send_time_per_sample = (avg_send_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Making Experience Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n' +
+            f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
+
+
+class TrainerPerformanceEvaluator(TrainerCallback):
+
+    def __init__(self,
+                 actor_num_params: int,
+                 critic_num_params: int,
+                 enable_grad_checkpoint: bool = False,
+                 ignore_first_episodes: int = 1) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_first_episodes = ignore_first_episodes
+        self.ignore_this_episode = False
+
+        self.episode_timer = Timer()
+        self.batch_timer = Timer()
+        self.update_timer = Timer()
+        self.total_samples: int = 0
+        self.learn_flop: int = 0
+
+        print_rank_0(
+            f'Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_episode_start(self, episodes: int) -> None:
+        self.ignore_this_episode = episodes < self.ignore_first_episodes
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.start()
+
+    def on_episode_end(self, episodes: int) -> None:
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.end()
+
+    def on_batch_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.start()
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor forward-backward, 3 means forward(1) + backward(2)
+        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+        # critic forward-backward
+        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+
+    def on_update_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.start()
+
+    def on_update_end(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.end()
+
+    def on_fit_end(self) -> None:
+        if self.total_samples == 0:
+            print_rank_0('No samples are collected, skip trainer performance evaluation')
+            return
+        avg_train_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_update_duration = all_reduce_mean(self.update_timer.duration, self.world_size)
+        avg_episode_duration = all_reduce_mean(self.episode_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_episode_duration + 1e-12)
+        avg_learn_tflops = self.learn_flop / 1e12 / (avg_train_duration + 1e-12)
+        avg_time_per_sample = (avg_episode_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_train_time_per_sample = (avg_train_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_update_time_per_sample = (avg_update_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Learning Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_learn_tflops:.3f}\n' + f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
diff --git a/applications/Chat/coati/ray/src/detached_replay_buffer.py b/applications/Chat/coati/ray/detached_replay_buffer.py
similarity index 62%
rename from applications/Chat/coati/ray/src/detached_replay_buffer.py
rename to applications/Chat/coati/ray/detached_replay_buffer.py
index 18c8db388e88..2f765281178a 100644
--- a/applications/Chat/coati/ray/src/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/detached_replay_buffer.py
@@ -1,22 +1,24 @@
-import torch
+import asyncio
+import copy
 import random
-from typing import List, Any
-# from torch.multiprocessing import Queue
-from ray.util.queue import Queue
+from threading import Lock
+from typing import Any, List
+
 import ray
-import asyncio
+import torch
 from coati.experience_maker.base import Experience
-from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
 from coati.replay_buffer import ReplayBuffer
-from threading import Lock
-import copy
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
+# from torch.multiprocessing import Queue
+from ray.util.queue import Queue
+
 
 class DetachedReplayBuffer:
     '''
-        Detached replay buffer. Share Experience across workers on the same node. 
-        Therefore a trainer node is expected to have only one instance. 
+        Detached replay buffer. Share Experience across workers on the same node.
+        Therefore a trainer node is expected to have only one instance.
         It is ExperienceMakerHolder's duty to call append(exp) method, remotely.
-    
+
     Args:
         sample_batch_size: Batch size when sampling. Exp won't enqueue until they formed a batch.
         tp_world_size: Number of workers in the same tp group
@@ -24,31 +26,25 @@ class DetachedReplayBuffer:
         cpu_offload: Whether to offload experience to cpu when sampling. Defaults to True.
     '''
 
-    def __init__(self, sample_batch_size: int, tp_world_size: int = 1, limit : int = 0, cpu_offload: bool = True) -> None:
-        self.cpu_offload = cpu_offload
+    def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
         self.sample_batch_size = sample_batch_size
         self.limit = limit
-        self.items = Queue(self.limit, actor_options={"num_cpus":1})
-        self.batch_collector : List[BufferItem] = []
+        self.items = Queue(self.limit, actor_options={"num_cpus": 1})
+        self.batch_collector: List[BufferItem] = []
 
+    @torch.no_grad()
+    def append(self, experience: Experience) -> None:
         '''
-        Workers in the same tp group share this buffer and need same sample for one step.
-            Therefore a held_sample should be returned tp_world_size times before it could be dropped.
-            worker_state records whether a worker got the held_sample
+        Expected to be called remotely.
         '''
-        self.tp_world_size = tp_world_size
-        self.worker_state = [False] * self.tp_world_size
-        self.held_sample = None
-        self._worker_state_lock = Lock()
+        items = split_experience_batch(experience)
+        self.extend(items)
 
     @torch.no_grad()
-    def append(self, experience: Experience) -> None:
+    def extend(self, items: List[BufferItem]) -> None:
         '''
         Expected to be called remotely.
         '''
-        if self.cpu_offload:
-            experience.to_device(torch.device('cpu'))
-        items = split_experience_batch(experience)
         self.batch_collector.extend(items)
         while len(self.batch_collector) >= self.sample_batch_size:
             items = self.batch_collector[:self.sample_batch_size]
@@ -62,19 +58,10 @@ def clear(self) -> None:
         self.items = Queue(self.limit)
         self.worker_state = [False] * self.tp_world_size
         self.batch_collector = []
-     
+
     @torch.no_grad()
-    def sample(self, worker_rank = 0, to_device = "cpu") -> Experience:
-        self._worker_state_lock.acquire()
-        if not any(self.worker_state):
-            self.held_sample = self._sample_and_erase()
-        self.worker_state[worker_rank] = True
-        if all(self.worker_state):
-            self.worker_state = [False] * self.tp_world_size
-            ret = self.held_sample
-        else:
-            ret = copy.deepcopy(self.held_sample)
-        self._worker_state_lock.release()
+    def sample(self, worker_rank=0, to_device="cpu") -> Experience:
+        ret = self._sample_and_erase()
         ret.to_device(to_device)
         return ret
 
@@ -85,4 +72,4 @@ def _sample_and_erase(self) -> Experience:
 
     def get_length(self) -> int:
         ret = self.items.qsize()
-        return ret
\ No newline at end of file
+        return ret
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/Chat/coati/ray/detached_trainer_base.py
new file mode 100644
index 000000000000..ac2d35e9da19
--- /dev/null
+++ b/applications/Chat/coati/ray/detached_trainer_base.py
@@ -0,0 +1,179 @@
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import ray
+import torch
+from coati.experience_maker import Experience
+from coati.replay_buffer.utils import BufferItem
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from .callbacks import TrainerCallback
+from .detached_replay_buffer import DetachedReplayBuffer
+from .utils import is_rank_0
+
+
+class DetachedTrainer(ABC):
+    '''
+        Base class for detached rlhf trainers.
+        'detach' means that the experience maker is detached compared to a normal Trainer.
+        Please set name attribute during init:
+            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
+            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
+    Args:
+        detached_strategy (DetachedStrategy): the strategy to use for training
+        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
+        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+
+    '''
+
+    def __init__(self,
+                 experience_maker_holder_name_list: List[str],
+                 train_batch_size: int = 8,
+                 buffer_limit: int = 0,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[TrainerCallback] = [],
+                 debug: bool = False) -> None:
+        super().__init__()
+        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit)
+        self.dataloader_pin_memory = dataloader_pin_memory
+        self.callbacks = callbacks
+        self.target_holder_name_list = experience_maker_holder_name_list
+        self.target_holder_list = []
+        self._is_target_holder_initialized = False
+        self._debug = debug
+
+    def update_target_holder_list(self):
+        # as the length of target_holder_list may be zero, we need to check it by a bool flag
+        if not self._is_target_holder_initialized:
+            for name in self.target_holder_name_list:
+                self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+            self._is_target_holder_initialized = True
+
+    @abstractmethod
+    def _update_remote_makers(self, fully_update: bool = False, **kwargs):
+        pass
+
+    def sync_models_to_remote_makers(self, **kwargs):
+        self._update_remote_makers(fully_update=True, **kwargs)
+
+    @abstractmethod
+    def training_step(self, experience: Experience) -> Dict[str, Any]:
+        pass
+
+    def _learn(self, update_steps: int, train_epochs: int) -> None:
+        data = []
+        # warmup
+        pbar = tqdm(range(update_steps), desc=f'Train epoch [1/{train_epochs}]', disable=not is_rank_0())
+        self._on_epoch_start(0)
+        self._learn_epoch(pbar, data)
+        self._on_epoch_end(0)
+        # item is already a batch
+        dataloader = DataLoader(data,
+                                batch_size=1,
+                                shuffle=True,
+                                pin_memory=self.dataloader_pin_memory,
+                                collate_fn=lambda x: x[0])
+        for epoch in range(1, train_epochs):
+            pbar = tqdm(dataloader, desc=f'Train epoch [{epoch + 1}/{train_epochs}]', disable=not is_rank_0())
+            self._on_epoch_start(epoch)
+            self._learn_epoch(pbar, data)
+            self._on_epoch_end(epoch)
+
+    def _learn_epoch(self, pbar: tqdm, data: List[Experience]) -> None:
+        is_warmup = len(data) == 0
+        for x in pbar:
+            if self._debug:
+                print("[trainer] training step")
+            # sample a batch and then train to avoid waiting
+            experience = x if not is_warmup else self._buffer_sample()
+            experience.to_device(torch.cuda.current_device())
+            self._on_batch_start()
+            metrics = self.training_step(experience)
+            self._on_batch_end(metrics, experience)
+
+            if self._debug:
+                print("[trainer] step over")
+            experience.to_device("cpu")
+            if is_warmup:
+                data.append(experience)
+            pbar.set_postfix(metrics)
+
+    def fit(self, total_steps: int, update_steps: int, train_epochs: int = 1) -> None:
+        self._on_fit_start()
+        for i in tqdm(range(total_steps // update_steps), desc='Trainer', disable=not is_rank_0()):
+            self._on_episode_start(i)
+            self._learn(update_steps, train_epochs)
+            self._on_update_start()
+            self._update_remote_makers()
+            self._on_update_end()
+            self._on_episode_end(i)
+        self._on_fit_end()
+
+    @ray.method(concurrency_group="buffer_length")
+    def buffer_get_length(self):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print("[trainer]                telling length")
+        return self.detached_replay_buffer.get_length()
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_append(self, experience: Experience):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.append(experience)
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_extend(self, items: List[BufferItem]):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.extend(items)
+
+    @ray.method(concurrency_group="buffer_sample")
+    def _buffer_sample(self):
+        return self.detached_replay_buffer.sample()
+
+    def _on_fit_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_start()
+
+    def _on_fit_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_end()
+
+    def _on_episode_start(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_start(episode)
+
+    def _on_episode_end(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_end(episode)
+
+    def _on_epoch_start(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_start(epoch)
+
+    def _on_epoch_end(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch)
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end(metrics, experience)
+
+    def _on_update_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_start()
+
+    def _on_update_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_end()
diff --git a/applications/Chat/coati/ray/src/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
similarity index 55%
rename from applications/Chat/coati/ray/src/detached_trainer_ppo.py
rename to applications/Chat/coati/ray/detached_trainer_ppo.py
index 838e82d07f4a..5f0032716f93 100644
--- a/applications/Chat/coati/ray/src/detached_trainer_ppo.py
+++ b/applications/Chat/coati/ray/detached_trainer_ppo.py
@@ -1,24 +1,38 @@
-from typing import Any, Callable, Dict, List, Optional
-import torch
-from torch.optim import Adam
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
+import ray
+import torch
 from coati.experience_maker import Experience, NaiveExperienceMaker
 from coati.models.base import Actor, Critic
-from coati.models.generation_utils import update_model_kwargs_fn
 from coati.models.loss import PolicyLoss, ValueLoss
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
 from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
+from torch.optim import Adam
 
 from colossalai.nn.optimizer import HybridAdam
 
-import ray
-
-
-from .utils import is_rank_0, get_cuda_actor_critic_from_args, get_strategy_from_args, set_dist_env
+from .callbacks import TrainerCallback, TrainerPerformanceEvaluator
 from .detached_trainer_base import DetachedTrainer
-
-
-@ray.remote(concurrency_groups={"buffer_length": 1, "buffer_append":1, "buffer_sample":1,"model_io": 1, "compute": 1})
+from .lora_constructor import LoRAConstructor
+from .utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_model_numel,
+    get_rank,
+    get_strategy_from_args,
+    is_rank_0,
+    set_dist_env,
+    state_dict_to,
+)
+
+
+@ray.remote(concurrency_groups={
+    "buffer_length": 1,
+    "buffer_append": 1,
+    "buffer_sample": 1,
+    "model_io": 1,
+    "compute": 1
+})
 class DetachedPPOTrainer(DetachedTrainer):
     '''
         Detached Trainer for PPO algorithm
@@ -40,86 +54,102 @@ class DetachedPPOTrainer(DetachedTrainer):
         generate_kwargs (dict, optional): the kwargs to use while model generating
     '''
 
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 strategy: str,
-                 model: str,
-                 env_info: Dict[str, str] = None,
-                 pretrained: str = None,
-                 lora_rank: int = 0,
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 eps_clip: float = 0.2,
-                 value_clip: float = 0.4,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 10,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
+    def __init__(
+        self,
+        experience_maker_holder_name_list: List[str],
+        strategy_fn: Callable[[], Strategy],
+        model_fn: Callable[[], Tuple[Actor, Critic]],
+        env_info: Dict[str, str] = None,
+        train_batch_size: int = 8,
+        buffer_limit: int = 0,
+        eps_clip: float = 0.2,
+        value_clip: float = 0.4,
+        dataloader_pin_memory: bool = True,
+        callbacks: List[TrainerCallback] = [],
+        eval_performance: bool = False,
+        debug: bool = False,
+        update_lora_weights: bool = False,
+    ) -> None:
         # set environment variables
         if env_info:
             set_dist_env(env_info=env_info)
         # configure strategy
-        self.strategy = get_strategy_from_args(strategy)
+        self.strategy = strategy_fn()
         # configure models, loss and optimizers
         with self.strategy.model_init_context():
-            self.actor, self.critic = get_cuda_actor_critic_from_args(model, pretrained, lora_rank)
+            self.actor, self.critic = model_fn()
 
-        if strategy != 'colossalai_gemini':
-            self.actor.to(torch.float16).to(torch.cuda.current_device())
-            self.critic.to(torch.float16).to(torch.cuda.current_device())
+        if eval_performance:
+            actor_numel = get_model_numel(self.actor)
+            critic_numel = get_model_numel(self.critic)
+            evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel)
+            callbacks = callbacks + [evaluator]
 
-        if strategy.startswith('colossalai'):
-            self.actor_optim = HybridAdam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = HybridAdam(self.critic.parameters(), lr=5e-6)
+        if isinstance(self.strategy, ColossalAIStrategy):
+            self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7)
         else:
-            self.actor_optim = Adam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = Adam(self.critic.parameters(), lr=5e-6)
+            self.actor_optim = Adam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = Adam(self.critic.parameters(), lr=1e-7)
 
         (self.actor, self.actor_optim), (self.critic, self.critic_optim) = \
             self.strategy.prepare((self.actor, self.actor_optim), (self.critic, self.critic_optim))
-        generate_kwargs = _set_default_generate_kwargs(self.strategy, generate_kwargs, self.actor)
 
+        # configure trainer
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
 
         super().__init__(experience_maker_holder_name_list,
                          train_batch_size=train_batch_size,
                          buffer_limit=buffer_limit,
-                         buffer_cpu_offload=buffer_cpu_offload,
-                         experience_batch_size=experience_batch_size,
-                         max_epochs=max_epochs,
                          dataloader_pin_memory=dataloader_pin_memory,
                          callbacks=callbacks,
-                         **generate_kwargs)
+                         debug=debug)
+        if self._debug:
+            print(f'[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}')
+
+        self._update_lora_weights = update_lora_weights
 
     @ray.method(concurrency_group="model_io")
-    def _update_remote_makers(self):
+    @torch.no_grad()
+    def _update_remote_makers(self, fully_update: bool = False, **config):
         # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+        if not fully_update:
+            config['requires_grad_only'] = True
+        self.update_target_holder_list()
+        # mark start, ensure order
+        tasks = []
+        for target_holder in self.target_holder_list:
+            tasks.append(target_holder.update_experience_maker.remote(chunk_start=True, fully_update=fully_update))
+        ray.get(tasks)
+        # sending loop
+        tasks = []
+
+        for state_dict_shard in self._get_model_state_dict_shard(self.actor, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.update_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
-                    
-    @ray.method(concurrency_group="model_io")
-    def initialize_remote_makers(self):
-        # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_actor_state_dict=state_dict_shard,
+                        new_actor_lora_config_dict=self._get_model_lora_config_dict(self.actor),
+                        fully_update=fully_update))
+        # sending loop
+        for state_dict_shard in self._get_model_state_dict_shard(self.critic, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.initialize_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_critic_state_dict=state_dict_shard,
+                        new_critic_lora_config_dict=self._get_model_lora_config_dict(self.critic),
+                        fully_update=fully_update))
+        ray.get(tasks)
+        # mark end
+        for target_holder in self.target_holder_list:
+            target_holder.update_experience_maker.remote(chunk_end=True, fully_update=fully_update)
 
     @ray.method(concurrency_group="compute")
     def training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
         self.critic.train()
 
-        experience.to_device(torch.cuda.current_device())
         num_actions = experience.action_mask.size(1)
         action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
         actor_loss = self.actor_loss_fn(action_log_probs,
@@ -155,38 +185,16 @@ def strategy_save_actor_optim(self, path: str, only_rank0: bool = False) -> None
     def strategy_save_critic_optim(self, path: str, only_rank0: bool = False) -> None:
         self.strategy.save_optimizer(self.critic_optim, path, only_rank0)
 
-    def _get_unwrapped_actor(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = Actor(self.strategy._unwrap_model(self.actor))
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return Actor(self.strategy._unwrap_actor(self.actor))
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.actor
-
-    def _get_unwrapped_critic(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = self.strategy._unwrap_model(self.critic)
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return self.critic.module
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.critic
-
-
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
-    origin_model = strategy._unwrap_actor(actor)
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
-        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
-
-    if 'update_model_kwargs_fn' not in generate_kwargs:
-        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
-
-    return new_kwargs
-   
\ No newline at end of file
+    def _get_model_state_dict_shard(self, model: torch.nn.Module, fully_update=False, **config):
+        for state_dict in self.strategy.get_model_state_dict_shard(model, **config):
+            if not self._update_lora_weights or fully_update:
+                yield state_dict_to(state_dict)
+            else:
+                state_dict_lora, _ = LoRAConstructor.filter_state_dict_lora(state_dict)
+                yield state_dict_to(state_dict_lora)
+
+    def _get_model_lora_config_dict(self, model: torch.nn.Module):
+        if not self._update_lora_weights:
+            return None
+        unwrapped_model = self.strategy.unwrap_model(model)
+        return LoRAConstructor.extract_lora_config(unwrapped_model)
diff --git a/applications/Chat/coati/ray/example/1m1t.py b/applications/Chat/coati/ray/example/1m1t.py
deleted file mode 100644
index a6527370505b..000000000000
--- a/applications/Chat/coati/ray/example/1m1t.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-    
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '1',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '1',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info = env_info_trainer,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        env_info = env_info_maker,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3 # +3 for fault tolerance
-    maker_done_ref = experience_holder_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m1t.sh b/applications/Chat/coati/ray/example/1m1t.sh
deleted file mode 100644
index f7c5054c800e..000000000000
--- a/applications/Chat/coati/ray/example/1m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy colossalai_zero2 --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10   --debug
diff --git a/applications/Chat/coati/ray/example/1m2t.py b/applications/Chat/coati/ray/example/1m2t.py
deleted file mode 100644
index 3883c364a8e0..000000000000
--- a/applications/Chat/coati/ray/example/1m2t.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1])
-    ray.init(dashboard_port = 1145)
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug= args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs * 2 + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m2t.sh b/applications/Chat/coati/ray/example/1m2t.sh
deleted file mode 100644
index 669f4141026c..000000000000
--- a/applications/Chat/coati/ray/example/1m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m2t.py "/path/to/prompts.csv" --model gpt2 \
-    --maker_strategy naive --trainer_strategy ddp --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  #--debug 
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/example/2m1t.py b/applications/Chat/coati/ray/example/2m1t.py
deleted file mode 100644
index b655de1ab1fa..000000000000
--- a/applications/Chat/coati/ray/example/2m1t.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def main(args):
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs // 2 + 3 # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_1_done_ref, maker_2_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m1t.sh b/applications/Chat/coati/ray/example/2m1t.sh
deleted file mode 100644
index a207d4118d60..000000000000
--- a/applications/Chat/coati/ray/example/2m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 3
-
-export RAY_NAMESPACE="admin"
-
-python 2m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy naive --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  # --debug
diff --git a/applications/Chat/coati/ray/example/2m2t.py b/applications/Chat/coati/ray/example/2m2t.py
deleted file mode 100644
index 435c71915fc2..000000000000
--- a/applications/Chat/coati/ray/example/2m2t.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    env_info_maker_2 = {'local_rank' : '0',
-                        'rank' : '1',
-                        'world_size' : '2',
-                        'master_port': maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1,
-           env_info_maker_2])
-    ray.init()
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_2,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref, maker_2_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m2t.sh b/applications/Chat/coati/ray/example/2m2t.sh
deleted file mode 100644
index fb4024766c54..000000000000
--- a/applications/Chat/coati/ray/example/2m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 2m2t.py "path/to/prompts.csv" \
-    --maker_strategy naive --trainer_strategy colossalai_zero2 --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  --debug
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/Chat/coati/ray/experience_maker_holder.py
new file mode 100644
index 000000000000..8551ef1eacef
--- /dev/null
+++ b/applications/Chat/coati/ray/experience_maker_holder.py
@@ -0,0 +1,271 @@
+import os
+import time
+import tracemalloc
+from copy import deepcopy
+from threading import Lock
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import ray
+import torch
+import torch.nn as nn
+from coati.experience_maker import Experience, ExperienceMaker, NaiveExperienceMaker
+from coati.models.base import Actor, Critic, RewardModel
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
+from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import Strategy
+from coati.trainer.strategies.sampler import DistributedSampler
+from ray.exceptions import GetTimeoutError
+from torch import Tensor
+from tqdm import tqdm
+
+from .callbacks import ExperienceMakerPerformanceEvaluator, MakerCallback
+from .utils import (get_model_numel, 
+                    get_rank, 
+                    get_world_size, 
+                    is_rank_0, 
+                    set_dist_env,
+                    state_dict_to)
+from .lora_constructor import LoRAConstructor
+
+@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
+class ExperienceMakerHolder:
+    '''
+    Args:
+        detached_trainer_name_list: str list to get ray actor handles
+        strategy:
+        kl_coef: the coefficient of kl divergence loss
+        sync_models_from_trainers: whether to sync models from trainers. If True, you must call sync_models_to_remote_makers() in trainers to sync models.
+    '''
+
+    def __init__(
+            self,
+            detached_trainer_name_list: List[str],
+            strategy_fn: Callable[[], Strategy],
+    # a function returns (actor, critic, reward_model, initial_model)
+            model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
+            env_info: Dict[str, str] = None,
+            sync_models_from_trainers: bool = False,
+            buffer_cpu_offload: bool = True,
+            kl_coef: float = 0.1,
+            callbacks: List[MakerCallback] = [],
+            eval_performance: bool = False,
+            debug: bool = False,
+            update_lora_weights: bool = False,
+            **generate_kwargs):
+        # set environment variables
+        if env_info:
+            set_dist_env(env_info=env_info)
+        self.target_trainer_list = []
+        assert len(detached_trainer_name_list) > 0
+        self._detached_trainer_name_list = detached_trainer_name_list
+        self.strategy = strategy_fn()
+        self.buffer_cpu_offload = buffer_cpu_offload
+        self.kl_coef = kl_coef
+        # init models
+        with self.strategy.model_init_context():
+            actor, critic, reward_model, initial_model = model_fn()
+        self.generate_kwargs = _set_default_generate_kwargs(generate_kwargs, actor)
+        if eval_performance:
+            actor_numel = get_model_numel(actor)
+            critic_numel = get_model_numel(critic)
+            initial_model_numel = get_model_numel(initial_model)
+            reward_model_numel = get_model_numel(reward_model)
+            evaluator = ExperienceMakerPerformanceEvaluator(actor_numel, critic_numel, initial_model_numel,
+                                                            reward_model_numel)
+            callbacks = callbacks + [evaluator]
+
+        actor, critic, reward_model, initial_model = self.strategy.prepare(actor, critic, reward_model, initial_model)
+        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
+        self.callbacks = callbacks
+
+        self._model_visit_lock = Lock()
+
+        self._is_fully_initialized = not sync_models_from_trainers
+
+        self._debug = debug
+        self._update_lora_weights = update_lora_weights
+        if self._update_lora_weights:
+            self.actor_lora_constructor = LoRAConstructor()
+            self.critic_lora_constructor = LoRAConstructor()
+
+        self.target_auto_balance = False
+
+        self._target_idx = 0
+
+        if self._debug:
+            print(f'[maker{get_rank()}] will send items to {self._detached_trainer_name_list}')
+            if not self._is_fully_initialized:
+                print(f'[maker{get_rank()}] Waiting for INIT')
+
+    def _get_ready(self):
+        while not self._fully_initialized():
+            time.sleep(1.0)
+
+    def _fully_initialized(self):
+        return self._is_fully_initialized
+
+    def _init_target_trainer_list(self):
+        if len(self.target_trainer_list) > 0:
+            return
+        for name in self._detached_trainer_name_list:
+            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+
+    # copy from ../trainer/base.py
+    @ray.method(concurrency_group="compute")
+    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
+        if isinstance(inputs, Tensor):
+            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
+        elif isinstance(inputs, dict):
+            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
+        else:
+            raise ValueError(f'Unsupported input type "{type(inputs)}"')
+
+    @ray.method(concurrency_group="experience_io")
+    def _send_items(self, experience: Experience) -> None:
+        self._init_target_trainer_list()
+        items = split_experience_batch(experience)
+        items_per_trainer = [[] for _ in range(len(self.target_trainer_list))]
+        for item in items:
+            items_per_trainer[self._target_idx].append(item)
+            self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
+        for i, target_trainer in enumerate(self.target_trainer_list):
+            if len(items_per_trainer[i]) > 0:
+                target_trainer.buffer_extend.remote(items_per_trainer[i])
+
+    def _inference_step(self, batch) -> None:
+        self._on_batch_start()
+        with self._model_visit_lock:
+            self._on_make_experience_start()
+            experience = self._make_experience(batch)
+            self._on_make_experience_end(experience)
+        self._on_send_start()
+        if self.buffer_cpu_offload:
+            experience.to_device('cpu')
+        self._send_items(experience)
+        self._on_send_end()
+        self._on_batch_end()
+
+    def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1, num_steps: int = 0):
+        """Working loop of the experience maker.
+
+        Args:
+            dataloader_fn (Callable[[], Iterable]): A function that returns a dataloader.
+            num_epochs (int, optional): Iterate the dataloader for number of epochs. Defaults to 1.
+            num_steps (int, optional): Iterate the dataloader for number if steps. If this value > 0, num_epochs will be ignored. Defaults to 0.
+        """
+        self._get_ready()
+        self._on_loop_start()
+        dataloader = dataloader_fn()
+        if num_steps > 0:
+            # ignore num epochs
+            it = iter(dataloader)
+            for _ in tqdm(range(num_steps), desc='ExperienceMaker', disable=not is_rank_0()):
+                try:
+                    batch = next(it)
+                except StopIteration:
+                    it = iter(dataloader)
+                    batch = next(it)
+                self._inference_step(batch)
+        else:
+            with tqdm(total=num_epochs * len(dataloader), desc='ExperienceMaker', disable=not is_rank_0()) as pbar:
+                for _ in range(num_epochs):
+                    for batch in dataloader:
+                        self._inference_step(batch)
+                        pbar.update()
+        self._on_loop_end()
+
+    @ray.method(concurrency_group="model_io")
+    def update_experience_maker(self,
+                                new_actor_state_dict: Dict[str, Any] = None,
+                                new_actor_lora_config_dict: Dict[str, Any] = None,
+                                new_critic_state_dict: Dict[str, Any] = None,
+                                new_critic_lora_config_dict: Dict[str, Any] = None,
+                                fully_update: bool = False,
+                                chunk_start: bool = None,
+                                chunk_end: bool = None):
+        '''
+            called by trainer
+            chunk_start: Set True at the first call. Before sending state_dict calls
+            chunk_end: Set True at the last call. After sending state_dict calls.
+            fully_update: Set True if you want to sync models when initializing
+
+            TODO: load_state_dict integrate with model-sharding strategy
+        '''
+        _watch_memory = self._debug
+        if chunk_start:
+            if self._debug:
+                print("[maker] UPDATE ")
+            if _watch_memory:
+                tracemalloc.start()
+            self._model_visit_lock.acquire()
+
+        with torch.no_grad():
+            if new_actor_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.actor.model.load_state_dict(new_actor_state_dict, strict=False)
+                else:
+                    new_actor_state_dict = state_dict_to(new_actor_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.actor_lora_constructor.reconstruct_increase(new_actor_state_dict, new_actor_lora_config_dict)
+                    self.actor_lora_constructor.load_state_dict_increase(self.experience_maker.actor.model, state_dict_increasae)
+            if new_critic_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.critic.load_state_dict(new_critic_state_dict, strict=False)
+                else:
+                    new_critic_state_dict = state_dict_to(new_critic_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.critic_lora_constructor.reconstruct_increase(new_critic_state_dict, new_critic_lora_config_dict)
+                    self.critic_lora_constructor.load_state_dict_increase(self.experience_maker.critic, state_dict_increasae)
+
+        # the lock must be released after both actor and critic being updated
+        if chunk_end:
+            self._model_visit_lock.release()
+            if _watch_memory:
+                current, peak = tracemalloc.get_traced_memory()
+                print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+                tracemalloc.stop()
+            if fully_update:
+                self._is_fully_initialized = True
+
+    def _on_make_experience_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_start()
+
+    def _on_make_experience_end(self, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_end(experience)
+
+    def _on_loop_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_start()
+
+    def _on_loop_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_end()
+
+    def _on_send_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_start()
+
+    def _on_send_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_end()
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end()
+
+
+def _set_default_generate_kwargs(generate_kwargs: dict, actor: Actor) -> None:
+    origin_model = actor.model
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
+
+    return new_kwargs
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/Chat/coati/ray/lora_constructor.py
new file mode 100644
index 000000000000..599a58248728
--- /dev/null
+++ b/applications/Chat/coati/ray/lora_constructor.py
@@ -0,0 +1,122 @@
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from loralib.layers import LoRALayer
+from coati.models.lora import LoraLinear
+
+
+@dataclass
+class LoRAConfig:
+    r: int = 0
+    lora_alpha: int = 1
+    lora_dropout: float = 0
+    fan_in_fan_out: bool = False
+
+
+class LoRAConstructor:
+    '''
+    Tools for reconstructing a model from a remote LoRA model.
+    (Transfering only LoRA data costs much less!)
+    Usage:
+        Step 1 (Sender):
+            filter_state_dict_lora()
+            
+        Step 2 (Sender, Optional):
+            extract_lora_config()
+            
+        Step 3 (Sender):
+            send state_dict_lora and lora_config_dict
+            
+        Step 4 (Receiver):
+            reconstruct_increase()
+            
+        Step 5 (Receiver):
+            load_state_dict_increase()
+            
+    '''
+
+    def __init__(self):
+        self.lora_config_dict = None
+
+    def register_lora_config(self, lora_config_dict: Dict[str, Any]):
+        self.lora_config_dict = lora_config_dict
+
+    def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict: Dict[str, Any]):
+        '''
+            xxx.lora_A, xxx.lora_B -->> xxx.weight
+            Warning: the xxx.weight here is the increment actually.
+        '''
+        if lora_config_dict is not None:
+            self.register_lora_config(lora_config_dict)
+
+        state_dict_increasae = OrderedDict()
+        config_iter = iter(self.lora_config_dict.items())
+        lora_A, lora_B, layer_prefix = None, None, None
+        for k, v in state_dict_lora.items():
+            if k.rpartition('.')[-1] == 'lora_A':
+                lora_A = v
+                layer_prefix = k.rpartition('.')[0]
+            elif k.rpartition('.')[-1] == 'lora_B':
+                assert layer_prefix == k.rpartition('.')[0], "unmatched (lora_A, lora_B) pair"
+                layer_prefix_2, config = next(config_iter)
+                assert layer_prefix_2 == layer_prefix, "unmatched (state_dict, config_dict) pair"
+                lora_B = v
+                weight_data_increase = self._compute(lora_A, lora_B, config)
+                state_dict_increasae[layer_prefix + '.weight'] = weight_data_increase
+                lora_A, lora_B, layer_prefix = None, None, None
+            else:
+                raise ValueError('unexpected key')
+        return state_dict_increasae
+
+    def _compute(self, lora_A, lora_B, config=LoRAConfig()):
+        def T(w):
+            return w.T if config.fan_in_fan_out else w
+        if config.r > 0:
+            scaling = config.lora_alpha / config.r
+            weight_data_increase = T(lora_B @ lora_A) * scaling
+            return weight_data_increase
+        return 0
+
+    def load_state_dict_increase(self, model: nn.Module, state_dict_increasae: Dict[str, Any]):
+        '''
+        The final reconstruction step
+        '''
+        # naive approach
+        model.load_state_dict({k: v + model.state_dict()[k] for k, v in state_dict_increasae.items()}, strict=False)
+
+    @staticmethod
+    def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
+        '''
+        if keep_non_lora, also return non_lora state_dict
+        '''
+        state_dict_lora = OrderedDict()
+        state_dict_non_lora = OrderedDict()
+        for k, v in state_dict.items():
+            if 'lora_A' in k or 'lora_B' in k:
+                state_dict_lora[k] = v
+            elif keep_non_lora:
+                state_dict_non_lora[k] = v
+        if keep_non_lora:
+            return state_dict_lora, state_dict_non_lora
+        else:
+            return state_dict_lora, None
+
+    @staticmethod
+    def extract_lora_config(model: nn.Module) -> Dict[str, LoRAConfig]:
+        '''
+        extract LoraLinear model.
+        return OrderedDict(): name -> LoRAConfig
+        '''
+        lora_config_dict = OrderedDict()
+
+        for name, child in model.named_modules():
+            if isinstance(child, LoraLinear):
+                lora_config_dict[name] = LoRAConfig(r=child.r,
+                                                    lora_alpha=child.lora_alpha,
+                                                    lora_dropout=child.lora_dropout,
+                                                    fan_in_fan_out=child.fan_in_fan_out)
+
+        return lora_config_dict
diff --git a/applications/Chat/coati/ray/src/__init__.py b/applications/Chat/coati/ray/src/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/applications/Chat/coati/ray/src/detached_trainer_base.py b/applications/Chat/coati/ray/src/detached_trainer_base.py
deleted file mode 100644
index f1ed1ec71499..000000000000
--- a/applications/Chat/coati/ray/src/detached_trainer_base.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
-from tqdm import tqdm
-from coati.trainer.callbacks import Callback
-from coati.experience_maker import Experience
-import ray
-import os
-
-from .detached_replay_buffer import DetachedReplayBuffer
-from .utils import is_rank_0
-
-class DetachedTrainer(ABC):
-    '''
-        Base class for detached rlhf trainers. 
-        'detach' means that the experience maker is detached compared to a normal Trainer.
-        Please set name attribute during init:
-            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
-            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
-    Args:
-        detached_strategy (DetachedStrategy): the strategy to use for training
-        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
-        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
-        max_epochs (int, defaults to 1): the number of epochs of training process
-        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        generate_kwargs (dict, optional): the kwargs to use while model generating
-    '''
-
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 1,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
-        super().__init__()
-        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit, cpu_offload=buffer_cpu_offload)
-        self.experience_batch_size = experience_batch_size
-        self.max_epochs = max_epochs
-        self.dataloader_pin_memory = dataloader_pin_memory
-        self.callbacks = callbacks
-        self.generate_kwargs = generate_kwargs
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-
-    def update_target_holder_list(self, experience_maker_holder_name_list):
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-        for name in self.target_holder_name_list:
-            self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-
-    @abstractmethod
-    def _update_remote_makers(self):
-        pass
-
-    @abstractmethod
-    def training_step(self, experience: Experience) -> Dict[str, Any]:
-        pass
-
-    def _learn(self):
-        pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
-        for _ in pbar:
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] sampling exp")
-            experience = self._buffer_sample()
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] training step")
-            metrics = self.training_step(experience)
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] step over")
-            pbar.set_postfix(metrics)
-
-    def fit(self, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
-        self._on_fit_start()
-        for episode in range(num_episodes):
-            self._on_episode_start(episode)
-            for timestep in tqdm(range(max_timesteps // update_timesteps),
-                                 desc=f'Episode [{episode+1}/{num_episodes}]',
-                                 disable=not is_rank_0()):
-                self._learn()
-                self._update_remote_makers()
-            self._on_episode_end(episode)
-        self._on_fit_end()
-
-    @ray.method(concurrency_group="buffer_length")
-    def buffer_get_length(self):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[trainer]                telling length")
-        return self.detached_replay_buffer.get_length()
-
-    @ray.method(concurrency_group="buffer_append")
-    def buffer_append(self, experience: Experience):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            # print(f"[trainer] receiving exp. Current buffer length: {self.detached_replay_buffer.get_length()}")
-            print(f"[trainer]               receiving exp.")
-        self.detached_replay_buffer.append(experience)
-
-    @ray.method(concurrency_group="buffer_sample")
-    def _buffer_sample(self):
-        return self.detached_replay_buffer.sample()
-
-    def _on_fit_start(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_start()
-
-    def _on_fit_end(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_end()
-
-    def _on_episode_start(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_start(episode)
-
-    def _on_episode_end(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_end(episode)
diff --git a/applications/Chat/coati/ray/src/experience_maker_holder.py b/applications/Chat/coati/ray/src/experience_maker_holder.py
deleted file mode 100644
index 0ae4e3125b70..000000000000
--- a/applications/Chat/coati/ray/src/experience_maker_holder.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import torch
-from typing import Any, Callable, Dict, List, Optional, Union
-import ray
-from ray.exceptions import GetTimeoutError
-from torch import Tensor
-import torch.nn as nn
-from coati.models.base import Actor, Critic, RewardModel
-from coati.trainer.strategies.sampler import DistributedSampler
-from coati.trainer.strategies import Strategy
-from coati.experience_maker import NaiveExperienceMaker, Experience, ExperienceMaker
-
-from copy import deepcopy
-from threading import Lock
-import time
-import os
-
-
-from .utils import is_rank_0, get_strategy_from_args, set_dist_env
-
-
-@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
-class ExperienceMakerHolder:
-    '''
-    Args:
-        detached_trainer_name_list: str list to get ray actor handles
-        strategy: 
-        experience_batch_size: batch size of generated experience
-        kl_coef: the coefficient of kl divergence loss
-    '''
-
-    def __init__(self,
-                 detached_trainer_name_list: List[str],
-                 strategy: str,
-                 env_info: Dict[str, str] = None,
-                 experience_batch_size: int = 8,
-                 kl_coef: float = 0.1,
-                 **generate_kwargs):
-        # set environment variables
-        if env_info:
-            set_dist_env(env_info=env_info)
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-        self.strategy_str = strategy
-        self.strategy = get_strategy_from_args(strategy)
-        self.experience_batch_size = experience_batch_size
-        self.kl_coef = kl_coef
-        self.generate_kwargs = generate_kwargs
-        # Need a trainer to give an actor and a critic via initialize_experience_maker(...)
-        actor, critic, reward_model, initial_model = None, None, None, None
-        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
-        self._model_visit_lock = Lock()
-        self.fully_initialized = False
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] Waiting for INIT')
-
-    def _get_ready(self):
-        while not self.fully_initialized:
-            time.sleep(1.0)
-
-    def update_target_trainer_list(self, detached_trainer_name_list):
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name))
-
-    # copy from ../trainer/base.py
-    @ray.method(concurrency_group="compute")
-    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
-        self._get_ready()
-        if isinstance(inputs, Tensor):
-            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
-        elif isinstance(inputs, dict):
-            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
-        else:
-            raise ValueError(f'Unsupported input type "{type(inputs)}"')
-
-    @ray.method(concurrency_group="experience_io")
-    def _send_experience(self, experience):
-        '''
-        ignore it
-
-        # choose a trainer that has the least experience batch in its detached_replay_buffer
-        chosen_trainer = None
-        min_length = None
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[maker] choosing target trainer")
-        while chosen_trainer is None:
-            for target_trainer in self.target_trainer_list:
-                try:
-                    temp_length = ray.get(target_trainer.buffer_get_length.remote(), timeout=0.1)
-                    if min_length is None:
-                        min_length = temp_length
-                        chosen_trainer = target_trainer
-                    else:
-                        if temp_length < min_length:
-                            min_length = temp_length
-                            chosen_trainer = target_trainer
-                except GetTimeoutError:
-                    pass
-                    
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        '''
-        # 
-        if not hasattr(self, "_target_idx"):
-            self._target_idx = 0
-        chosen_trainer = self.target_trainer_list[self._target_idx]
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
-
-    def workingloop(self, dataset, tokenizer: Optional[Callable[[Any], dict]] = None, times=5000 * 50000):
-        self._get_ready()
-        sampler = self.strategy.setup_sampler(dataset)
-        for _ in range(times):
-            rand_prompts = sampler.sample(self.experience_batch_size)
-            if tokenizer is not None:
-                inputs = tokenizer(rand_prompts)
-            else:
-                inputs = rand_prompts
-            self._model_visit_lock.acquire()
-            experience = self._make_experience(inputs=inputs)
-            self._model_visit_lock.release()
-            self._send_experience(experience=experience)
-
-    @ray.method(concurrency_group="model_io")
-    def initialize_experience_maker(self, init_actor: Actor, init_critic: Critic):
-        '''
-        called by trainer. Only once.
-        '''
-        # TODO: reduce malloc
-        if self.fully_initialized:
-            return
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] INIT')
-        with torch.no_grad():
-            with self.strategy.model_init_context():
-                actor = init_actor
-                critic = init_critic
-                initial_model = deepcopy(actor)
-                reward_model = RewardModel(deepcopy(critic.model),
-                                           deepcopy(critic.value_head)).to(torch.cuda.current_device())
-            if self.strategy_str != 'colossalai_gemini':
-                actor.to(torch.float16).to(torch.cuda.current_device())
-                critic.to(torch.float16).to(torch.cuda.current_device())
-                initial_model.to(torch.float16).to(torch.cuda.current_device())
-                reward_model.to(torch.float16).to(torch.cuda.current_device())
-
-            self.experience_maker.actor = self.strategy.prepare(actor)
-            self.experience_maker.critic = self.strategy.prepare(critic)
-            self.experience_maker.initial_model = self.strategy.prepare(initial_model)
-            self.experience_maker.reward_model = self.strategy.prepare(reward_model)
-        self.fully_initialized = True
-
-    @ray.method(concurrency_group="model_io")
-    def update_experience_maker(self, new_actor: Actor, new_critic: Critic):
-        '''
-            called by trainer
-        '''
-        # TODO: reduce malloc
-        self._model_visit_lock.acquire()
-        with torch.no_grad():
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[maker] UPDATE ")
-            if self.strategy_str != 'colossalai_gemini':
-                new_actor.to(torch.float16).to(torch.cuda.current_device())
-                new_critic.to(torch.float16).to(torch.cuda.current_device())
-            self.experience_maker.actor = self.strategy.prepare(new_actor)
-            self.experience_maker.critic = self.strategy.prepare(new_critic)
-        self._model_visit_lock.release()
diff --git a/applications/Chat/coati/ray/src/pipeline_strategy.py b/applications/Chat/coati/ray/src/pipeline_strategy.py
deleted file mode 100644
index 7ecb5d7d86d6..000000000000
--- a/applications/Chat/coati/ray/src/pipeline_strategy.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# WIP
-
-
-from coati.trainer.strategies import Strategy
-from coati.trainer.strategies import NaiveStrategy
-from coati.models.base import Actor, RewardModel, Critic
-
-import numpy as np
-import torch
-from torch._C._distributed_rpc import _is_current_rpc_agent_set
-
-import colossalai
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
-from colossalai.fx import ColoTracer
-from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-
-
-import os
-from functools import partial
-import random
-
-rpc_is_initialized = _is_current_rpc_agent_set
-
-class PipelineModel(torch.nn.Module):
-    '''
-    Actor has 2 kinds of jobs: forward and generate. 
-        better to just pipeline the inner model
-    '''
-    def __init__(self,
-                 model: torch.nn.Module,
-                 stage_num: int,
-                 num_microbatches: int,
-                 data_kwargs = None,
-                 ):
-        super().__init__()
-        # create partition module
-        def create_partition_module(pp_rank:int, stage_num: int, model, data_kwargs):
-            model.eval()
-            tracer = ColoTracer()
-            meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
-            graph = tracer.trace(root=model, meta_args=meta_args)
-            gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
-            annotated_model = balanced_split_pass(gm, stage_num)
-            top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
-            topo = get_fx_topology(top_module)
-            for submodule in split_submodules:
-                if isinstance(submodule, torch.fx.GraphModule):
-                    setattr(submodule, '_topo', topo)
-            return split_submodules[pp_rank + 1]
-    
-        def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
-            partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
-            return partition
-        self.inference_engine = OneFOneBPipelineEngine(
-            partition_fn=partial(partition, model, data_kwargs),
-            stage_num=stage_num,
-            num_microbatches=num_microbatches,
-            device='cuda',
-        )
-
-    def forward(self,
-                **model_inputs):
-        return self.inference_engine.forward_backward(**model_inputs, forward_only=True)
-
-
-
-class PPStrategy(NaiveStrategy):
-    """
-        Strategy for Pipeline inference (inference only!)
-        
-        master node only
-    """
-    def __init__(
-        self,
-        seed: int = 42
-    ):
-        self.seed = seed
-        super().__init__()
-        
-        
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-        ppg.set_global_info(rank = int(os.environ['RANK']),
-                            world_size=int(os.environ['WORLD_SIZE']),
-                            dp_degree=1,
-                            tp_degree=1,
-                            num_worker_threads=128,
-                            device="cuda")
-        
-    def model_init_context(self):
-        return super().model_init_context()
-    
-    def setup_model(self, model: torch.nn.Module) -> torch.nn.Module:
-        if isinstance(model, Actor) or \
-            isinstance(model, RewardModel) or \
-            isinstance(model, Critic):
-            model.model = PipelineModel(model.model)
-
-    def set_seed(self, seed: int) -> None:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
diff --git a/applications/Chat/coati/ray/src/utils.py b/applications/Chat/coati/ray/src/utils.py
deleted file mode 100644
index c750879b6d18..000000000000
--- a/applications/Chat/coati/ray/src/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch.distributed as dist
-from typing import Any, Callable, Dict, List, Optional
-from coati.models.bloom import BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-import torch
-import os
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
-
-
-def get_cuda_actor_critic_from_args(model: str, pretrained: str = None, lora_rank=0):
-    if model == 'gpt2':
-        actor = GPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'bloom':
-        actor = BLOOMActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'opt':
-        actor = OPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    else:
-        raise ValueError(f'Unsupported model "{model}"')
-    return actor, critic
-
-
-def get_strategy_from_args(strategy: str):
-    if strategy == 'naive':
-        strategy_ = NaiveStrategy()
-    elif strategy == 'ddp':
-        strategy_ = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-    return strategy_
-
-
-def set_dist_env(env_info: Dict[str, str]):
-    os.environ["RANK"] = env_info['rank']
-    os.environ["LOCAL_RANK"] = env_info['local_rank']
-    os.environ["WORLD_SIZE"] = env_info['world_size']
-    os.environ['MASTER_PORT'] = env_info['master_port']
-    os.environ['MASTER_ADDR'] = env_info['master_addr']
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
new file mode 100644
index 000000000000..4361ee236771
--- /dev/null
+++ b/applications/Chat/coati/ray/utils.py
@@ -0,0 +1,152 @@
+import os
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
+
+
+def is_rank_0() -> bool:
+    return not dist.is_initialized() or dist.get_rank() == 0
+
+
+def get_rank() -> int:
+    return dist.get_rank() if dist.is_initialized() else 0
+
+
+def get_world_size() -> int:
+    return dist.get_world_size() if dist.is_initialized() else 1
+
+
+def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        actor = GPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'bloom':
+        actor = BLOOMActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'opt':
+        actor = OPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'llama':
+        actor = LlamaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'roberta':
+        actor = RoBERTaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    else:
+        raise ValueError(f'Unsupported actor model "{model}"')
+    return actor
+
+
+def get_critic_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'bloom':
+        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'opt':
+        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'llama':
+        critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'roberta':
+        critic = RoBERTaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return critic
+
+
+def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
+    if model == 'gpt2':
+        reward_model = GPTRM(pretrained=pretrained, config=config)
+    elif model == 'bloom':
+        reward_model = BLOOMRM(pretrained=pretrained, config=config)
+    elif model == 'opt':
+        reward_model = OPTRM(pretrained=pretrained, config=config)
+    elif model == 'llama':
+        reward_model = LlamaRM(pretrained=pretrained, config=config)
+    elif model == 'roberta':
+        reward_model = RoBERTaRM(pretrained=pretrained, config=config)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return reward_model
+
+
+def get_strategy_from_args(strategy: str):
+    if strategy == 'naive':
+        strategy_ = NaiveStrategy()
+    elif strategy == 'ddp':
+        strategy_ = DDPStrategy()
+    elif strategy == 'colossalai_gemini':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif strategy == 'colossalai_gemini_cpu':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2_cpu':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+    return strategy_
+
+
+def get_tokenizer_from_args(model: str, **kwargs):
+    if model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif model == 'llama':
+        pretrain_path = kwargs["pretrain"]
+        tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
+    elif model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{model}"')
+
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+
+def set_dist_env(env_info: Dict[str, str]):
+    os.environ["RANK"] = env_info['rank']
+    os.environ["LOCAL_RANK"] = env_info['local_rank']
+    os.environ["WORLD_SIZE"] = env_info['world_size']
+    os.environ['MASTER_PORT'] = env_info['master_port']
+    os.environ['MASTER_ADDR'] = env_info['master_addr']
+
+
+def get_model_numel(model: nn.Module) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    return numel
+
+
+def get_receivers_per_sender(sender_idx: int, num_senders: int, num_receivers: int, allow_idle_sender: bool) -> list:
+    target_receivers = []
+    if num_senders <= num_receivers or allow_idle_sender:
+        # a sender will send data to one or more than one receivers
+        # a receiver only has one sender
+        for i in range(num_receivers):
+            if i % num_senders == sender_idx:
+                target_receivers.append(i)
+    else:
+        # a sender will send data to one receiver
+        # a receiver may have more than one sender
+        target_receivers.append(sender_idx % num_receivers)
+    return target_receivers
+
+
+def state_dict_to(state_dict: Dict[str, Any],
+                  dtype: torch.dtype = torch.float16,
+                  device: torch.device = torch.device('cpu')):
+    '''
+        keep state_dict intact
+    '''
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k] = v.to(dtype=dtype, device=device)
+    return new_state_dict
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index b1452869179e..bd30422022ae 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -130,3 +130,7 @@ def save_pretrained(self,
                         only_rank0: bool = True,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
         pass
+
+    @abstractmethod
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        pass
\ No newline at end of file
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 8aa302c77eee..88268b677eb2 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -186,3 +186,15 @@ def save_pretrained(self,
         if self.stage == 3:
             raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
         super().save_pretrained(model, path, only_rank0, tokenizer)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        if self.stage != 3:
+            yield from super().get_model_state_dict_shard(model, **config)
+        else:
+            # unwrapped_model = self._unwrap_model(model)
+            # for module in unwrapped_model.modules():
+            #     if isinstance(module, LoraLinear):
+            #         module.merge_weights = True
+            #         module.eval()
+            base_model: ZeroDDP = get_base_model(model)
+            yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 7910b57878f8..a1fecb36373f 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -26,19 +26,8 @@ def __init__(self, seed: int = 42) -> None:
         super().__init__()
 
     def setup_distributed(self) -> None:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-        except KeyError as e:
-            raise RuntimeError(
-                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-            )
-        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+        self._try_init_dist(force=True)
         self.set_seed(self.seed)
-        torch.cuda.set_device(local_rank)
 
     def set_seed(self, seed: int) -> None:
         random.seed(seed)
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index 4d94026ce932..972deebeaa0d 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -1,10 +1,17 @@
-from typing import Any, Optional
+import os
+import sys
+from collections import OrderedDict
+from typing import Any, Dict, Optional
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 from coati.models.base import get_base_model
 from coati.replay_buffer import ReplayBuffer
+from coati.models.base import RewardModel
+from coati.models.lora import LoraLinear
+from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.modeling_utils import PreTrainedModel
@@ -13,6 +20,15 @@
 from .base import Strategy
 
 
+# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
+def get_grad_required_state_dict(model: nn.Module):
+    state_dict = OrderedDict()
+    for name, parameter in model.named_parameters():
+        if parameter.requires_grad:
+            state_dict[name] = parameter.detach()
+    return state_dict
+
+
 class NaiveStrategy(Strategy):
     """
         Strategy for single GPU. No parallelism is used.
@@ -25,7 +41,7 @@ def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
         optimizer.step()
 
     def setup_distributed(self) -> None:
-        pass
+        self._try_init_dist(force=False)
 
     def setup_model(self, model: nn.Module) -> nn.Module:
         return model
@@ -68,3 +84,45 @@ def save_pretrained(self,
         unwrapped_model.save_pretrained(path)
         if tokenizer is not None:
             tokenizer.save_pretrained(path)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        # TODO: implement sharding on naive strategy
+        model = self.unwrap_model(model)
+        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+            state_dict = get_grad_required_state_dict(model)
+        else:
+            state_dict = model.state_dict()
+
+        if 'shard_size' in config:
+            shard_size = config['shard_size']
+            accumulate_size = 0
+            state_dict_shard = OrderedDict()
+            for name, param in state_dict.items():
+                state_dict_shard[name] = param
+                accumulate_size += param.numel() * param.element_size()
+                if accumulate_size >= shard_size:
+                    accumulate_size = 0
+                    yield state_dict_shard
+                    state_dict_shard = OrderedDict()
+            if accumulate_size > 0:
+                yield state_dict_shard
+        else:
+            yield state_dict
+
+    def _try_init_dist(self, force: bool = False) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            torch.cuda.set_device(local_rank)
+        except KeyError as e:
+            if force:
+                raise RuntimeError(
+                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+                )
+        except Exception as e:
+            if force:
+                raise e
diff --git a/applications/Chat/coati/trainer/strategies/sampler.py b/applications/Chat/coati/trainer/strategies/sampler.py
index d726fa640fa2..65e199dbf029 100644
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -27,6 +27,7 @@ def __init__(self, dataset, num_replicas: int, rank: int) -> None:
         assert len(indices) == self.num_samples
         self.indices = indices
 
+
     def sample(self, batch_size: int) -> list:
         sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
         return [self.dataset[idx] for idx in sampled_indices]
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
new file mode 100644
index 000000000000..afdd6a922cc7
--- /dev/null
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+    get_tokenizer_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = get_tokenizer_from_args(args.model)
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=["maker1"],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        experience_batch_size=args.experience_batch_size,
+        kl_coef=0.1,
+        debug=args.debug,
+        update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
new file mode 100644
index 000000000000..fa7b2bd7edfd
--- /dev/null
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain, lora_rank=args.lora_rank).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain, lora_rank=args.lora_rank).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/requirements.txt b/applications/Chat/examples/ray/requirements.txt
new file mode 100644
index 000000000000..e0275631807f
--- /dev/null
+++ b/applications/Chat/examples/ray/requirements.txt
@@ -0,0 +1 @@
+ray
diff --git a/applications/Chat/examples/ray/test_ci.sh b/applications/Chat/examples/ray/test_ci.sh
new file mode 100755
index 000000000000..895f7de0fea9
--- /dev/null
+++ b/applications/Chat/examples/ray/test_ci.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -xe
+BASE=$(realpath $(dirname $0))
+
+export RAY_NAMESPACE=admin
+export DATA=/data/scratch/chatgpt/prompts.csv
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 2b049163c801..2fa6c6052f8d 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
+
+# 3080 doesn't support P2P, skip this test
+# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 61d912157449..4a42e204982f 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -25,11 +25,11 @@ class Booster:
     Examples:
         ```python
         colossalai.launch(...)
-        plugin = GeminiPlugin(stage=3, ...)
+        plugin = GeminiPlugin(...)
         booster = Booster(precision='fp16', plugin=plugin)
 
         model = GPT2()
-        optimizer = Adam(model.parameters())
+        optimizer = HybridAdam(model.parameters())
         dataloader = Dataloader(Dataset)
         lr_scheduler = LinearWarmupScheduler()
         criterion = GPTLMLoss()
diff --git a/colossalai/lazy/lazy_init.py b/colossalai/lazy/lazy_init.py
index c1fda3c53865..76f550dc4392 100644
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -37,7 +37,7 @@
 # If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
 # without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
 # These ops cannot be unwrapped using .data
-_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__']
+_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__', 'numel', 'size', 'dim']
 
 _LEGACY_TENSOR_CONSTRUCTOR = {
     'FloatTensor': torch.float,
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 1ec8783c53d3..3a6d37103398 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -13,7 +13,7 @@
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 526071b06f95..84903ac36832 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -14,7 +14,7 @@
 class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py
index 9731530a6e15..79913987b7cc 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@@ -83,7 +83,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
         for k, v in kwargs.items():
             if isinstance(v, torch.nn.Module):
                 v = self._layer_spec_dict[id(v)]
-            # (lyl)TODO: analyse ColoTensor as well
+            # (lyl)TODO: analyze ColoTensor as well
             modified_kwargs[k] = v
 
         # keep track of the module children
@@ -117,7 +117,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
     def to_layer_list(self, exec_seq=None):
         """
         Create a layer spec list and func list with execution sequence given by user.
-        If exec_seq is None, we will take the module initizing order as execution order.
+        If exec_seq is None, we will take the module initializing order as execution order.
         """
 
         self._exec_seq = exec_seq
@@ -177,7 +177,7 @@ def to_layer_list(self, exec_seq=None):
 
     def partition(self, num_chunks, pipeline_size, rank):
         """
-        Partitioned model will be built respect to partion policy.
+        Partitioned model will be built respect to partition policy.
         The real module instance will be built in this method.
         """
         if isinstance(self._policy, str):
@@ -193,7 +193,7 @@ def partition(self, num_chunks, pipeline_size, rank):
                 self.customized_parts = customized_partition(self._exec_seq)
                 assert len(self.customized_parts) == gpc.get_world_size(
                     ParallelMode.PIPELINE
-                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
+                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
                 parts = self.customized_parts[rank]
             else:
                 raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 2d7e25c82e7b..9e549df58214 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -123,7 +123,7 @@ def __init__(self,
         self.device = device
         self._initialize_outstanding_range()
 
-        # variable and const for context managment
+        # variable and const for context management
         self.outstanding = 0
         self.forward_times = 0
         self.backward_times = 0
@@ -226,7 +226,7 @@ def sync_global_worker_rrefs(self, pp_rank_to_worker_rref: Dict[int, PyRRef]) ->
         self.pp_rank_to_worker_rref = pp_rank_to_worker_rref
 
         # for some schedule need the other worker's info to initialise partition (like Chimera)
-        # construction of partition is executed after the registion of pp_rank_to_worker_rref
+        # construction of partition is executed after the registration of pp_rank_to_worker_rref
         self._initialize_partition()
 
     # res_use works for lifecycle counter,
@@ -418,7 +418,7 @@ def subscribe_producer(self, microbatch_id: int, forward_only: bool):
                 # On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
                 self.work_list[key] = work_item_from_producer
                 self.work_list_condition_lock.notify_all()
@@ -460,7 +460,7 @@ def subscribe_consumer(self, microbatch_id: int):
                 # On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_consumer = self._subscribe_consumer(microbatch_id)
                 self.work_list[key] = work_item_from_consumer
                 self.work_list_condition_lock.notify_all()
@@ -508,7 +508,7 @@ def _get_producer_consumer(self) -> None:
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = self.get_producer_stage_ids()
         self.consumer_stage_ids = self.get_consumer_stage_ids()
 
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/pipeline/rpc/_pipeline_schedule.py
index 0d572231d378..6eda8f3b34b7 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -123,7 +123,7 @@ def _get_producer_consumer(self) -> None:
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = []
         self.consumer_stage_ids = []
 
@@ -174,7 +174,7 @@ def _initialize_partition(self):
         else:
             # if it is down pipeline, create partition by origin method
             co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
-            # get the coresponding model state dict and wait for its init
+            # get the corresponding model state dict and wait for its init
             state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
             super()._initialize_partition()
             self.module_partition.load_state_dict(state_dict)
@@ -228,7 +228,7 @@ def _hook_before_step(self):
         stage_num = self.actual_stage_num
         co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)
 
-        # if currrent pp_rank is not the first to do step
+        # if current pp_rank is not the first to do step
         # wait its previous pp_rank finish step
         grads = self.get_parameter_gradients()
 
diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py
index df7226644a7a..ac8a3ad7d1db 100644
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -113,7 +113,7 @@ def _binary_search(weights, num):
 
 def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
     assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recommended"
 
     logger = get_dist_logger()
     parts = [[] for _ in range(pipeline_parallel_size)]
diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
index 765d8ec1b01a..159125fa16db 100644
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -28,7 +28,7 @@ class CommSpec:
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
         process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 2ea0c4db89fd..565012b58a03 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -41,7 +41,7 @@ def __repr__(self):
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -58,7 +58,7 @@ def _convert_str_to_shard_list(self, str_spec):
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index 9c2e0d4adbf1..8ed8176d996a 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -164,7 +164,7 @@ def _get_grad_args(*args):
     for obj in args:
         if _is_grad_tensor(obj):
             return args, None
-    # otherwise, the first arguement should be a tuple of grad tensors
+    # otherwise, the first argument should be a tuple of grad tensors
     # if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
     arg_zero = args[0]
     if not isinstance(arg_zero, tuple):
diff --git a/colossalai/tensor/process_group.py b/colossalai/tensor/process_group.py
index f108bdc247f5..8d2e9a616d76 100644
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@@ -130,7 +130,7 @@ def set_cpu_groups(self):
     @property
     def has_cpu_groups(self) -> bool:
         """has_cpu_groups
-        If cpu groups have been initailized.
+        If cpu groups have been initialized.
 
         Returns:
             bool: cpu process groups have been initialized or not.
diff --git a/colossalai/tensor/shape_consistency.py b/colossalai/tensor/shape_consistency.py
index 0a840006f086..5bec552d69d5 100644
--- a/colossalai/tensor/shape_consistency.py
+++ b/colossalai/tensor/shape_consistency.py
@@ -252,7 +252,7 @@ def get_all_all_to_all_spec(self, source_spec: ShardingSpec,
     def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
         '''
         Get all valid sharding specs from source_spec with single shard operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         For the sharding operation, we just care about legal sharding dimensions.
 
         Argument:
@@ -386,7 +386,7 @@ def get_all_mix_gather_spec(self, source_spec: ShardingSpec,
     def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
         '''
         Get all valid sharding specs from source_spec with one step transform, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         Note:
             all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
             and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
@@ -577,7 +577,7 @@ def shape_consistency(self, source_spec: ShardingSpec,
         Step3:
             Repeat above steps until the source spec transform to target spec.
 
-        During finding the transform path, commucation cost will be accumulated, and it
+        During finding the transform path, communication cost will be accumulated, and it
         will be finally used in auto parallel solver.
 
         Additionally, to avoid repeating the path search in runtime, we cached all solved path
diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py
index bed320130ccd..406ad49097b5 100644
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@@ -45,7 +45,7 @@ def __repr__(self):
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -62,7 +62,7 @@ def _convert_str_to_shard_list(self, str_spec):
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
@@ -166,7 +166,7 @@ class ShardingSpec:
         device_mesh(DeviceMesh): A logical view of a physical mesh.
         entire_shape(torch.Size): The entire shape of tensor before sharded.
         dim_partition_dict(Dict[int, List[int]]， optional): The key is the dimension of tensor to be sharded,
-            and the value of the key decribe which logical axis will be sharded in that dimension.
+            and the value of the key describe which logical axis will be sharded in that dimension.
         sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
     '''
 
diff --git a/colossalai/tensor/utils.py b/colossalai/tensor/utils.py
index 6e30f97fef03..e7d51d099e02 100644
--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):
 
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     '''
     _, shard_list = target_pair
     shard_list_list = []
diff --git a/colossalai/trainer/_trainer.py b/colossalai/trainer/_trainer.py
index 60bbc4eeee32..bfe1c403fd48 100644
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@@ -31,9 +31,9 @@ class Trainer:
         >>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
         >>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
         >>> # Beginning training progress
-        >>> timier = ...
+        >>> timer = ...
         >>> logger = ...
-        >>> trainer = Trainer(engine=engine, logger=logger, timer=timier)
+        >>> trainer = Trainer(engine=engine, logger=logger, timer=timer)
         >>> # add hooks you would like to use here.
         >>> hook_list = []
         >>> trainer.fit(
@@ -56,7 +56,7 @@ def __init__(
         timer: MultiTimer = None,
         logger: DistributedLogger = None,
     ):
-        # training-ralated params
+        # training-related params
         self._engine = engine
         self._max_epochs = 0
         self._cur_epoch = 0
@@ -118,7 +118,7 @@ def _set_current_step(self, epoch: int):
         self._cur_step = epoch * self._steps_per_epoch
 
     def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
-        """Call timer funciton with a given timer name.
+        """Call timer function with a given timer name.
 
         Args:
             action (str): Function to be called on timer.
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
index 945dc54b397a..2318e07a7f8d 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-# adpated from torch.utils.data.DistributedSampler
+# adapted from torch.utils.data.DistributedSampler
 
 import math
 import random
diff --git a/colossalai/utils/model/utils.py b/colossalai/utils/model/utils.py
index f49607376439..21bc530934d3 100644
--- a/colossalai/utils/model/utils.py
+++ b/colossalai/utils/model/utils.py
@@ -70,7 +70,7 @@ def _init_subclass(cls, **kwargs):
             cls.__init__ = preprocess_after(cls.__init__)
 
         # Replace .__init__() for all existing subclasses of torch.nn.Module
-        # Excution self._post_init_method after the default init function.
+        # Execution self._post_init_method after the default init function.
         substitute_init_recursively(torch.nn.modules.module.Module, _enable_class, set())
 
         # holding on to the current __init__subclass__ for exit
diff --git a/colossalai/utils/profiler/legacy/comm_profiler.py b/colossalai/utils/profiler/legacy/comm_profiler.py
index a4f5729c97ec..334f0113ee90 100644
--- a/colossalai/utils/profiler/legacy/comm_profiler.py
+++ b/colossalai/utils/profiler/legacy/comm_profiler.py
@@ -111,7 +111,7 @@ def append(s: str = None):
             res.append(sep)
 
         if self.warn_flag:
-            append("Warnning: there exists multiple communication operations in the same time. As a result, "
+            append("Warning: there exists multiple communication operations in the same time. As a result, "
                    "the profiling result is not accurate.")
 
         if self.total_cuda_time == 0:
@@ -123,12 +123,12 @@ def append(s: str = None):
         append("total number of calls: {}".format(self.total_count))
         append("All events:")
 
-        seperation = '-' * 74
+        separation = '-' * 74
         row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
 
-        append(seperation)
+        append(separation)
         append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
-        append(seperation)
+        append(separation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
         for location, event in show_list:
diff --git a/colossalai/utils/profiler/legacy/pcie_profiler.py b/colossalai/utils/profiler/legacy/pcie_profiler.py
index 526222941ef9..8f812f5cfc7b 100644
--- a/colossalai/utils/profiler/legacy/pcie_profiler.py
+++ b/colossalai/utils/profiler/legacy/pcie_profiler.py
@@ -130,12 +130,12 @@ def append(s: str = None):
 
         append("Possible data transmission events in PCIE:")
 
-        seperation = '-' * 62
+        separation = '-' * 62
         row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
 
-        append(seperation)
+        append(separation)
         append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
-        append(seperation)
+        append(separation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
         for location, event in show_list:
diff --git a/colossalai/utils/profiler/legacy/prof_utils.py b/colossalai/utils/profiler/legacy/prof_utils.py
index 87ad644a7ecc..2f7eee827651 100644
--- a/colossalai/utils/profiler/legacy/prof_utils.py
+++ b/colossalai/utils/profiler/legacy/prof_utils.py
@@ -32,9 +32,9 @@ def _format_memory(nbytes):
         return str(nbytes) + ' B'
 
 
-def _format_bandwidth(volme: float or int, time_us: int):
+def _format_bandwidth(volume: float or int, time_us: int):
     sec_div_mb = (1000.0 / 1024.0)**2
-    mb_per_sec = volme / time_us * sec_div_mb
+    mb_per_sec = volume / time_us * sec_div_mb
 
     if mb_per_sec >= 1024.0:
         return '{:.3f} GB/s'.format(mb_per_sec / 1024.0)
diff --git a/colossalai/utils/rank_recorder/README.md b/colossalai/utils/rank_recorder/README.md
index e30a925d2a92..da8a6039d543 100644
--- a/colossalai/utils/rank_recorder/README.md
+++ b/colossalai/utils/rank_recorder/README.md
@@ -1,5 +1,5 @@
 # Rank Recorder
-This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualise the json file easily.
+This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualize the json file easily.
 
 Before using the tool, you should ensure dist.is_initialized() return true before exit of program. 
 
@@ -20,7 +20,7 @@ with recorder(record_name, current_rank) as r:
 ```
 
 ## Example
-This is a demo to display kernel select in cuda and visualise the cost of several procedures in each rank.
+This is a demo to display kernel select in cuda and visualize the cost of several procedures in each rank.
 
 ```python
 import time
diff --git a/colossalai/utils/rank_recorder/rank_recorder.py b/colossalai/utils/rank_recorder/rank_recorder.py
index c088ceeb2e87..40bb7e184a12 100644
--- a/colossalai/utils/rank_recorder/rank_recorder.py
+++ b/colossalai/utils/rank_recorder/rank_recorder.py
@@ -133,7 +133,7 @@ def merge_recode(self):
         with open(self.export_name + '.json', 'w', encoding='utf-8') as f:
             json.dump(recoders, f, ensure_ascii=False)
 
-    def visualise_record(self):
+    def visualize_record(self):
         with open(self.export_name + '.json', 'r', encoding='utf-8') as f:
             records = json.load(f)
         records = dict(records)
@@ -171,7 +171,7 @@ def exit_worker(self):
         if rank == 1:
             # take the base time of rank 0 as standard
             self.merge_recode()
-            self.visualise_record()
+            self.visualize_record()
 
 
 recorder = Recorder()
diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index a7682eaf62e9..51da9be2b1f8 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -416,7 +416,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data_slice: torch.Ten
         Copy data slice to the memory space indexed by the input tensor in the chunk.
 
         Args:
-            tensor (torch.Tensor): the tensor used to retrive meta information
+            tensor (torch.Tensor): the tensor used to retrieve meta information
             data_slice (torch.Tensor): the tensor to be copied to the chunk
         """
         # sanity check
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index 77368d06d255..38d34f14863e 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -157,7 +157,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data: torch.Tensor) -
         Copy data to the chunk.
 
         Args:
-            tensor (torch.Tensor): the tensor used to retrive meta information
+            tensor (torch.Tensor): the tensor used to retrieve meta information
             data (torch.Tensor): the tensor to be copied to the chunk
         """
         chunk = self.tensor_chunk_map[tensor]
diff --git a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
index f5eb05b4f22a..83903bbf4023 100644
--- a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
+++ b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
@@ -25,7 +25,7 @@ def __init__(self, chunk_manager: ChunkManager, memstats: Optional[MemStats] = N
     # override
     def record_model_data_volume(self) -> None:
         """
-        record model data volumn on cuda and cpu.
+        record model data volume on cuda and cpu.
         """
         if self._start_flag and not self.use_outside_memstats:
             cuda_mem = self._chunk_manager.total_mem['cuda']
diff --git a/colossalai/zero/gemini/memory_tracer/memory_monitor.py b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
index f8d99dbce7a4..4bb585677d5b 100644
--- a/colossalai/zero/gemini/memory_tracer/memory_monitor.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
@@ -45,7 +45,7 @@ def clear(self):
 
 class AsyncMemoryMonitor(MemoryMonitor):
     """
-    An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
+    An Async Memory Monitor running during computing. Sampling memory usage of the current GPU
     at interval of `1/(10**power)` sec.
 
     The idea comes from Runtime Memory Tracer of PatrickStar
@@ -67,7 +67,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
         async_mem_monitor.save('log.pkl')
 
     Args:
-        power (int, optional): the power of time interva. Defaults to 10.
+        power (int, optional): the power of time interval. Defaults to 10.
 
     .. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
         https://arxiv.org/abs/2108.05818
diff --git a/colossalai/zero/gemini/utils.py b/colossalai/zero/gemini/utils.py
index e52b5b836b0b..6f4a253b504b 100644
--- a/colossalai/zero/gemini/utils.py
+++ b/colossalai/zero/gemini/utils.py
@@ -73,7 +73,7 @@ def get_static_torch_model(zero_ddp_model,
         zero_ddp_model (ZeroDDP): a zero ddp model
         device (torch.device): the device of the final torch model
         dtype (torch.dtype): the dtype of the final torch model
-        only_rank_0 (bool): if True, only rank0 has the coverted torch model
+        only_rank_0 (bool): if True, only rank0 has the converted torch model
 
     Returns:
         torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
diff --git a/colossalai/zero/legacy/gemini/ophooks/utils.py b/colossalai/zero/legacy/gemini/ophooks/utils.py
index 84e8298c1d51..f88ad2b00e9e 100644
--- a/colossalai/zero/legacy/gemini/ophooks/utils.py
+++ b/colossalai/zero/legacy/gemini/ophooks/utils.py
@@ -88,7 +88,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
                                  ophook_list: List[BaseOpHook],
                                  name: str = "",
                                  filter_fn: Optional[Callable] = None):
-    r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
+    r"""Recursively register pre/post hooks for all submodules in the module in FWD and BWD."""
     assert isinstance(module, torch.nn.Module)
     assert isinstance(ophook_list, (list, tuple))
     assert len(ophook_list) > 0, 'expected at least 1 hook in the argument ophook_list but found 0'
@@ -103,7 +103,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
     if len(list(module.parameters(recurse=False))) == 0:
         return
 
-    # return from flitered module
+    # return from filtered module
     if filter_fn is not None and filter_fn(module):
         return
 
diff --git a/colossalai/zero/legacy/gemini/tensor_utils.py b/colossalai/zero/legacy/gemini/tensor_utils.py
index b7f23e0253fd..843e330ee2c6 100644
--- a/colossalai/zero/legacy/gemini/tensor_utils.py
+++ b/colossalai/zero/legacy/gemini/tensor_utils.py
@@ -77,7 +77,7 @@ def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], t
     move a tensor to the target_device
     Args:
         t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
-        target_device: a traget device, if type is int, it the index of cuda card.
+        target_device: a target device, if type is int, it the index of cuda card.
     """
     if not isinstance(target_device, torch.device):
         target_device = torch.device(f'cuda:{target_device}')
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index d6f6f611a64c..1b27d64b6897 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -195,7 +195,7 @@ def get_data(batch_size, seq_len, vocab_size):
 Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss:
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -211,7 +211,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 9030464ddf9a..9fe5601bbd1b 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -197,7 +197,7 @@ def get_data(batch_size, seq_len, vocab_size):
 最后，使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数：
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -213,7 +213,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
diff --git a/examples/language/bert/README.md b/examples/language/bert/README.md
new file mode 100644
index 000000000000..c845a5c50387
--- /dev/null
+++ b/examples/language/bert/README.md
@@ -0,0 +1,34 @@
+## Overview
+
+This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
+
+## Finetune
+```
+bash test_ci.sh
+```
+
+## Benchmark
+```
+bash benchmark.sh
+```
+
+Now include these metrics in benchmark: CUDA mem occupy, throughput and the number of model parameters. If you have custom metrics, you can add them to benchmark_util.
+
+## Results
+
+### Bert
+
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | 21.44 GB | 3.0 | 82M |
+| ddp_fp16 | 16.26 GB | 11.3 | 82M |
+| gemini | 11.0 GB | 12.9 | 82M |
+| low_level_zero | 11.29 G | 14.7 | 82M |
+
+### AlBert
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | OOM |  | |
+| ddp_fp16 | OOM |  | |
+| gemini | 69.39 G | 1.3 | 208M |
+| low_level_zero | 56.89 G | 1.4 | 208M |
\ No newline at end of file
diff --git a/examples/language/bert/benchmark.py b/examples/language/bert/benchmark.py
new file mode 100644
index 000000000000..ae8b2269a534
--- /dev/null
+++ b/examples/language/bert/benchmark.py
@@ -0,0 +1,174 @@
+import argparse
+
+import torch
+from benchmark_utils import benchmark
+from torch.utils.data import DataLoader, Dataset
+from transformers import (
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    BertConfig,
+    BertForSequenceClassification,
+    get_linear_schedule_with_warmup,
+)
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+SEQ_LEN = 512
+VOCAB_SIZE = 1000
+NUM_LABELS = 10
+DATASET_LEN = 1000
+
+
+class RandintDataset(Dataset):
+
+    def __init__(self, dataset_length: int, sequence_length: int, vocab_size: int, n_class: int):
+
+        self._sequence_length = sequence_length
+        self._vocab_size = vocab_size
+        self._n_class = n_class
+        self._dataset_length = dataset_length
+        self._datas = torch.randint(
+            low=0,
+            high=self._vocab_size,
+            size=(self._dataset_length, self._sequence_length,),
+            dtype=torch.long,
+        )
+        self._labels = torch.randint(low=0, high=self._n_class, size=(self._dataset_length, 1), dtype=torch.long) 
+
+    def __len__(self):
+        return self._dataset_length
+
+    def __getitem__(self, idx):
+        return self._datas[idx], self._labels[idx]
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+
+    args = parser.parse_args()
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+
+    train_dataset = RandintDataset(dataset_length=DATASET_LEN,
+                                   sequence_length=SEQ_LEN,
+                                   vocab_size=VOCAB_SIZE,
+                                   n_class=NUM_LABELS)
+    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+
+    if args.model_type == "bert":
+        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = BertForSequenceClassification(cfg)
+    elif args.model_type == "albert":
+        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = AlbertForSequenceClassification(cfg)
+    else:
+        raise RuntimeError
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # criterion
+    criterion = lambda inputs: inputs[0]
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Benchmark model
+    # ==============================
+
+    results = benchmark(model,
+                        booster,
+                        optimizer,
+                        lr_scheduler,
+                        train_dataloader,
+                        criterion=criterion,
+                        epoch_num=NUM_EPOCHS)
+
+    coordinator.print_on_master(results)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/bert/benchmark.sh b/examples/language/bert/benchmark.sh
new file mode 100755
index 000000000000..9453d1373f2f
--- /dev/null
+++ b/examples/language/bert/benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+   torchrun --standalone --nproc_per_node 2  benchmark.py --plugin $plugin --model_type "bert"
+   torchrun --standalone --nproc_per_node 2  benchmark.py  --plugin $plugin --model_type "albert"
+done
diff --git a/examples/language/bert/benchmark_utils.py b/examples/language/bert/benchmark_utils.py
new file mode 100644
index 000000000000..886017a41826
--- /dev/null
+++ b/examples/language/bert/benchmark_utils.py
@@ -0,0 +1,146 @@
+import inspect
+from logging import getLogger
+from time import time
+from typing import Callable
+
+import torch
+import yaml
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+logger = getLogger("colossalai-booster-benchmark")
+_INVALID = float("nan")
+
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def _is_valid(val):
+    return val == val
+
+
+def get_call_arg_names(module_or_fn):
+    if isinstance(module_or_fn, torch.nn.Module):
+        return inspect.getfullargspec(module_or_fn.forward)[0][1:]
+    return inspect.getfullargspec(module_or_fn)[0]
+
+
+def measure_params(model):
+    num_params = _INVALID
+
+    try:
+        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    except AttributeError as e:
+        logger.error(f"Unable to measure model params due to error: {e}")
+
+    return num_params
+
+
+def warm_up(
+    model,
+    booster,
+    dataloader,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    num_runs=10,
+):
+    for i, data in enumerate(dataloader):
+        if i > num_runs:
+            break
+        inputs, labels = data[0].cuda(), data[1].cuda()
+        outputs = model(inputs, labels=labels)
+        loss = criterion(outputs)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+
+def fmt(d: dict):
+    return yaml.dump(d)
+
+
+def benchmark(
+    model: torch.nn.Module,
+    booster: Booster,
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: LRScheduler,
+    dataloader: DataLoader,
+    criterion: Callable = None,
+    warm_up_fn=warm_up,
+    epoch_num: int = 3,
+    batch_size: int = 32,
+    warm_up_steps: int = 3,
+):
+    results = {}
+    model_device = torch.cuda.current_device()
+
+    # Warm up
+    warm_up_fn(
+        model,
+        booster,
+        dataloader,
+        criterion,
+        optimizer,
+        lr_scheduler,
+        num_runs=warm_up_steps,
+    )
+    # Measure params
+    params = measure_params(model)
+    if _is_valid(params):
+        results["params"] = format_num(params)
+        logger.info(f"Model parameters: {params} ({format_num(params)})")
+
+    # Measure Allocated Memory and Throughput
+    memory = {}
+    throughput = {}
+    torch.cuda.reset_peak_memory_stats(device=model_device)
+    pre_mem = torch.cuda.memory_allocated(device=model_device)
+
+    start_time = time()
+
+    for epoch in range(epoch_num):
+        with tqdm(dataloader, desc=f'Epoch [{epoch + 1}/{epoch_num}]',
+                  disable=not DistCoordinator().is_master()) as pbar:
+            for data in pbar:
+                inputs, labels = data[0].cuda(), data[1].cuda()
+                outputs = model(inputs, labels=labels)
+                loss = criterion(outputs)
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    end_time = time()
+
+    all_sample = epoch_num * len(dataloader)
+
+    post_mem = torch.cuda.memory_allocated(device=model_device)
+    max_mem = torch.cuda.max_memory_allocated(device=model_device)
+
+    memory[f"batch_size_{batch_size}"] = {
+        "cuda_pre_training_bytes": format_num(pre_mem, bytes=True),
+        "cuda_max_training_bytes": format_num(max_mem, bytes=True),
+        "cuda_post_training_bytes": format_num(post_mem, bytes=True),
+    }
+    logger.info(fmt({f"Memory results (batch_size={batch_size})": memory[f"batch_size_{batch_size}"]}))
+
+    throughput[f"batch_size_{batch_size}"] = {"throughput:": "{:.1f}".format(all_sample * DistCoordinator().world_size / (end_time - start_time))}
+    logger.info(fmt({f"Throughput results (batch_size={batch_size})": throughput[f"batch_size_{batch_size}"]}))
+
+    results["throughput"] = throughput
+    results["memory"] = memory
+
+    return results
diff --git a/examples/language/bert/data.py b/examples/language/bert/data.py
new file mode 100644
index 000000000000..981cedcca8c2
--- /dev/null
+++ b/examples/language/bert/data.py
@@ -0,0 +1,127 @@
+import datasets
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+
+
+class GLUEDataBuilder:
+
+    task_text_field_map = {
+        "cola": ["sentence"],
+        "sst2": ["sentence"],
+        "mrpc": ["sentence1", "sentence2"],
+        "qqp": ["question1", "question2"],
+        "stsb": ["sentence1", "sentence2"],
+        "mnli": ["premise", "hypothesis"],
+        "qnli": ["question", "sentence"],
+        "rte": ["sentence1", "sentence2"],
+        "wnli": ["sentence1", "sentence2"],
+        "ax": ["premise", "hypothesis"],
+    }
+
+    glue_task_num_labels = {
+        "cola": 2,
+        "sst2": 2,
+        "mrpc": 2,
+        "qqp": 2,
+        "stsb": 1,
+        "mnli": 3,
+        "qnli": 2,
+        "rte": 2,
+        "wnli": 2,
+        "ax": 3,
+    }
+
+    loader_columns = [
+        "datasets_idx",
+        "input_ids",
+        "token_type_ids",
+        "attention_mask",
+        "start_positions",
+        "end_positions",
+        "labels",
+    ]
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        plugin: DPPluginBase,
+        task_name: str = "mrpc",
+        max_seq_length: int = 128,
+        train_batch_size: int = 32,
+        eval_batch_size: int = 32,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_name_or_path = model_name_or_path
+        self.task_name = task_name
+        self.max_seq_length = max_seq_length
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.plugin = plugin
+
+        self.text_fields = self.task_text_field_map[task_name]
+        self.num_labels = self.glue_task_num_labels[task_name]
+        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+        self.setup()
+
+    def setup(self):
+        self.dataset = datasets.load_dataset("glue", self.task_name)
+
+        for split in self.dataset.keys():
+            self.dataset[split] = self.dataset[split].map(
+                self.convert_to_features,
+                batched=True,
+                remove_columns=["label"],
+            )
+            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
+            self.dataset[split].set_format(type="torch", columns=self.columns)
+
+        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]
+
+    def prepare_data(self):
+        datasets.load_dataset("glue", self.task_name)
+        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+
+    def train_dataloader(self):
+        return self.plugin.prepare_dataloader(self.dataset["train"],
+                                              batch_size=self.train_batch_size,
+                                              shuffle=True,
+                                              drop_last=True)
+
+    def val_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def test_dataloader(self):
+        if len(self.eval_splits) == 1:
+            return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
+        elif len(self.eval_splits) > 1:
+            return [
+                self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
+                for x in self.eval_splits
+            ]
+
+    def convert_to_features(self, example_batch):
+
+        # Either encode single sentence or sentence pairs
+        if len(self.text_fields) > 1:
+            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
+        else:
+            texts_or_text_pairs = example_batch[self.text_fields[0]]
+
+        # Tokenize the text/text pairs
+        features = self.tokenizer.batch_encode_plus(texts_or_text_pairs,
+                                                    max_length=self.max_seq_length,
+                                                    padding='max_length',
+                                                    truncation=True)
+
+        # Rename label to labels to make it easier to pass to model forward
+        features["labels"] = example_batch["label"]
+
+        return features
diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
new file mode 100644
index 000000000000..b209ffde85a4
--- /dev/null
+++ b/examples/language/bert/finetune.py
@@ -0,0 +1,220 @@
+import argparse
+from typing import List, Union
+
+import evaluate
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from data import GLUEDataBuilder
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AlbertForSequenceClassification,
+    AutoConfig,
+    BertForSequenceClassification,
+    get_linear_schedule_with_warmup,
+)
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+
+
+def move_to_cuda(batch):
+    return {k: v.cuda() for k, v in batch.items()}
+
+
+@torch.no_grad()
+def evaluate_model(model: nn.Module, test_dataloader: Union[DataLoader, List[DataLoader]], num_labels: int, task_name: str,
+             eval_splits: List[str], coordinator: DistCoordinator):
+    metric = evaluate.load("glue", task_name, process_id=coordinator.rank, num_process=coordinator.world_size)
+    model.eval()
+
+    def evaluate_subset(dataloader: DataLoader):
+        accum_loss = torch.zeros(1, device=get_current_device())
+        for batch in dataloader:
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            val_loss, logits = outputs[:2]
+            accum_loss.add_(val_loss)
+
+            if num_labels > 1:
+                preds = torch.argmax(logits, axis=1)
+            elif num_labels == 1:
+                preds = logits.squeeze()
+
+            labels = batch["labels"]
+
+            metric.add_batch(predictions=preds, references=labels)
+
+        results = metric.compute()
+        dist.all_reduce(accum_loss.div_(len(dataloader)))
+        if coordinator.is_master():
+            results['loss'] = accum_loss.item() / coordinator.world_size
+        return results
+
+    if isinstance(test_dataloader, DataLoader):
+        return evaluate_subset(test_dataloader)
+    else:
+        assert len(test_dataloader) == len(eval_splits)
+        final_results = {}
+        for split, sub_loader in zip(eval_splits, test_dataloader):
+            results = evaluate_subset(sub_loader)
+            final_results.update({f'{k}_{split}': v for k, v in results.items()})
+        return final_results
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, lr_scheduler, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for batch in pbar:
+            # Forward pass
+            batch = move_to_cuda(batch)
+            outputs = model(**batch)
+            loss = outputs[0]
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            lr_scheduler.step()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+    parser.add_argument('--target_f1', type=float, default=None, help="target f1 score. Raise exception if not reached")
+    args = parser.parse_args()
+
+    if args.model_type == 'bert':
+        model_name = "bert-base-uncased"
+    elif args.model_type == 'albert':
+        model_name = "albert-xxlarge-v2"
+    else:
+        raise RuntimeError
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    data_builder = GLUEDataBuilder(model_name,
+                                   plugin,
+                                   args.task,
+                                   train_batch_size=BATCH_SIZE,
+                                   eval_batch_size=BATCH_SIZE)
+    train_dataloader = data_builder.train_dataloader()
+    test_dataloader = data_builder.test_dataloader()
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+
+    cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels)
+    if model_name == "bert-base-uncased":
+        model = BertForSequenceClassification.from_pretrained(model_name, config=cfg)
+    elif model_name == "albert-xxlarge-v2":
+        model = AlbertForSequenceClassification.from_pretrained(model_name, config=cfg)
+    else:
+        raise RuntimeError
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Train model
+    # ==============================
+    for epoch in range(NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, lr_scheduler, train_dataloader, booster, coordinator)
+
+    results = evaluate_model(model, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits,
+                       coordinator)
+
+    if coordinator.is_master():
+        print(results)
+        if args.target_f1 is not None and 'f1' in results:
+            assert results['f1'] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/bert/requirements.txt b/examples/language/bert/requirements.txt
new file mode 100644
index 000000000000..377422c260ad
--- /dev/null
+++ b/examples/language/bert/requirements.txt
@@ -0,0 +1,9 @@
+colossalai
+evaluate
+datasets
+torch
+tqdm
+transformers
+scipy
+scikit-learn
+ptflops
diff --git a/examples/language/bert/run_gemini.sh b/examples/language/bert/run_gemini.sh
deleted file mode 100644
index d791334e8c97..000000000000
--- a/examples/language/bert/run_gemini.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-set -x
-# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
-export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
-
-# The following options only valid when DISTPLAN="colossalai"
-export GPUNUM=${GPUNUM:-1}
-export PLACEMENT=${PLACEMENT:-"cpu"}
-export BATCH_SIZE=${BATCH_SIZE:-16}
-
-# bert | albert
-export MODEL_TYPE=${MODEL_TYPE:-"bert"}
-export TRAIN_STEP=${TRAIN_STEP:-10}
-
-mkdir -p gemini_logs
-
-env CUDA_LAUNCH_BLOCKING=1 torchrun --standalone --nproc_per_node=${GPUNUM} ./train_bert_demo.py \
---model_type=${MODEL_TYPE} \
---batch_size=${BATCH_SIZE} \
---placement=${PLACEMENT} \
---distplan=${DISTPLAN} \
---train_step=${TRAIN_STEP} \
-2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_${PLACEMENT}.log
diff --git a/examples/language/bert/test_ci.sh b/examples/language/bert/test_ci.sh
old mode 100644
new mode 100755
index 42c63fec50c0..7fc6daabb2f3
--- a/examples/language/bert/test_ci.sh
+++ b/examples/language/bert/test_ci.sh
@@ -1,2 +1,8 @@
-set -x
-env GPUNUM=1 bash run_gemini.sh
+#!/bin/bash
+set -xe
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+   torchrun --standalone --nproc_per_node 4  finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert"
+done
diff --git a/examples/language/bert/train_bert_demo.py b/examples/language/bert/train_bert_demo.py
deleted file mode 100644
index 9a0278b2c711..000000000000
--- a/examples/language/bert/train_bert_demo.py
+++ /dev/null
@@ -1,331 +0,0 @@
-import os
-from functools import partial
-from time import time
-
-import psutil
-import torch
-from packaging import version
-from torch import nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from transformers import AlbertConfig, AlbertForSequenceClassification, BertConfig, BertForSequenceClassification
-
-import colossalai
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
-
-CAI_VERSION = colossalai.__version__
-
-
-def get_tflops(model_numel, batch_size, seq_len, step_time):
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
-
-
-def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
-    from contextlib import nullcontext
-
-    from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
-    if enable_flag:
-        return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                       schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
-                       on_trace_ready=tensorboard_trace_handler(save_dir),
-                       record_shapes=True,
-                       profile_memory=True)
-    else:
-
-        class DummyProfiler:
-
-            def __init__(self):
-                self.step_number = 0
-
-            def step(self):
-                self.step_number += 1
-
-        return nullcontext(DummyProfiler())
-
-
-def get_time_stamp():
-    import time
-    cur_time = time.strftime("%d-%H:%M", time.localtime())
-    return cur_time
-
-
-def get_bert_data(batch_size: int, sequence_length: int, vacob_size: int, n_class: int, device: torch.device):
-    input = torch.randint(
-        low=0,
-        high=vacob_size,
-        size=(batch_size, sequence_length),
-        device=device,
-        dtype=torch.long,
-    )
-    label = torch.randint(low=0, high=n_class, size=(batch_size,), device=device, dtype=torch.long)
-    return input, label
-
-
-def parse_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument(
-        "--distplan",
-        type=str,
-        default='CAI_Gemini',
-        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
-    )
-    parser.add_argument(
-        "--placement",
-        type=str,
-        default='cpu',
-        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="batch size per DP group of training.",
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default="bert",
-        help="bert or albert",
-    )
-    parser.add_argument(
-        "--train_step",
-        type=int,
-        default=10,
-        help="training iterations for test",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-SEQ_LEN = 512
-VOCAB_SIZE = 1000
-NUM_LABELS = 10
-
-
-# Parameter Sharding Strategies for Tensor Parallelism
-def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
-    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    param.set_tensor_spec(*spec)
-
-
-def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(0, param, pg)
-
-
-def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(-1, param, pg)
-
-
-def get_cpu_mem():
-    return psutil.Process().memory_info().rss / 1024**2
-
-
-def get_gpu_mem():
-    return torch.cuda.memory_allocated() / 1024**2
-
-
-def get_mem_info(prefix=''):
-    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
-
-
-def get_model_size(model: nn.Module):
-    total_numel = 0
-    for module in model.modules():
-        for p in module.parameters(recurse=False):
-            total_numel += p.numel()
-    return total_numel
-
-
-def model_builder(args):
-    if args.model_type == "bert":
-        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
-        return BertForSequenceClassification(cfg)
-    elif args.model_type == "albert":
-        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
-        return AlbertForSequenceClassification(cfg)
-    else:
-        raise RuntimeError
-
-
-def model_size_formatter(numel: int) -> str:
-    GB_SIZE = 10**9
-    MB_SIZE = 10**6
-    KB_SIZE = 10**3
-    if numel >= GB_SIZE:
-        return f'{numel / GB_SIZE:.1f}B'
-    elif numel >= MB_SIZE:
-        return f'{numel / MB_SIZE:.1f}M'
-    elif numel >= KB_SIZE:
-        return f'{numel / KB_SIZE:.1f}K'
-    else:
-        return str(numel)
-
-
-def set_cpu_maximum_parallelism():
-    conf_str = torch.__config__.parallel_info()
-    inter_str = conf_str.split("hardware_concurrency() : ")[1]
-    max_concurrency = inter_str.split('\n')[0]
-    os.environ["OMP_NUM_THREADS"] = max_concurrency
-    print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")
-
-
-def main():
-    # version check
-    # this example is supposed to work for versions greater than 0.2.0
-    assert version.parse(CAI_VERSION) >= version.parse("0.2.0")
-
-    set_cpu_maximum_parallelism()
-    args = parse_args()
-
-    # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
-    if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
-        raise TypeError(f"{args.distplan} is error")
-
-    # batch size per DP degree
-    BATCH_SIZE = args.batch_size
-
-    NUM_STEPS = args.train_step
-
-    WARMUP_STEPS = 1
-    assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
-    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
-    PROF_FLAG = False    # The flag of profiling, False by default
-
-    disable_existing_loggers()
-    colossalai.launch_from_torch(config={})
-
-    logger = get_dist_logger()
-    logger.info(f" {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
-
-    torch.manual_seed(123)
-    if args.distplan.startswith("CAI"):
-        # all param must use the same process group.
-        world_size = torch.distributed.get_world_size()
-
-        # build a base-bert model
-        with ColoInitContext(device=get_current_device(), dtype=torch.half):
-            model = model_builder(args)
-            # model = BertForSequenceClassification(BertConfig(vocal_size =  VOCAB_SIZE))
-
-        # asign running configurations
-        gemini_config = None
-        if args.distplan.startswith("CAI_ZeRO"):
-            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
-        elif args.distplan == "CAI_Gemini":
-            gemini_config = dict(strict_ddp_mode=True,
-                                 device=get_current_device(),
-                                 placement_policy=args.placement,
-                                 pin_memory=True,
-                                 hidden_dim=model.config.hidden_size,
-                                 search_range_mb=128)
-            optim_config = dict(gpu_margin_mem_ratio=0.)
-        else:
-            raise RuntimeError
-
-        # build a highly optimized gpu/cpu optimizer
-        optimizer = HybridAdam(model.parameters(), lr=1e-3)
-
-        if args.distplan == "CAI_ZeRO1":
-            zero_stage = 1
-        elif args.distplan == "CAI_ZeRO2":
-            zero_stage = 2
-        elif args.distplan == "CAI_Gemini":
-            zero_stage = 3
-        else:
-            raise RuntimeError
-
-        # wrap your model and optimizer
-        model = zero_model_wrapper(model, zero_stage, gemini_config)
-        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
-
-        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
-    elif args.distplan.startswith("Pytorch"):
-        model = model_builder(args).cuda()
-        model = DDP(model)
-        if args.distplan.endswith("DDP"):
-            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-        elif args.distplan.endswith("ZeRO"):
-            from torch.distributed.optim import ZeroRedundancyOptimizer
-            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
-    else:
-        raise RuntimeError
-
-    # model is shared after TP
-    numel = get_model_size(model)
-    logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
-    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
-
-    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
-    # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
-    # = batch_per_DP_group * numel * seq_len * 8
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
-
-    torch.cuda.synchronize()
-    model.train()
-    tflops_list = []
-
-    def train_step():
-        # we just use randomly generated data here
-        input_ids, labels = get_bert_data(BATCH_SIZE,
-                                          SEQ_LEN,
-                                          VOCAB_SIZE,
-                                          NUM_LABELS,
-                                          device=torch.cuda.current_device())
-        optimizer.zero_grad()
-
-        start = time()
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-        torch.cuda.synchronize()
-        fwd_end = time()
-        fwd_time = fwd_end - start
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
-
-        if args.distplan.startswith("CAI"):
-            optimizer.backward(loss)
-        elif args.distplan.startswith("Pytorch"):
-            loss.backward()
-        else:
-            raise RuntimeError
-
-        torch.cuda.synchronize()
-        bwd_end = time()
-        bwd_time = bwd_end - fwd_end
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
-
-        optimizer.step()
-        torch.cuda.synchronize()
-        optim_time = time() - bwd_end
-        step_time = time() - start
-        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])
-
-        step_tflops = get_tflops_func(step_time)
-        logger.info(
-            f"[{n + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
-            ranks=[0],
-        )
-        if n >= WARMUP_STEPS:
-            tflops_list.append(step_tflops)
-
-    demo_profiler = get_profile_context(PROF_FLAG,
-                                        WARMUP_STEPS,
-                                        NUM_STEPS - WARMUP_STEPS,
-                                        save_dir=f"profile/{get_time_stamp()}-demo")
-
-    with demo_profiler as prof:
-        for n in range(NUM_STEPS):
-            train_step()
-            prof.step()
-
-    tflops_list.sort()
-    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
-    logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
-    torch.cuda.synchronize()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/language/palm/README.md b/examples/language/palm/README.md
index 486bf240f89c..3ff3939d63d4 100644
--- a/examples/language/palm/README.md
+++ b/examples/language/palm/README.md
@@ -43,6 +43,9 @@ palm = PaLM(
 )
 ```
 
+## New API
+We have modified our previous implementation of PaLM with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in train.py. We have also offer a shell script test_ci.sh for you to go through all our plugins for the booster. For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
 ## Test on Enwik8
 
 ```bash
diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh
index 7a533509e009..2a846e81a9a7 100644
--- a/examples/language/palm/run.sh
+++ b/examples/language/palm/run.sh
@@ -3,9 +3,11 @@ export DISTPAN="colossalai"
 
 # The following options only valid when DISTPAN="colossalai"
 export TPDEGREE=1
-export GPUNUM=1
+export GPUNUM=4
 export PLACEMENT='cpu'
 export USE_SHARD_INIT=False
-export BATCH_SIZE=4
+export BATCH_SIZE=1
 
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  \
+--dummy_data=True --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --plugin='gemini' \
+--placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh
index f21095578077..4de6a44e5bf7 100644
--- a/examples/language/palm/test_ci.sh
+++ b/examples/language/palm/test_ci.sh
@@ -4,6 +4,6 @@ for BATCH_SIZE in 2
 do
 for GPUNUM in 1 4
 do
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  2>&1 | tee run.log
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --standalone  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  --plugin='gemini' 2>&1 | tee run.log
 done
 done
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index b16da1c7744a..62062e8bd272 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -9,6 +9,8 @@
 import torch.optim as optim
 import tqdm
 from packaging import version
+
+from colossalai.nn import HybridAdam
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
 from torch.utils.data import DataLoader, Dataset
@@ -18,6 +20,8 @@
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, ZeroDDP
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 # constants
 
@@ -58,6 +62,12 @@ def parse_args():
         help=
         "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--batch_size",
         type=int,
@@ -101,28 +111,6 @@ def get_model_size(model: nn.Module):
     return total_numel
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placement_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placement_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placement_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
 
 
 # Parameter Sharding Strategies for Tensor Parallelism
@@ -218,6 +206,18 @@ def __len__(self):
 if args.distplan == "colossalai":
     # instantiate GPT-like decoder model
 
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+    logger.info(f"plugin: {plugin}")
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
     default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
     ctx = ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg)
@@ -228,12 +228,12 @@ def __len__(self):
 
     pg = default_pg
     tensor_parallelize(model, pg)
-    model = gemini_zero_dpp(model, pg, args.placement)
 
     # optimizer
 
-    #optimizer = GeminiAdamOptimizer(model, lr=1e-7, initial_scale=2**5)
-    optimizer = GeminiAdamOptimizer(model, lr=LEARNING_RATE, initial_scale=2**5)
+    optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE, initial_scale=2**5)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+
 else:
     model = PaLM(num_tokens=256, dim=512, depth=8)
     model = AutoregressiveWrapper(model, max_seq_len=2048)