From b463651f3eeb313d13d10db28fc08d7a0277cfbf Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 22 Jun 2023 14:41:25 +0800
Subject: [PATCH 01/14] [workflow] cover all public repositories in weekly
 report (#4069)

---
 .../generate_leaderboard_and_send_to_lark.py  | 212 ++++++++++++------
 1 file changed, 149 insertions(+), 63 deletions(-)

diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
index d8f6c8fe309e..2884e38dd3dd 100644
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -1,5 +1,4 @@
 import os
-from dataclasses import dataclass
 from datetime import datetime, timedelta
 from typing import Any, Dict, List
 
@@ -10,8 +9,7 @@
 from requests_toolbelt import MultipartEncoder
 
 
-@dataclass
-class Contributor:
+class Counter(dict):
     """
     Dataclass for a github contributor.
 
@@ -19,8 +17,40 @@ class Contributor:
         name (str): name of the contributor
         num_commits_this_week (int): number of commits made within one week
     """
-    name: str
-    num_commits_this_week: int
+
+    def record(self, item: str):
+        if item in self:
+            self[item] += 1
+        else:
+            self[item] = 1
+
+    def to_sorted_list(self):
+        data = [(key, value) for key, value in self.items()]
+        data.sort(key=lambda x: x[1], reverse=True)
+        return data
+
+
+def get_utc_time_one_week_ago():
+    """
+    Get the UTC time one week ago.
+    """
+    now = datetime.utcnow()
+    start_datetime = now - timedelta(days=7)
+    return start_datetime
+
+
+def datetime2str(dt):
+    """
+    Convert datetime to string in the format of YYYY-MM-DDTHH:MM:SSZ
+    """
+    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def str2datetime(string):
+    """
+    Convert string in the format of YYYY-MM-DDTHH:MM:SSZ to datetime
+    """
+    return datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")
 
 
 def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title: str, output_path: str) -> None:
@@ -36,7 +66,28 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
     plt.savefig(output_path, dpi=1200)
 
 
-def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
+def get_organization_repositories(github_token, organization_name) -> List[str]:
+    """
+    Retrieve the public repositories under the organization.
+    """
+    url = f"https://api.github.com/orgs/{organization_name}/repos?type=public"
+
+    # prepare header
+    headers = {
+        'Authorization': f'Bearer {github_token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+
+    res = requests.get(url, headers=headers).json()
+    repo_list = []
+
+    for item in res:
+        repo_list.append(item['name'])
+    return repo_list
+
+
+def get_issue_pull_request_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
     """
     Retrieve the issue/PR comments made by our members in the last 7 days.
 
@@ -56,7 +107,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
     # do pagination to the API
     page = 1
     while True:
-        comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={since}&page={page}'
+        comment_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/comments?since={since}&page={page}'
         comment_response = requests.get(comment_api, headers=headers).json()
 
         if len(comment_response) == 0:
@@ -70,7 +121,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
                     continue
 
                 issue_id = item['issue_url'].split('/')[-1]
-                issue_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/{issue_id}'
+                issue_api = f'https://api.github.com/repos/{org_name}/{repo_name}/issues/{issue_id}'
                 issue_response = requests.get(issue_api, headers=headers).json()
                 issue_author_relationship = issue_response['author_association']
 
@@ -87,7 +138,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
     return user_engagement_count
 
 
-def get_discussion_comments(github_token, since) -> Dict[str, int]:
+def get_discussion_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
     """
     Retrieve the discussion comments made by our members in the last 7 days.
     This is only available via the GitHub GraphQL API.
@@ -105,7 +156,7 @@ def _generate_discussion_query(num, cursor: str = None):
             offset_str = f", after: \"{cursor}\""
         query = f"""
         {{
-            repository(owner: "hpcaitech", name: "ColossalAI"){{
+            repository(owner: "{org_name}", name: "{repo_name}"){{
                 discussions(first: {num} {offset_str}){{
                     edges {{
                         cursor
@@ -134,7 +185,7 @@ def _generate_comment_reply_count_for_discussion(discussion_number, num, cursor:
             offset_str = f", before: \"{cursor}\""
         query = f"""
         {{
-            repository(owner: "hpcaitech", name: "ColossalAI"){{
+            repository(owner: "{org_name}", name: "{repo_name}"){{
                 discussion(number: {discussion_number}){{
                     title
                     comments(last: {num} {offset_str}){{
@@ -191,8 +242,8 @@ def _call_graphql_api(query):
             for edge in edges:
                 # print the discussion title
                 discussion = edge['node']
+                discussion_updated_at = str2datetime(discussion['updatedAt'])
 
-                discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                 # check if the updatedAt is within the last 7 days
                 # if yes, add it to discussion_numbers
                 if discussion_updated_at > since:
@@ -250,6 +301,7 @@ def _call_graphql_api(query):
                             if reply['authorAssociation'] == 'MEMBER':
                                 # check if the updatedAt is within the last 7 days
                                 # if yes, add it to discussion_numbers
+
                                 reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                                 if reply_updated_at > since:
                                     member_name = reply['author']['login']
@@ -260,7 +312,7 @@ def _call_graphql_api(query):
     return user_engagement_count
 
 
-def generate_user_engagement_leaderboard_image(github_token: str, output_path: str) -> bool:
+def generate_user_engagement_leaderboard_image(github_token: str, org_name: str, repo_list: List[str], output_path: str) -> bool:
     """
     Generate the user engagement leaderboard image for stats within the last 7 days
 
@@ -270,23 +322,29 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
     """
 
     # request to the Github API to get the users who have replied the most in the last 7 days
-    now = datetime.utcnow()
-    start_datetime = now - timedelta(days=7)
-    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
+    start_datetime = get_utc_time_one_week_ago()
+    start_datetime_str = datetime2str(start_datetime)
 
     # get the issue/PR comments and discussion comment count
-    issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, since=start_datetime_str)
-    discussion_engagement_count = get_discussion_comments(github_token=github_token, since=start_datetime)
     total_engagement_count = {}
 
-    # update the total engagement count
-    total_engagement_count.update(issue_pr_engagement_count)
-    for name, count in discussion_engagement_count.items():
-        if name in total_engagement_count:
-            total_engagement_count[name] += count
-        else:
-            total_engagement_count[name] = count
+    def _update_count(counter):
+        for name, count in counter.items():
+            if name in total_engagement_count:
+                total_engagement_count[name] += count
+            else:
+                total_engagement_count[name] = count
 
+
+    for repo_name in repo_list:
+        print(f"Fetching user engagement count for {repo_name}/{repo_name}")
+        issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime_str)
+        discussion_engagement_count = get_discussion_comments(github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime)
+
+        # update the total engagement count
+        _update_count(issue_pr_engagement_count)
+        _update_count(discussion_engagement_count)
+        
     # prepare the data for plotting
     x = []
     y = []
@@ -302,9 +360,6 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
             x.append(count)
             y.append(name)
 
-        # use Shanghai time to display on the image
-        start_datetime_str = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%dT%H:%M:%SZ")
-
         # plot the leaderboard
         xlabel = f"Number of Comments made (since {start_datetime_str})"
         ylabel = "Member"
@@ -315,7 +370,7 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
         return False
 
 
-def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
+def generate_contributor_leaderboard_image(github_token, org_name, repo_list, output_path) -> bool:
     """
     Generate the contributor leaderboard image for stats within the last 7 days
 
@@ -324,54 +379,81 @@ def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
         output_path (str): the path to save the image
     """
     # request to the Github API to get the users who have contributed in the last 7 days
-    URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
     headers = {
         'Authorization': f'Bearer {github_token}',
         'Accept': 'application/vnd.github+json',
         'X-GitHub-Api-Version': '2022-11-28'
     }
 
-    while True:
-        response = requests.get(URL, headers=headers).json()
+    counter = Counter()
+    start_datetime = get_utc_time_one_week_ago()
 
-        if len(response) != 0:
-            # sometimes the Github API returns empty response for unknown reason
-            # request again if the response is empty
-            break
+    def _get_url(org_name, repo_name, page):
+        return f'https://api.github.com/repos/{org_name}/{repo_name}/pulls?per_page=50&page={page}&state=closed'
+
+    def _iterate_by_page(org_name, repo_name):
+        page = 1
+        stop = False
+
+        while not stop:
+            print(f"Fetching pull request data for {org_name}/{repo_name} - page{page}")
+            url = _get_url(org_name, repo_name, page)
 
-    contributor_list = []
+            while True:
+                response = requests.get(url, headers=headers).json()
 
-    # get number of commits for each contributor
-    start_timestamp = None
-    for item in response:
-        num_commits_this_week = item['weeks'][-1]['c']
-        name = item['author']['login']
-        contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
-        contributor_list.append(contributor)
+                if isinstance(response, list):
+                    # sometimes the Github API returns nothing
+                    # request again if the response is not a list
+                    break
+                print("Empty response, request again...")
 
-        # update start_timestamp
-        start_timestamp = item['weeks'][-1]['w']
+            if len(response) == 0:
+                # if the response is empty, stop
+                stop = True
+                break
+
+            # count the pull request and author from response
+            for pr_data in response:
+                merged_at = pr_data['merged_at']
+                author = pr_data['user']['login']
+
+                if merged_at is None:
+                    continue
+
+                merge_datetime = str2datetime(merged_at)
+
+                if merge_datetime < start_datetime:
+                    # if we found a pull request that is merged before the start_datetime
+                    # we stop
+                    stop = True
+                    break
+                else:
+                    # record the author1
+                    counter.record(author)
+
+            # next page
+            page += 1
+
+    for repo_name in repo_list:
+        _iterate_by_page(org_name, repo_name)
 
     # convert unix timestamp to Beijing datetime
-    start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
-    start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
+    bj_start_datetime = datetime.fromtimestamp(start_datetime.timestamp(), tz=pytz.timezone('Asia/Shanghai'))
+    bj_start_datetime_str = datetime2str(bj_start_datetime)
 
-    # sort by number of commits
-    contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
+    contribution_list = counter.to_sorted_list()
 
     # remove contributors who has zero commits
-    contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]
-
-    # prepare the data for plotting
-    x = [x.num_commits_this_week for x in contributor_list]
-    y = [x.name for x in contributor_list]
+    author_list = [x[0] for x in contribution_list]
+    num_commit_list = [x[1] for x in contribution_list]
 
     # plot
-    if len(x) > 0:
-        xlabel = f"Number of Commits (since {start_datetime_str})"
+    if len(author_list) > 0:
+        xlabel = f"Number of Pull Requests (since {bj_start_datetime_str})"
         ylabel = "Contributor"
         title = 'Active Contributor Leaderboard'
-        plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
+        plot_bar_chart(num_commit_list, author_list, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
         return True
     else:
         return False
@@ -438,10 +520,14 @@ def send_message_to_lark(message: str, webhook_url: str):
     GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
     CONTRIBUTOR_IMAGE_PATH = 'contributor_leaderboard.png'
     USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
+    ORG_NAME = "hpcaitech"
+
+    # get all open source repositories
+    REPO_LIST = get_organization_repositories(GITHUB_TOKEN, ORG_NAME)
 
     # generate images
-    contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
-    engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
+    contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, CONTRIBUTOR_IMAGE_PATH)
+    engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, USER_ENGAGEMENT_IMAGE_PATH)
 
     # upload images
     APP_ID = os.environ['LARK_APP_ID']
@@ -457,8 +543,8 @@ def send_message_to_lark(message: str, webhook_url: str):
 2. 用户互动榜单
 
 注：
-- 开发贡献者测评标准为：本周由公司成员提交的commit次数
-- 用户互动榜单测评标准为：本周由公司成员在非成员创建的issue/PR/discussion中回复的次数
+- 开发贡献者测评标准为：本周由公司成员与社区在所有开源仓库提交的Pull Request次数
+- 用户互动榜单测评标准为：本周由公司成员在非成员在所有开源仓库创建的issue/PR/discussion中回复的次数
 """
 
     send_message_to_lark(message, LARK_WEBHOOK_URL)
@@ -467,7 +553,7 @@ def send_message_to_lark(message: str, webhook_url: str):
     if contrib_success:
         send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
     else:
-        send_message_to_lark("本周没有成员贡献commit，无榜单图片生成。", LARK_WEBHOOK_URL)
+        send_message_to_lark("本周没有成员贡献PR，无榜单图片生成。", LARK_WEBHOOK_URL)
 
     # send user engagement image to lark
     if engagement_success:

From 0bb0b481b475aa716b5f4141e3199e840315c81c Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <eddiezhang@pku.edu.cn>
Date: Sun, 25 Jun 2023 13:34:15 +0800
Subject: [PATCH 02/14] [gemini] fix argument naming during chunk configuration
 searching

---
 colossalai/booster/plugin/gemini_plugin.py    | 17 +++++-------
 colossalai/zero/gemini/chunk/search_utils.py  | 26 +++++++++----------
 colossalai/zero/gemini/chunk/utils.py         | 14 +++++-----
 colossalai/zero/gemini/gemini_ddp.py          | 16 ++++++------
 .../test_offload/test_perf.py                 |  2 +-
 .../test_compatibility_with_gemini.py         |  2 +-
 .../test_gemini_checkpoint_io.py              |  2 +-
 tests/test_tensor/test_tp_with_zero.py        |  2 +-
 tests/test_zero/test_gemini/test_fwd_bwd.py   |  4 +--
 .../test_gemini/test_gemini_use_rmt.py        |  2 +-
 tests/test_zero/test_gemini/test_grad_clip.py |  2 +-
 tests/test_zero/test_gemini/test_inference.py |  2 +-
 tests/test_zero/test_gemini/test_optim.py     |  4 +--
 tests/test_zero/test_gemini/test_search.py    | 22 ++++++++--------
 .../test_gemini/test_zeroddp_state_dict.py    |  4 +--
 .../test_zeroddp_state_dict_shard.py          |  3 ++-
 .../test_gemini/test_zerooptim_state_dict.py  |  2 +-
 17 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 60b25b2c400c..1173589fcd49 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -181,11 +181,11 @@ class GeminiPlugin(DPPluginBase):
         pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
         force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
         strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
-        search_range_mb (int, optional): chunk size searching range in MegaByte. Defaults to 32.
+        search_range_m (int, optional): chunk size searching range divided by 2^20. Defaults to 32.
         hidden_dim (int, optional): the hidden dimension of DNN.
             Users can provide this argument to speed up searching.
             If users do not know this argument before training, it is ok. We will use a default value 1024.
-        min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
+        min_chunk_size_m (float, optional): the minimum chunk size divided by 2^20.
             If the aggregate size of parameters is still smaller than the minimum chunk size,
             all parameters will be compacted into one small chunk.
         memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
@@ -214,9 +214,9 @@ def __init__(
         pin_memory: bool = False,
         force_outputs_fp32: bool = False,
         strict_ddp_mode: bool = False,
-        search_range_mb: int = 32,
+        search_range_m: int = 32,
         hidden_dim: Optional[int] = None,
-        min_chunk_size_mb: float = 32,
+        min_chunk_size_m: float = 32,
         memstats: Optional[MemStats] = None,
         gpu_margin_mem_ratio: float = 0.0,
         initial_scale: float = 2**32,
@@ -238,9 +238,9 @@ def __init__(
             pin_memory=pin_memory,
             force_outputs_fp32=force_outputs_fp32,
             strict_ddp_mode=strict_ddp_mode,
-            search_range_mb=search_range_mb,
+            search_range_m=search_range_m,
             hidden_dim=hidden_dim,
-            min_chunk_size_mb=min_chunk_size_mb,
+            min_chunk_size_m=min_chunk_size_m,
             memstats=memstats,
             mixed_precision=PRECISION_STR_TO_DTYPE[precision],
         )
@@ -295,10 +295,7 @@ def configure(
 
         if optimizer is not None and \
                 not isinstance(optimizer, OptimizerWrapper):
-            optimizer = GeminiOptimizer(model.unwrap(),
-                                        optimizer,
-                                        self.zero_optim_config,
-                                        self.optim_kwargs,
+            optimizer = GeminiOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
                                         self.verbose)
 
         return model, optimizer, criterion, dataloader, lr_scheduler
diff --git a/colossalai/zero/gemini/chunk/search_utils.py b/colossalai/zero/gemini/chunk/search_utils.py
index 881ceb0b3b97..6c3d4f9a1b41 100644
--- a/colossalai/zero/gemini/chunk/search_utils.py
+++ b/colossalai/zero/gemini/chunk/search_utils.py
@@ -114,9 +114,9 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
 
 def search_chunk_configuration(
         model: nn.Module,
-        search_range_mb: float,
-        search_interval_byte: int,    # hidden size is the best value for the interval
-        min_chunk_size_mb: float = 32,
+        search_range_m: float,
+        search_interval: int,    # hidden size is the best value for the interval
+        min_chunk_size_m: float = 32,
         filter_exlarge_params: bool = True,
         strict_ddp_flag: bool = False,
         memstas: Optional[MemStats] = None) -> Tuple[Dict, int, int]:
@@ -126,9 +126,9 @@ def search_chunk_configuration(
 
     Args:
         model (nn.Module): torch module
-        search_range_mb (float): searching range in mega byte.
-        search_interval_byte (int): searching interval in byte.
-        min_chunk_size_mb (float, optional): the minimum size of a distributed chunk.
+        search_range_m (float): searching range divided by 2^20.
+        search_interval (int): searching interval.
+        min_chunk_size_m (float, optional): the minimum size of a distributed chunk, divided by 2^20..
         filter_exlarge_params (bool, optional): filter extreme large parameters. Defaults to True.
         strict_ddp_flag (bool, optional): whether to enable the strict ddp mode.
             all parameters keep replicated in this mode.
@@ -145,9 +145,9 @@ def search_chunk_configuration(
         for p in model.parameters():
             param_order.append(p)
 
-    search_range_byte = round(search_range_mb * 1024**2)
-    min_chunk_size_byte = round(min_chunk_size_mb * 1024**2)
-    assert search_range_byte >= 0
+    search_range = round(search_range_m * 1024**2)
+    min_chunk_size = round(min_chunk_size_m * 1024**2)
+    assert search_range >= 0
 
     params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
     size_lcm = np.lcm.reduce(list(params_dict.keys()))
@@ -162,7 +162,7 @@ def search_chunk_configuration(
         total_param_size += group_acc_size
 
         # let small parameters keep gathered in CUDA all the time
-        if group_acc_size < min_chunk_size_byte:
+        if group_acc_size < min_chunk_size:
             config_dict[dp_degree] = dict(chunk_size=group_acc_size, keep_gathered=True)
         else:
             size_dict[dp_degree] = size_list
@@ -170,15 +170,15 @@ def search_chunk_configuration(
     if filter_exlarge_params:
         _filter_exlarge_params(model, size_dict)
 
-    max_size = min_chunk_size_byte
+    max_size = min_chunk_size
     for key in size_dict:
         max_size = max(max_size, max(size_dict[key]))
-    start_size = int(math.ceil(max_size / search_interval_byte) * search_interval_byte)
+    start_size = int(math.ceil(max_size / search_interval) * search_interval)
 
     min_chunk_waste = float('+inf')
     best_chunk_size = start_size
 
-    for chunk_size in range(start_size, start_size + search_range_byte + 1, search_interval_byte):
+    for chunk_size in range(start_size, start_size + search_range + 1, search_interval):
         temp_waste = 0
         for key in size_dict:
             temp_waste += _get_unused_byte(size_dict[key], chunk_size)
diff --git a/colossalai/zero/gemini/chunk/utils.py b/colossalai/zero/gemini/chunk/utils.py
index 71242dcd6d49..e98e9cf9c314 100644
--- a/colossalai/zero/gemini/chunk/utils.py
+++ b/colossalai/zero/gemini/chunk/utils.py
@@ -23,10 +23,10 @@ def init_chunk_manager(model: nn.Module,
                        verbose: bool = False,
                        **kwargs) -> ChunkManager:
     if hidden_dim:
-        search_interval_byte = hidden_dim
+        search_interval = hidden_dim
     else:
-        search_interval_byte = 1024    # defaults to 1kb
-    kwargs["search_interval_byte"] = search_interval_byte
+        search_interval = 1024    # defaults to 1024
+    kwargs["search_interval"] = search_interval
 
     dist.barrier()
     begin = time()
@@ -36,13 +36,13 @@ def init_chunk_manager(model: nn.Module,
     dist.barrier()
     end = time()
     span_s = end - begin
-    mb_size = 1024**2
-    total_size /= mb_size
-    wasted_size /= mb_size
+    mega_unit = 1024**2
+    total_size /= mega_unit
+    wasted_size /= mega_unit
 
     if verbose and dist.get_rank() == 0:
         print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
-              "used number: {:.2f} MB, wasted number: {:.2f} MB\n".format(total_size, wasted_size),
+              "used number: {:.2f} * 2^20, wasted number: {:.2f} * 2^20\n".format(total_size, wasted_size),
               "total wasted percentage is {:.2f}%".format(100 * safe_div(wasted_size, total_size + wasted_size)),
               sep='',
               flush=True)
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 094320c4aff4..08384ee82d0b 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -739,9 +739,9 @@ def __init__(self,
                  force_outputs_fp32: bool = False,
                  strict_ddp_mode: bool = False,
                  scatter_after_inference: bool = True,
-                 search_range_mb: int = 32,
+                 search_range_m: int = 32,
                  hidden_dim: Optional[int] = None,
-                 min_chunk_size_mb: float = 32,
+                 min_chunk_size_m: float = 32,
                  memstats: Optional[MemStats] = None,
                  mixed_precision: torch.dtype = torch.float16,
                  verbose: bool = False) -> None:
@@ -763,24 +763,24 @@ def __init__(self,
             placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
             pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
             force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
-            search_range_mb (int, optional): chunk size searching range in MegaByte. Defaults to 32.
+            search_range_m (int, optional): chunk size searching range divided by 2^20. Defaults to 32.
             hidden_dim (int, optional): the hidden dimension of DNN.
                 Users can provide this argument to speed up searching.
                 If users do not know this argument before training, it is ok. We will use a default value 1024.
-            min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
+            min_chunk_size_m (float, optional): the minimum chunk size divided by 2^20.
                 If the aggregate size of parameters is still smaller than the minimum chunk size,
                 all parameters will be compacted into one small chunk.
             memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
         """
         # some ugly hotfix for the compatibility with Lightning
-        if search_range_mb is None:
-            search_range_mb = 32
+        if search_range_m is None:
+            search_range_m = 32
 
         chunk_manager = init_chunk_manager(model=module,
                                            init_device=device,
                                            hidden_dim=hidden_dim,
-                                           search_range_mb=search_range_mb,
-                                           min_chunk_size_mb=min_chunk_size_mb,
+                                           search_range_m=search_range_m,
+                                           min_chunk_size_m=min_chunk_size_m,
                                            strict_ddp_flag=strict_ddp_mode,
                                            verbose=verbose)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
diff --git a/tests/test_auto_parallel/test_offload/test_perf.py b/tests/test_auto_parallel/test_offload/test_perf.py
index 80f134fd85d0..45c22efc4127 100644
--- a/tests/test_auto_parallel/test_offload/test_perf.py
+++ b/tests/test_auto_parallel/test_offload/test_perf.py
@@ -60,7 +60,7 @@ def exam_fwd_bwd(model_name: str, memory_budget: float, solver_name: str):
                          placement_policy='cpu',
                          pin_memory=True,
                          hidden_dim=8192,
-                         search_range_mb=128)
+                         search_range_m=128)
     gemini_model = zero_model_wrapper(gemini_model, 3, gemini_config)
     optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
     gemini_optim = zero_optim_wrapper(gemini_model, hybrid_optimizer, optim_config=optim_config)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
index 05704acbf7fd..4e3c26c1ba9c 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
@@ -75,7 +75,7 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
                          device=get_current_device(),
                          placement_policy='cpu',
                          pin_memory=True,
-                         search_range_mb=128)
+                         search_range_m=128)
 
     post_process_colo_init_ctx(gm, device=get_current_device(), default_pg=dp_process_group)
     gm = zero_model_wrapper(gm, zero_stage=3, gemini_config=gemini_config)
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 994412bbc63f..14d69cab2176 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -30,7 +30,7 @@ def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: b
         bert_model.config.save_pretrained(save_directory=pretrained_path)
 
         # TODO(ver217): use boost api
-        config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
+        config_dict, *_ = search_chunk_configuration(bert_model, search_range_m=1, search_interval=100)
         chunk_manager = ChunkManager(config_dict)
         gemini_manager = GeminiManager(placement_policy, chunk_manager)
         bert_model = ZeroDDP(bert_model, gemini_manager)
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index c636d9442902..539806cb196a 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -79,7 +79,7 @@ def run_gpt(placement_policy, tp_init_spec_func=None):
         tp_init_spec_func(model, pg)
 
     dp_world_size = pg.dp_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[dp_world_size]['chunk_size'] = 5000
     config_dict[dp_world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_zero/test_gemini/test_fwd_bwd.py b/tests/test_zero/test_gemini/test_fwd_bwd.py
index f2cbb7fb77d6..9c5455b8371b 100644
--- a/tests/test_zero/test_gemini/test_fwd_bwd.py
+++ b/tests/test_zero/test_gemini/test_fwd_bwd.py
@@ -52,7 +52,7 @@ def exam_gpt_fwd_bwd(
         torch_p.data.copy_(p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gather
     chunk_manager = ChunkManager(config_dict)
@@ -113,7 +113,7 @@ def exam_gpt_inference(
         torch_p.data.copy_(p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gather
     chunk_manager = ChunkManager(config_dict)
diff --git a/tests/test_zero/test_gemini/test_gemini_use_rmt.py b/tests/test_zero/test_gemini/test_gemini_use_rmt.py
index dd580976d8ea..00e712050b32 100644
--- a/tests/test_zero/test_gemini/test_gemini_use_rmt.py
+++ b/tests/test_zero/test_gemini/test_gemini_use_rmt.py
@@ -56,7 +56,7 @@ def run_gemini_use_rmt(placement_policy, keep_gather, model_name: str, use_grad_
                 assert len(step_list) == 4
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gather
     chunk_manager = ChunkManager(config_dict)
diff --git a/tests/test_zero/test_gemini/test_grad_clip.py b/tests/test_zero/test_gemini/test_grad_clip.py
index 38b6e474ea98..ac19a27f4a37 100644
--- a/tests/test_zero/test_gemini/test_grad_clip.py
+++ b/tests/test_zero/test_gemini/test_grad_clip.py
@@ -51,7 +51,7 @@ def exam_grad_clipping(placement_policy, model_name: str):
         p.data.copy_(torch_p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_zero/test_gemini/test_inference.py b/tests/test_zero/test_gemini/test_inference.py
index 790a0611c9dd..fb2018f7b477 100644
--- a/tests/test_zero/test_gemini/test_inference.py
+++ b/tests/test_zero/test_gemini/test_inference.py
@@ -34,7 +34,7 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
 
 def multi_chunk_init(model: torch.nn.Module, placement_policy: str):
     world_size = dist.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
diff --git a/tests/test_zero/test_gemini/test_optim.py b/tests/test_zero/test_gemini/test_optim.py
index 66611bcd2419..a9ee67368e9d 100644
--- a/tests/test_zero/test_gemini/test_optim.py
+++ b/tests/test_zero/test_gemini/test_optim.py
@@ -73,7 +73,7 @@ def exam_model_step(placement_policy, model_name: str, mixed_precision: torch.dt
         p.data.copy_(torch_p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = False
     if placement_policy != 'cuda':
@@ -130,7 +130,7 @@ def exam_tiny_example(placement_policy, model_name: str, mixed_precision: torch.
     for torch_p, p in zip(torch_model.parameters(), model.parameters()):
         p.data.copy_(torch_p.data)
 
-    chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_mb=1)
+    chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_m=1)
     gemini_manager = GeminiManager(placement_policy, chunk_manager)
     model = ZeroDDP(model, gemini_manager, pin_memory=True, mixed_precision=mixed_precision)
     optimizer = HybridAdam(model.parameters(), lr=1e-3)
diff --git a/tests/test_zero/test_gemini/test_search.py b/tests/test_zero/test_gemini/test_search.py
index 35b3b93ade0c..51dd84aace5b 100644
--- a/tests/test_zero/test_gemini/test_search.py
+++ b/tests/test_zero/test_gemini/test_search.py
@@ -30,9 +30,9 @@ def exam_search_chunk_size():
         model = model_builder()
     init_1d_row_spec(model, pg_tp)
     config_dict, *_ = search_chunk_configuration(model,
-                                                 search_range_mb=1,
-                                                 search_interval_byte=16,
-                                                 min_chunk_size_mb=0,
+                                                 search_range_m=1,
+                                                 search_interval=16,
+                                                 min_chunk_size_m=0,
                                                  filter_exlarge_params=True)
 
     for key in config_dict:
@@ -54,9 +54,9 @@ def exam_search_strict_ddp():
     with ColoInitContext(device=get_current_device()):
         ddp_model = model_builder()
     re_dict, re_total, re_wasted = search_chunk_configuration(ddp_model,
-                                                              search_range_mb=1,
-                                                              search_interval_byte=16,
-                                                              min_chunk_size_mb=0,
+                                                              search_range_m=1,
+                                                              search_interval=16,
+                                                              min_chunk_size_m=0,
                                                               filter_exlarge_params=True,
                                                               strict_ddp_flag=False)
     # get the chunk configuration over sharded ddp models
@@ -64,9 +64,9 @@ def exam_search_strict_ddp():
                          default_dist_spec=default_shard_spec):
         sharded_ddp_model = model_builder()
     sh_dict, sh_total, sh_wasted = search_chunk_configuration(sharded_ddp_model,
-                                                              search_range_mb=1,
-                                                              search_interval_byte=16,
-                                                              min_chunk_size_mb=0,
+                                                              search_range_m=1,
+                                                              search_interval=16,
+                                                              min_chunk_size_m=0,
                                                               filter_exlarge_params=True,
                                                               strict_ddp_flag=True)
     assert re_dict == sh_dict
@@ -91,8 +91,8 @@ def exam_chunk_manager():
     chunk_manager = init_chunk_manager(sharded_ddp_model,
                                        get_current_device(),
                                        hidden_dim=16,
-                                       search_range_mb=1,
-                                       min_chunk_size_mb=0,
+                                       search_range_m=1,
+                                       min_chunk_size_m=0,
                                        filter_exlarge_params=True,
                                        strict_ddp_flag=True)
     config_dict = chunk_manager.dp_degree_chunk_size_dict
diff --git a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
index 66e05f3ed1ec..2a5a4ab83029 100644
--- a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
+++ b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
@@ -35,7 +35,7 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
         torch_p.data.copy_(p.data)
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
     chunk_manager = ChunkManager(config_dict)
@@ -67,7 +67,7 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     torch_model = model_builder()    # get a different model
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
 
diff --git a/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py b/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
index ad7d3a5a4859..d16bfb7d1622 100644
--- a/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
+++ b/tests/test_zero/test_gemini/test_zeroddp_state_dict_shard.py
@@ -22,7 +22,7 @@ def exam_state_dict(placement_policy, model_name: str):
 
     model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
 
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     chunk_manager = ChunkManager(config_dict)
     gemini_manager = GeminiManager(placement_policy, chunk_manager)
     model = ZeroDDP(model, gemini_manager)
@@ -38,6 +38,7 @@ def exam_state_dict(placement_policy, model_name: str):
             assert key in zero_dict, f"{key} not in ZeRO dictionary."
             assert torch.equal(value, zero_dict[key]), f"{key} not equal."
 
+
 def run_dist(rank, world_size, port):
     config = {}
     colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
diff --git a/tests/test_zero/test_gemini/test_zerooptim_state_dict.py b/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
index a8af176c5b3d..ba016d6528dc 100644
--- a/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
+++ b/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
@@ -27,7 +27,7 @@ def exam_zero_optim_state_dict(placement_policy, keep_gathered):
     torch_model = model_builder()    # get a different model
 
     world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
     config_dict[world_size]['chunk_size'] = 5000
     config_dict[world_size]['keep_gathered'] = keep_gathered
 

From 153b957a1b5ba728528069b678c3cd30592ca912 Mon Sep 17 00:00:00 2001
From: Wenhao Chen <cwher@outlook.com>
Date: Sun, 25 Jun 2023 17:36:21 +0800
Subject: [PATCH 03/14] [chat] refactor strategy class with booster api (#3987)

* refactor: adapt boost API in base and naive strategies

* fix: initialize plugin after setup_distributed

* fix: fix save_pretrained fn

* refactor: adapt boost API in DDPStrategy

* to: add _post_init check

* to: fix ddp backward, modify ddp dataloader and unwrap

* feat: adapt boost API in ColossalAIStrategy

* fix: call setup_distributed before use get_current_device

* fix: fix save_model and save_optimizer

* test: remove save_sharded_optimizer test

* style: apply formatter

* fix: fix stage check and add comments

* feat: allow dict type arg in strategy.prepare

* to: temporarily remove lr_scheduler for testing

* style: simplify init of ColossalAIStrategy

* fix: fix lr_scheduler in sft and rm

* style: modify comments

* test: add train_prompts tests

* fix: fix inference only case and use in train_prompts

* test: skip failed tests in ci

* style: fix CodeFactor check

* fix: do not use model.to('cpu') with GeminiPlugin

* test: enable colossalai_gemini tests

* test: set CUDA_VISIBLE_DEVICES in ci

* docs: add note
---
 .../benchmarks/benchmark_opt_lora_dummy.py    |   6 +-
 applications/Chat/coati/trainer/ppo.py        |   9 +-
 applications/Chat/coati/trainer/rm.py         |  20 +-
 applications/Chat/coati/trainer/sft.py        |  20 +-
 .../Chat/coati/trainer/strategies/base.py     | 110 ++++++-----
 .../coati/trainer/strategies/colossalai.py    | 150 +++++++--------
 .../Chat/coati/trainer/strategies/ddp.py      |  59 +++---
 .../Chat/coati/trainer/strategies/naive.py    |  42 +---
 applications/Chat/examples/test_ci.sh         | 181 +++++++++++-------
 applications/Chat/examples/train_prompts.py   |   8 +-
 .../Chat/examples/train_reward_model.py       |  10 +-
 applications/Chat/examples/train_sft.py       |  16 +-
 applications/Chat/tests/test_checkpoint.py    |  11 +-
 13 files changed, 351 insertions(+), 291 deletions(-)

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 7a47624f74d8..dea7ebc60a8b 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -19,8 +19,10 @@
 
 def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
     numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
-        numel *= dist.get_world_size()
+    if isinstance(strategy, ColossalAIStrategy):
+        from colossalai.booster.plugin import GeminiPlugin
+        if isinstance(strategy.plugin, GeminiPlugin) and strategy.shard_init:
+            numel *= dist.get_world_size()
     return numel
 
 
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index e2e44e62533e..cfb18e2ae483 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -17,7 +17,7 @@
 
 from .base import Trainer
 from .callbacks import Callback
-from .strategies import Strategy
+from .strategies import ColossalAIStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
@@ -71,6 +71,11 @@ def __init__(self,
                  offload_inference_models: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
+        if isinstance(strategy, ColossalAIStrategy):
+            from colossalai.booster.plugin import GeminiPlugin
+            assert not (isinstance(strategy.plugin, GeminiPlugin) and offload_inference_models), \
+                "GeminiPlugin is not compatible with manual model.to('cpu')"
+
         experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
         replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
         generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
@@ -105,6 +110,8 @@ def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experien
     def _learn(self):
         # replay buffer may be empty at first, we should rebuild at each training
         if not self.sample_replay_buffer:
+            # HACK(cwher): according to the design of boost API, dataloader should also be boosted,
+            #  but it is impractical to adapt this pattern in RL training. Thus, I left dataloader unboosted.
             dataloader = self.strategy.setup_dataloader(self.replay_buffer, self.dataloader_pin_memory)
         if self.sample_replay_buffer:
             pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index cdae5108ab00..316eded7ea5d 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,13 +1,12 @@
 from datetime import datetime
-from typing import List, Optional
+from typing import Callable, List
 
 import pandas as pd
 import torch
-import torch.distributed as dist
-from torch.optim import Optimizer, lr_scheduler
-from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from .base import Trainer
 from .callbacks import Callback
@@ -22,7 +21,8 @@ class RewardModelTrainer(Trainer):
     Args:
         model (torch.nn.Module): the model to train
         strategy (Strategy): the strategy to use for training
-        optim(Optimizer): the optimizer to use for training
+        optim (Optimizer): the optimizer to use for training
+        lr_scheduler (_LRScheduler): the lr scheduler to use for training
         loss_fn (callable): the loss function to use for training
         train_dataloader (DataLoader): the dataloader to use for training
         valid_dataloader (DataLoader): the dataloader to use for validation
@@ -37,7 +37,8 @@ def __init__(
         model,
         strategy: Strategy,
         optim: Optimizer,
-        loss_fn,
+        lr_scheduler: _LRScheduler,
+        loss_fn: Callable,
         train_dataloader: DataLoader,
         valid_dataloader: DataLoader,
         eval_dataloader: DataLoader,
@@ -53,7 +54,7 @@ def __init__(
         self.model = model
         self.loss_fn = loss_fn
         self.optimizer = optim
-        self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer, self.train_dataloader.__len__() // 100)
+        self.scheduler = lr_scheduler
 
     def eval_acc(self, dataloader):
         dist = 0
@@ -116,7 +117,8 @@ def fit(self):
             # eval
             dist, acc = self.eval_acc(self.eval_dataloader)
             if is_rank_0():
-                log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]], columns=['step', 'loss', 'dist', 'acc'])
+                log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]],
+                                   columns=['step', 'loss', 'dist', 'acc'])
                 log.to_csv('log.csv', mode='a', header=False, index=False)
             epoch_bar.update()
             step_bar.set_postfix({'dist': dist, 'acc': acc})
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index 63fde53956cc..da223f1f33ff 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,15 +1,13 @@
-import math
 import time
-from typing import List, Optional
+from typing import List
 
 import torch
 import torch.distributed as dist
 import wandb
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.trainer import get_scheduler
 
 from .base import Trainer
 from .callbacks import Callback
@@ -38,14 +36,17 @@ def __init__(
         model,
         strategy: Strategy,
         optim: Optimizer,
+        lr_scheduler: _LRScheduler,
         train_dataloader: DataLoader,
         eval_dataloader: DataLoader = None,
         max_epochs: int = 2,
         accumulation_steps: int = 8,
         callbacks: List[Callback] = [],
     ) -> None:
-        if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3:
-            raise ValueError("Accumulation steps are not supported in stage 3 of ColossalAI")
+        if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy):
+            from colossalai.booster.plugin import GeminiPlugin
+            assert not isinstance(strategy.plugin, GeminiPlugin), \
+                "Accumulation steps are not supported in stage 3 of ColossalAI"
         super().__init__(strategy, max_epochs, callbacks=callbacks)
         self.train_dataloader = train_dataloader
         self.eval_dataloader = eval_dataloader
@@ -53,13 +54,8 @@ def __init__(
         self.optimizer = optim
 
         self.accumulation_steps = accumulation_steps
-        num_update_steps_per_epoch = len(train_dataloader) // self.accumulation_steps
-        max_steps = math.ceil(self.max_epochs * num_update_steps_per_epoch)
 
-        self.scheduler = get_scheduler("cosine",
-                                       self.optimizer,
-                                       num_warmup_steps=math.ceil(max_steps * 0.03),
-                                       num_training_steps=max_steps)
+        self.scheduler = lr_scheduler
 
     def fit(self, logger, use_wandb: bool = False):
         if use_wandb:
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index 06f81f21ab26..80bc3272872e 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
-from typing import Any, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -9,10 +9,12 @@
 from torch.utils.data import DataLoader
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
+from colossalai.booster import Booster
+from colossalai.booster.plugin import Plugin
+
 from .sampler import DistributedSampler
 
-ModelOptimPair = Tuple[nn.Module, Optimizer]
-ModelOrModelOptimPair = Union[nn.Module, ModelOptimPair]
+_BoostArgSpec = Union[nn.Module, Tuple[nn.Module, Optimizer], Dict]
 
 
 class Strategy(ABC):
@@ -20,30 +22,28 @@ class Strategy(ABC):
         Base class for training strategies.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
         super().__init__()
+        # NOTE: dist must be initialized before Booster
         self.setup_distributed()
+        self.plugin = plugin_initializer()
+        self.booster = Booster(plugin=self.plugin)
+        self._post_init()
 
     @abstractmethod
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
+    def _post_init(self) -> None:
         pass
 
-    @abstractmethod
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
+        self.booster.backward(loss, optimizer)
+
     def optimizer_step(self, optimizer: Optimizer, **kwargs) -> None:
-        pass
+        optimizer.step()
 
     @abstractmethod
     def setup_distributed(self) -> None:
         pass
 
-    @abstractmethod
-    def setup_model(self, model: nn.Module) -> nn.Module:
-        pass
-
-    @abstractmethod
-    def setup_optimizer(self, optimizer: Optimizer, model: nn.Module) -> Optimizer:
-        pass
-
     @abstractmethod
     def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
         pass
@@ -51,12 +51,13 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
     def model_init_context(self):
         return nullcontext()
 
-    def prepare(
-        self, *models_or_model_optim_pairs: ModelOrModelOptimPair
-    ) -> Union[List[ModelOrModelOptimPair], ModelOrModelOptimPair]:
-        """Prepare models or model-optimizer-pairs based on each strategy.
+    def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
+        """Prepare [model | (model, optimizer) | Dict] based on each strategy.
+        NOTE: the keys of Dict must be a subset of `self.booster.boost`'s arguments.
 
         Example::
+            >>> # e.g., include lr_scheduler
+            >>> result_dict = strategy.prepare(dict(model=model, lr_scheduler=lr_scheduler))
             >>> # when fine-tuning actor and critic
             >>> (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
             >>> # or when training reward model
@@ -65,25 +66,39 @@ def prepare(
             >>> actor, critic = strategy.prepare(actor, critic)
 
         Returns:
-            Union[List[ModelOrModelOptimPair], ModelOrModelOptimPair]: Models or model-optimizer-pairs in the original order.
+            Union[List[_BoostArgSpec], _BoostArgSpec]: [model | (model, optimizer) | Dict] in the original order.
         """
 
         rets = []
-        for arg in models_or_model_optim_pairs:
-            if isinstance(arg, tuple):
-                assert len(arg) == 2, f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"'
-                model, optimizer = arg
-                model = self.setup_model(model)
-                optimizer = self.setup_optimizer(optimizer, model)
+        for arg in boost_args:
+            if isinstance(arg, nn.Module):
+                model, *_ = self.booster.boost(arg)
+                rets.append(model)
+            elif isinstance(arg, tuple):
+                try:
+                    model, optimizer = arg
+                except ValueError:
+                    raise RuntimeError(f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"')
+                model, optimizer, *_ = self.booster.boost(model=model,
+                                                          optimizer=optimizer)
                 rets.append((model, optimizer))
-            elif isinstance(arg, nn.Module):
-                rets.append(self.setup_model(model))
+            elif isinstance(arg, Dict):
+                model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
+                boost_result = dict(model=model,
+                                    optimizer=optimizer,
+                                    criterion=criterion,
+                                    dataloader=dataloader,
+                                    lr_scheduler=lr_scheduler)
+                # remove None values
+                boost_result = {
+                    key: value
+                    for key, value in boost_result.items() if value is not None
+                }
+                rets.append(boost_result)
             else:
-                raise RuntimeError(f'Expect model or (model, optimizer) pair, got {type(arg)}')
+                raise RuntimeError(f'Type {type(arg)} is not supported')
 
-        if len(rets) == 1:
-            return rets[0]
-        return rets
+        return rets[0] if len(rets) == 1 else rets
 
     @staticmethod
     def unwrap_model(model: nn.Module) -> nn.Module:
@@ -97,23 +112,30 @@ def unwrap_model(model: nn.Module) -> nn.Module:
         """
         return model
 
-    @abstractmethod
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
-        pass
+    def save_model(self,
+                   model: nn.Module,
+                   path: str,
+                   only_rank0: bool = True,
+                   **kwargs
+                   ) -> None:
+        self.booster.save_model(model, path, shard=not only_rank0, **kwargs)
 
-    @abstractmethod
-    def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
-        pass
+    def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
+        self.booster.load_model(model, path, strict)
 
-    @abstractmethod
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
-        pass
+    def save_optimizer(self,
+                       optimizer: Optimizer,
+                       path: str,
+                       only_rank0: bool = False,
+                       **kwargs
+                       ) -> None:
+        self.booster.save_optimizer(optimizer, path, shard=not only_rank0, **kwargs)
 
-    @abstractmethod
-    def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
-        pass
+    def load_optimizer(self, optimizer: Optimizer, path: str) -> None:
+        self.booster.load_optimizer(optimizer, path)
 
     def setup_sampler(self, dataset) -> DistributedSampler:
+        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
         return DistributedSampler(dataset, 1, 0)
 
     @abstractmethod
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index cfdab2806a25..8c9b8ac0334b 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -1,24 +1,23 @@
+import functools
 import warnings
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-import torch.optim as optim
-from torch.optim import Optimizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 import colossalai
-from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import CPUAdam, HybridAdam
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
+from colossalai.booster.plugin.gemini_plugin import GeminiModel
+from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, ZeroDDP, zero_model_wrapper, zero_optim_wrapper
+from colossalai.zero import ColoInitContext
+from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
 
-logger = get_dist_logger(__name__)
-
 
 class ColossalAIStrategy(DDPStrategy):
     """
@@ -62,7 +61,6 @@ def __init__(
             placement_policy: str = 'cuda',
             pin_memory: bool = True,    # only for stage 3
             force_outputs_fp32: bool = False,    # only for stage 3
-            scatter_after_inference: bool = False,    # only for stage 3
             search_range_mb: int = 32,    # only for stage 3
             hidden_dim: Optional[int] = None,    # only for stage 3
             min_chunk_size_mb: float = 32,    # only for stage 3
@@ -78,50 +76,76 @@ def __init__(
             max_scale: float = 2**32,
             max_norm: float = 0.0,
             norm_type: float = 2.0) -> None:
-        super().__init__(seed)
+
+        assert stage in (1, 2, 3), f'Unsupported stage "{stage}"'
         assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
         assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
-        self.stage = stage
+
         # TODO(ver217): support shard_init when using from_pretrained()
         if shard_init:
             warnings.warn(
-                f'Shard init is not supported model.from_pretrained() yet. Please load weights after strategy.prepare()'
+                f'Shard init is not supported model.from_pretrained() yet. '
+                'Please load weights after strategy.prepare()'
             )
         if stage == 3 and precision == 'fp32':
             warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
             precision = 'fp16'
         self.precision = precision
         self.shard_init = shard_init
-        self.gemini_config = dict(device=get_current_device(),
-                                  placement_policy=placement_policy,
-                                  pin_memory=pin_memory,
-                                  force_outputs_fp32=force_outputs_fp32,
-                                  strict_ddp_mode=shard_init,
-                                  search_range_mb=search_range_mb,
-                                  hidden_dim=hidden_dim,
-                                  min_chunk_size_mb=min_chunk_size_mb,
-                                  scatter_after_inference=scatter_after_inference)
+
+        optim_kwargs = dict(
+            initial_scale=initial_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            hysteresis=hysteresis,
+            min_scale=min_scale,
+            max_scale=max_scale,
+            max_norm=max_norm,
+            norm_type=norm_type
+        )
+        # NOTE: dist should be initialized before calling get_current_device()
         if stage == 3:
-            self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio)
+            plugin_initializer = lambda: GeminiPlugin(
+                # gemini_config
+                device=get_current_device(),
+                placement_policy=placement_policy,
+                precision=precision,
+                pin_memory=pin_memory,
+                force_outputs_fp32=force_outputs_fp32,
+                strict_ddp_mode=shard_init,
+                search_range_mb=search_range_mb,
+                hidden_dim=hidden_dim,
+                min_chunk_size_mb=min_chunk_size_mb,
+                # zero_optim_config
+                gpu_margin_mem_ratio=gpu_margin_mem_ratio,
+                # optim_config
+                **optim_kwargs
+            )
         else:
-            self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size,
-                                          overlap_communication=overlap_communication,
-                                          cpu_offload=(placement_policy == 'cpu'))
-        self.optim_kwargs = dict(initial_scale=initial_scale,
-                                 growth_factor=growth_factor,
-                                 backoff_factor=backoff_factor,
-                                 growth_interval=growth_interval,
-                                 hysteresis=hysteresis,
-                                 min_scale=min_scale,
-                                 max_scale=max_scale,
-                                 max_norm=max_norm,
-                                 norm_type=norm_type)
+            plugin_initializer = lambda: LowLevelZeroPlugin(
+                # zero_config
+                stage=stage,
+                precision=precision,
+                # zero_optim_config
+                reduce_bucket_size_in_m=reduce_bucket_size,
+                overlap_communication=overlap_communication,
+                cpu_offload=(placement_policy == 'cpu'),
+                # optim_config
+                **optim_kwargs
+            )
+
+        super().__init__(seed, plugin_initializer)
+
+    def _post_init(self) -> None:
+        assert isinstance(self.plugin, (LowLevelZeroPlugin, GeminiPlugin)), \
+            f'{type(self).__name__}\'s plugin is not initialized properly.'
 
     def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        if self.stage == 3:
+        if isinstance(self.plugin, GeminiPlugin):
             world_size = dist.get_world_size()
             shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
             default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
@@ -131,61 +155,29 @@ def model_init_context(self):
                                    default_dist_spec=default_dist_spec)
         return super().model_init_context()
 
-    def setup_model(self, model: nn.Module) -> nn.Module:
-
-        model = zero_model_wrapper(model, zero_stage=self.stage, gemini_config=self.gemini_config)
-
-        if self.stage != 3 and self.precision == 'fp16':
-            model = model.half().cuda()
-        return model
-
-    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
-        assert isinstance(optimizer, (CPUAdam, HybridAdam)), f'Unsupported optimizer {type(optimizer)}'
-        return zero_optim_wrapper(model, optimizer, optim_config=self.zero_optim_config, **self.optim_kwargs)
-
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
-        optimizer.backward(loss)
-
-    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
-        optimizer.step()
-
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
-        if only_rank0 and dist.get_rank() != 0 and self.stage != 3:
-            return
-        if self.stage == 3:
-            assert isinstance(model, ZeroDDP)
-            # for stage 3, state_dict() method should be called on every rank
-            state_dict = model.state_dict(only_rank_0=only_rank0)
-        else:
-            # only_rank0 is false or rank == 0
-            state_dict = model.state_dict()
-        if only_rank0 and dist.get_rank() != 0:
-            return
-        torch.save(state_dict, path)
-
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
-        if only_rank0:
-            raise RuntimeError(
-                f'Optimizer states are sharded when using ColossalAIStrategy. Only rank0 is not supported.')
-        torch.save(optimizer.state_dict(), path)
-
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        if self.stage == 3:
-            assert isinstance(model, ZeroDDP)
+        if isinstance(self.plugin, GeminiPlugin):
+            assert isinstance(model, GeminiModel)
+            ddp_model = model.unwrap()
+            assert isinstance(ddp_model, GeminiDDP)
+            return ddp_model.module
+        elif isinstance(self.plugin, LowLevelZeroPlugin):
+            assert isinstance(model, LowLevelZeroModel)
             return model.module
-        return model
+        else:
+            raise RuntimeError(f'Unsupported plugin {type(self.plugin)}')
 
     def save_pretrained(self,
                         model: nn.Module,
                         path: str,
                         only_rank0: bool = True,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        if self.stage == 3:
+        if isinstance(self.plugin, GeminiPlugin):
             raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
         super().save_pretrained(model, path, only_rank0, tokenizer)
 
     def get_model_state_dict_shard(self, model: nn.Module, **config):
-        if self.stage != 3:
+        if not isinstance(self.plugin, GeminiPlugin):
             yield from super().get_model_state_dict_shard(model, **config)
         else:
             # unwrapped_model = self._unwrap_model(model)
@@ -193,5 +185,5 @@ def get_model_state_dict_shard(self, model: nn.Module, **config):
             #     if isinstance(module, LoraLinear):
             #         module.merge_weights = True
             #         module.eval()
-            assert isinstance(model, ZeroDDP)
+            assert isinstance(model, LowLevelZeroModel)
             yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 713d7b90c6f0..42867645290c 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -1,17 +1,18 @@
-import os
 import random
-from typing import Optional
+from typing import Callable, Optional
 
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from coati.replay_buffer import ReplayBuffer
-from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
+
 from .naive import NaiveStrategy
 from .sampler import DistributedSampler
 
@@ -21,9 +22,16 @@ class DDPStrategy(NaiveStrategy):
         Strategy for distributed training using torch.distributed.
     """
 
-    def __init__(self, seed: int = 42) -> None:
+    def __init__(self,
+                 seed: int = 42,
+                 plugin_initializer: Callable = TorchDDPPlugin
+                 ) -> None:
         self.seed = seed
-        super().__init__()
+        super().__init__(plugin_initializer)
+
+    def _post_init(self) -> None:
+        assert isinstance(self.plugin, TorchDDPPlugin), \
+            f'{type(self).__name__}\'s plugin is not initialized properly.'
 
     def setup_distributed(self) -> None:
         self._try_init_dist(force=True)
@@ -34,43 +42,24 @@ def set_seed(self, seed: int) -> None:
         np.random.seed(seed)
         torch.manual_seed(seed)
 
-    def setup_model(self, model: nn.Module) -> nn.Module:
-        device = torch.cuda.current_device()
-        return DDP(model, device_ids=[device])
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
+        self.booster.backward(loss, optimizer)
 
     def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
-        # DDP only mode, replay buffers on each rank are different.
-        # sampler = DistributedSampler(replay_buffer,
-        #                              num_replicas=dist.get_world_size(),
-        #                              rank=dist.get_rank(),
-        #                              shuffle=True,
-        #                              seed=self.seed,
-        #                              drop_last=True)
-        return DataLoader(
-            replay_buffer,
-            batch_size=replay_buffer.sample_batch_size,
-        #   sampler=sampler,
-            shuffle=True,
-            drop_last=True,
-            pin_memory=pin_memory,
-            collate_fn=replay_buffer.collate_fn)
-
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
-        if only_rank0 and dist.get_rank() != 0:
-            return
-        super().save_model(model, path, only_rank0)
-
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
-        if only_rank0 and dist.get_rank() != 0:
-            return
-        super().save_optimizer(optimizer, path, only_rank0)
+        return self.plugin.prepare_dataloader(replay_buffer,
+                                              batch_size=replay_buffer.sample_batch_size,
+                                              shuffle=True,
+                                              drop_last=True,
+                                              pin_memory=pin_memory,
+                                              collate_fn=replay_buffer.collate_fn)
 
     def setup_sampler(self, dataset) -> DistributedSampler:
+        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
         return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, DDP)
-        return model.module
+        assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
+        return model.unwrap()
 
     def save_pretrained(self,
                         model: nn.Module,
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index 202c480e06d9..d121237a68ea 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -1,16 +1,10 @@
 import os
-import sys
 from collections import OrderedDict
-from typing import Any, Dict, Optional
+from typing import Optional
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-import torch.optim as optim
-from coati.models.base import get_base_model
-from coati.replay_buffer import ReplayBuffer
-from coati.models.base import RewardModel
-from coati.models.lora import LoraLinear
 from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
@@ -34,20 +28,18 @@ class NaiveStrategy(Strategy):
         Strategy for single GPU. No parallelism is used.
     """
 
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: optim.Optimizer, **kwargs) -> None:
-        loss.backward()
-
-    def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:
-        optimizer.step()
+    def _post_init(self) -> None:
+        assert self.plugin is None, \
+            f'{type(self).__name__}\'s plugin is not initialized properly.'
 
     def setup_distributed(self) -> None:
         self._try_init_dist(force=False)
 
-    def setup_model(self, model: nn.Module) -> nn.Module:
-        return model
-
-    def setup_optimizer(self, optimizer: optim.Optimizer, model: nn.Module) -> optim.Optimizer:
-        return optimizer
+    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
+        # HACK: self.booster.backward(loss, optimizer) can't work if plugin is None,
+        #  it would run `optimizer.backward(loss)`, which is not compatible with torch.optim.Optimizer
+        assert self.plugin is None, "DO NOT call this method if plugin is not None"
+        loss.backward()
 
     def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
         return DataLoader(replay_buffer,
@@ -57,22 +49,6 @@ def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False
                           pin_memory=pin_memory,
                           collate_fn=replay_buffer.collate_fn)
 
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True) -> None:
-        state_dict = model.state_dict()
-        torch.save(state_dict, path)
-
-    def load_model(self, model: nn.Module, path: str, map_location: Any = None, strict: bool = True) -> None:
-        unwrapped_model = self.unwrap_model(model)
-        state_dict = torch.load(path, map_location=map_location)
-        unwrapped_model.load_state_dict(state_dict, strict=strict)
-
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
-        torch.save(optimizer.state_dict(), path)
-
-    def load_optimizer(self, optimizer: Optimizer, path: str, map_location: Any = None) -> None:
-        state_dict = torch.load(path, map_location=map_location)
-        optimizer.load_state_dict(state_dict)
-
     def save_pretrained(self,
                         model: nn.Module,
                         path: str,
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index ac3a9b507864..85728e95820c 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -1,5 +1,22 @@
 #!/usr/bin/env bash
 
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
 set -xue
 
 if [ -z "$SFT_DATASET" ]; then
@@ -26,109 +43,137 @@ pip install -r ${BASE}/requirements.txt
 
 wandb init -m offline
 
+# FIXME: This is a hack to skip tests that are not working (tested at commit b3ab7fbabf)
+#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
+#  - llama-*: Repository Not Found for url: https://huggingface.co/{...}/resolve/main/tokenizer.model.
+#  - roberta-*: RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
+SKIPPED_TESTS=(
+    "gpt2-ddp"
+    "llama-naive" "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2"
+    "roberta-naive" "roberta-ddp" "roberta-colossalai_gemini" "roberta-colossalai_zero2"
+)
+
+# These tests are quick and do not have any dependencies
+for model in 'gpt2' 'bloom' 'opt' 'llama' 'roberta'; do
+    for strategy in 'naive' 'ddp' 'colossalai_gemini' 'colossalai_zero2'; do
+        if [[ " ${SKIPPED_TESTS[*]} " =~ " ${model}-${strategy} " ]]; then
+            echo "[Test]: Skipped $model-$strategy"
+            continue
+        fi
+        torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
+            --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+            --strategy $strategy --model $model \
+            --num_episodes 1 --max_timesteps 2 \
+            --update_timesteps 2 --max_epochs 1 --train_batch_size 2
+    done
+done
+
 # train sft
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'bigscience/bloom-560m' \
-        --model 'bloom' --strategy colossalai_zero2 --lora_rank 4\
-        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-        --save_path ${BASE}/output
+    --model 'bloom' --strategy colossalai_zero2 --lora_rank 4 \
+    --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+    --save_path ${BASE}/output
 rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
-        --model 'gpt2' --strategy colossalai_zero2 \
-        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-        --save_path ${BASE}/output
+    --model 'gpt2' --strategy colossalai_zero2 \
+    --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+    --save_path ${BASE}/output
 rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
-        --model 'opt' --strategy colossalai_zero2 --lora_rank 4\
-        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-        --save_path ${BASE}/output
+    --model 'opt' --strategy colossalai_zero2 --lora_rank 4 \
+    --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+    --save_path ${BASE}/output
 rm -rf ${BASE}/output
 
 torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
-        --model 'gpt2' --strategy ddp --lora_rank 4\
-        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-        --save_path ${BASE}/output
+    --model 'gpt2' --strategy ddp --lora_rank 4 \
+    --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+    --save_path ${BASE}/output
 
-#torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
-#        --model 'opt' --strategy naive \
-#        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-#        --save_path ${BASE}/output
+# torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
+#     --model 'opt' --strategy naive \
+#     --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+#     --save_path ${BASE}/output
 
 rm -rf ${BASE}/output
 
 # train rm
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'facebook/opt-350m' --model 'opt' \
-                            --strategy colossalai_zero2 --loss_fn 'log_sig'\
-                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
-                            --test True --lora_rank 0 \
-                            --save_path ${BASE}/rm_ckpt_opt.pt
+    --pretrain 'facebook/opt-350m' --model 'opt' \
+    --strategy colossalai_zero2 --loss_fn 'log_sig' \
+    --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+    --test True --lora_rank 0 \
+    --save_path ${BASE}/rm_ckpt_opt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'gpt2' --model 'gpt2' \
-                            --strategy colossalai_zero2 --loss_fn 'log_exp' \
-                            --dataset 'Dahoas/rm-static' \
-                            --test True  --lora_rank 0 \
-                            --save_path ${BASE}/rm_ckpt_gpt.pt
+    --pretrain 'gpt2' --model 'gpt2' \
+    --strategy colossalai_zero2 --loss_fn 'log_exp' \
+    --dataset 'Dahoas/rm-static' \
+    --test True --lora_rank 0 \
+    --save_path ${BASE}/rm_ckpt_gpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'gpt2' --model 'gpt2' \
-                            --strategy ddp --loss_fn 'log_exp' \
-                            --dataset 'Dahoas/rm-static' \
-                            --test True --lora_rank 4 \
-                            --save_path ${BASE}/rm_ckpt.pt
+    --pretrain 'gpt2' --model 'gpt2' \
+    --strategy ddp --loss_fn 'log_exp' \
+    --dataset 'Dahoas/rm-static' \
+    --test True --lora_rank 4 \
+    --save_path ${BASE}/rm_ckpt.pt
 rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'bigscience/bloom-560m' --model 'bloom' \
-                            --strategy colossalai_zero2 --loss_fn 'log_sig' \
-                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
-                            --test True --lora_rank 4 \
-                            --save_path ${BASE}/rm_ckpt.pt
+    --pretrain 'bigscience/bloom-560m' --model 'bloom' \
+    --strategy colossalai_zero2 --loss_fn 'log_sig' \
+    --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+    --test True --lora_rank 4 \
+    --save_path ${BASE}/rm_ckpt.pt
 rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'microsoft/deberta-v3-large' --model 'deberta' \
-                            --strategy colossalai_zero2 --loss_fn 'log_sig' \
-                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
-                            --test True --lora_rank 4 \
-                            --save_path ${BASE}/rm_ckpt.pt
+    --pretrain 'microsoft/deberta-v3-large' --model 'deberta' \
+    --strategy colossalai_zero2 --loss_fn 'log_sig' \
+    --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+    --test True --lora_rank 4 \
+    --save_path ${BASE}/rm_ckpt.pt
 rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
-                            --pretrain 'roberta-base' --model 'roberta' \
-                            --strategy colossalai_zero2 --loss_fn 'log_exp'\
-                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
-                            --test True --lora_rank 4 \
-                            --save_path ${BASE}/rm_ckpt.pt
+    --pretrain 'roberta-base' --model 'roberta' \
+    --strategy colossalai_zero2 --loss_fn 'log_exp' \
+    --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+    --test True --lora_rank 4 \
+    --save_path ${BASE}/rm_ckpt.pt
 
 rm -rf ${BASE}/rm_ckpt.pt
 
-torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-        --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
-        --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
-        --pretrain 'facebook/opt-350m' --model opt \
-        --rm_pretrain 'facebook/opt-350m' \
-        --rm_path ${BASE}/rm_ckpt_opt.pt \
-        --save_path ${BASE}/actor_checkpoint_prompts.pt
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
+    --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+    --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --pretrain 'facebook/opt-350m' --model opt \
+    --rm_pretrain 'facebook/opt-350m' \
+    --rm_path ${BASE}/rm_ckpt_opt.pt \
+    --save_path ${BASE}/actor_checkpoint_prompts.pt
 rm -rf ${BASE}/rm_ckpt_opt.pt
 
-torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-         --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
-         --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
-         --pretrain 'gpt2' --model gpt2 \
-         --rm_pretrain 'gpt2' \
-         --rm_path ${BASE}/rm_ckpt_gpt.pt \
-         --save_path ${BASE}/actor_checkpoint_prompts.pt
-
-torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-         --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
-         --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
-         --pretrain 'gpt2' --model gpt2 \
-         --rm_pretrain 'gpt2' \
-         --rm_path ${BASE}/rm_ckpt_gpt.pt \
-         --save_path ${BASE}/actor_checkpoint_prompts.pt
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
+    --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+    --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --pretrain 'gpt2' --model gpt2 \
+    --rm_pretrain 'gpt2' \
+    --rm_path ${BASE}/rm_ckpt_gpt.pt \
+    --save_path ${BASE}/actor_checkpoint_prompts.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
+    --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+    --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
+    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --pretrain 'gpt2' --model gpt2 \
+    --rm_pretrain 'gpt2' \
+    --rm_path ${BASE}/rm_ckpt_gpt.pt \
+    --save_path ${BASE}/actor_checkpoint_prompts.pt
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 134f21f80ef1..2a47dda637bb 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -1,6 +1,5 @@
 import argparse
 
-import pandas as pd
 import torch
 import torch.distributed as dist
 from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
@@ -51,7 +50,7 @@ def main(args):
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
-        if args.rm_model == None:
+        if args.rm_model is None:
             rm_model_name = args.model
         else:
             rm_model_name = args.rm_model
@@ -163,7 +162,9 @@ def main(args):
                                      batch_size=args.ptx_batch_size,
                                      collate_fn=data_collator)
 
-    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
+    # NOTE: For small models like opt-1.3b, reward model and initial model are not required to be parallelized.
+    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = \
+        strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
 
     # configure trainer
     trainer = PPOTrainer(
@@ -185,6 +186,7 @@ def main(args):
         top_k=50,
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id,
+        offload_inference_models=args.strategy != 'colossalai_gemini'
     )
 
     trainer.fit(prompt_dataloader=prompt_dataloader,
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 48b12336fa67..2df3bc391b9b 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -18,6 +18,7 @@
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from torch.optim import Adam
+from torch.optim.lr_scheduler import CosineAnnealingLR
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 from transformers import AutoTokenizer, BloomTokenizerFast, DebertaV2Tokenizer, LlamaTokenizer, RobertaTokenizer
@@ -165,10 +166,17 @@ def train(args):
                                  batch_size=args.batch_size,
                                  pin_memory=True)
 
-    (model, optim) = strategy.prepare((model, optim))
+    lr_scheduler = CosineAnnealingLR(optim, train_dataloader.__len__() // 100)
+    strategy_dict = strategy.prepare(
+        dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler)
+    )
+    model = strategy_dict['model']
+    optim = strategy_dict['optimizer']
+    lr_scheduler = strategy_dict['lr_scheduler']
     trainer = RewardModelTrainer(model=model,
                                  strategy=strategy,
                                  optim=optim,
+                                 lr_scheduler=lr_scheduler,
                                  loss_fn=loss_fn,
                                  train_dataloader=train_dataloader,
                                  valid_dataloader=valid_dataloader,
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index 7fcd026fb538..717eb95311fb 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -1,4 +1,5 @@
 import argparse
+import math
 import os
 
 import loralib as lora
@@ -19,6 +20,7 @@
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 from transformers.models.opt.configuration_opt import OPTConfig
 from transformers.models.opt.modeling_opt import OPTForCausalLM
+from transformers.trainer import get_scheduler
 
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
@@ -152,10 +154,22 @@ def train(args):
     else:
         eval_dataloader = None
 
-    (model, optim) = strategy.prepare((model, optim))
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    max_steps = math.ceil(args.max_epochs * num_update_steps_per_epoch)
+    lr_scheduler = get_scheduler("cosine",
+                                 optim,
+                                 num_warmup_steps=math.ceil(max_steps * 0.03),
+                                 num_training_steps=max_steps)
+    strategy_dict = strategy.prepare(
+        dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler)
+    )
+    model = strategy_dict['model']
+    optim = strategy_dict['optimizer']
+    lr_scheduler = strategy_dict['lr_scheduler']
     trainer = SFTTrainer(model=model,
                          strategy=strategy,
                          optim=optim,
+                         lr_scheduler=lr_scheduler,
                          train_dataloader=train_dataloader,
                          eval_dataloader=eval_dataloader,
                          max_epochs=args.max_epochs,
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index d93a5c94d8ea..cfa39e44b476 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -60,10 +60,15 @@ def run_step():
         rank0_dirname = rank0_dirname[0]
 
         model_path = os.path.join(rank0_dirname, 'model.pt')
-        optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
-
         strategy.save_model(actor, model_path, only_rank0=True)
-        strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
+
+        optim_path = os.path.join(rank0_dirname, f'optim.pt')
+        strategy.save_optimizer(actor_optim, optim_path, only_rank0=True)
+
+        # FIXME(cwher): Sharded optimizer checkpoint is not supported yet.
+        #  at "ColossalAI/colossalai/checkpoint_io/general_checkpoint_io.py", line 62
+        # optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
+        # strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
 
         dist.barrier()
 

From e89b127d8ec9c14fc34ff9a1208b630069eb026f Mon Sep 17 00:00:00 2001
From: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Date: Mon, 26 Jun 2023 15:26:07 +0800
Subject: [PATCH 04/14] [chat]: fix chat evaluation possible bug (#4064)

* fix chat eval

* fix utils

* fix utils

* add comment

---------

Co-authored-by: Qianran Ma <qianranm@luchentech.com>
---
 applications/Chat/evaluate/metrics.py           |  4 ++--
 applications/Chat/evaluate/unieval/evaluator.py |  3 ++-
 applications/Chat/evaluate/utils.py             | 13 +------------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
index e220226ec041..77f9b6e98044 100644
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -141,8 +141,8 @@ def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
             count_segs = len(pred_seg_list)
             unique_segs = set(pred_seg_list)
             count_unique_chars = len(unique_segs)
-
-            cumulative_distinct.append(count_unique_chars / count_segs)
+            # prevent denominator from being 0
+            cumulative_distinct.append(count_unique_chars / (count_segs + 1e-6))
         elif language == "en":
             # calculate distinct 1-gram, 2-gram, 3-gram
             unique_ngram = [set() for _ in range(0, 3)]
diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py
index d7f2f87f8c52..56cc6d2f9e41 100644
--- a/applications/Chat/evaluate/unieval/evaluator.py
+++ b/applications/Chat/evaluate/unieval/evaluator.py
@@ -80,7 +80,8 @@ def evaluate(self, data, category, dims=None, overall=True):
                 start_idx = 0
                 score = []
                 for cur_n_sent in n_sents:
-                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+                    # prevent denominator from being 0
+                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
                     start_idx += cur_n_sent
 
             # Calculate summary-level score for 'coherence' and 'relevance'
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index fefe25f5e764..406e43db99aa 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -72,17 +72,6 @@ def get_data_per_category(data, categories):
     return data_per_category
 
 
-def remove_articles(text: str) -> str:
-    """
-    Remove articles "a, an, the" in the given text.
-    It is used in evaluation of automatic metrics.
-
-    """
-
-    pattern = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-    return re.sub(pattern, " ", text)
-
-
 def remove_punctuations(text: str) -> str:
     """
     Remove punctuations in the given text.
@@ -121,7 +110,7 @@ def preprocessing_text(text: str) -> str:
 
     """
 
-    return remove_redundant_space(remove_articles(remove_punctuations(text.lower())))
+    return remove_redundant_space(remove_punctuations(text.lower()))
 
 
 def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:

From 4da324cd609427ef9825aa16f856d04bc10e56db Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <eddiezhang@pku.edu.cn>
Date: Mon, 26 Jun 2023 23:50:04 +0800
Subject: [PATCH 05/14] [hotfix]fix argument naming in docs and examples
 (#4083)

---
 .../coati/trainer/strategies/colossalai.py    | 56 +++++++++----------
 ...parallelize_your_training_like_Megatron.md |  4 +-
 docs/source/en/features/zero_with_chunk.md    |  6 +-
 ...parallelize_your_training_like_Megatron.md |  5 +-
 .../zh-Hans/features/zero_with_chunk.md       |  6 +-
 .../roberta/pretraining/run_pretraining.py    |  2 +-
 examples/community/roberta/test_ci.sh         |  0
 .../language/gpt/gemini/train_gpt_demo.py     |  2 +-
 8 files changed, 40 insertions(+), 41 deletions(-)
 create mode 100644 examples/community/roberta/test_ci.sh

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 8c9b8ac0334b..f31551f22318 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -34,9 +34,9 @@ class ColossalAIStrategy(DDPStrategy):
                           If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
         pin_memory(bool): Whether to pin the memory for the data loader. Only for ZeRO-3.
         force_outputs_fp32(bool): Whether to force the outputs to be fp32. Only for ZeRO-3.
-        search_range_mb(int): The search range in MB for the chunk size. Only for ZeRO-3.
+        search_range_m(int): The number of search range for the chunk size, divided by 2^20. Only for ZeRO-3.
         hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3.
-        min_chunk_size_mb(float): The minimum chunk size in MB. Only for ZeRO-3.
+        min_chunk_size_m(float): The minimum chunk size divided by 2^20. Only for ZeRO-3.
         gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3.
         reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
         overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
@@ -61,9 +61,9 @@ def __init__(
             placement_policy: str = 'cuda',
             pin_memory: bool = True,    # only for stage 3
             force_outputs_fp32: bool = False,    # only for stage 3
-            search_range_mb: int = 32,    # only for stage 3
+            search_range_m: int = 32,    # only for stage 3
             hidden_dim: Optional[int] = None,    # only for stage 3
-            min_chunk_size_mb: float = 32,    # only for stage 3
+            min_chunk_size_m: float = 32,    # only for stage 3
             gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
             reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
             overlap_communication: bool = True,    # only for stage 1&2
@@ -83,57 +83,51 @@ def __init__(
 
         # TODO(ver217): support shard_init when using from_pretrained()
         if shard_init:
-            warnings.warn(
-                f'Shard init is not supported model.from_pretrained() yet. '
-                'Please load weights after strategy.prepare()'
-            )
+            warnings.warn(f'Shard init is not supported model.from_pretrained() yet. '
+                          'Please load weights after strategy.prepare()')
         if stage == 3 and precision == 'fp32':
             warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
             precision = 'fp16'
         self.precision = precision
         self.shard_init = shard_init
 
-        optim_kwargs = dict(
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type
-        )
+        optim_kwargs = dict(initial_scale=initial_scale,
+                            growth_factor=growth_factor,
+                            backoff_factor=backoff_factor,
+                            growth_interval=growth_interval,
+                            hysteresis=hysteresis,
+                            min_scale=min_scale,
+                            max_scale=max_scale,
+                            max_norm=max_norm,
+                            norm_type=norm_type)
         # NOTE: dist should be initialized before calling get_current_device()
         if stage == 3:
             plugin_initializer = lambda: GeminiPlugin(
-                # gemini_config
+            # gemini_config
                 device=get_current_device(),
                 placement_policy=placement_policy,
                 precision=precision,
                 pin_memory=pin_memory,
                 force_outputs_fp32=force_outputs_fp32,
                 strict_ddp_mode=shard_init,
-                search_range_mb=search_range_mb,
+                search_range_m=search_range_m,
                 hidden_dim=hidden_dim,
-                min_chunk_size_mb=min_chunk_size_mb,
-                # zero_optim_config
+                min_chunk_size_m=min_chunk_size_m,
+            # zero_optim_config
                 gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-                # optim_config
-                **optim_kwargs
-            )
+            # optim_config
+                **optim_kwargs)
         else:
             plugin_initializer = lambda: LowLevelZeroPlugin(
-                # zero_config
+            # zero_config
                 stage=stage,
                 precision=precision,
-                # zero_optim_config
+            # zero_optim_config
                 reduce_bucket_size_in_m=reduce_bucket_size,
                 overlap_communication=overlap_communication,
                 cpu_offload=(placement_policy == 'cpu'),
-                # optim_config
-                **optim_kwargs
-            )
+            # optim_config
+                **optim_kwargs)
 
         super().__init__(seed, plugin_initializer)
 
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 978ac32fc78e..281fd47554ca 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -181,7 +181,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy:
                         device=get_current_device(),
                         placement_policy=placement_policy,
                         pin_memory=True,
-                        search_range_mb=32)
+                        search_range_m=32)
     return model
 ```
 
@@ -190,3 +190,5 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy:
 The above optimization we made allows us to pretrain the GPT-2 model on a single GPU. We only need to set the parameter `GPUNUM`=1 in `run.sh`, and then we can complete the model training on a single GPU when running the file.
 
 The GPT-2 example is accessible at [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 parallelize_your_training_like_Megatron.py  -->
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index 1b27d64b6897..b50d2d02217b 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -67,12 +67,12 @@ Define the model parameters as follows:
 chunk_manager = init_chunk_manager(model=module,
                                            init_device=device,
                                            hidden_dim=hidden_dim,
-                                           search_range_mb=search_range_mb,
-                                           min_chunk_size_mb=min_chunk_size_mb)
+                                           search_range_m=search_range_m,
+                                           min_chunk_size_m=min_chunk_size_m)
 gemini_manager = GeminiManager(placement_policy, chunk_manager)
 ```
 
-`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk.
+`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_m` is a floating point, being the minimum chunk size divided by 2^20 (e.g., if min_chunk_size_m=2.5, then the minimum chunk size should be 2.5*(2^20)).If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk.
 
 Initialization of the optimizer.
 ```python
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index b4e0d18a2647..3f85d50454ae 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -165,7 +165,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy:
                         device=get_current_device(),
                         placement_policy=placement_policy,
                         pin_memory=True,
-                        search_range_mb=32)
+                        search_range_m=32)
     return model
 ```
 
@@ -174,3 +174,6 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy:
 我们做的上述优化让我们可以在单GPU上训练GPT-2模型，只需要将`run.sh`中设置参数`GPUNUM`=1，再运行文件时就可以在单个GPU上完成模型的训练。
 
 GPT-2 示例在[Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。
+
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 parallelize_your_training_like_Megatron.py  -->
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 9fe5601bbd1b..513850f5cab7 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -66,13 +66,13 @@ with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_
 chunk_manager = init_chunk_manager(model=module,
                                    init_device=device,
                                    hidden_dim=hidden_dim,
-                                   search_range_mb=search_range_mb,
-                                   min_chunk_size_mb=min_chunk_size_mb)
+                                   search_range_m=search_range_m,
+                                   min_chunk_size_m=min_chunk_size_m)
 gemini_manager = GeminiManager(placement_policy, chunk_manager)
 model = ZeroDDP(model, gemini_manager)
 ```
 
-`hidden dim`是DNN的隐藏维度。用户可以提供这个参数来加快搜索速度。如果用户在训练前不知道这个参数也可以。 我们将使用默认值 1024。`min_chunk_size_mb`是以兆字节为单位的最小块大小。如果参数的总大小仍然小于最小块大小，则所有参数将被压缩为一个小块。
+`hidden dim`是DNN的隐藏维度。用户可以提供这个参数来加快搜索速度。如果用户在训练前不知道这个参数也可以。 我们将使用默认值 1024。`min_chunk_size_m`是以兆（2^20）为单位的最小块大小。如果参数的总大小仍然小于最小块大小，则所有参数将被压缩为一个小块。
 
 初始化优化器。
 ```python
diff --git a/examples/community/roberta/pretraining/run_pretraining.py b/examples/community/roberta/pretraining/run_pretraining.py
index a72bdf775644..9fae4bef227a 100644
--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -88,7 +88,7 @@ def main():
                                  placement_policy=args.placement,
                                  pin_memory=True,
                                  hidden_dim=model.config.hidden_size,
-                                 search_range_mb=128)
+                                 search_range_m=128)
             optim_config = dict(gpu_margin_mem_ratio=0.)
         else:
             raise RuntimeError
diff --git a/examples/community/roberta/test_ci.sh b/examples/community/roberta/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index a7b552c9e23d..9e61779a1dbf 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -258,7 +258,7 @@ def main():
                                   placement_policy=args.placement,
                                   pin_memory=True,
                                   strict_ddp_mode=args.tp_degree == 1,
-                                  search_range_mb=128,
+                                  search_range_m=128,
                                   hidden_dim=model.config.n_embd,
                                   gpu_margin_mem_ratio=0.)
         else:

From 95e95b6d588d27aeba8d62e6656392f771b69aee Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 27 Jun 2023 11:02:25 +0800
Subject: [PATCH 06/14] [testing] move pytest to be inside the function (#4087)

---
 colossalai/testing/pytest_wrapper.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/testing/pytest_wrapper.py b/colossalai/testing/pytest_wrapper.py
index b264b009028a..6a80e1dcc548 100644
--- a/colossalai/testing/pytest_wrapper.py
+++ b/colossalai/testing/pytest_wrapper.py
@@ -1,10 +1,9 @@
 """
 This file will not be automatically imported by `colossalai.testing`
-as this file has a dependency on `pytest`. Therefore, you need to 
+as this file has a dependency on `pytest`. Therefore, you need to
 explicitly import this file `from colossalai.testing.pytest_wrapper import <func>`.from
 """
 
-import pytest
 import os
 
 
@@ -30,6 +29,12 @@ def test_for_something():
         pytest test_for_something.py
 
     """
+    try:
+        import pytest
+    except ImportError:
+        raise ImportError(
+            'This function requires `pytest` to be installed, please do `pip install pytest` and try again.')
+
     assert isinstance(name, str)
     flag = os.environ.get(name.upper(), '0')
 

From 31dc302017ff491a36088dd27ed4c76e11d5b5b7 Mon Sep 17 00:00:00 2001
From: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Date: Tue, 27 Jun 2023 16:40:46 +0800
Subject: [PATCH 07/14] [examples] copy resnet example to image (#4090)

* copy resnet example

* add pytest package

* skip test_ci

* skip test_ci

* skip test_ci
---
 examples/images/resnet/.gitignore       |   4 +
 examples/images/resnet/README.md        |  56 +++++++
 examples/images/resnet/eval.py          |  48 ++++++
 examples/images/resnet/requirements.txt |   5 +
 examples/images/resnet/test_ci.sh       |  12 ++
 examples/images/resnet/train.py         | 204 ++++++++++++++++++++++++
 6 files changed, 329 insertions(+)
 create mode 100644 examples/images/resnet/.gitignore
 create mode 100644 examples/images/resnet/README.md
 create mode 100644 examples/images/resnet/eval.py
 create mode 100644 examples/images/resnet/requirements.txt
 create mode 100755 examples/images/resnet/test_ci.sh
 create mode 100644 examples/images/resnet/train.py

diff --git a/examples/images/resnet/.gitignore b/examples/images/resnet/.gitignore
new file mode 100644
index 000000000000..a79cf5236c08
--- /dev/null
+++ b/examples/images/resnet/.gitignore
@@ -0,0 +1,4 @@
+data
+checkpoint
+ckpt-fp16
+ckpt-fp32
diff --git a/examples/images/resnet/README.md b/examples/images/resnet/README.md
new file mode 100644
index 000000000000..c69828637269
--- /dev/null
+++ b/examples/images/resnet/README.md
@@ -0,0 +1,56 @@
+# Train ResNet on CIFAR-10 from scratch
+
+## 🚀 Quick Start
+
+This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch.
+
+- Training Arguments
+  - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`.
+  - `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming.
+  - `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`.
+  - `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved.
+  - `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`.
+
+- Eval Arguments
+  - `-e`, `--epoch`: select the epoch to evaluate
+  - `-c`, `--checkpoint`: the folder where checkpoints are found
+
+### Install requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+### Train
+The folders will be created automatically.
+```bash
+# train with torch DDP with fp32
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32
+
+# train with torch DDP with mixed precision training
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 -p torch_ddp_fp16
+
+# train with low level zero
+colossalai run --nproc_per_node 2 train.py -c ./ckpt-low_level_zero -p low_level_zero
+```
+
+### Eval
+
+```bash
+# evaluate fp32 training
+python eval.py -c ./ckpt-fp32 -e 80
+
+# evaluate fp16 mixed precision training
+python eval.py -c ./ckpt-fp16 -e 80
+
+# evaluate low level zero training
+python eval.py -c ./ckpt-low_level_zero -e 80
+```
+
+Expected accuracy performance will be:
+
+| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
+| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
+| ResNet-18 | 85.85%                   | 84.91%                | 85.46%                | 84.50%                 |
+
+**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`**
diff --git a/examples/images/resnet/eval.py b/examples/images/resnet/eval.py
new file mode 100644
index 000000000000..657708ec3ff2
--- /dev/null
+++ b/examples/images/resnet/eval.py
@@ -0,0 +1,48 @@
+import argparse
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+# ==============================
+# Parse Arguments
+# ==============================
+parser = argparse.ArgumentParser()
+parser.add_argument('-e', '--epoch', type=int, default=80, help="resume from the epoch's checkpoint")
+parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
+args = parser.parse_args()
+
+# ==============================
+# Prepare Test Dataset
+# ==============================
+# CIFAR-10 dataset
+test_dataset = torchvision.datasets.CIFAR10(root='./data/', train=False, transform=transforms.ToTensor())
+
+# Data loader
+test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)
+
+# ==============================
+# Load Model
+# ==============================
+model = torchvision.models.resnet18(num_classes=10).cuda()
+state_dict = torch.load(f'{args.checkpoint}/model_{args.epoch}.pth')
+model.load_state_dict(state_dict)
+
+# ==============================
+# Run Evaluation
+# ==============================
+model.eval()
+
+with torch.no_grad():
+    correct = 0
+    total = 0
+    for images, labels in test_loader:
+        images = images.cuda()
+        labels = labels.cuda()
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
diff --git a/examples/images/resnet/requirements.txt b/examples/images/resnet/requirements.txt
new file mode 100644
index 000000000000..3c7da7743702
--- /dev/null
+++ b/examples/images/resnet/requirements.txt
@@ -0,0 +1,5 @@
+colossalai
+torch
+torchvision
+tqdm
+pytest
\ No newline at end of file
diff --git a/examples/images/resnet/test_ci.sh b/examples/images/resnet/test_ci.sh
new file mode 100755
index 000000000000..b3fb67830dda
--- /dev/null
+++ b/examples/images/resnet/test_ci.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -xe
+
+export DATA=/data/scratch/cifar-10
+
+pip install -r requirements.txt
+
+# TODO: skip ci test due to time limits, train.py needs to be rewritten.
+
+# for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do
+#     colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.84 --plugin $plugin
+# done
diff --git a/examples/images/resnet/train.py b/examples/images/resnet/train.py
new file mode 100644
index 000000000000..fe0dabf08377
--- /dev/null
+++ b/examples/images/resnet/train.py
@@ -0,0 +1,204 @@
+import argparse
+import os
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import MultiStepLR
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 80
+LEARNING_RATE = 1e-3
+
+
+def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
+    # transform
+    transform_train = transforms.Compose(
+        [transforms.Pad(4),
+         transforms.RandomHorizontalFlip(),
+         transforms.RandomCrop(32),
+         transforms.ToTensor()])
+    transform_test = transforms.ToTensor()
+
+    # CIFAR-10 dataset
+    data_path = os.environ.get('DATA', './data')
+    with coordinator.priority_execution():
+        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                     train=True,
+                                                     transform=transform_train,
+                                                     download=True)
+        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
+                                                    train=False,
+                                                    transform=transform_test,
+                                                    download=True)
+
+    # Data loader
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
+    return train_dataloader, test_dataloader
+
+
+@torch.no_grad()
+def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
+    model.eval()
+    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+    for images, labels in test_dataloader:
+        images = images.cuda()
+        labels = labels.cuda()
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    dist.all_reduce(correct)
+    dist.all_reduce(total)
+    accuracy = correct.item() / total.item()
+    if coordinator.is_master():
+        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
+    return accuracy
+
+
+def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
+                booster: Booster, coordinator: DistCoordinator):
+    model.train()
+    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
+        for images, labels in pbar:
+            images = images.cuda()
+            labels = labels.cuda()
+            # Forward pass
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            # Print log info
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    # FIXME(ver217): gemini is not supported resnet now
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
+    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
+    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
+    parser.add_argument('--target_acc',
+                        type=float,
+                        default=None,
+                        help="target accuracy. Raise exception if not reached")
+    args = parser.parse_args()
+
+    # ==============================
+    # Prepare Checkpoint Directory
+    # ==============================
+    if args.interval > 0:
+        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={})
+    coordinator = DistCoordinator()
+
+    # update the learning rate with linear scaling
+    # old_gpu_num / old_lr = new_gpu_num / new_lr
+    global LEARNING_RATE
+    LEARNING_RATE *= coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+    train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)
+
+    # ====================================
+    # Prepare model, optimizer, criterion
+    # ====================================
+    # resent50
+    model = torchvision.models.resnet18(num_classes=10)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
+
+    # lr scheduler
+    lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
+                                                                 optimizer,
+                                                                 criterion=criterion,
+                                                                 lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Resume from checkpoint
+    # ==============================
+    if args.resume >= 0:
+        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
+        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
+        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
+
+    # ==============================
+    # Train model
+    # ==============================
+    start_epoch = args.resume if args.resume >= 0 else 0
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
+        lr_scheduler.step()
+
+        # save checkpoint
+        if args.interval > 0 and (epoch + 1) % args.interval == 0:
+            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
+            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
+            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
+
+    accuracy = evaluate(model, test_dataloader, coordinator)
+    if args.target_acc is not None:
+        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'
+
+
+if __name__ == '__main__':
+    main()

From 1ee947f617aaffde010392089d05508e3597267d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 28 Jun 2023 14:33:43 +0800
Subject: [PATCH 08/14] [workflow] added status check for test coverage
 workflow (#4106)

---
 .github/workflows/report_test_coverage.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index d9b131fd994c..c9dc541b8a33 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -9,6 +9,7 @@ on:
 jobs:
   report-test-coverage:
     runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
     steps:
       - name: "Download artifact"
         uses: actions/github-script@v6

From 2d40759a5351aa08866d37a39b5f141e7a11e0eb Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 28 Jun 2023 15:29:44 +0800
Subject: [PATCH 09/14] fix #3852 path error (#4058)

---
 examples/language/gpt/titans/train_gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 66225d6c8044..6be0b9e8da30 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -15,7 +15,7 @@
 from colossalai.trainer import Trainer, hooks
 from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
 from colossalai.utils.timer import MultiTimer
-from colossalai.zero.init_ctx import ZeroInitContext
+from colossalai.zero.legacy.init_ctx import ZeroInitContext
 
 
 def calc_local_model_size(model: torch.nn.Module):

From 769cddcb2cc45ec78cb27148520e10bc8d7307d9 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 28 Jun 2023 15:30:30 +0800
Subject: [PATCH 10/14] fix typo docs/ (#4033)

---
 docs/source/en/advanced_tutorials/meet_gemini.md                | 2 +-
 .../advanced_tutorials/train_vit_with_hybrid_parallelism.md     | 2 +-
 docs/source/zh-Hans/features/mixed_precision_training.md        | 2 +-
 .../zh-Hans/features/mixed_precision_training_with_booster.md   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md
index c1c23a355efa..e94e3fea3710 100644
--- a/docs/source/en/advanced_tutorials/meet_gemini.md
+++ b/docs/source/en/advanced_tutorials/meet_gemini.md
@@ -9,7 +9,7 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr
 
 ## Usage
 
-At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions of `booster` please refer to [**usage of booster**](../basics/booster_api.md).
+At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the features of `GeminiPlugin` into training components with `booster`. More instructions of `booster` please refer to [**usage of booster**](../basics/booster_api.md).
 
 ```python
 from torchvision.models import resnet18
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index e2f2c90a3791..5ad08392049e 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -150,7 +150,7 @@ Colossal-AI 提供了自己的优化器、损失函数和学习率调度器。Py
 optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
 # build loss
 criterion = torch.nn.CrossEntropyLoss()
-# lr_scheduelr
+# lr_scheduler
 lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
 ```
 
diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md
index 4628b09cd910..a92e7e093015 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training.md
@@ -303,7 +303,7 @@ colossalai.launch_from_torch(config=args.config)
     # build loss
     criterion = torch.nn.CrossEntropyLoss()
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
 ```
 
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index 187aef1a6c4a..ba9451341d15 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -181,7 +181,7 @@ optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
 # build loss
 criterion = torch.nn.CrossEntropyLoss()
 
-# lr_scheduelr
+# lr_scheduler
 lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS)
 ```
 

From 711e2b4c00d68083af64dd2422f33c3b08d42856 Mon Sep 17 00:00:00 2001
From: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Date: Wed, 28 Jun 2023 19:30:37 +0800
Subject: [PATCH 11/14] [doc] update and revise some typos and errs in docs
 (#4107)

* fix some typos and problems in doc

* fix some typos and problems in doc

* add doc test
---
 docs/source/en/basics/booster_api.md          | 16 +++---
 .../mixed_precision_training_with_booster.md  | 26 ++++++----
 docs/source/en/get_started/run_demo.md        | 19 +++----
 docs/source/zh-Hans/basics/booster_api.md     | 31 +++++++-----
 .../mixed_precision_training_with_booster.md  | 49 ++++++++++++-------
 docs/source/zh-Hans/get_started/run_demo.md   | 18 ++++---
 6 files changed, 98 insertions(+), 61 deletions(-)

diff --git a/docs/source/en/basics/booster_api.md b/docs/source/en/basics/booster_api.md
index a446ff31be83..22d5ee818019 100644
--- a/docs/source/en/basics/booster_api.md
+++ b/docs/source/en/basics/booster_api.md
@@ -1,31 +1,36 @@
 # Booster API
-Author: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+Author: [Mingyan Jiang](https://github.com/jiangmingyan) [Jianghai Chen](https://github.com/CjhHa1)
 
 **Prerequisite:**
+
 - [Distributed Training](../concepts/distributed_training.md)
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
 
 **Example Code**
+
 - [Train with Booster](https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/README.md)
 
 ## Introduction
+
 In our new design, `colossalai.booster` replaces the role of `colossalai.initialize` to inject features into your training components (e.g. model, optimizer, dataloader) seamlessly. With these new APIs, you can integrate your model with our parallelism features more friendly. Also calling `colossalai.booster` is the standard procedure before you run into your training loops. In the sections below, I will cover how `colossalai.booster` works and what we should take note of.
 
 ### Plugin
+
 Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows:
 
-***GeminiPlugin:*** This plugin wraps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
+**_GeminiPlugin:_** This plugin wraps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
 
-***TorchDDPPlugin:*** This plugin wraps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
+**_TorchDDPPlugin:_** This plugin wraps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
 
-***LowLevelZeroPlugin:*** This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.
+**_LowLevelZeroPlugin:_** This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.
 
 ### API of booster
 
-
 {{ autodoc:colossalai.booster.Booster }}
 
 ## Usage
+
 In a typical workflow, you should launch distributed environment at the beginning of training script and create objects needed (such as models, optimizers, loss function, data loaders etc.) firstly, then call `colossalai.booster` to inject features into these objects, After that, you can use our booster APIs and these returned objects to continue the rest of your training processes.
 
 A pseudo-code example is like below:
@@ -67,5 +72,4 @@ def train():
 
 [more design details](https://github.com/hpcaitech/ColossalAI/discussions/3046)
 
-
 <!-- doc-test-command: torchrun --standalone --nproc_per_node=1 booster_api.py  -->
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
index e9b6f684f613..1240b47d5d2e 100644
--- a/docs/source/en/features/mixed_precision_training_with_booster.md
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -3,12 +3,13 @@
 Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **Prerequisite**
+
 - [Define Your Configuration](../basics/define_your_config.md)
 - [Training Booster](../basics/booster_api.md)
 
 **Related Paper**
-- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
 
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
 
 ## Introduction
 
@@ -19,12 +20,11 @@ In Colossal-AI, we have incorporated different implementations of mixed precisio
 2. apex.amp
 3. naive amp
 
-
-| Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent |
-| ----------- | ----------------------- | ------------------------- | ----------- |
-| AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
-| AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 |
-| AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 |
+| Colossal-AI    | support tensor parallel | support pipeline parallel | fp16 extent                                                                                          |
+| -------------- | ----------------------- | ------------------------- | ---------------------------------------------------------------------------------------------------- |
+| AMP_TYPE.TORCH | ✅                      | ❌                        | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
+| AMP_TYPE.APEX  | ❌                      | ❌                        | More fine-grained, we can choose opt_level O0, O1, O2, O3                                            |
+| AMP_TYPE.NAIVE | ✅                      | ✅                        | Model parameters, forward and backward operations are all downcast to fp16                           |
 
 The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
 The last method is similar to Apex O2 level.
@@ -64,8 +64,11 @@ However, there are other operations, like reductions, which require the dynamic
 We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Now booster support torch amp, the other two(apex amp, naive amp) are still started by `colossalai.initialize`, if needed, please refer to [this](./mixed_precision_training.md). Next we will support `bf16`, `fp8`.
 
 ### Start with Booster
+
 instantiate `Booster` with `mixed_precision="fp16"`, then you can train with torch amp.
+
 <!--- doc-test-ignore-start -->
+
 ```python
 """
     Mapping:
@@ -78,9 +81,13 @@ instantiate `Booster` with `mixed_precision="fp16"`, then you can train with tor
 from colossalai import Booster
 booster = Booster(mixed_precision='fp16',...)
 ```
+
 <!--- doc-test-ignore-end -->
+
 or you can create a `FP16TorchMixedPrecision` object, such as:
+
 <!--- doc-test-ignore-start -->
+
 ```python
 from colossalai.mixed_precision import FP16TorchMixedPrecision
 mixed_precision = FP16TorchMixedPrecision(
@@ -90,9 +97,10 @@ mixed_precision = FP16TorchMixedPrecision(
     growth_interval=2000)
 booster = Booster(mixed_precision=mixed_precision,...)
 ```
+
 <!--- doc-test-ignore-end -->
-The same goes for other types of amps.
 
+The same goes for other types of amps.
 
 ### Torch AMP Configuration
 
@@ -121,7 +129,6 @@ The output model is converted to AMP model of smaller memory consumption.
 If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`.
 Otherwise, try smaller models or checkout more parallelization training techniques!
 
-
 ## Hands-on Practice
 
 Now we will introduce the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example.
@@ -248,4 +255,5 @@ Use the following command to start the training scripts. You can change `--nproc
 ```shell
 colossalai run --nproc_per_node 1 train.py
 ```
+
 <!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->
diff --git a/docs/source/en/get_started/run_demo.md b/docs/source/en/get_started/run_demo.md
index f47bdbbd62fc..1ce185e26db0 100644
--- a/docs/source/en/get_started/run_demo.md
+++ b/docs/source/en/get_started/run_demo.md
@@ -7,19 +7,18 @@ can also run on systems with only one GPU. Quick demos showing how to use Coloss
 ## Single GPU
 
 Colossal-AI can be used to train deep learning models on systems with only one GPU and achieve baseline
-performances. We provided an example to [train ResNet on CIFAR10 dataset](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet)
-with only one GPU. You can find the example in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
+performances. We provided an example to [train ResNet on CIFAR10 dataset](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/resnet)
+with only one GPU. You can find the example in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples).
 Detailed instructions can be found in its `README.md`.
 
 ## Multiple GPUs
 
 Colossal-AI can be used to train deep learning models on distributed systems with multiple GPUs and accelerate the
-training process drastically by applying efficient parallelization techniques. When we have several parallelism for you
-to try out.
+training process drastically by applying efficient parallelization techniques. When we have several parallelism for you to try out.
 
 #### 1. data parallel
 
-You can use the same [ResNet example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet) as the
+You can use the same [ResNet example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/resnet) as the
 single-GPU demo above. By setting `--nproc_per_node` to be the number of GPUs you have on your machine, the example
 is turned into a data parallel example.
 
@@ -27,17 +26,19 @@ is turned into a data parallel example.
 
 Hybrid parallel includes data, tensor, and pipeline parallelism. In Colossal-AI, we support different types of tensor
 parallelism (i.e. 1D, 2D, 2.5D and 3D). You can switch between different tensor parallelism by simply changing the configuration
-in the `config.py`. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt).
+in the `config.py`. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
 Detailed instructions can be found in its `README.md`.
 
 #### 3. MoE parallel
 
-We provided [an example of WideNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) to demonstrate
+We provided [an example of ViT-MoE](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/moe) to demonstrate
 MoE parallelism. WideNet uses mixture of experts (MoE) to achieve better performance. More details can be found in
 [Tutorial: Integrate Mixture-of-Experts Into Your Model](../advanced_tutorials/integrate_mixture_of_experts_into_your_model.md)
 
 #### 4. sequence parallel
 
 Sequence parallel is designed to tackle memory efficiency and sequence length limit problems in NLP tasks. We provided
-[an example of BERT](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/bert/sequene_parallel) in
-[ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples). You can follow the `README.md` to execute the code.
+[an example of BERT](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/sequence_parallel) in
+[ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples). You can follow the `README.md` to execute the code.
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 run_demo.py  -->
diff --git a/docs/source/zh-Hans/basics/booster_api.md b/docs/source/zh-Hans/basics/booster_api.md
index 1bb5fd69bd15..1df821ce7d6e 100644
--- a/docs/source/zh-Hans/basics/booster_api.md
+++ b/docs/source/zh-Hans/basics/booster_api.md
@@ -1,35 +1,44 @@
 # booster 使用
-作者: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+作者: [Mingyan Jiang](https://github.com/jiangmingyan) [Jianghai Chen](https://github.com/CjhHa1)
 
 **预备知识:**
+
 - [分布式训练](../concepts/distributed_training.md)
 - [Colossal-AI 总览](../concepts/colossalai_overview.md)
 
 **示例代码**
-- [使用booster训练](https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/README.md)
+
+<!-- update this url-->
+
+- [使用 booster 训练](https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/README.md)
 
 ## 简介
-在我们的新设计中， `colossalai.booster` 代替 `colossalai.initialize` 将特征(例如，模型、优化器、数据加载器）无缝注入您的训练组件中。 使用booster API, 您可以更友好地将我们的并行策略整合到待训练模型中. 调用 `colossalai.booster` 是您进入训练循环前的基本操作。
+
+在我们的新设计中， `colossalai.booster` 代替 `colossalai.initialize` 将特征(例如，模型、优化器、数据加载器)无缝注入您的训练组件中。 使用 booster API, 您可以更友好地将我们的并行策略整合到待训练模型中. 调用 `colossalai.booster` 是您进入训练循环前的基本操作。
 在下面的章节中，我们将介绍 `colossalai.booster` 是如何工作的以及使用时我们要注意的细节。
 
-### Booster插件
-Booster插件是管理并行配置的重要组件（eg：gemini插件封装了gemini加速方案）。目前支持的插件如下：
+### Booster 插件
+
+Booster 插件是管理并行配置的重要组件（eg：gemini 插件封装了 gemini 加速方案）。目前支持的插件如下：
+
+**_GeminiPlugin:_** GeminiPlugin 插件封装了 gemini 加速解决方案，即基于块内存管理的 ZeRO 优化方案。
 
-***GeminiPlugin:*** GeminiPlugin插件封装了 gemini 加速解决方案，即基于块内存管理的 ZeRO优化方案。
+**_TorchDDPPlugin:_** TorchDDPPlugin 插件封装了 DDP 加速方案，实现了模型级别的数据并行，可以跨多机运行。
 
-***TorchDDPPlugin:*** TorchDDPPlugin插件封装了DDP加速方案，实现了模型级别的数据并行，可以跨多机运行。
+**_LowLevelZeroPlugin:_** LowLevelZeroPlugin 插件封装了零冗余优化器的 1/2 阶段。阶段 1：切分优化器参数，分发到各并发进程或并发 GPU 上。阶段 2：切分优化器参数及梯度，分发到各并发进程或并发 GPU 上。
 
-***LowLevelZeroPlugin:*** LowLevelZeroPlugin插件封装了零冗余优化器的 1/2 阶段。阶段 1：切分优化器参数，分发到各并发进程或并发GPU上。阶段 2：切分优化器参数及梯度，分发到各并发进程或并发GPU上。
+### Booster 接口
 
-### Booster接口
+<!--TODO: update autodoc -->
 
 {{ autodoc:colossalai.booster.Booster }}
 
 ## 使用方法及示例
 
-在使用colossalai训练时，首先需要在训练脚本的开头启动分布式环境，并创建需要使用的模型、优化器、损失函数、数据加载器等对象。之后，调用`colossalai.booster` 将特征注入到这些对象中，您就可以使用我们的booster API去进行您接下来的训练流程。
+在使用 colossalai 训练时，首先需要在训练脚本的开头启动分布式环境，并创建需要使用的模型、优化器、损失函数、数据加载器等对象。之后，调用`colossalai.booster` 将特征注入到这些对象中，您就可以使用我们的 booster API 去进行您接下来的训练流程。
 
-以下是一个伪代码示例，将展示如何使用我们的booster API进行模型训练:
+以下是一个伪代码示例，将展示如何使用我们的 booster API 进行模型训练:
 
 ```python
 import torch
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index ba9451341d15..0354f92ee7ce 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -3,12 +3,13 @@
 作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **前置教程**
+
 - [定义配置文件](../basics/define_your_config.md)
-- [booster使用](../basics/booster_api.md)
+- [booster 使用](../basics/booster_api.md)
 
 **相关论文**
-- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
 
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
 
 ## 引言
 
@@ -19,18 +20,17 @@ AMP 代表自动混合精度训练。
 2. apex.amp
 3. naive amp
 
+| Colossal-AI    | 支持张量并行 | 支持流水并行 | fp16 范围                                                 |
+| -------------- | ------------ | ------------ | --------------------------------------------------------- |
+| AMP_TYPE.TORCH | ✅           | ❌           | 在前向和反向传播期间，模型参数、激活和梯度向下转换至 fp16 |
+| AMP_TYPE.APEX  | ❌           | ❌           | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3           |
+| AMP_TYPE.NAIVE | ✅           | ✅           | 模型参数、前向和反向操作，全都向下转换至 fp16             |
 
-| Colossal-AI | 支持张量并行 | 支持流水并行 | fp16范围 |
-| ----------- | ----------------------- | ------------------------- | ----------- |
-| AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间，模型参数、激活和梯度向下转换至fp16 |
-| AMP_TYPE.APEX | ❌ | ❌ | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3 |
-| AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作，全都向下转换至fp16 |
-
-前两个依赖于 PyTorch (1.6及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现inf或nan。我们修改了torch amp实现，使其现在与张量并行兼容。
+前两个依赖于 PyTorch (1.6 及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现 inf 或 nan。我们修改了 torch amp 实现，使其现在与张量并行兼容。
 
-> ❌️ fp16与ZeRO不兼容
+> ❌️ fp16 与 ZeRO 不兼容
 >
-> ⚠️ 流水并行目前仅支持naive amp
+> ⚠️ 流水并行目前仅支持 naive amp
 
 我们建议使用 torch AMP，因为在不使用流水并行时，它通常比 NVIDIA AMP 提供更好的准确性。
 
@@ -57,11 +57,14 @@ AMP 代表自动混合精度训练。
 
 ## Colossal-AI 中的 AMP
 
-我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster支持amp特性注入，如果您要使用混合精度训练，则在创建booster实例时指定`mixed_precision`参数，我们现已支持torch amp，apex amp, naive amp（现已移植torch amp至booster，apex amp, naive amp仍由`colossalai.initialize`方式启动，如您需使用，请[参考](./mixed_precision_training.md）;后续将会拓展`bf16`,`pf8`的混合精度训练.
+我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster 支持 amp 特性注入，如果您要使用混合精度训练，则在创建 booster 实例时指定`mixed_precision`参数，我们现已支持 torch amp，apex amp, naive amp（现已移植 torch amp 至 booster，apex amp, naive amp 仍由`colossalai.initialize`方式启动，如您需使用，请[参考](./mixed_precision_training.md);后续将会拓展`bf16`,`pf8`的混合精度训练.
+
+#### booster 启动方式
+
+您可以在创建 booster 实例时，指定`mixed_precision="fp16"`即使用 torch amp。
 
-#### booster启动方式
-您可以在创建booster实例时，指定`mixed_precision="fp16"`即使用torch amp。
 <!--- doc-test-ignore-start -->
+
 ```python
 """
     初始化映射关系如下：
@@ -74,9 +77,13 @@ AMP 代表自动混合精度训练。
 from colossalai import Booster
 booster = Booster(mixed_precision='fp16',...)
 ```
+
 <!--- doc-test-ignore-end -->
+
 或者您可以自定义一个`FP16TorchMixedPrecision`对象，如
+
 <!--- doc-test-ignore-start -->
+
 ```python
 from colossalai.mixed_precision import FP16TorchMixedPrecision
 mixed_precision = FP16TorchMixedPrecision(
@@ -86,8 +93,10 @@ mixed_precision = FP16TorchMixedPrecision(
     growth_interval=2000)
 booster = Booster(mixed_precision=mixed_precision,...)
 ```
+
 <!--- doc-test-ignore-end -->
-其他类型的amp使用方式也是一样的。
+
+其他类型的 amp 使用方式也是一样的。
 
 ### Torch AMP 配置
 
@@ -96,7 +105,7 @@ booster = Booster(mixed_precision=mixed_precision,...)
 ### Apex AMP 配置
 
 对于这种模式，我们依靠 Apex 实现混合精度训练。我们支持这个插件，因为它允许对混合精度的粒度进行更精细的控制。
-例如, O2 水平 (优化器水平2) 将保持 batch normalization 为 FP32。
+例如, O2 水平 (优化器水平 2) 将保持 batch normalization 为 FP32。
 
 如果你想了解更多细节，请参考 [Apex Documentation](https://nvidia.github.io/apex/)。
 
@@ -104,7 +113,7 @@ booster = Booster(mixed_precision=mixed_precision,...)
 
 ### Naive AMP 配置
 
-在 Naive AMP 模式中, 我们实现了混合精度训练，同时保持了与复杂张量和流水并行的兼容性。该 AMP 模式将所有操作转为 FP16 。下列代码块展示了该模式的booster启动方式。
+在 Naive AMP 模式中, 我们实现了混合精度训练，同时保持了与复杂张量和流水并行的兼容性。该 AMP 模式将所有操作转为 FP16 。下列代码块展示了该模式的 booster 启动方式。
 
 {{ autodoc:colossalai.booster.mixed_precision.FP16NaiveMixedPrecision }}
 
@@ -186,7 +195,8 @@ lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS
 ```
 
 ### 步骤 4. 插入 AMP
-创建一个MixedPrecision对象（如果需要）及torchDDPPlugin对象，调用 `colossalai.boost` 将所有训练组件转为为FP16模式.
+
+创建一个 MixedPrecision 对象（如果需要）及 torchDDPPlugin 对象，调用 `colossalai.boost` 将所有训练组件转为为 FP16 模式.
 
 ```python
 plugin = TorchDDPPlugin()
@@ -209,7 +219,7 @@ model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, opt
 
 ### 步骤 5. 使用 booster 训练
 
-使用booster构建一个普通的训练循环。
+使用 booster 构建一个普通的训练循环。
 
 ```python
 model.train()
@@ -232,4 +242,5 @@ for epoch in range(NUM_EPOCHS):
 ```shell
 colossalai run --nproc_per_node 1 train.py
 ```
+
 <!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->
diff --git a/docs/source/zh-Hans/get_started/run_demo.md b/docs/source/zh-Hans/get_started/run_demo.md
index edfc246c22d5..70ed5ebe251b 100755
--- a/docs/source/zh-Hans/get_started/run_demo.md
+++ b/docs/source/zh-Hans/get_started/run_demo.md
@@ -4,8 +4,8 @@ Colossal-AI 是一个集成的大规模深度学习系统，具有高效的并
 
 ## 单 GPU
 
-Colossal-AI 可以用在只有一个 GPU 的系统上训练深度学习模型，并达到 baseline 的性能。 我们提供了一个 [在CIFAR10数据集上训练ResNet](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet) 的例子，该例子只需要一个 GPU。
-您可以在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples) 中获取该例子。详细说明可以在其 `README.md` 中获取。
+Colossal-AI 可以用在只有一个 GPU 的系统上训练深度学习模型，并达到 baseline 的性能。 我们提供了一个 [在 CIFAR10 数据集上训练 ResNet](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/resnet) 的例子，该例子只需要一个 GPU。
+您可以在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples) 中获取该例子。详细说明可以在其 `README.md` 中获取。
 
 ## 多 GPU
 
@@ -13,16 +13,20 @@ Colossal-AI 可用于在具有多个 GPU 的分布式系统上训练深度学习
 
 #### 1. 数据并行
 
-您可以使用与上述单 GPU 演示相同的 [ResNet例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/resnet)。 通过设置 `--nproc_per_node` 为您机器上的 GPU 数量，您就能把数据并行应用在您的例子上了。
+您可以使用与上述单 GPU 演示相同的 [ResNet 例子](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/resnet)。 通过设置 `--nproc_per_node` 为您机器上的 GPU 数量，您就能把数据并行应用在您的例子上了。
 
 #### 2. 混合并行
 
-混合并行包括数据、张量和流水线并行。在 Colossal-AI 中，我们支持不同类型的张量并行（即 1D、2D、2.5D 和 3D）。您可以通过简单地改变 `config.py` 中的配置在不同的张量并行之间切换。您可以参考 [GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt), 更多细节能在它的 `README.md` 中被找到。
+混合并行包括数据、张量和流水线并行。在 Colossal-AI 中，我们支持不同类型的张量并行（即 1D、2D、2.5D 和 3D）。您可以通过简单地改变 `config.py` 中的配置在不同的张量并行之间切换。您可以参考 [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt), 更多细节能在它的 `README.md` 中被找到。
 
-#### 3. MoE并行
+#### 3. MoE 并行
 
-我们提供了一个 [WideNet例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) 来验证 MoE 的并行性。 WideNet 使用 Mixture of Experts（MoE）来实现更好的性能。更多的细节可以在我们的教程中获取：[教会您如何把Mixture of Experts整合到模型中](../advanced_tutorials/integrate_mixture_of_experts_into_your_model.md)。
+<!-- TODO: 在colossalai中实现这个例子 -->
+
+我们提供了一个 [ViT-MoE 例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/moe) 来验证 MoE 的并行性。 WideNet 使用 Mixture of Experts（MoE）来实现更好的性能。更多的细节可以在我们的教程中获取：[教会您如何把 Mixture of Experts 整合到模型中](../advanced_tutorials/integrate_mixture_of_experts_into_your_model.md)。
 
 #### 4. 序列并行
 
-序列并行是为了解决NLP任务中的内存效率和序列长度限制问题。 我们在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples) 中提供了一个 [BERT例子](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/bert/sequene_parallel)。您可以按照 `README.md` 来执行代码。
+序列并行是为了解决 NLP 任务中的内存效率和序列长度限制问题。 我们在 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples) 中提供了一个 [Sequence Parallelism 例子](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/sequence_parallel)。您可以按照 `README.md` 来执行代码。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 run_demo.py  -->

From b03d64d010cb6803b66230a0386bc62d989e6ef6 Mon Sep 17 00:00:00 2001
From: Wenhao Chen <cwher@outlook.com>
Date: Thu, 29 Jun 2023 10:48:09 +0800
Subject: [PATCH 12/14] [chat] refactor trainer class (#4080)

* to: add SLTrainer

* refactor: refactor RMTrainer and SFTTrainer

* fix: fix init file

* feat: remove on_learn_epoch fn as not used

* fix: align with modified gemini arguments

* to: add OnPolicyTrainer

* revert: add _on_learn_epoch fn

* refactor: refactor PPOTrainer

* style: rename PPOTrainer argument

* fix: align with modified PPO arguments

* test: align with modified train_prompts arguments

* chore: modify train_prompts

* docs: align with modified arguments

* fix: remove unnecessary output

* fix: move dataloader to fit fn of SLTrainer

* fix: move dataloader to fit fn of OnPolicyTrainer

* fix: modify usage of prompt and pretrain dataloader
---
 applications/Chat/README.md                   |   2 +-
 .../benchmarks/benchmark_opt_lora_dummy.py    |  26 ++-
 applications/Chat/coati/trainer/__init__.py   |   8 +-
 applications/Chat/coati/trainer/base.py       | 168 ++++++++++++++---
 applications/Chat/coati/trainer/ppo.py        | 175 +++++++-----------
 applications/Chat/coati/trainer/rm.py         | 156 +++++++---------
 applications/Chat/coati/trainer/sft.py        | 171 +++++++++--------
 .../coati/trainer/strategies/colossalai.py    |  13 +-
 applications/Chat/coati/trainer/utils.py      |  27 +++
 applications/Chat/examples/README.md          |   5 +-
 .../community/peft/train_peft_prompts.py      |  10 +-
 applications/Chat/examples/test_ci.sh         |  16 +-
 applications/Chat/examples/train_prompts.py   |  10 +-
 applications/Chat/examples/train_prompts.sh   |  21 ++-
 .../Chat/examples/train_reward_model.py       |   7 +-
 applications/Chat/examples/train_sft.py       |   7 +-
 16 files changed, 461 insertions(+), 361 deletions(-)

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 29cd581d7cc9..082cbb22b587 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -83,7 +83,7 @@ More details can be found in the latest news.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
 </p>
 
-> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --max_timesteps 1 --update_timesteps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32
+> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32
 
 ## Install
 
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index dea7ebc60a8b..39f2f28eca16 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -137,6 +137,12 @@ def main(args):
 
     (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
 
+    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
+    dataloader = DataLoader(random_prompts,
+                            batch_size=args.experience_batch_size,
+                            shuffle=True,
+                            collate_fn=preprocess_batch)
+
     trainer = PPOTrainer(strategy,
                          actor,
                          critic,
@@ -145,7 +151,6 @@ def main(args):
                          actor_optim,
                          critic_optim,
                          ptx_coef=0,
-                         max_epochs=args.max_epochs,
                          train_batch_size=args.train_batch_size,
                          offload_inference_models=args.offload_inference_models,
                          max_length=512,
@@ -157,17 +162,11 @@ def main(args):
                          eos_token_id=tokenizer.eos_token_id,
                          callbacks=[performance_evaluator])
 
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
-    dataloader = DataLoader(random_prompts,
-                            batch_size=args.experience_batch_size,
-                            shuffle=True,
-                            collate_fn=preprocess_batch)
-
-    trainer.fit(dataloader,
-                None,
+    trainer.fit(prompt_dataloader=dataloader,
+                pretrain_dataloader=None,
                 num_episodes=args.num_episodes,
-                max_timesteps=args.max_timesteps,
-                update_timesteps=args.update_timesteps)
+                num_update_steps=args.num_update_steps,
+                num_collect_steps=args.num_collect_steps)
 
     print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
 
@@ -183,9 +182,8 @@ def main(args):
                         ],
                         default='ddp')
     parser.add_argument('--num_episodes', type=int, default=3)
-    parser.add_argument('--max_timesteps', type=int, default=8)
-    parser.add_argument('--update_timesteps', type=int, default=8)
-    parser.add_argument('--max_epochs', type=int, default=1)
+    parser.add_argument('--num_collect_steps', type=int, default=8)
+    parser.add_argument('--num_update_steps', type=int, default=1)
     parser.add_argument('--train_batch_size', type=int, default=8)
     parser.add_argument('--experience_batch_size', type=int, default=8)
     parser.add_argument('--lora_rank', type=int, default=0)
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/Chat/coati/trainer/__init__.py
index 525b57bf21d3..86142361f3ff 100644
--- a/applications/Chat/coati/trainer/__init__.py
+++ b/applications/Chat/coati/trainer/__init__.py
@@ -1,6 +1,10 @@
-from .base import Trainer
+from .base import OnPolicyTrainer, SLTrainer
 from .ppo import PPOTrainer
 from .rm import RewardModelTrainer
 from .sft import SFTTrainer
 
-__all__ = ['Trainer', 'PPOTrainer', 'RewardModelTrainer', 'SFTTrainer']
+__all__ = [
+    'SLTrainer', 'OnPolicyTrainer',
+    'RewardModelTrainer', 'SFTTrainer',
+    'PPOTrainer'
+]
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
index ac3a878be884..13571cdcc23a 100644
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -1,54 +1,108 @@
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
+from contextlib import contextmanager
+from typing import List
 
-import torch
+import torch.nn as nn
+import tqdm
 from coati.experience_maker import Experience
+from coati.replay_buffer import NaiveReplayBuffer
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
 
 from .callbacks import Callback
 from .strategies import Strategy
+from .utils import CycledDataLoader, is_rank_0
 
 
-class Trainer(ABC):
+class SLTrainer(ABC):
     """
-        Base class for rlhf trainers.
+        Base class for supervised learning trainers.
 
     Args:
         strategy (Strategy):the strategy to use for training
         max_epochs (int, defaults to 1): the number of epochs of training process
+        model (nn.Module): the model to train
+        optim (Optimizer): the optimizer to use for training
+    """
+
+    def __init__(self,
+                 strategy: Strategy,
+                 max_epochs: int,
+                 model: nn.Module,
+                 optimizer: Optimizer,
+                 ) -> None:
+        super().__init__()
+        self.strategy = strategy
+        self.max_epochs = max_epochs
+        self.model = model
+        self.optimizer = optimizer
+
+    @abstractmethod
+    def _train(self, epoch):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _eval(self, epoch):
+        raise NotImplementedError()
+
+    def _before_fit(self):
+        self.no_epoch_bar = False
+
+    def fit(self, *args, **kwargs):
+        self._before_fit(*args, **kwargs)
+        for epoch in tqdm.trange(self.max_epochs,
+                                 desc="Epochs",
+                                 disable=not is_rank_0() or self.no_epoch_bar
+                                 ):
+            self._train(epoch)
+            self._eval(epoch)
+
+
+class OnPolicyTrainer(ABC):
+    """
+        Base class for on-policy rl trainers, e.g. PPO.
+
+    Args:
+        strategy (Strategy):the strategy to use for training
+        buffer (NaiveReplayBuffer): the buffer to collect experiences
+        sample_buffer (bool, defaults to False): whether to sample from buffer
         dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
         callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        generate_kwargs (dict, optional): the kwargs to use while model generating
     """
 
     def __init__(self,
                  strategy: Strategy,
-                 max_epochs: int = 1,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
+                 buffer: NaiveReplayBuffer,
+                 sample_buffer: bool,
+                 dataloader_pin_memory: bool,
+                 callbacks: List[Callback] = []
+                 ) -> None:
         super().__init__()
         self.strategy = strategy
-        self.max_epochs = max_epochs
-        self.generate_kwargs = generate_kwargs
+        self.buffer = buffer
+        self.sample_buffer = sample_buffer
         self.dataloader_pin_memory = dataloader_pin_memory
         self.callbacks = callbacks
 
-    # TODO(ver217): maybe simplify these code using context
-    def _on_fit_start(self) -> None:
+    @contextmanager
+    def _fit_ctx(self) -> None:
         for callback in self.callbacks:
             callback.on_fit_start()
-
-    def _on_fit_end(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_end()
-
-    def _on_episode_start(self, episode: int) -> None:
+        try:
+            yield
+        finally:
+            for callback in self.callbacks:
+                callback.on_fit_end()
+
+    @contextmanager
+    def _episode_ctx(self, episode: int) -> None:
         for callback in self.callbacks:
             callback.on_episode_start(episode)
-
-    def _on_episode_end(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_end(episode)
+        try:
+            yield
+        finally:
+            for callback in self.callbacks:
+                callback.on_episode_end(episode)
 
     def _on_make_experience_start(self) -> None:
         for callback in self.callbacks:
@@ -73,3 +127,71 @@ def _on_learn_batch_start(self) -> None:
     def _on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
         for callback in self.callbacks:
             callback.on_learn_batch_end(metrics, experience)
+
+    @abstractmethod
+    def _make_experience(self, collect_step: int):
+        """
+        Implement this method to make experience.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _learn(self, update_step: int):
+        """
+        Implement this method to learn from experience, either 
+        sample from buffer or transform buffer into dataloader.
+        """
+        raise NotImplementedError()
+
+    def _collect_phase(self, collect_step: int):
+        self._on_make_experience_start()
+        experience = self._make_experience(collect_step)
+        self._on_make_experience_end(experience)
+        self.buffer.append(experience)
+
+    def _update_phase(self, update_step: int):
+        self._on_learn_epoch_start(update_step)
+        self._learn(update_step)
+        self._on_learn_epoch_end(update_step)
+
+    def fit(self,
+            prompt_dataloader: DataLoader,
+            pretrain_dataloader: DataLoader,
+            num_episodes: int,
+            num_collect_steps: int,
+            num_update_steps: int,
+            ):
+        """
+        The main training loop of on-policy rl trainers.
+
+        Args:
+            prompt_dataloader (DataLoader): the dataloader to use for prompt data
+            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
+            num_episodes (int): the number of episodes to train
+            num_collect_steps (int): the number of collect steps per episode
+            num_update_steps (int): the number of update steps per episode
+        """
+        self.prompt_dataloader = CycledDataLoader(prompt_dataloader)
+        self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader)
+
+        with self._fit_ctx():
+            for episode in tqdm.trange(num_episodes,
+                                       desc="Episodes",
+                                       disable=not is_rank_0()):
+                with self._episode_ctx(episode):
+                    for collect_step in tqdm.trange(num_collect_steps,
+                                                    desc="Collect steps",
+                                                    disable=not is_rank_0()):
+                        self._collect_phase(collect_step)
+                    if not self.sample_buffer:
+                        # HACK(cwher): according to the design of boost API, dataloader should also be boosted,
+                        #  but it is impractical to adapt this pattern in RL training. Thus, I left dataloader unboosted.
+                        #  I only call strategy.setup_dataloader() to setup dataloader.
+                        self.dataloader = self.strategy.setup_dataloader(self.buffer,
+                                                                         self.dataloader_pin_memory)
+                    for update_step in tqdm.trange(num_update_steps,
+                                                   desc="Update steps",
+                                                   disable=not is_rank_0()):
+                        self._update_phase(update_step)
+                    # NOTE: this is for on-policy algorithms
+                    self.buffer.clear()
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index cfb18e2ae483..451abe2a7438 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -1,6 +1,5 @@
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Dict, List
 
-import torch
 import torch.nn as nn
 from coati.experience_maker import Experience, NaiveExperienceMaker
 from coati.models.base import Actor, Critic, get_base_model
@@ -9,19 +8,32 @@
 from coati.replay_buffer import NaiveReplayBuffer
 from torch import Tensor
 from torch.optim import Optimizer
-from torch.utils.data import DistributedSampler
+from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from colossalai.utils import get_current_device
 
-from .base import Trainer
+from .base import OnPolicyTrainer
 from .callbacks import Callback
 from .strategies import ColossalAIStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
-class PPOTrainer(Trainer):
+def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
+    unwrapper_model = strategy.unwrap_model(actor)
+    hf_model = get_base_model(unwrapper_model)
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(hf_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = hf_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(hf_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = hf_model._update_model_kwargs_for_generation
+
+    return new_kwargs
+
+
+class PPOTrainer(OnPolicyTrainer):
     """
         Trainer for PPO algorithm.
 
@@ -35,14 +47,13 @@ class PPOTrainer(Trainer):
         critic_optim (Optimizer): the optimizer to use for critic model
         kl_coef (float, defaults to 0.1): the coefficient of kl divergence loss
         train_batch_size (int, defaults to 8): the batch size to use for training
-        buffer_limit (int, defaults to 0): the max_size limitation of replay buffer
-        buffer_cpu_offload (bool, defaults to True): whether to offload replay buffer to cpu
+        buffer_limit (int, defaults to 0): the max_size limitation of buffer
+        buffer_cpu_offload (bool, defaults to True): whether to offload buffer to cpu
         eps_clip (float, defaults to 0.2): the clip coefficient of policy loss
         vf_coef (float, defaults to 1.0): the coefficient of value loss
         ptx_coef (float, defaults to 0.9): the coefficient of ptx loss
         value_clip (float, defaults to 0.4): the clip coefficient of value loss
-        max_epochs (int, defaults to 1): the number of epochs of training process
-        sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
+        sample_buffer (bool, defaults to False): whether to sample from buffer
         dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
         offload_inference_models (bool, defaults to True): whether to offload inference models to cpu during training process
         callbacks (List[Callback], defaults to []): the callbacks to call during training process
@@ -65,25 +76,26 @@ def __init__(self,
                  eps_clip: float = 0.2,
                  vf_coef: float = 1.0,
                  value_clip: float = 0.4,
-                 max_epochs: int = 1,
-                 sample_replay_buffer: bool = False,
+                 sample_buffer: bool = False,
                  dataloader_pin_memory: bool = True,
                  offload_inference_models: bool = True,
                  callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
+                 **generate_kwargs
+                 ) -> None:
         if isinstance(strategy, ColossalAIStrategy):
             from colossalai.booster.plugin import GeminiPlugin
             assert not (isinstance(strategy.plugin, GeminiPlugin) and offload_inference_models), \
                 "GeminiPlugin is not compatible with manual model.to('cpu')"
 
-        experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
-        replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
-        generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
-        super().__init__(strategy, max_epochs, dataloader_pin_memory, callbacks, **generate_kwargs)
+        buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
+        super().__init__(
+            strategy, buffer,
+            sample_buffer, dataloader_pin_memory,
+            callbacks
+        )
 
-        self.experience_maker = experience_maker
-        self.replay_buffer = replay_buffer
-        self.sample_replay_buffer = sample_replay_buffer
+        self.generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
+        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
         self.offload_inference_models = offload_inference_models
 
         self.actor = actor
@@ -99,76 +111,20 @@ def __init__(self,
 
         self.device = get_current_device()
 
-    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
-        if isinstance(inputs, Tensor):
-            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
-        elif isinstance(inputs, dict):
-            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
-        else:
-            raise ValueError(f'Unsupported input type "{type(inputs)}"')
-
-    def _learn(self):
-        # replay buffer may be empty at first, we should rebuild at each training
-        if not self.sample_replay_buffer:
-            # HACK(cwher): according to the design of boost API, dataloader should also be boosted,
-            #  but it is impractical to adapt this pattern in RL training. Thus, I left dataloader unboosted.
-            dataloader = self.strategy.setup_dataloader(self.replay_buffer, self.dataloader_pin_memory)
-        if self.sample_replay_buffer:
-            pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
-            for _ in pbar:
-                experience = self.replay_buffer.sample()
-                experience.to_device(self.device)
-                metrics = self.training_step(experience)
-                pbar.set_postfix(metrics)
+    def _make_experience(self, collect_step: int) -> Experience:
+        prompts = self.prompt_dataloader.next()
+        if self.offload_inference_models:
+            # TODO(ver217): this may be controlled by strategy if they are prepared by strategy
+            self.experience_maker.initial_model.to(self.device)
+            self.experience_maker.reward_model.to(self.device)
+        if isinstance(prompts, Tensor):
+            return self.experience_maker.make_experience(prompts, **self.generate_kwargs)
+        elif isinstance(prompts, dict):
+            return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
         else:
-            for epoch in range(self.max_epochs):
-                self._on_learn_epoch_start(epoch)
-                if isinstance(dataloader.sampler, DistributedSampler):
-                    dataloader.sampler.set_epoch(epoch)
-                pbar = tqdm(dataloader, desc=f'Train epoch [{epoch+1}/{self.max_epochs}]', disable=not is_rank_0())
-                for experience in pbar:
-                    self._on_learn_batch_start()
-                    experience.to_device(self.device)
-                    metrics = self.training_step(experience)
-                    self._on_learn_batch_end(metrics, experience)
-                    pbar.set_postfix(metrics)
-                self._on_learn_epoch_end(epoch)
-
-    def fit(self,
-            prompt_dataloader,
-            pretrain_dataloader,
-            num_episodes: int = 50000,
-            max_timesteps: int = 500,
-            update_timesteps: int = 5000) -> None:
-        time = 0
-        self.pretrain_dataloader = pretrain_dataloader
-        self.prompt_dataloader = prompt_dataloader
-        self._on_fit_start()
-        for episode in range(num_episodes):
-            self._on_episode_start(episode)
-            for timestep in tqdm(range(max_timesteps),
-                                 desc=f'Episode [{episode+1}/{num_episodes}]',
-                                 disable=not is_rank_0()):
-                time += 1
-                prompts = next(iter(self.prompt_dataloader))
-                self._on_make_experience_start()
-                if self.offload_inference_models:
-                    # TODO(ver217): this may be controlled by strategy if they are prepared by strategy
-                    self.experience_maker.initial_model.to(self.device)
-                    self.experience_maker.reward_model.to(self.device)
-                experience = self._make_experience(prompts)
-                self._on_make_experience_end(experience)
-                self.replay_buffer.append(experience)
-                if time % update_timesteps == 0:
-                    if self.offload_inference_models:
-                        self.experience_maker.initial_model.to('cpu')
-                        self.experience_maker.reward_model.to('cpu')
-                    self._learn()
-                    self.replay_buffer.clear()
-            self._on_episode_end(episode)
-        self._on_fit_end()
-
-    def training_step(self, experience: Experience) -> Dict[str, float]:
+            raise ValueError(f'Unsupported input type "{type(prompts)}"')
+
+    def _training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
         self.critic.train()
         # policy loss
@@ -182,7 +138,7 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
 
         # ptx loss
         if self.ptx_coef != 0:
-            batch = next(iter(self.pretrain_dataloader))
+            batch = self.pretrain_dataloader.next()
             batch = to_device(batch, self.device)
             ptx_log_probs = self.actor(batch['input_ids'],
                                        attention_mask=batch['attention_mask'])['logits']
@@ -208,16 +164,29 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
 
         return {'reward': experience.reward.mean().item()}
 
-
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
-    unwrapper_model = strategy.unwrap_model(actor)
-    hf_model = get_base_model(unwrapper_model)
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(hf_model, 'prepare_inputs_for_generation'):
-        new_kwargs['prepare_inputs_fn'] = hf_model.prepare_inputs_for_generation
-
-    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(hf_model, '_update_model_kwargs_for_generation'):
-        new_kwargs['update_model_kwargs_fn'] = hf_model._update_model_kwargs_for_generation
-
-    return new_kwargs
+    def _learn(self, update_step: int):
+        if self.offload_inference_models:
+            self.experience_maker.initial_model.to('cpu')
+            self.experience_maker.reward_model.to('cpu')
+
+        # buffer may be empty at first, we should rebuild at each training
+        if self.sample_buffer:
+            experience = self.buffer.sample()
+            self._on_learn_batch_start()
+            experience.to_device(self.device)
+            metrics = self._training_step(experience)
+            self._on_learn_batch_end(metrics, experience)
+        else:
+            if isinstance(self.dataloader.sampler, DistributedSampler):
+                self.dataloader.sampler.set_epoch(update_step)
+            pbar = tqdm(
+                self.dataloader,
+                desc=f'Train epoch [{update_step + 1}]',
+                disable=not is_rank_0()
+            )
+            for experience in pbar:
+                self._on_learn_batch_start()
+                experience.to_device(self.device)
+                metrics = self._training_step(experience)
+                self._on_learn_batch_end(metrics, experience)
+                pbar.set_postfix(metrics)
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index 316eded7ea5d..54a5d0f40dea 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,20 +1,19 @@
 from datetime import datetime
-from typing import Callable, List
+from typing import Callable
 
 import pandas as pd
 import torch
+import tqdm
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
-from tqdm import tqdm
 
-from .base import Trainer
-from .callbacks import Callback
+from .base import SLTrainer
 from .strategies import Strategy
 from .utils import is_rank_0
 
 
-class RewardModelTrainer(Trainer):
+class RewardModelTrainer(SLTrainer):
     """
         Trainer to use while training reward model.
 
@@ -24,12 +23,7 @@ class RewardModelTrainer(Trainer):
         optim (Optimizer): the optimizer to use for training
         lr_scheduler (_LRScheduler): the lr scheduler to use for training
         loss_fn (callable): the loss function to use for training
-        train_dataloader (DataLoader): the dataloader to use for training
-        valid_dataloader (DataLoader): the dataloader to use for validation
-        eval_dataloader (DataLoader): the dataloader to use for evaluation
-        batch_size (int, defaults to 1): the batch size while training
         max_epochs (int, defaults to 2): the number of epochs to train
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
     """
 
     def __init__(
@@ -39,87 +33,79 @@ def __init__(
         optim: Optimizer,
         lr_scheduler: _LRScheduler,
         loss_fn: Callable,
-        train_dataloader: DataLoader,
-        valid_dataloader: DataLoader,
-        eval_dataloader: DataLoader,
         max_epochs: int = 1,
-        callbacks: List[Callback] = [],
     ) -> None:
-        super().__init__(strategy, max_epochs, callbacks=callbacks)
+        super().__init__(strategy, max_epochs, model, optim)
 
-        self.train_dataloader = train_dataloader
-        self.valid_dataloader = valid_dataloader
-        self.eval_dataloader = eval_dataloader
-
-        self.model = model
         self.loss_fn = loss_fn
-        self.optimizer = optim
         self.scheduler = lr_scheduler
 
-    def eval_acc(self, dataloader):
-        dist = 0
-        on = 0
-        cnt = 0
-        self.model.eval()
-        with torch.no_grad():
-            for chosen_ids, c_mask, reject_ids, r_mask in dataloader:
-                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                for i in range(len(chosen_reward)):
-                    cnt += 1
-                    if chosen_reward[i] > reject_reward[i]:
-                        on += 1
-                dist += (chosen_reward - reject_reward).mean().item()
-            dist_mean = dist / len(dataloader)
-            acc = on / cnt
-        self.model.train()
-        return dist_mean, acc
+    def _eval(self, epoch):
+        if self.eval_dataloader is not None:
+            self.model.eval()
+            dist, on, cnt = 0, 0, 0
+            with torch.no_grad():
+                for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
+                    chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+                    c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+                    reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+                    r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
+                    chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                    reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                    for i in range(len(chosen_reward)):
+                        cnt += 1
+                        if chosen_reward[i] > reject_reward[i]:
+                            on += 1
+                    dist += (chosen_reward - reject_reward).mean().item()
+                self.dist = dist / len(self.eval_dataloader)
+                self.acc = on / cnt
 
-    def fit(self):
-        time = datetime.now()
-        epoch_bar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
-        for epoch in range(self.max_epochs):
-            step_bar = tqdm(range(self.train_dataloader.__len__()),
-                            desc='Train step of epoch %d' % epoch,
-                            disable=not is_rank_0())
-            # train
-            self.model.train()
-            cnt = 0
-            acc = 0
-            dist = 0
-            for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
-                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                loss = self.loss_fn(chosen_reward, reject_reward)
-                self.strategy.backward(loss, self.model, self.optimizer)
-                self.strategy.optimizer_step(self.optimizer)
-                self.optimizer.zero_grad()
-                cnt += 1
-                if cnt == 100:
-                    self.scheduler.step()
-                    dist, acc = self.eval_acc(self.valid_dataloader)
-                    cnt = 0
-                    if is_rank_0():
-                        log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]],
-                                           columns=['step', 'loss', 'dist', 'acc'])
-                        log.to_csv('log_%s.csv' % time, mode='a', header=False, index=False)
-                step_bar.update()
-                step_bar.set_postfix({'dist': dist, 'acc': acc})
-
-            # eval
-            dist, acc = self.eval_acc(self.eval_dataloader)
             if is_rank_0():
-                log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]],
-                                   columns=['step', 'loss', 'dist', 'acc'])
+                log = pd.DataFrame(
+                    [[(epoch + 1) * len(self.train_dataloader),
+                      self.loss.item(), self.dist, self.acc]],
+                    columns=['step', 'loss', 'dist', 'acc']
+                )
                 log.to_csv('log.csv', mode='a', header=False, index=False)
-            epoch_bar.update()
-            step_bar.set_postfix({'dist': dist, 'acc': acc})
-            step_bar.close()
+
+    def _train(self, epoch):
+        self.model.train()
+        step_bar = tqdm.trange(
+            len(self.train_dataloader),
+            desc='Train step of epoch %d' % epoch,
+            disable=not is_rank_0()
+        )
+        cnt = 0
+        for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
+            chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+            c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+            reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+            r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
+            chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+            reject_reward = self.model(reject_ids, attention_mask=r_mask)
+            self.loss = self.loss_fn(chosen_reward, reject_reward)
+            self.strategy.backward(self.loss, self.model, self.optimizer)
+            self.strategy.optimizer_step(self.optimizer)
+            self.optimizer.zero_grad()
+            cnt += 1
+            if cnt % 100 == 0:
+                self.scheduler.step()
+            step_bar.update()
+        step_bar.close()
+
+    def _before_fit(self,
+                    train_dataloader: DataLoader,
+                    valid_dataloader: DataLoader,
+                    eval_dataloader: DataLoader):
+        """
+        Args:
+            train_dataloader (DataLoader): the dataloader to use for training
+            valid_dataloader (DataLoader): the dataloader to use for validation
+            eval_dataloader (DataLoader): the dataloader to use for evaluation
+        """
+        super()._before_fit()
+        self.datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+        self.train_dataloader = train_dataloader
+        self.valid_dataloader = valid_dataloader
+        self.eval_dataloader = eval_dataloader
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index da223f1f33ff..12c51d7a80c3 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,21 +1,22 @@
 import time
-from typing import List
+from typing import Optional
 
 import torch
 import torch.distributed as dist
+import tqdm
 import wandb
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
-from tqdm import tqdm
 
-from .base import Trainer
-from .callbacks import Callback
+from colossalai.logging import DistributedLogger
+
+from .base import SLTrainer
 from .strategies import ColossalAIStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
-class SFTTrainer(Trainer):
+class SFTTrainer(SLTrainer):
     """
         Trainer to use while training reward model.
 
@@ -23,12 +24,9 @@ class SFTTrainer(Trainer):
         model (torch.nn.Module): the model to train
         strategy (Strategy): the strategy to use for training
         optim(Optimizer): the optimizer to use for training
-        train_dataloader: the dataloader to use for training
-        eval_dataloader: the dataloader to use for evaluation
-        batch_size (int, defaults to 1): the batch size while training
+        lr_scheduler(_LRScheduler): the lr scheduler to use for training
         max_epochs (int, defaults to 2): the number of epochs to train
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        optim_kwargs (dict, defaults to {'lr':1e-4}): the kwargs to use while initializing optimizer
+        accumulation_steps (int, defaults to 8): the number of steps to accumulate gradients
     """
 
     def __init__(
@@ -37,95 +35,92 @@ def __init__(
         strategy: Strategy,
         optim: Optimizer,
         lr_scheduler: _LRScheduler,
-        train_dataloader: DataLoader,
-        eval_dataloader: DataLoader = None,
         max_epochs: int = 2,
         accumulation_steps: int = 8,
-        callbacks: List[Callback] = [],
     ) -> None:
         if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy):
             from colossalai.booster.plugin import GeminiPlugin
             assert not isinstance(strategy.plugin, GeminiPlugin), \
                 "Accumulation steps are not supported in stage 3 of ColossalAI"
-        super().__init__(strategy, max_epochs, callbacks=callbacks)
-        self.train_dataloader = train_dataloader
-        self.eval_dataloader = eval_dataloader
-        self.model = model
-        self.optimizer = optim
 
-        self.accumulation_steps = accumulation_steps
+        super().__init__(strategy, max_epochs, model, optim)
 
+        self.accumulation_steps = accumulation_steps
         self.scheduler = lr_scheduler
 
-    def fit(self, logger, use_wandb: bool = False):
+    def _train(self, epoch: int):
+        self.model.train()
+        for batch_id, batch in enumerate(self.train_dataloader):
+
+            batch = to_device(batch, torch.cuda.current_device())
+            outputs = self.model(batch["input_ids"],
+                                 attention_mask=batch["attention_mask"],
+                                 labels=batch["labels"])
+
+            loss = outputs.loss
+            loss = loss / self.accumulation_steps
+
+            self.strategy.backward(loss, self.model, self.optimizer)
+
+            self.total_loss += loss.item()
+
+            # gradient accumulation
+            if (batch_id + 1) % self.accumulation_steps == 0:
+                self.strategy.optimizer_step(self.optimizer)
+                self.optimizer.zero_grad()
+                self.scheduler.step()
+                if is_rank_0() and self.use_wandb:
+                    wandb.log({
+                        "loss": self.total_loss / self.accumulation_steps,
+                        "lr": self.scheduler.get_last_lr()[0],
+                        "epoch": epoch,
+                        "batch_id": batch_id
+                    })
+                self.total_loss = 0
+                self.step_bar.update()
+
+    def _eval(self, epoch: int):
+        if self.eval_dataloader is not None:
+            self.model.eval()
+            with torch.no_grad():
+                loss_sum, num_seen = 0, 0
+                for batch in self.eval_dataloader:
+                    batch = to_device(batch, torch.cuda.current_device())
+                    outputs = self.model(batch["input_ids"],
+                                         attention_mask=batch["attention_mask"],
+                                         labels=batch["labels"])
+                    loss = outputs.loss
+
+                    loss_sum += loss.item()
+                    num_seen += batch["input_ids"].size(0)
+
+                loss_mean = loss_sum / num_seen
+                if dist.get_rank() == 0:
+                    self.logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
+
+    def _before_fit(self,
+                    train_dataloader: DataLoader,
+                    eval_dataloader: Optional[DataLoader] = None,
+                    logger: Optional[DistributedLogger] = None,
+                    use_wandb: bool = False):
+        """
+        Args:
+            train_dataloader: the dataloader to use for training
+            eval_dataloader: the dataloader to use for evaluation
+        """
+        self.train_dataloader = train_dataloader
+        self.eval_dataloader = eval_dataloader
+
+        self.logger = logger
+        self.use_wandb = use_wandb
         if use_wandb:
             wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
             wandb.watch(self.model)
-        total_loss = 0
-        # epoch_bar = tqdm(range(self.epochs), desc='Epochs', disable=not is_rank_0())
-        step_bar = tqdm(range(len(self.train_dataloader) // self.accumulation_steps * self.max_epochs),
-                        desc=f'steps',
-                        disable=not is_rank_0())
-        for epoch in range(self.max_epochs):
-
-            # process_bar = tqdm(range(len(self.train_dataloader)), desc=f'Train process for{epoch}', disable=not is_rank_0())
-            # train
-            self.model.train()
-            for batch_id, batch in enumerate(self.train_dataloader):
-
-                batch = to_device(batch, torch.cuda.current_device())
-                outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
-
-                loss = outputs.loss
-
-                if loss >= 2.5 and is_rank_0():
-                    logger.warning(f"batch_id:{batch_id}, abnormal loss: {loss}")
-
-                loss = loss / self.accumulation_steps
-
-                self.strategy.backward(loss, self.model, self.optimizer)
-
-                total_loss += loss.item()
-
-                # gradient accumulation
-                if (batch_id + 1) % self.accumulation_steps == 0:
-                    self.strategy.optimizer_step(self.optimizer)
-                    self.optimizer.zero_grad()
-                    self.scheduler.step()
-                    if is_rank_0() and use_wandb:
-                        wandb.log({
-                            "loss": total_loss / self.accumulation_steps,
-                            "lr": self.scheduler.get_last_lr()[0],
-                            "epoch": epoch,
-                            "batch_id": batch_id
-                        })
-                    total_loss = 0
-                    step_bar.update()
-
-                # if batch_id % log_interval == 0:
-                # logger.info(f'Train Epoch {epoch}/{self.epochs} Batch {batch_id} Rank {dist.get_rank()} loss {loss.item()}')
-                # wandb.log({"loss": loss.item()})
-
-                # process_bar.update()
-
-            # eval
-            if self.eval_dataloader is not None:
-                self.model.eval()
-                with torch.no_grad():
-                    loss_sum = 0
-                    num_seen = 0
-                    for batch in self.eval_dataloader:
-                        batch = to_device(batch, torch.cuda.current_device())
-                        outputs = self.model(batch["input_ids"],
-                                             attention_mask=batch["attention_mask"],
-                                             labels=batch["labels"])
-                        loss = outputs.loss
-
-                        loss_sum += loss.item()
-                        num_seen += batch["input_ids"].size(0)
-
-                    loss_mean = loss_sum / num_seen
-                    if dist.get_rank() == 0:
-                        logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
-
-            # epoch_bar.update()
+
+        self.total_loss = 0
+        self.no_epoch_bar = True
+        self.step_bar = tqdm.trange(
+            len(self.train_dataloader) // self.accumulation_steps * self.max_epochs,
+            desc=f'steps',
+            disable=not is_rank_0()
+        )
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index f31551f22318..e5a69f3351cb 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -1,4 +1,3 @@
-import functools
 import warnings
 from typing import Optional
 
@@ -103,7 +102,7 @@ def __init__(
         # NOTE: dist should be initialized before calling get_current_device()
         if stage == 3:
             plugin_initializer = lambda: GeminiPlugin(
-            # gemini_config
+                # gemini_config
                 device=get_current_device(),
                 placement_policy=placement_policy,
                 precision=precision,
@@ -113,20 +112,20 @@ def __init__(
                 search_range_m=search_range_m,
                 hidden_dim=hidden_dim,
                 min_chunk_size_m=min_chunk_size_m,
-            # zero_optim_config
+                # zero_optim_config
                 gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-            # optim_config
+                # optim_config
                 **optim_kwargs)
         else:
             plugin_initializer = lambda: LowLevelZeroPlugin(
-            # zero_config
+                # zero_config
                 stage=stage,
                 precision=precision,
-            # zero_optim_config
+                # zero_optim_config
                 reduce_bucket_size_in_m=reduce_bucket_size,
                 overlap_communication=overlap_communication,
                 cpu_offload=(placement_policy == 'cpu'),
-            # optim_config
+                # optim_config
                 **optim_kwargs)
 
         super().__init__(seed, plugin_initializer)
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index 9cccb5c92603..c9fc8d0fe19f 100644
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -3,6 +3,33 @@
 import torch
 import torch.distributed as dist
 from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader
+
+
+class CycledDataLoader:
+    """
+    Why do we need this class?
+    In version 4da324cd60, "prompts = next(iter(self.prompt_dataloader))" is used to sample a batch of prompts/pretrain.
+    However, this may be inefficient due to frequent re-initialization of the dataloader. (re-initialize workers...)
+    NOTE: next(iter(dataloader)) is not equivalent to for batch in dataloader: break, it causes slightly different behavior.
+    """
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 ) -> None:
+        self.dataloader = dataloader
+
+        self.count = 0
+        self.dataloader_iter = iter(dataloader)
+
+    def next(self):
+        self.count += 1
+        try:
+            return next(self.dataloader_iter)
+        except StopIteration:
+            self.count = 0
+            self.dataloader_iter = iter(self.dataloader)
+            return next(self.dataloader_iter)
 
 
 def is_rank_0() -> bool:
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index 72810738d017..3e9d9c4325d8 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -171,9 +171,8 @@ Pretrain dataset: the pretrain dataset including the instruction and correspondi
 - --pretrain_dataset:  path of the ptx dataset, type=str, default=None
 - --need_optim_ckpt:   whether to save optim ckpt, type=bool, default=False
 - --num_episodes:      num of episodes for training, type=int, default=10
-- --max_epochs:        max epochs for training in one episode, type=int, default=5
-- --max_timesteps:     max episodes in one batch, type=int, default=10
-- --update_timesteps:  timesteps to update, type=int, default=10
+- --num_update_steps:  number of steps to update policy per episode, type=int
+- --num_collect_steps: number of steps to collect experience per episode, type=int
 - --train_batch_size:  batch size while training, type=int, default=8
 - --ptx_batch_size:    batch size to compute ptx loss, type=int, default=1
 - --experience_batch_size: batch size to make experience, type=int, default=8
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index ba8470f38fad..00ed7aa36257 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -171,7 +171,6 @@ def tokenize_fn(texts):
         critic_optim,
         kl_coef=args.kl_coef,
         ptx_coef=args.ptx_coef,
-        max_epochs=args.max_epochs,
         train_batch_size=args.train_batch_size,
         experience_batch_size=args.experience_batch_size,
         tokenizer=tokenize_fn,
@@ -186,8 +185,8 @@ def tokenize_fn(texts):
     trainer.fit(prompt_dataloader=prompt_dataloader,
                 pretrain_dataloader=pretrain_dataloader,
                 num_episodes=args.num_episodes,
-                max_timesteps=args.max_timesteps,
-                update_timesteps=args.update_timesteps)
+                num_update_steps=args.num_update_steps,
+                num_collect_steps=args.num_collect_steps)
 
     # save model checkpoint after fitting
     trainer.save_model(args.save_path, only_rank0=True, tokenizer=tokenizer)
@@ -215,9 +214,8 @@ def tokenize_fn(texts):
     parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
+    parser.add_argument('--num_collect_steps', type=int, default=10)
+    parser.add_argument('--num_update_steps', type=int, default=5)
     parser.add_argument('--train_batch_size', type=int, default=2)
     parser.add_argument('--ptx_batch_size', type=int, default=1)
     parser.add_argument('--experience_batch_size', type=int, default=8)
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 85728e95820c..4bf5524afb01 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -63,8 +63,8 @@ for model in 'gpt2' 'bloom' 'opt' 'llama' 'roberta'; do
         torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
             --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
             --strategy $strategy --model $model \
-            --num_episodes 1 --max_timesteps 2 \
-            --update_timesteps 2 --max_epochs 1 --train_batch_size 2
+            --num_episodes 1 --num_collect_steps 2 --num_update_steps 1 \
+            --train_batch_size 2
     done
 done
 
@@ -149,8 +149,8 @@ rm -rf ${BASE}/rm_ckpt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
     --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-    --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
-    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --strategy colossalai_zero2 --num_episodes 1 \
+    --num_collect_steps 2 --num_update_steps 1 --train_batch_size 2 \
     --pretrain 'facebook/opt-350m' --model opt \
     --rm_pretrain 'facebook/opt-350m' \
     --rm_path ${BASE}/rm_ckpt_opt.pt \
@@ -159,8 +159,8 @@ rm -rf ${BASE}/rm_ckpt_opt.pt
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
     --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-    --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
-    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --strategy colossalai_zero2 --num_episodes 1 \
+    --num_collect_steps 2 --num_update_steps 1 --train_batch_size 2 \
     --pretrain 'gpt2' --model gpt2 \
     --rm_pretrain 'gpt2' \
     --rm_path ${BASE}/rm_ckpt_gpt.pt \
@@ -168,8 +168,8 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
 
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
     --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
-    --strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
-    --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+    --strategy colossalai_gemini --num_episodes 1 \
+    --num_collect_steps 2 --num_update_steps 1 --train_batch_size 2 \
     --pretrain 'gpt2' --model gpt2 \
     --rm_pretrain 'gpt2' \
     --rm_path ${BASE}/rm_ckpt_gpt.pt \
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 2a47dda637bb..a9bc0e532e5d 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -177,7 +177,6 @@ def main(args):
         critic_optim,
         kl_coef=args.kl_coef,
         ptx_coef=args.ptx_coef,
-        max_epochs=args.max_epochs,
         train_batch_size=args.train_batch_size,
         max_length=args.max_seq_len,
         use_cache=True,
@@ -192,8 +191,8 @@ def main(args):
     trainer.fit(prompt_dataloader=prompt_dataloader,
                 pretrain_dataloader=pretrain_dataloader,
                 num_episodes=args.num_episodes,
-                max_timesteps=args.max_timesteps,
-                update_timesteps=args.update_timesteps)
+                num_collect_steps=args.num_collect_steps,
+                num_update_steps=args.num_update_steps)
 
     # save model checkpoint after fitting
     strategy.save_model(actor, args.save_path, only_rank0=True)
@@ -220,9 +219,8 @@ def main(args):
     parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
+    parser.add_argument('--num_collect_steps', type=int, default=10)
+    parser.add_argument('--num_update_steps', type=int, default=5)
     parser.add_argument('--train_batch_size', type=int, default=8)
     parser.add_argument('--ptx_batch_size', type=int, default=1)
     parser.add_argument('--experience_batch_size', type=int, default=8)
diff --git a/applications/Chat/examples/train_prompts.sh b/applications/Chat/examples/train_prompts.sh
index 7f3b2636ca32..d04c416015b1 100755
--- a/applications/Chat/examples/train_prompts.sh
+++ b/applications/Chat/examples/train_prompts.sh
@@ -1,13 +1,13 @@
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
     export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
@@ -17,4 +17,9 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
 # torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
 
-torchrun --standalone --nproc_per_node=2 train_prompts.py --prompt_dataset /path/to/data.json --strategy colossalai_zero2
+torchrun --standalone --nproc_per_node=2 train_prompts.py \
+    --pretrain_dataset /path/to/data.json \
+    --prompt_dataset /path/to/data.json \
+    --strategy colossalai_zero2 \
+    --num_episodes 1 --num_collect_steps 2 --num_update_steps 1 \
+    --train_batch_size 2
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 2df3bc391b9b..4a6851ab5b24 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -178,12 +178,11 @@ def train(args):
                                  optim=optim,
                                  lr_scheduler=lr_scheduler,
                                  loss_fn=loss_fn,
-                                 train_dataloader=train_dataloader,
-                                 valid_dataloader=valid_dataloader,
-                                 eval_dataloader=eval_dataloader,
                                  max_epochs=args.max_epochs)
 
-    trainer.fit()
+    trainer.fit(train_dataloader=train_dataloader,
+                valid_dataloader=valid_dataloader,
+                eval_dataloader=eval_dataloader)
     # save model checkpoint after fitting on only rank0
     strategy.save_model(model, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index 717eb95311fb..967b7c277c6a 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -170,12 +170,13 @@ def train(args):
                          strategy=strategy,
                          optim=optim,
                          lr_scheduler=lr_scheduler,
-                         train_dataloader=train_dataloader,
-                         eval_dataloader=eval_dataloader,
                          max_epochs=args.max_epochs,
                          accumulation_steps=args.accumulation_steps)
 
-    trainer.fit(logger=logger, use_wandb=args.use_wandb)
+    trainer.fit(train_dataloader=train_dataloader,
+                eval_dataloader=eval_dataloader,
+                logger=logger,
+                use_wandb=args.use_wandb)
 
     # save model checkpoint after fitting on only rank0
     strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)

From edd75a59eada232a7d093b070e4ec7bfd81f31c3 Mon Sep 17 00:00:00 2001
From: Wenhao Chen <cwher@outlook.com>
Date: Thu, 29 Jun 2023 18:11:00 +0800
Subject: [PATCH 13/14] [chat] remove naive strategy and split colossalai
 strategy (#4094)

* feat: remove on_learn_epoch fn as not used

* revert: add _on_learn_epoch fn

* to: remove the use of NaiveStrategy

* test: remove NaiveStrategy tests

* feat: remove NaiveStrategy

* style: modify comments and params

* feat: split ColossalAIStrategy into LowLevelZeroStrategy and GeminiStrategy

* fix: remove naive

* fix: align with modified colossal strategy

* fix: fix ddp _try_init_dist arg
---
 applications/Chat/README.md                   |   2 +-
 .../benchmarks/benchmark_opt_lora_dummy.py    |  20 +-
 .../Chat/benchmarks/ray/1mmt_dummy.py         |   8 +-
 .../Chat/benchmarks/ray/mmmt_dummy.py         |   8 +-
 .../Chat/coati/ray/detached_trainer_ppo.py    |   4 +-
 applications/Chat/coati/ray/utils.py          |  16 +-
 .../trainer/callbacks/save_checkpoint.py      |   4 +-
 applications/Chat/coati/trainer/ppo.py        |   7 +-
 applications/Chat/coati/trainer/sft.py        |   7 +-
 .../Chat/coati/trainer/strategies/__init__.py |   8 +-
 .../coati/trainer/strategies/colossalai.py    | 259 ++++++++++--------
 .../Chat/coati/trainer/strategies/ddp.py      |  68 ++++-
 .../Chat/coati/trainer/strategies/naive.py    | 103 -------
 applications/Chat/examples/README.md          |   6 +-
 .../community/peft/train_peft_prompts.py      |  14 +-
 .../examples/community/peft/train_peft_sft.py |  24 +-
 .../community/ray/train_prompts_on_ray.py     |  16 +-
 applications/Chat/examples/ray/1mmt_prompt.py |   8 +-
 applications/Chat/examples/ray/mmmt_prompt.py |   8 +-
 applications/Chat/examples/test_ci.sh         |  14 +-
 applications/Chat/examples/train_prompts.py   |  12 +-
 .../Chat/examples/train_reward_model.py       |  12 +-
 applications/Chat/examples/train_sft.py       |  16 +-
 applications/Chat/tests/test_checkpoint.py    |   6 +-
 applications/Chat/tests/test_data.py          |   4 +-
 25 files changed, 314 insertions(+), 340 deletions(-)
 delete mode 100644 applications/Chat/coati/trainer/strategies/naive.py

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 082cbb22b587..016272ed8c89 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -287,7 +287,7 @@ If you only have a single 24G GPU, you can use the following script. `batch_size
 torchrun --standalone --nproc_per_node=1 train_sft.py \
     --pretrain "/path/to/LLaMa-7B/" \
     --model 'llama' \
-    --strategy naive \
+    --strategy ddp \
     --log_interval 10 \
     --save_path  /path/to/Coati-7B \
     --dataset /path/to/data.json \
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 39f2f28eca16..90471ed727b0 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -8,7 +8,7 @@
 from coati.models.opt import OPTActor, OPTCritic
 from coati.trainer import PPOTrainer
 from coati.trainer.callbacks import PerformanceEvaluator
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
@@ -19,10 +19,8 @@
 
 def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
     numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, ColossalAIStrategy):
-        from colossalai.booster.plugin import GeminiPlugin
-        if isinstance(strategy.plugin, GeminiPlugin) and strategy.shard_init:
-            numel *= dist.get_world_size()
+    if isinstance(strategy, GeminiStrategy) and strategy.shard_init:
+        numel *= dist.get_world_size()
     return numel
 
 
@@ -78,17 +76,17 @@ def main(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
     elif args.strategy == 'colossalai_gemini_cpu':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     elif args.strategy == 'colossalai_zero1':
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero1_cpu':
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy='cpu')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
index 9e8f36cefc4f..7fc990448805 100644
--- a/applications/Chat/benchmarks/ray/1mmt_dummy.py
+++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py
@@ -83,8 +83,8 @@ def model_fn():
         env_info=env_info_maker,
         kl_coef=0.1,
         debug=args.debug,
-    # sync_models_from_trainers=True,
-    # generation kwargs:
+        # sync_models_from_trainers=True,
+        # generation kwargs:
         max_length=512,
         do_sample=True,
         temperature=1.0,
@@ -153,10 +153,10 @@ def build_dataloader(size):
     parser.add_argument('--num_trainers', type=int, default=1)
     parser.add_argument('--trainer_strategy',
                         choices=[
-                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
                             'colossalai_zero2_cpu'
                         ],
-                        default='naive')
+                        default='ddp')
     parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
     parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
index 46a0062893b8..ca1df22070fc 100644
--- a/applications/Chat/benchmarks/ray/mmmt_dummy.py
+++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py
@@ -87,8 +87,8 @@ def model_fn():
             env_info=env_info_maker,
             kl_coef=0.1,
             debug=args.debug,
-    # sync_models_from_trainers=True,
-    # generation kwargs:
+            # sync_models_from_trainers=True,
+            # generation kwargs:
             max_length=512,
             do_sample=True,
             temperature=1.0,
@@ -164,10 +164,10 @@ def build_dataloader(size):
     parser.add_argument('--num_trainers', type=int, default=1)
     parser.add_argument('--trainer_strategy',
                         choices=[
-                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
                             'colossalai_zero2_cpu'
                         ],
-                        default='naive')
+                        default='ddp')
     parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
     parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
diff --git a/applications/Chat/coati/ray/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
index 5f0032716f93..2f2aa0e29579 100644
--- a/applications/Chat/coati/ray/detached_trainer_ppo.py
+++ b/applications/Chat/coati/ray/detached_trainer_ppo.py
@@ -6,7 +6,7 @@
 from coati.models.base import Actor, Critic
 from coati.models.loss import PolicyLoss, ValueLoss
 from coati.trainer.callbacks import Callback
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
 from torch.optim import Adam
 
 from colossalai.nn.optimizer import HybridAdam
@@ -85,7 +85,7 @@ def __init__(
             evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel)
             callbacks = callbacks + [evaluator]
 
-        if isinstance(self.strategy, ColossalAIStrategy):
+        if isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy)):
             self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7)
             self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7)
         else:
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 4361ee236771..4f8e0b8a87e9 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -1,6 +1,6 @@
 import os
-from typing import Any, Callable, Dict, List, Optional
 from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -10,7 +10,7 @@
 from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
 
@@ -76,18 +76,16 @@ def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
 
 
 def get_strategy_from_args(strategy: str):
-    if strategy == 'naive':
-        strategy_ = NaiveStrategy()
-    elif strategy == 'ddp':
+    if strategy == 'ddp':
         strategy_ = DDPStrategy()
     elif strategy == 'colossalai_gemini':
-        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        strategy_ = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
     elif strategy == 'colossalai_zero2':
-        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     elif strategy == 'colossalai_gemini_cpu':
-        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+        strategy_ = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
     elif strategy == 'colossalai_zero2_cpu':
-        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu')
+        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
     return strategy_
diff --git a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
index d2dcc0dd4c65..f0d77a191a88 100644
--- a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
+++ b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
@@ -1,7 +1,7 @@
 import os
 
 import torch.distributed as dist
-from coati.trainer.strategies import ColossalAIStrategy, Strategy
+from coati.trainer.strategies import GeminiStrategy, LowLevelZeroStrategy, Strategy
 from coati.trainer.utils import is_rank_0
 from torch import nn
 from torch.optim import Optimizer
@@ -69,7 +69,7 @@ def on_episode_end(self, episode: int) -> None:
             # save optimizer
             if self.model_dict[model][1] is None:
                 continue
-            only_rank0 = not isinstance(self.strategy, ColossalAIStrategy)
+            only_rank0 = not isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy))
             rank = 0 if is_rank_0() else dist.get_rank()
             optim_path = os.path.join(base_path, f'{model}-optim-rank-{rank}.pt')
             self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0)
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index 451abe2a7438..4c4a1002e96d 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -15,7 +15,7 @@
 
 from .base import OnPolicyTrainer
 from .callbacks import Callback
-from .strategies import ColossalAIStrategy, Strategy
+from .strategies import GeminiStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
@@ -82,9 +82,8 @@ def __init__(self,
                  callbacks: List[Callback] = [],
                  **generate_kwargs
                  ) -> None:
-        if isinstance(strategy, ColossalAIStrategy):
-            from colossalai.booster.plugin import GeminiPlugin
-            assert not (isinstance(strategy.plugin, GeminiPlugin) and offload_inference_models), \
+        if isinstance(strategy, GeminiStrategy):
+            assert not offload_inference_models, \
                 "GeminiPlugin is not compatible with manual model.to('cpu')"
 
         buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index 12c51d7a80c3..0812ba165286 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -12,7 +12,7 @@
 from colossalai.logging import DistributedLogger
 
 from .base import SLTrainer
-from .strategies import ColossalAIStrategy, Strategy
+from .strategies import GeminiStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
@@ -38,9 +38,8 @@ def __init__(
         max_epochs: int = 2,
         accumulation_steps: int = 8,
     ) -> None:
-        if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy):
-            from colossalai.booster.plugin import GeminiPlugin
-            assert not isinstance(strategy.plugin, GeminiPlugin), \
+        if accumulation_steps > 1:
+            assert not isinstance(strategy, GeminiStrategy), \
                 "Accumulation steps are not supported in stage 3 of ColossalAI"
 
         super().__init__(strategy, max_epochs, model, optim)
diff --git a/applications/Chat/coati/trainer/strategies/__init__.py b/applications/Chat/coati/trainer/strategies/__init__.py
index f258c9b8a873..b49a2c742db3 100644
--- a/applications/Chat/coati/trainer/strategies/__init__.py
+++ b/applications/Chat/coati/trainer/strategies/__init__.py
@@ -1,6 +1,8 @@
 from .base import Strategy
-from .colossalai import ColossalAIStrategy
+from .colossalai import GeminiStrategy, LowLevelZeroStrategy
 from .ddp import DDPStrategy
-from .naive import NaiveStrategy
 
-__all__ = ['Strategy', 'NaiveStrategy', 'DDPStrategy', 'ColossalAIStrategy']
+__all__ = [
+    'Strategy', 'DDPStrategy',
+    'LowLevelZeroStrategy', 'GeminiStrategy'
+]
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index e5a69f3351cb..1b59d704eec3 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -18,13 +18,96 @@
 from .ddp import DDPStrategy
 
 
-class ColossalAIStrategy(DDPStrategy):
+class LowLevelZeroStrategy(DDPStrategy):
+    """
+        The strategy for training with ColossalAI.
+
+    Args:
+        stage(int): The stage to use in ZeRO. Choose in (1, 2)
+        precision(str): The precision to use. Choose in ('fp32', 'fp16').
+        seed(int): The seed for the random number generator.
+        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
+                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
+                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
+        reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
+        overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
+        initial_scale(float): The initial scale for the optimizer.
+        growth_factor(float): The growth factor for the optimizer.
+        backoff_factor(float): The backoff factor for the optimizer.
+        growth_interval(int): The growth interval for the optimizer.
+        hysteresis(int): The hysteresis for the optimizer.
+        min_scale(float): The minimum scale for the optimizer.
+        max_scale(float): The maximum scale for the optimizer.
+        max_norm(float): The maximum norm for the optimizer.
+        norm_type(float): The norm type for the optimizer.
+
+    """
+
+    def __init__(self,
+                 stage: int = 3,
+                 precision: str = 'fp16',
+                 seed: int = 42,
+                 placement_policy: str = 'cuda',
+                 reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
+                 overlap_communication: bool = True,    # only for stage 1&2
+                 initial_scale: float = 2**16,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 min_scale: float = 1,
+                 max_scale: float = 2**32,
+                 max_norm: float = 0.0,
+                 norm_type: float = 2.0
+                 ) -> None:
+
+        assert stage in (1, 2), f'Unsupported stage "{stage}"'
+        assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
+        assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
+
+        plugin_initializer = lambda: LowLevelZeroPlugin(
+            # zero_config
+            stage=stage,
+            precision=precision,
+            # zero_optim_config
+            reduce_bucket_size_in_m=reduce_bucket_size,
+            overlap_communication=overlap_communication,
+            cpu_offload=(placement_policy == 'cpu'),
+            # optim_config
+            initial_scale=initial_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            hysteresis=hysteresis,
+            min_scale=min_scale,
+            max_scale=max_scale,
+            max_norm=max_norm,
+            norm_type=norm_type
+        )
+
+        super().__init__(seed, plugin_initializer)
+
+    def _post_init(self) -> None:
+        assert isinstance(self.plugin, LowLevelZeroPlugin), \
+            f'{type(self).__name__}\'s plugin is not initialized properly.'
+
+    def setup_distributed(self) -> None:
+        colossalai.launch_from_torch({}, seed=self.seed)
+
+    def unwrap_model(self, model: nn.Module) -> nn.Module:
+        assert isinstance(model, LowLevelZeroModel)
+        return model.module
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        assert isinstance(model, LowLevelZeroModel)
+        yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
+
+
+class GeminiStrategy(DDPStrategy):
     """
         The strategy for training with ColossalAI.
 
     Args:
-        stage(int): The stage to use in ZeRO. Choose in (1, 2, 3)
-        precision(str): The precision to use. Choose in ('fp32', 'fp16'). Stage 3 only supports fp16.
         seed(int): The seed for the random number generator.
         shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3.
             This is not compatible with `from_pretrained()`. We temporarily disable this and will support it in the future.
@@ -37,8 +120,6 @@ class ColossalAIStrategy(DDPStrategy):
         hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3.
         min_chunk_size_m(float): The minimum chunk size divided by 2^20. Only for ZeRO-3.
         gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3.
-        reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
-        overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
         initial_scale(float): The initial scale for the optimizer.
         growth_factor(float): The growth factor for the optimizer.
         backoff_factor(float): The backoff factor for the optimizer.
@@ -51,132 +132,96 @@ class ColossalAIStrategy(DDPStrategy):
 
     """
 
-    def __init__(
-            self,
-            stage: int = 3,
-            precision: str = 'fp16',
-            seed: int = 42,
-            shard_init: bool = False,    # only for stage 3
-            placement_policy: str = 'cuda',
-            pin_memory: bool = True,    # only for stage 3
-            force_outputs_fp32: bool = False,    # only for stage 3
-            search_range_m: int = 32,    # only for stage 3
-            hidden_dim: Optional[int] = None,    # only for stage 3
-            min_chunk_size_m: float = 32,    # only for stage 3
-            gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
-            reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
-            overlap_communication: bool = True,    # only for stage 1&2
-            initial_scale: float = 2**16,
-            growth_factor: float = 2,
-            backoff_factor: float = 0.5,
-            growth_interval: int = 1000,
-            hysteresis: int = 2,
-            min_scale: float = 1,
-            max_scale: float = 2**32,
-            max_norm: float = 0.0,
-            norm_type: float = 2.0) -> None:
-
-        assert stage in (1, 2, 3), f'Unsupported stage "{stage}"'
+    def __init__(self,
+                 seed: int = 42,
+                 shard_init: bool = False,    # only for stage 3
+                 placement_policy: str = 'cuda',
+                 pin_memory: bool = True,    # only for stage 3
+                 force_outputs_fp32: bool = False,    # only for stage 3
+                 search_range_m: int = 32,    # only for stage 3
+                 hidden_dim: Optional[int] = None,    # only for stage 3
+                 min_chunk_size_m: float = 32,    # only for stage 3
+                 gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
+                 initial_scale: float = 2**16,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 min_scale: float = 1,
+                 max_scale: float = 2**32,
+                 max_norm: float = 0.0,
+                 norm_type: float = 2.0
+                 ) -> None:
+
         assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
-        assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
 
         # TODO(ver217): support shard_init when using from_pretrained()
         if shard_init:
-            warnings.warn(f'Shard init is not supported model.from_pretrained() yet. '
-                          'Please load weights after strategy.prepare()')
-        if stage == 3 and precision == 'fp32':
-            warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
-            precision = 'fp16'
-        self.precision = precision
+            warnings.warn(
+                f'Shard init is not supported model.from_pretrained() yet. '
+                'Please load weights after strategy.prepare()'
+            )
         self.shard_init = shard_init
 
-        optim_kwargs = dict(initial_scale=initial_scale,
-                            growth_factor=growth_factor,
-                            backoff_factor=backoff_factor,
-                            growth_interval=growth_interval,
-                            hysteresis=hysteresis,
-                            min_scale=min_scale,
-                            max_scale=max_scale,
-                            max_norm=max_norm,
-                            norm_type=norm_type)
+        warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
+
         # NOTE: dist should be initialized before calling get_current_device()
-        if stage == 3:
-            plugin_initializer = lambda: GeminiPlugin(
-                # gemini_config
-                device=get_current_device(),
-                placement_policy=placement_policy,
-                precision=precision,
-                pin_memory=pin_memory,
-                force_outputs_fp32=force_outputs_fp32,
-                strict_ddp_mode=shard_init,
-                search_range_m=search_range_m,
-                hidden_dim=hidden_dim,
-                min_chunk_size_m=min_chunk_size_m,
-                # zero_optim_config
-                gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-                # optim_config
-                **optim_kwargs)
-        else:
-            plugin_initializer = lambda: LowLevelZeroPlugin(
-                # zero_config
-                stage=stage,
-                precision=precision,
-                # zero_optim_config
-                reduce_bucket_size_in_m=reduce_bucket_size,
-                overlap_communication=overlap_communication,
-                cpu_offload=(placement_policy == 'cpu'),
-                # optim_config
-                **optim_kwargs)
+        plugin_initializer = lambda: GeminiPlugin(
+            # gemini_config
+            device=get_current_device(),
+            placement_policy=placement_policy,
+            precision='fp16',
+            pin_memory=pin_memory,
+            force_outputs_fp32=force_outputs_fp32,
+            strict_ddp_mode=shard_init,
+            search_range_m=search_range_m,
+            hidden_dim=hidden_dim,
+            min_chunk_size_m=min_chunk_size_m,
+            # zero_optim_config
+            gpu_margin_mem_ratio=gpu_margin_mem_ratio,
+            # optim_config
+            initial_scale=initial_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            hysteresis=hysteresis,
+            min_scale=min_scale,
+            max_scale=max_scale,
+            max_norm=max_norm,
+            norm_type=norm_type
+        )
 
         super().__init__(seed, plugin_initializer)
 
     def _post_init(self) -> None:
-        assert isinstance(self.plugin, (LowLevelZeroPlugin, GeminiPlugin)), \
+        assert isinstance(self.plugin, GeminiPlugin), \
             f'{type(self).__name__}\'s plugin is not initialized properly.'
 
     def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        if isinstance(self.plugin, GeminiPlugin):
-            world_size = dist.get_world_size()
-            shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
-            default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-            return ColoInitContext(device=get_current_device(),
-                                   dtype=torch.half,
-                                   default_pg=shard_pg,
-                                   default_dist_spec=default_dist_spec)
-        return super().model_init_context()
+        world_size = dist.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+        return ColoInitContext(device=get_current_device(),
+                               dtype=torch.half,
+                               default_pg=shard_pg,
+                               default_dist_spec=default_dist_spec)
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        if isinstance(self.plugin, GeminiPlugin):
-            assert isinstance(model, GeminiModel)
-            ddp_model = model.unwrap()
-            assert isinstance(ddp_model, GeminiDDP)
-            return ddp_model.module
-        elif isinstance(self.plugin, LowLevelZeroPlugin):
-            assert isinstance(model, LowLevelZeroModel)
-            return model.module
-        else:
-            raise RuntimeError(f'Unsupported plugin {type(self.plugin)}')
+        assert isinstance(model, GeminiModel)
+        ddp_model = model.unwrap()
+        assert isinstance(ddp_model, GeminiDDP)
+        return ddp_model.module
 
     def save_pretrained(self,
                         model: nn.Module,
                         path: str,
                         only_rank0: bool = True,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        if isinstance(self.plugin, GeminiPlugin):
-            raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
-        super().save_pretrained(model, path, only_rank0, tokenizer)
+        raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
 
     def get_model_state_dict_shard(self, model: nn.Module, **config):
-        if not isinstance(self.plugin, GeminiPlugin):
-            yield from super().get_model_state_dict_shard(model, **config)
-        else:
-            # unwrapped_model = self._unwrap_model(model)
-            # for module in unwrapped_model.modules():
-            #     if isinstance(module, LoraLinear):
-            #         module.merge_weights = True
-            #         module.eval()
-            assert isinstance(model, LowLevelZeroModel)
-            yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
+        assert isinstance(self.plugin, GeminiPlugin)
+        yield from super().get_model_state_dict_shard(model, **config)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 42867645290c..e1c1bbf19f35 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -1,4 +1,6 @@
+import os
 import random
+from collections import OrderedDict
 from typing import Callable, Optional
 
 import numpy as np
@@ -6,18 +8,27 @@
 import torch.distributed as dist
 import torch.nn as nn
 from coati.replay_buffer import ReplayBuffer
-from torch.optim import Optimizer
 from torch.utils.data import DataLoader
+from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
 
-from .naive import NaiveStrategy
+from .base import Strategy
 from .sampler import DistributedSampler
 
 
-class DDPStrategy(NaiveStrategy):
+# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
+def get_grad_required_state_dict(model: nn.Module):
+    state_dict = OrderedDict()
+    for name, parameter in model.named_parameters():
+        if parameter.requires_grad:
+            state_dict[name] = parameter.detach()
+    return state_dict
+
+
+class DDPStrategy(Strategy):
     """
         Strategy for distributed training using torch.distributed.
     """
@@ -29,6 +40,24 @@ def __init__(self,
         self.seed = seed
         super().__init__(plugin_initializer)
 
+    def _try_init_dist(self, force: bool = False) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            torch.cuda.set_device(local_rank)
+        except KeyError as e:
+            if force:
+                raise RuntimeError(
+                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+                )
+        except Exception as e:
+            if force:
+                raise e
+
     def _post_init(self) -> None:
         assert isinstance(self.plugin, TorchDDPPlugin), \
             f'{type(self).__name__}\'s plugin is not initialized properly.'
@@ -42,9 +71,6 @@ def set_seed(self, seed: int) -> None:
         np.random.seed(seed)
         torch.manual_seed(seed)
 
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
-        self.booster.backward(loss, optimizer)
-
     def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
         return self.plugin.prepare_dataloader(replay_buffer,
                                               batch_size=replay_buffer.sample_batch_size,
@@ -68,4 +94,32 @@ def save_pretrained(self,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
         if only_rank0 and dist.get_rank() != 0:
             return
-        super().save_pretrained(model, path, only_rank0, tokenizer)
+        unwrapped_model = self.unwrap_model(model)
+        assert isinstance(unwrapped_model, PreTrainedModel)
+        unwrapped_model.save_pretrained(path)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(path)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        # TODO: implement sharding on naive strategy
+        model = self.unwrap_model(model)
+        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+            state_dict = get_grad_required_state_dict(model)
+        else:
+            state_dict = model.state_dict()
+
+        if 'shard_size' in config:
+            shard_size = config['shard_size']
+            accumulate_size = 0
+            state_dict_shard = OrderedDict()
+            for name, param in state_dict.items():
+                state_dict_shard[name] = param
+                accumulate_size += param.numel() * param.element_size()
+                if accumulate_size >= shard_size:
+                    accumulate_size = 0
+                    yield state_dict_shard
+                    state_dict_shard = OrderedDict()
+            if accumulate_size > 0:
+                yield state_dict_shard
+        else:
+            yield state_dict
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
deleted file mode 100644
index d121237a68ea..000000000000
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Optional
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.replay_buffer import ReplayBuffer
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-from .base import Strategy
-
-
-# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
-def get_grad_required_state_dict(model: nn.Module):
-    state_dict = OrderedDict()
-    for name, parameter in model.named_parameters():
-        if parameter.requires_grad:
-            state_dict[name] = parameter.detach()
-    return state_dict
-
-
-class NaiveStrategy(Strategy):
-    """
-        Strategy for single GPU. No parallelism is used.
-    """
-
-    def _post_init(self) -> None:
-        assert self.plugin is None, \
-            f'{type(self).__name__}\'s plugin is not initialized properly.'
-
-    def setup_distributed(self) -> None:
-        self._try_init_dist(force=False)
-
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
-        # HACK: self.booster.backward(loss, optimizer) can't work if plugin is None,
-        #  it would run `optimizer.backward(loss)`, which is not compatible with torch.optim.Optimizer
-        assert self.plugin is None, "DO NOT call this method if plugin is not None"
-        loss.backward()
-
-    def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
-        return DataLoader(replay_buffer,
-                          batch_size=replay_buffer.sample_batch_size,
-                          shuffle=True,
-                          drop_last=True,
-                          pin_memory=pin_memory,
-                          collate_fn=replay_buffer.collate_fn)
-
-    def save_pretrained(self,
-                        model: nn.Module,
-                        path: str,
-                        only_rank0: bool = True,
-                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
-        unwrapped_model = self.unwrap_model(model)
-        assert isinstance(unwrapped_model, PreTrainedModel)
-        unwrapped_model.save_pretrained(path)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(path)
-
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        # TODO: implement sharding on naive strategy
-        model = self.unwrap_model(model)
-        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
-            state_dict = get_grad_required_state_dict(model)
-        else:
-            state_dict = model.state_dict()
-
-        if 'shard_size' in config:
-            shard_size = config['shard_size']
-            accumulate_size = 0
-            state_dict_shard = OrderedDict()
-            for name, param in state_dict.items():
-                state_dict_shard[name] = param
-                accumulate_size += param.numel() * param.element_size()
-                if accumulate_size >= shard_size:
-                    accumulate_size = 0
-                    yield state_dict_shard
-                    state_dict_shard = OrderedDict()
-            if accumulate_size > 0:
-                yield state_dict_shard
-        else:
-            yield state_dict
-
-    def _try_init_dist(self, force: bool = False) -> None:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
-            torch.cuda.set_device(local_rank)
-        except KeyError as e:
-            if force:
-                raise RuntimeError(
-                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-                )
-        except Exception as e:
-            if force:
-                raise e
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index 3e9d9c4325d8..56e4cc992c17 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -69,7 +69,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --grad_checkpoint
 ```
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
+- --strategy:          the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --max_datasets_size: the max size of dataset, type=int, default=None
@@ -118,7 +118,7 @@ Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
 <div align=left>We also train the reward model based on LLaMA-7B, which reaches the ACC of 72.06% after 1 epoch, performing almost the same as Anthropic's best RM.
 
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
+- --strategy:          the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --model_path:        the path of rm model(if continue to train), type=str, default=None
@@ -160,7 +160,7 @@ Prompt dataset: the instruction dataset mentioned in the above figure which incl
 Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
 
 ### Arg List
-- --strategy:          the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
+- --strategy:          the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - --model:             model type of actor, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - --pretrain:          pretrain model, type=str, default=None
 - --rm_model:          reward model type, type=str, choices=['gpt2', 'bloom', 'opt', 'llama'], default=None
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 00ed7aa36257..9d8dbb38ae5d 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -9,7 +9,7 @@
 from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.trainer import PPOTrainer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
 from easy_models import BLOOMActor
@@ -24,14 +24,12 @@
 
 def main(args):
     # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
+    if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -202,8 +200,8 @@ def tokenize_fn(texts):
     parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset')
     parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive',
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='ddp',
                         help='strategy to use')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
     parser.add_argument('--pretrain', type=str, default=None)
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index d2b08b72ca95..54fe0ad55049 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -11,7 +11,7 @@
 from coati.models.llama import LlamaLM
 from coati.models.opt import OPTLM
 from coati.trainer import SFTTrainer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from easy_dataset import EasyDataset
@@ -30,14 +30,12 @@
 
 def train(args):
     # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
+    if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -45,15 +43,15 @@ def train(args):
     with strategy.model_init_context():
         print('Warning: currently only bloom is tested, gpt2,llama and opt are not tested')
         model = AutoModelForCausalLM.from_pretrained(args.pretrain).to(torch.cuda.current_device())
-        #if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json
-        if os.path.exists(args.save_path) and os.path.exists(args.save_path+'/adapter_config.json') \
-            and os.path.exists(args.save_path+'/adapter_model.bin'):
+        # if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json
+        if os.path.exists(args.save_path) and os.path.exists(args.save_path + '/adapter_config.json') \
+                and os.path.exists(args.save_path + '/adapter_model.bin'):
             print("loading from saved peft model ", args.save_path)
             model = PeftModel.from_pretrained(model, args.save_path)
         else:
-            #we'll use peft lora library to do the lora
+            # we'll use peft lora library to do the lora
             lora_rank = args.lora_rank if args.lora_rank > 0 else 32
-            #config lora with rank of lora_rank
+            # config lora with rank of lora_rank
             lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                                      inference_mode=False,
                                      r=lora_rank,
@@ -170,8 +168,8 @@ def train(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='ddp')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default=None)
diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
index 289330ad8415..1bba9ad66fbc 100644
--- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py
+++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
@@ -15,7 +15,7 @@
 from coati.models.loss import PolicyLoss, ValueLoss
 from coati.models.opt import OPTActor, OPTCritic
 from coati.models.utils import compute_reward
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 from torch.optim import Adam
@@ -99,19 +99,17 @@ def make_experience(self, experience_computation_ref: ExperienceCompositionRefs)
 
     def _init_strategy(self, strategy: str):
         # configure strategy
-        if strategy == 'naive':
-            self._strategy = NaiveStrategy()
-        elif strategy == 'ddp':
+        if strategy == 'ddp':
             self._strategy = DDPStrategy()
         elif strategy == 'colossalai_gemini':
-            self._strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+            self._strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
         elif strategy == 'colossalai_zero2':
-            self._strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+            self._strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
         else:
             raise ValueError(f'Unsupported strategy "{strategy}"')
 
     def _init_optimizer(self):
-        if isinstance(self._strategy, ColossalAIStrategy):
+        if isinstance(self._strategy, (GeminiStrategy, LowLevelZeroStrategy)):
             self._optimizer = HybridAdam(self._model.parameters(), lr=5e-6)
         else:
             self._optimizer = Adam(self._model.parameters(), lr=5e-6)
@@ -534,8 +532,8 @@ def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--prompt_csv_url', type=str)
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='ddp')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default='gpt2')
     parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
index afdd6a922cc7..5dd52f1790e6 100644
--- a/applications/Chat/examples/ray/1mmt_prompt.py
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@@ -103,8 +103,8 @@ def model_fn():
         kl_coef=0.1,
         debug=args.debug,
         update_lora_weights=not (args.lora_rank == 0),
-    # sync_models_from_trainers=True,
-    # generation kwargs:
+        # sync_models_from_trainers=True,
+        # generation kwargs:
         max_length=512,
         do_sample=True,
         temperature=1.0,
@@ -150,10 +150,10 @@ def tokenize_fn(texts):
     parser.add_argument('--num_trainers', type=int, default=1)
     parser.add_argument('--trainer_strategy',
                         choices=[
-                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
                             'colossalai_zero2_cpu'
                         ],
-                        default='naive')
+                        default='ddp')
     parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
     parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
index fa7b2bd7edfd..60f049bd5b70 100644
--- a/applications/Chat/examples/ray/mmmt_prompt.py
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@@ -87,8 +87,8 @@ def model_fn():
             kl_coef=0.1,
             debug=args.debug,
             update_lora_weights=not (args.lora_rank == 0),
-    # sync_models_from_trainers=True,
-    # generation kwargs:
+            # sync_models_from_trainers=True,
+            # generation kwargs:
             max_length=512,
             do_sample=True,
             temperature=1.0,
@@ -163,10 +163,10 @@ def tokenize_fn(texts):
     parser.add_argument('--num_trainers', type=int, default=1)
     parser.add_argument('--trainer_strategy',
                         choices=[
-                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
                             'colossalai_zero2_cpu'
                         ],
-                        default='naive')
+                        default='ddp')
     parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
     parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 4bf5524afb01..dec1f7c036c8 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -49,13 +49,13 @@ wandb init -m offline
 #  - roberta-*: RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
 SKIPPED_TESTS=(
     "gpt2-ddp"
-    "llama-naive" "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2"
-    "roberta-naive" "roberta-ddp" "roberta-colossalai_gemini" "roberta-colossalai_zero2"
+    "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2"
+    "roberta-ddp" "roberta-colossalai_gemini" "roberta-colossalai_zero2"
 )
 
 # These tests are quick and do not have any dependencies
 for model in 'gpt2' 'bloom' 'opt' 'llama' 'roberta'; do
-    for strategy in 'naive' 'ddp' 'colossalai_gemini' 'colossalai_zero2'; do
+    for strategy in 'ddp' 'colossalai_gemini' 'colossalai_zero2'; do
         if [[ " ${SKIPPED_TESTS[*]} " =~ " ${model}-${strategy} " ]]; then
             echo "[Test]: Skipped $model-$strategy"
             continue
@@ -91,12 +91,6 @@ torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2'
     --model 'gpt2' --strategy ddp --lora_rank 4 \
     --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
     --save_path ${BASE}/output
-
-# torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
-#     --model 'opt' --strategy naive \
-#     --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
-#     --save_path ${BASE}/output
-
 rm -rf ${BASE}/output
 
 # train rm
@@ -144,9 +138,9 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
     --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
     --test True --lora_rank 4 \
     --save_path ${BASE}/rm_ckpt.pt
-
 rm -rf ${BASE}/rm_ckpt.pt
 
+# train rl
 torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \
     --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
     --strategy colossalai_zero2 --num_episodes 1 \
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index a9bc0e532e5d..c748eeb21065 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -9,7 +9,7 @@
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
 from coati.trainer import PPOTrainer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from torch.optim import Adam
 from torch.utils.data import DataLoader
@@ -21,14 +21,12 @@
 
 def main(args):
     # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
+    if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -208,7 +206,7 @@ def main(args):
     parser.add_argument('--prompt_dataset', type=str, default=None, help='path to the prompt dataset')
     parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='colossalai_zero2',
                         help='strategy to use')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama', 'roberta'])
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 4a6851ab5b24..e9618e0c1d5e 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -14,7 +14,7 @@
 from coati.models.opt import OPTRM
 from coati.models.roberta import RoBERTaRM
 from coati.trainer import RewardModelTrainer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from torch.optim import Adam
@@ -29,14 +29,12 @@
 
 def train(args):
     # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
+    if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -195,7 +193,7 @@ def train(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='colossalai_zero2')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta', 'llama', 'roberta'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index 967b7c277c6a..30becd8a68a1 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -8,7 +8,7 @@
 from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
 from coati.models import convert_to_lora_module
 from coati.trainer import SFTTrainer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from torch.optim import Adam
@@ -29,18 +29,16 @@
 
 def train(args):
     # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
+    if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
         raise NotImplementedError(
             'Gemini is not supported .from_pretrained() yet. We will update this after checkpoint io is ready.')
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -66,7 +64,7 @@ def train(args):
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@@ -190,7 +188,7 @@ def train(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
+                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
                         default='colossalai_zero2')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index cfa39e44b476..19338da437ab 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 from coati.models.gpt import GPTActor
 from coati.models.utils import calc_action_log_probs
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 
 from colossalai.nn.optimizer import HybridAdam
@@ -28,9 +28,9 @@ def run_test_checkpoint(strategy):
     if strategy == 'ddp':
         strategy = DDPStrategy()
     elif strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
     elif strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 
diff --git a/applications/Chat/tests/test_data.py b/applications/Chat/tests/test_data.py
index 67016f6ed286..db641a6218b1 100644
--- a/applications/Chat/tests/test_data.py
+++ b/applications/Chat/tests/test_data.py
@@ -8,7 +8,7 @@
 from coati.models.base import RewardModel
 from coati.models.gpt import GPTActor, GPTCritic
 from coati.replay_buffer import NaiveReplayBuffer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 
 from colossalai.testing import rerun_if_address_is_in_use, spawn
@@ -39,7 +39,7 @@ def run_test_data(strategy):
     if strategy == 'ddp':
         strategy = DDPStrategy()
     elif strategy == 'colossalai':
-        strategy = ColossalAIStrategy(placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='cuda')
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 

From 09fe9dc704dd388f08e3b19dca65d3d0be64f106 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Fri, 30 Jun 2023 17:23:22 +0800
Subject: [PATCH 14/14] [nfc]fix ColossalaiOptimizer is not defined (#4122)

---
 colossalai/engine/_base_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index ff8979d82401..db27ad0e8abe 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -12,7 +12,7 @@
 from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
 from colossalai.logging import get_dist_logger
 from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
-
+from colossalai.nn.optimizer import ColossalaiOptimizer
 
 class Engine:
     """Basic engine class for training and evaluation. It runs a specific process method