diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 8fc14e0d531a..f40f4cc86d1b 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -14,7 +14,7 @@
       - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
     - [Release](#release)
     - [User Friendliness](#user-friendliness)
-    - [Commmunity](#commmunity)
+    - [Community](#community)
   - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
@@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll
 | `Synchronize submodule` | `submodule.yml`         | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                 |
 | `Close inactive issues` | `close_inactive.yml`    | This workflow will close issues which are stale for 14 days.                                                                           |
 
-### Commmunity
+### Community
 
 | Workflow Name                                | File name                        | Description                                                                      |
 | -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index a9e50e231164..a5a17d176c9d 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -3,24 +3,27 @@ name: Build on PR
 on:
   pull_request:
     types: [synchronize, opened, reopened]
+    branches:
+      - "main"
+      - "develop"
+      - "feature/**"
     paths:
-      - '.github/workflows/build_on_pr.yml' # run command & env variables change
-      - 'colossalai/**' # source code change
-      - '!colossalai/**.md' # ignore doc change
-      - 'op_builder/**' # cuda extension change
-      - '!op_builder/**.md' # ignore doc change
-      - 'requirements/**' # requirements change
-      - 'tests/**' # test change
-      - '!tests/**.md' # ignore doc change
-      - 'pytest.ini' # test config change
-      - 'setup.py' # install command change
+      - ".github/workflows/build_on_pr.yml" # run command & env variables change
+      - "colossalai/**" # source code change
+      - "!colossalai/**.md" # ignore doc change
+      - "op_builder/**" # cuda extension change
+      - "!op_builder/**.md" # ignore doc change
+      - "requirements/**" # requirements change
+      - "tests/**" # test change
+      - "!tests/**.md" # ignore doc change
+      - "pytest.ini" # test config change
+      - "setup.py" # install command change
 
 jobs:
   detect:
     name: Detect file change
     if: |
       github.event.pull_request.draft == false &&
-      github.base_ref == 'main' &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
       changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }}
@@ -133,7 +136,7 @@ jobs:
       - name: Restore Testmon Cache
         run: |
           if [ -d /github/home/testmon_cache ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata /__w/ColossalAI/ColossalAI/
+            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
 
       - name: Execute Unit Testing
@@ -147,7 +150,7 @@ jobs:
       - name: Store Testmon Cache
         run: |
           [ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata /github/home/testmon_cache/
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/
 
       - name: Collate artifact
         env:
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 717cf729b3f3..3dcc4dfd182a 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -19,26 +19,26 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-    - id: set-matrix
-      env:
-        TORCH_VERSIONS: ${{ inputs.torch_version }}
-        CUDA_VERSIONS: ${{ inputs.cuda_version }}
-      run: |
-        IFS=','
-        DOCKER_IMAGE=()
+      - id: set-matrix
+        env:
+          TORCH_VERSIONS: ${{ inputs.torch_version }}
+          CUDA_VERSIONS: ${{ inputs.cuda_version }}
+        run: |
+          IFS=','
+          DOCKER_IMAGE=()
 
-        for tv in $TORCH_VERSIONS
-        do
-            for cv in $CUDA_VERSIONS
-            do
-                DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
-            done
-        done
+          for tv in $TORCH_VERSIONS
+          do
+              for cv in $CUDA_VERSIONS
+              do
+                  DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
+              done
+          done
 
-        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
-        container="[${container}]"
-        echo "$container"
-        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+          container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+          container="[${container}]"
+          echo "$container"
+          echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
 
   build:
     name: Test for PyTorch Compatibility
@@ -70,6 +70,17 @@ jobs:
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
       - name: Install Colossal-AI
         run: |
           pip install -r requirements/requirements.txt
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index 2fca67b820a1..94a723388872 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -3,8 +3,8 @@ name: Compatibility Test on PR
 on:
   pull_request:
     paths:
-      - 'version.txt'
-      - '.compatibility'
+      - "version.txt"
+      - ".compatibility"
 
 jobs:
   matrix_preparation:
@@ -58,6 +58,18 @@ jobs:
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Install Colossal-AI
         run: |
           pip install -v --no-cache-dir .
diff --git a/.github/workflows/doc_build_after_merge.yml b/.github/workflows/doc_build_on_schedule_after_release.yml
similarity index 69%
rename from .github/workflows/doc_build_after_merge.yml
rename to .github/workflows/doc_build_on_schedule_after_release.yml
index ede04b336620..62dfdc67257c 100644
--- a/.github/workflows/doc_build_after_merge.yml
+++ b/.github/workflows/doc_build_on_schedule_after_release.yml
@@ -1,18 +1,16 @@
-name: Build Documentation After Merge
+name: Build Documentation On Schedule & After Release
 
 on:
   workflow_dispatch:
-  pull_request:
-    paths:
-      - 'version.txt'
-      - 'docs/**'
-    types:
-      - closed
+  schedule:
+    - cron: "0 12 * * *" # build doc every day at 8pm Singapore time (12pm UTC time)
+  release:
+    types: [published]
 
 jobs:
   build-doc:
     name: Trigger Documentation Build Workflow
-    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     steps:
       - name: trigger workflow in ColossalAI-Documentation
diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
index a863fcd70b44..992cc93b008c 100644
--- a/.github/workflows/doc_check_on_pr.yml
+++ b/.github/workflows/doc_check_on_pr.yml
@@ -2,47 +2,49 @@ name: Check Documentation on PR
 
 on:
   pull_request:
+    branches:
+      - "main"
+      - "develop"
+      - "feature/**"
     paths:
-      - 'docs/**'
+      - "docs/**"
 
 jobs:
   check-i18n:
     name: Check docs in diff languages
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
 
       - uses: actions/setup-python@v2
         with:
-          python-version: '3.8.14'
+          python-version: "3.8.14"
 
       - run: python .github/workflows/scripts/check_doc_i18n.py -d docs/source
 
   check-doc-build:
     name: Test if the docs can be built
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
         with:
-          path: './ColossalAI'
+          path: "./ColossalAI"
           fetch-depth: 0
 
       - uses: actions/checkout@v2
         with:
-          path: './ColossalAI-Documentation'
-          repository: 'hpcaitech/ColossalAI-Documentation'
+          path: "./ColossalAI-Documentation"
+          repository: "hpcaitech/ColossalAI-Documentation"
 
       - uses: actions/setup-python@v2
         with:
-          python-version: '3.8.14'
+          python-version: "3.8.14"
 
       # we use the versions in the main branch as the guide for versions to display
       # checkout will give your merged branch
@@ -57,7 +59,6 @@ jobs:
           git config user.name 'github-actions'
           git config user.email 'github-actions@github.com'
 
-
       - name: Build docs
         run: |
           cache_dir=ColossalAI-Documentation/doc-build/.cache
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index fb2e28cd9b2e..325e2a7c95a4 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -1,17 +1,20 @@
 name: Test Documentation on PR
 on:
   pull_request:
+    branches:
+      - "main"
+      - "develop"
+      - "feature/**"
     # any change in the examples folder will trigger check for the corresponding example.
     paths:
-      - 'docs/source/**.md'
+      - "docs/source/**.md"
 
 jobs:
   # This is for changed example files detect and output a matrix containing all the corresponding directory name.
   detect-changed-doc:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     outputs:
       any_changed: ${{ steps.changed-files.outputs.any_changed }}
@@ -26,10 +29,10 @@ jobs:
       - name: Locate base commit
         id: locate-base-sha
         run: |
-            curBranch=$(git rev-parse --abbrev-ref HEAD)
-            commonCommit=$(git merge-base origin/main $curBranch)
-            echo $commonCommit
-            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          commonCommit=$(git merge-base origin/main $curBranch)
+          echo $commonCommit
+          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
 
       - name: Get all changed example files
         id: changed-files
@@ -43,10 +46,9 @@ jobs:
   check-changed-doc:
     # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
-        needs.detect-changed-doc.outputs.any_changed == 'true'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+      needs.detect-changed-doc.outputs.any_changed == 'true'
     name: Test the changed Doc
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
@@ -61,8 +63,8 @@ jobs:
       - name: Checkout ColossalAI-Documentation
         uses: actions/checkout@v2
         with:
-          path: './ColossalAI-Documentation'
-          repository: 'hpcaitech/ColossalAI-Documentation'
+          path: "./ColossalAI-Documentation"
+          repository: "hpcaitech/ColossalAI-Documentation"
 
       - name: Install Docer
         run: |
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index b22664ee47cc..31dbf7540091 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -1,17 +1,20 @@
 name: Test Example on PR
 on:
   pull_request:
+    branches:
+      - "main"
+      - "develop"
+      - "feature/**"
     # any change in the examples folder will trigger check for the corresponding example.
     paths:
-      - 'examples/**'
+      - "examples/**"
 
 jobs:
   # This is for changed example files detect and output a matrix containing all the corresponding directory name.
   detect-changed-example:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     outputs:
       matrix: ${{ steps.setup-matrix.outputs.matrix }}
@@ -26,10 +29,10 @@ jobs:
       - name: Locate base commit
         id: locate-base-sha
         run: |
-            curBranch=$(git rev-parse --abbrev-ref HEAD)
-            commonCommit=$(git merge-base origin/main $curBranch)
-            echo $commonCommit
-            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          commonCommit=$(git merge-base origin/main $curBranch)
+          echo $commonCommit
+          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
 
       - name: Get all changed example files
         id: changed-files
@@ -61,10 +64,9 @@ jobs:
   check-changed-example:
     # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
-        needs.detect-changed-example.outputs.anyChanged == 'true'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+      needs.detect-changed-example.outputs.anyChanged == 'true'
     name: Test the changed example
     needs: detect-changed-example
     runs-on: [self-hosted, gpu]
diff --git a/.github/workflows/release_docker_after_merge.yml b/.github/workflows/release_docker_after_publish.yml
similarity index 84%
rename from .github/workflows/release_docker_after_merge.yml
rename to .github/workflows/release_docker_after_publish.yml
index 607c19b05472..22698ca192ed 100644
--- a/.github/workflows/release_docker_after_merge.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@@ -1,17 +1,14 @@
-name: Publish Docker Image to DockerHub after Merge
+name: Publish Docker Image to DockerHub after Publish
 
 on:
   workflow_dispatch:
-  pull_request:
-    paths:
-      - 'version.txt'
-    types:
-      - closed
+  release:
+    types: [published]
 
 jobs:
   release:
     name: Publish Docker Image to DockerHub
-    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
       image: "hpcaitech/docker-in-docker:latest"
@@ -26,7 +23,7 @@ jobs:
         run: |
           version=$(cat version.txt)
           tag=hpcaitech/colossalai:$version
-          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t $tag ./docker
+          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 --build-arg VERSION=v${version} -t $tag ./docker
           echo "tag=${tag}" >> $GITHUB_OUTPUT
 
       - name: Log in to Docker Hub
@@ -50,7 +47,7 @@ jobs:
 
       - uses: actions/setup-python@v2
         with:
-          python-version: '3.8.14'
+          python-version: "3.8.14"
 
       - name: Install requests
         run: pip install requests
diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index bbada74e6850..d9b131fd994c 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -10,7 +10,7 @@ jobs:
   report-test-coverage:
     runs-on: ubuntu-latest
     steps:
-      - name: 'Download artifact'
+      - name: "Download artifact"
         uses: actions/github-script@v6
         with:
           script: |
@@ -31,7 +31,7 @@ jobs:
             let fs = require('fs');
             fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/report.zip`, Buffer.from(download.data));
 
-      - name: 'Unzip artifact'
+      - name: "Unzip artifact"
         id: unzip
         run: |
           unzip report.zip
@@ -58,7 +58,7 @@ jobs:
           echo "</details>" >> coverage_report.txt
           mv coverage_report.txt coverage.txt
 
-      - name: 'Comment on PR'
+      - name: "Comment on PR"
         if: steps.unzip.outputs.hasReport == 'true'
         uses: actions/github-script@v6
         with:
diff --git a/.gitignore b/.gitignore
index bf74a753894f..81113fa99dd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -155,3 +155,7 @@ colossalai/version.py
 # ignore coverage test file
 coverage.lcov
 coverage.xml
+
+# ignore testmon and coverage files
+.coverage
+.testmondata*
diff --git a/README.md b/README.md
index 2e6dcaa1eaf4..34c8a6b730a3 100644
--- a/README.md
+++ b/README.md
@@ -132,9 +132,9 @@ distributed training and inference in a few lines.
    </a>
 </div>
 
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. 
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
-[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 [[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
 [[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
@@ -362,6 +362,22 @@ If you want to install and enable CUDA kernel fusion (compulsory installation wh
 CUDA_EXT=1 pip install .
 ```
 
+For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Use Docker
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index bc8481d96de3..29cd581d7cc9 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -73,9 +73,9 @@ More details can be found in the latest news.
    </a>
 </div>
 
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. 
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
-[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 [[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
 [[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
index faa1c94d2728..5dacf7e81464 100644
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@@ -6,7 +6,7 @@
 from .utils import is_rank_0
 
 
-# Dahaos/rm-static
+# Dahoas/rm-static
 class RmStaticDataset(Dataset):
     """
     Dataset for reward model
diff --git a/applications/Chat/coati/ray/src/detached_replay_buffer.py b/applications/Chat/coati/ray/src/detached_replay_buffer.py
index 855eee48c5a5..18c8db388e88 100644
--- a/applications/Chat/coati/ray/src/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/src/detached_replay_buffer.py
@@ -34,7 +34,7 @@ def __init__(self, sample_batch_size: int, tp_world_size: int = 1, limit : int =
         '''
         Workers in the same tp group share this buffer and need same sample for one step.
             Therefore a held_sample should be returned tp_world_size times before it could be dropped.
-            worker_state records wheter a worker got the held_sample
+            worker_state records whether a worker got the held_sample
         '''
         self.tp_world_size = tp_world_size
         self.worker_state = [False] * self.tp_world_size
diff --git a/applications/Chat/coati/ray/src/experience_maker_holder.py b/applications/Chat/coati/ray/src/experience_maker_holder.py
index 94e4a3d537a5..0ae4e3125b70 100644
--- a/applications/Chat/coati/ray/src/experience_maker_holder.py
+++ b/applications/Chat/coati/ray/src/experience_maker_holder.py
@@ -22,7 +22,7 @@
 class ExperienceMakerHolder:
     '''
     Args:
-        detached_trainer_name_list: str list to get ray actor handleskkk
+        detached_trainer_name_list: str list to get ray actor handles
         strategy: 
         experience_batch_size: batch size of generated experience
         kl_coef: the coefficient of kl divergence loss
diff --git a/applications/Chat/coati/ray/src/pipeline_strategy.py b/applications/Chat/coati/ray/src/pipeline_strategy.py
index 1780839c62ee..7ecb5d7d86d6 100644
--- a/applications/Chat/coati/ray/src/pipeline_strategy.py
+++ b/applications/Chat/coati/ray/src/pipeline_strategy.py
@@ -26,7 +26,7 @@
 class PipelineModel(torch.nn.Module):
     '''
     Actor has 2 kinds of jobs: forward and generate. 
-        better to just pipelinize the inner model
+        better to just pipeline the inner model
     '''
     def __init__(self,
                  model: torch.nn.Module,
diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index 7ace4bfe6d18..ae3499bf268c 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -1,182 +1,329 @@
-# Evaluation
-
-In this directory, we introduce how you can evaluate your model with GPT-4. 
-
-## Evaluation Pipeline
-
-The whole evaluation process undergoes the following three steps: 
-1. Prepare the questions following the internal data structure in the data format section (described below).
-2. Generate answers from different models: 
-    * Generate answers using GPT-3.5: [`generate_gpt35_answers.py`](generate_gpt35_answers.py).
-    * Generate answers using your own models: [`generate_answers.py`](generate_answers.py).
-3. Evaluate models using GPT-4: [`evaluate.py`](evaluate.py).
-
-### Generate Answers
-#### Generate Answers Using GPT-3.5
-You can provide your own OpenAI key to generate answers from GPT-3.5 using [`generate_gpt35_answers.py`](./generate_gpt35_answers.py).
-
-An example script is provided as follows:
-```shell
-python generate_gpt35_answers.py \
-    --dataset "path to the question dataset" \
-    --answer_path "path to answer folder" \
-    --num_workers 4 \
-    --openai_key "your openai key" \
-    --max_tokens 512 \
-``` 
-
-#### Generate Answers Using our Own Model
-You can also generate answers using your own models. The generation process is divided into two stages:
-1. Generate answers using multiple GPUs (optional) with batch processing: [`generate_answers.py`](./generate_answers.py).
-2. Merge multiple shards and output a single file: [`merge.py`](./merge.py).
-
-An example script is given as follows:
-
-```shell
-device_number=number of your devices
-model_name="name of your model"
-model_path="path to your model"
-dataset="path to the question dataset"
-answer_path="path to save the model answers"
-
-torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
-    --model 'llama' \
-    --strategy ddp \
-    --model_path $model_path \
-    --model_name $model_name \
-    --dataset $dataset \
-    --batch_size 8 \
-    --max_datasets_size 80 \
-    --answer_path $answer_path \
-    --max_length 512
-
-python merge.py \
-    --model_name $model_name \
-    --shards $device_number \
-    --answer_path $answer_path \
-
-for (( i=0; i<device_number; i++ )) do
-    rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
-done
-
-```
-
-### Evaluate Answers
-
-In [`evaluate.py`](./evaluate.py), GPT-4 helps to review and score answers of two different models. Here `Model 1` refers to the first model you specify in the `--answer_file_list` and `Model 2` refers to the second model. The script shows several metrics and output the corresponding JSON files.
-
-The metrics include:
-
-- `Invalid Count`: The number of reviews where the program fail to parse the score pair.
-- `Better Count`: The number of reviews where Model 2 receives a higher score.
-- `Worse Count`: The number of reviews where Model 2 receives a lower score.
-- `Tie Count`: The number of reviews where two models play to a tie.
-- `Win Rate of Model 2`: Win rate of Model 2.
-- `Model 1 Average Score`: Average score of Model 1.
-- `Model 2 Average Score`: Average score of Model 2.
-
-Other than the `review` and `result` file which include all reviews, the output files also include `invalid`, `better`, `worse` and `tie` JSON file which only include the corresponding reviews.
-
-```shell
-python evaluate.py \
-    --answer_file_list "path to answers of model 1" "path to answers of model 2" \
-    --prompt_file "path to prompt file" \
-    --reviewer_file "path to reviewer file" \
-    --output_folder "path to output folder" \
-    --openai_key "your openai key" \
-    --model "the gpt model" \
-    --num_workers 8 \
-    --max_tokens 512 \
-
-```
-
-## Results
-
-We compare our model with alpaca and vicuna. The results is shown below. Please note that the better cases don't add to 80 because there are reviews the program can't successfully parse to get the score pair. Our Coati-7B model performs better than Alpaca-7B. The Coati-7B model we evaluate is an old version we trained a few weeks ago and the new version is around the corner.
-
-|  Model Pair   | Alpaca-7B ⚔ Coati-7B | Coati-7B ⚔ Alpaca-7B |
-| :-----------: | :------------------: | :------------------: |
-| Better Cases  |     38 ⚔ **41**      |     **45** ⚔ 33      |
-|   Win Rate    |    48% ⚔ **52%**     |    **58%** ⚔ 42%     |
-| Average Score |   7.06 ⚔ **7.13**    |   **7.31** ⚔ 6.82    |
-
-We would like to mention that the evaluation of model answers using the GPT-3.5 model is not reliable. GPT-3.5 tends to give a higher score to the second answer (`{answer2}` in the prompt). In our evaluation which uses GPT-4, we still swap the two model answers. As can be seen from the table, GPT-4 can generate consistent results and it is more unbiased than GPT-3.5.
-
-## Data Format
-
-### Questions
-The file [`questions.json`](./sample/questions.json) shows the example questions used to evaluate the performance of the model. Each question record has the following field:
-* `id` (id, compulsory): The ID of the instruction / question.
-* `instruction` (str, compulsory): The instruction / question for the LLM.
-* `input` (str, optional): The additional context of the instruction / question.
-* `output` (str, optional): The sample output of the instruction / question.
-* `category` (str, compulsory): The category of the instruction / question.
-
-Example:
-```
-{
-    "id": 0,
-    "instruction": "Help me summarize the following short story?",
-    "input": "{story}",
-    "output": "{summarized story}",
-    "category": "closed qa"
-}
-```
-
-### Answers
-
-We store model answers in `{model_name}_answers.json`. The JSON file contains one list. Each element in the list is an answer record to one question.
-
-An answer record has the following field:
-
-* `category` (str, compulsory): The category of the instruction / question.
-* `instruction` (str, compulsory): The instruction / question for the LLM.
-* `input` (str, optional): The additional context of the instruction / question.
-* `output` (str, compulsory): The output from the LLM.
-* `id` (int, compulsory): The ID of the instruction / question.
-
-### Results
-
-We store evaluation results in `results.json`. The JSON file contains one dictionary. The key in the dictionary is formatted as `{model 1}_vs_{model 2}` and the value is also a dictionary contains metrics about the evaluation.
-
-The value has the following field:
-
-* `model` (list, compulsory): The names of the two models.
-* `better` (int, compulsory): The number of reviews where Model 2 receives a higher score.
-* `worse` (int, compulsory): The number of reviews where Model 2 receives a lower score.
-* `tie` (int, compulsory): The number of reviews where two models play to a tie.
-* `win_rate` (float, compulsory): Win rate of Model 2.
-* `score` (list, compulsory): Average score of the two models.
-
-### Better, Worse, Tie, Invalid, Review
-
-To help better compare the model answers, we store JSON files whose name ends with `_better`, `_worse`, `_tie`, `_invalid` or `_review`. Each JSON file contains one list. Each element in the list is a record of better, worse, tie, invalid or all cases.
-
-A record has the following field:
-
-* `review_id` (str, optional): Random UUID, not in use.
-* `id` (int, compulsory): The ID of the instruction / question.
-* `reviewer_id` (int, compulsory): A unique ID for a reviewer. Different reviewer id use different prompts.
-* `metadata` (dict, optional): It is empty.
-* `review` (str, optional): GPT-4's review.
-* `score` (list, compulsory): The scores of two models.
-
-### Prompts
-
-The data format is the same with [`FastChat's`](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/prompt.jsonl) prompts.
-
-### Reviewer
-
-The data format is the same with [`FastChat's`](https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/table/reviewer.jsonl) reviewers.
-
-## Citations
-
-```bibtex
-@misc{vicuna2023,
-    title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality},
-    url = {https://vicuna.lmsys.org},
-    author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
-    month = {March},
-    year = {2023}
-}
-```
+# Evaluation
+
+In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is available for model
+evaluation of Chinese capability and the one for English capability is under preparation.
+
+## Installation
+
+To start model evaluation, you need to install required packages which listed in `requirements.txt` under `evaluate` folder.
+
+```shell
+pip install -r requirements.txt
+```
+
+## Evaluation Pipeline
+
+The whole evaluation pipeline consists of two methods:
+
+1. `GPT Evaluation`: evaluates model predictions using GPT models.
+   * Compare the performance of two different models (battle).
+   * Rate the model according to pre-defined metrics using prompting design.
+2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
+
+### Evaluation Category
+
+Our evaluation pipeline examines the model's capability using 10 categories of questions. The following table introduces each category:
+
+| Evaluation Category | <center>Description</center>                                                  |
+| :-----------------: | :----------------------------------------------------------- |
+|    Brainstorming    | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
+|        Chat         | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
+|   Classification    | Models are asked to do classification tasks. The capability of accurate classification is required. |
+|      Closed QA      | Models are asked to answer a closed QA question. The capability of answering questions with limited scope (such as single/multiple choice question) is required. |
+|     Extraction      | Models are asked to extract information from a given material. The capability of extracting required information is required. |
+|     Generation      | Models are asked to generate an email, letter, article, etc. The capability of generating texts in a high quality and human-written way is required. |
+|       Open QA       | Models are asked to answer an open QA question(without context provided). The capability of answering questions with the models' own knowledge base is required. |
+|      Roleplay       | Models are asked to play the role provided. The capability of engaging in the scenario and effectively interacting with the user is required. |
+|      Rewriting      | Models are asked to do rewriting tasks such as translation and grammar correction. The capability of rewriting according to different instructions is required. |
+|    Summarization    | Models are asked to summarize the given paragraph or passage. The capability of summarization is required. |
+
+To better understand each evaluation category, here are some example questions provided.
+
+
+| Evaluation Category | <center>Chinese Example</center>                                              | <center>English Example</center>                                              |
+| :-----------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+|    Brainstorming    | **Example 1:**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2:**<br/>请给出管理家庭财务的3个小技巧。<br/> | **Example 1:**<br/>How can I improve my memory? Any useful techniques you can suggest?<br/><br/>**Example 2:**<br/>What are some ways to increase productivity while working from home? |
+|        Chat         | **Example 1:**<br/>基于以下角色信息完成一段对话。小张是一名新手爱好者，对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。<br/>小张：您好，老李，我最近开始对养鸡感兴趣了，想请教您一些问题。 <br/>老李：你好，小张，我很乐意帮助你。你想问些什么？ <br/>小张：我想知道如何确定鸡的品种和性别？ <br/>老李：确切的品种可以通过鸡的外貌特征来确定，而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗？<br/> 小张：<br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生，一位老年病患者想要停药，但他对病情有所忽视并有担忧；王叔叔是老年病患者的儿子，希望能够听取医生的建议。<br/>小明：你好，王叔叔，我了解你想要让你父亲停药。<br/>王叔叔：是的，我父亲已经吃了那么久的药，我担心药物对他的身体会有副作用。<br/>小明： | **Example 1:**<br/>Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants.<br/>Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. <br/>Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. <br/>Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. <br/>Jack: <br/>**Example 2:**<br/>Complete a dialogue based on the following role information. A: Elementary student  B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
+|   Classification    | **Example 1:**<br/>新闻标题：今日立夏，有一上联，立夏万物并秀，下联怎么对？<br/>请根据以上新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。<br/><br/> **Example 2:**<br/>新闻标题：赵丽颖很久没有登上微博热搜了，但你们别急，她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。 | **Example 1:**<br/>Title:  Fighting for Love (2020) <br/>Description: Jasmine got obsessed with a man and now he's obsessed with her. Steamy nights, kisses and rules being broken awaits them. She turned his whole world upside down and now he's doing it to hers. In this free fall, can they survive each others love?\"<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\".<br/><br/>**Example2:** <br/>Title:  Summer Breeze: The Isley Brothers Greatest Hits Live (2005)<br/>Description: Filmed in the US in 2005 and captured in excellent form led by Ron Isley's vocals and Ernie Isley's hard edged guitar. Virtually every track is a hit including Shout, Who's That Lady, Twist And Shout, Summer Breeze and Harvest For The World.<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\"." |
+|      Closed QA      | **Example 1:**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰？ <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2:**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题：非洲最高的山是哪座山？<br/> 选项： <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1:**<br/>Which of the following options is NOT a primary color?<br/>(a) yellow<br/>(b) blue<br/>(c) orange<br/>(d) red<br/>**Example 2:**<br/>Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the ________ book in the Harry Potter series.<br/>(A) first<br/>(B) second<br/>(C) third<br/>(D) fourth |
+|     Extraction      | **Example 1:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2007-4-7中新网4月7日电据中国消防在线消息，4月4日晚上7时30分左右，湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台，警力100余人。经过消防官兵近2个小时奋力扑救，大火被成功扑灭。据初步调查，有1人在此次事故中死亡。<br/><br/>**Example 2:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2014年1月15日，据外媒《俄罗斯报》报道称，位于北半球的澳大利亚现在正处于炎热的夏季，而近日也到了高温酷暑的时候，当地时间1月14日晚，澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响，澳大利亚南部一夜间发生至少250起火灾，灾情多集中在维多利亚州。火灾发生后，救援人员立即展开救灾行动。目前，大部分起火点火势已被控制。 | **Example 1:**<br/>Ernest Hemingway, an American literary giant known for his spare and direct writing style, has penned timeless works such as 'The Old Man and the Sea', 'For Whom the Bell Tolls', and 'A Farewell to Arms', which have made a profound impact on the literary world and continue to be widely read and admired today.<br/>Extract the name of the author mentioned above.<br/><br/>**Example 2:**<br/>In the epic fantasy series 'A Song of Ice and Fire', George R.R. Martin weaves a complex web of political intrigue, war, and magic across the fictional continents of Westeros and Essos. Martin's richly developed characters and intricate plotlines have captivated readers worldwide, much like his other acclaimed works such as 'A Clash of Kings' and 'A Storm of Swords'.<br/>Extract the name of the author in the above material. |
+|     Generation      | **Example 1:**<br/>请撰写一篇文章，介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2:**<br/>请根据以下情节撰写一篇短篇小说：一名年轻人被困在一个荒岛上，他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1:**<br/>Write a descriptive paragraph about an island to relax and unwind, including details about the location and atmosphere.<br/><br/>**Example 2:**<br/>Can you help me write a persuasive email to my colleagues encouraging them to participate in a charitable fundraising event? |
+|       Open QA       | **Example 1:**<br/>请问万有引力定律由谁提出的？<br/><br/>**Example 2:**<br/>哪些国家参与了第一次世界大战？ | **Example 1:**<br/>What are the four basic tastes of the human palate?<br/><br/>**Example 2:**<br/>Who painted the The Scream? |
+|      Rewriting      | **Example 1:**<br/>请将以下句子改为正确的语序。 <br/>生日快乐你祝他了吗？<br/><br/>**Example 2:**<br/>将以下文本翻译成英语：<br/>“这个周末我要去海边玩” | **Example 1:**<br/>Please translate the following sentences, which are a mixture of Chinese and English, into full English. <br/>我需要买一些healthy snacks，比如nuts和dried fruits，作为我的office的午餐.<br/><br/>**Example 2:**<br/>Please rewrite the sentence using an inverted sentence structure.<br/>We won't begin our journey until the sun sets. |
+|      Roleplay       | **Example 1:**<br/>我想让你担任Android开发工程师面试官。我将成为候选人，您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题，等待我的回答。不要写解释。像面试官一样一个一个问我，等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2:**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众，您可以为您的讲故事环节选择特定的主题或主题，例如，如果是儿童，那么您可以谈论动物；如果是成人，那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:**<br/>Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust. <br/><br/>**Example 2: **<br/>I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
+|    Summarization    | **Example 1:**<br/>请简要总结概括以下段落材料。<br/>当地时间29日，泰国卫生部通报，新增143名新冠肺炎确诊病例和1名死亡病例。截止到当地时间29日上午，泰国累计确诊病例1388例，其中泰国籍1172例，非泰国籍216例。死亡病例累计7例。（原题为《泰国新增143例新冠肺炎确诊病例累计确诊1388例》）<br/><br/> **Example 2:**<br/>请简要总结概括以下段落材料。<br/>近期，参与京雄高铁站站房建设的中铁十二局，因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后，引起社会广泛关注。近日，人民网记者从雄安新区相关部门及中铁十二局获悉，新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果，中铁十二局有关宣传负责人回应：“具体内容不清楚，最好找雄安新区相关部门了解情况。”新区有关部门负责人表示，此前涉及的环境违法行为，中铁十二局已基本整改到位，但约谈内容和结果暂不公开，接下来，将按部就班推进环境治理工作。（原题为《雄安新区：中铁十二局涉环境违法已基本整改到位》） | **Example 1:**<br/>The 21 year-old-woman was treated by paramedics after the kitchen fire in Botfield Road in Shifnal, Shropshire. West Mercia Police said it is treating Wednesday morning's incident as arson and are appealing for any witnesses to contact them.The 50-year-old man has been arrested on suspicion of arson with intent to endanger life. For more on this and other stories from Shropshire.<br/>Please briefly summarize the above material within 20 words.<br/><br/>**Example 2:**<br/>South Wales Police were called to a property in Heolgerrig, Merthyr Tydfil, at about 13:40 BST on Sunday. The child was airlifted to Prince Charles Hospital but died shortly afterwards. Police are investigating the circumstances surrounding the incident and have appealed for witnesses. The girl's family are being supported by specially trained officers.<br/>Please briefly summarize the above material within 20 words. |
+
+
+### Evaluation Metrics
+
+#### GPT Evaluation
+
+GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics in Chinese:
+
+|   Evaluation Metric   | <center>Prompt Words</center>                                                 | <center>CoT(Chain-of-Thought)</center>                                        |
+| :-------------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+| Language organization | 语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。 | 1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2.检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。 |
+|       Relevance       | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。 | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。 |
+|      Creativity       | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。 |
+|     Practicality      | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。 |
+|      Correctness      | 正确性(1-5)：答案应该符合常识、生活实际等等                  | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。 |
+|      Naturalness      | 自然(1-5)：答案是否自然，并且符合问题给定的身份。            | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。 |
+|     Engagingness      | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。 | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。 |
+|    Reasonableness     | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。 | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。 |
+|       Diversity       | 多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。 | 1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。<br/> 3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度，看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。 |
+|       Fidelity        | 保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。  | 1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求，确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。 |
+|      Conciseness      | 简明扼要(1-5)：答案是否简明扼要，没有冗余内容。              | 1. 阅读题目，提取出材料的重点。<br/> 2. 阅读该总结，并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。 |
+
+GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
+
+#### Automatic Evaluation
+
+Automated metrics evaluate the capability of a model by comparing model predictions with reference answers.
+There are two ways to obtain reference answers:
+
+* For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
+* For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
+
+There are 5 types of automatic evaluation metrics listed in the table below:
+
+|     Automatic Evaluation Metric     | <center>Description</center>                                                  |
+| :---------------------------------: | :----------------------------------------------------------- |
+|               BLEU-n                | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level.<br/> BLEU-n (n-gram) evaluate the fluency in sentence level. |
+|                ROUGE                | ROUGE-N measures the number of matching n-grams between prediction and reference. <br/> ROUGE-L measures the number of matching longest common subsequence (LCS) between prediction and reference. |
+|              Distinct               | Measure the diversity of generation text by counting the unique n-grams. |
+|              BERTScore              | Measure the semantic similarity between tokens of predictions and references with BERT. |
+| Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
+
+## Evaluation Process
+
+### Data Format
+
+#### Target Answers / Predictions
+
+A JSON file contains one list. Each element in the list is a target answer / prediction record for one instruction / question.
+An element should have the following fields:
+
+* `category` (str, compulsory): The category of the instruction / question.
+* `instruction` (str, compulsory): The instruction / question for the LLM.
+* `input` (str, optional): The additional context of the instruction / question.
+* `output` (str, optional): The sample output of the instruction (default: GPT-3.5).
+* `target` (str, optional): The target answer for the instruction.
+* `id` (int, compulsory): The ID of the instruction / question.
+
+If the `input` has a target answer, the `output` can be empty. Otherwise, we generate answers from GPT-3.5 as the `output`, and the `target` field is empty.
+
+Example:
+
+```json
+[
+    {
+        "category": "brainstorming",
+        "instruction": "请介绍一下人工智能的多个领域。",
+        "input": "",
+        "output": "{GPT-3.5 Answers}",
+        "target": "",
+        "id": 1
+    },
+    {
+        "category": "classification",
+        "instruction": "新闻标题：为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经？请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。",
+        "input": "",
+        "output": "",
+        "target": "{target answer}",
+        "id": 2
+    }
+]
+```
+
+#### Model Answers / Predictions
+
+A JSON file contains one list. Each element in the list is a model answer / prediction record for one instruction / question.
+
+An element should have the following fields:
+
+* `category` (str, compulsory): The category of the instruction / question.
+* `instruction` (str, compulsory): The instruction / question for the LLM.
+* `input` (str, optional): The additional context of the instruction / question.
+* `output` (str, compulsory): The output from the LLM.
+* `target` (str, optional): The target answer for the instruction.
+* `id` (int, compulsory): The ID of the instruction / question.
+
+Example:
+
+```json
+[
+    {
+        "category": "brainstorming",
+        "instruction": "请介绍一下人工智能的多个领域。",
+        "input": "",
+        "output": "{Model Answers / Predictions}",
+        "target": "",
+        "id": 1
+    },
+    {
+        "category": "classification",
+        "instruction": "新闻标题：为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经？请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。",
+        "input": "",
+        "output": "{Model Answers / Predictions}",
+        "target": "{target answer}",
+        "id": 2
+    }
+]
+```
+
+### Prompt
+
+#### Battle Prompt
+
+The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find an example battle prompt file in `prompt/battle_prompt`.
+
+```json
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答	案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}
+```
+
+#### Evaluation Prompt
+
+The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`.  You can find an example evaluation prompt file in `prompt/evaluation_prompt`.
+
+```json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
+```
+
+`"metrics"`: the metrics that can be used in GPT evaluation. This field determines which metrics can be added to your config file.
+
+`"CoT"`: evaluation steps you prompt to GPT models for each metric defined in `"metrics"`.
+
+### Evaluation
+
+#### Configuration
+
+The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics and automatic metrics in key `GPT` and `Metrics`. You can find an example Chinese config file in `config`.
+
+```json
+{
+    "language": "cn",
+    "category": {
+        "brainstorming": {
+            "GPT": ["relevance", "creativity", "practicality", "correctness"],
+            "Metrics": ["Distinct"]
+        },
+        "chat": {
+            "GPT": [ "relevance", "naturalness", "engagingness", "reasonableness"],
+            "Metrics": ["Distinct"]
+        }
+    }
+}
+```
+
+`"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
+
+`"category"`: the category/categories needed to evaluate the model capability.
+
+`"GPT"`: the metrics you want to use for GPT evaluation.
+
+`"Metrics"`: the metrics you want to use for automatic metrics evaluation.
+
+You can create your config file based on available settings listed in following table.
+
+|    "category"    |          "GPT"          |  "Metrics"  |
+| :--------------: | :---------------------: | :---------: |
+| "brainstorming"  | "language organization" |   "BLEU"    |
+|      "chat"      |       "relevance"       |   "ROUGE"   |
+| "classification" |      "creativity"       | "Distinct"  |
+|   "closed_qa"    |     "practicality"      | "BERTScore" |
+|   "extraction"   |      "correctness"      | "Precision" |
+|   "generation"   |      "naturalness"      |  "Recall"   |
+|    "open_qa"     |     "engagingness"      | "F1 score"  |
+|   "rewriting"    |    "reasonableness"     |             |
+|    "roleplay"    |       "diversity"       |             |
+| "summarization"  |       "fidelity"        |             |
+|                  |      "conciseness"      |             |
+
+> **NOTE:**  For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
+
+#### Evaluate
+
+After setting the configuration file, you can evaluate the model using `eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`. If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using automatic metrics and GPT models.
+
+An example script is provided as follows:
+
+```shell
+python eval.py \
+    --config_file "path to the config file" \
+    --battle_prompt_file "path to the prompt file for battle" \
+    --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+    --target_file "path to the target answer file" \
+    --answer_file_list "path to the answer files of at most 2 models" \
+    --model_name_list "the names of at most 2 models" \
+    --gpt_model "which GPT model to use for evaluation" \
+    --save_path "path to save results" \
+    --openai_key "your openai key" \
+```
+
+## FAQ
+
+<details><summary><b>How can I add a new GPT evaluation metric?</b></summary>
+
+For example, if you want to add a new metric `persuasiveness` into category `brainstorming`, you should add the metric definition and its corresponding CoT(Chain-of-thought) in the evaluation prompt file in `prompt/evaluation_promt`. The CoT can be generated using ChatGPT. You can prompt ChatGPT to generate evaluation steps for the new metric.
+
+```json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "persuasiveness": "说服力(1-5)：XXX"
+    },
+    "CoT": {
+      "persuasiveness": "XXX\n\n说服力："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
+```
+
+</details>
+
+## To Do
+
+- [ ] Add evaluation for English capability
+- [ ] Support UniEval
+- [x] Support GPT-4 evaluation
+
+## Citations
+
+```bibtex
+@misc{vicuna2023,
+    title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality},
+    url = {https://vicuna.lmsys.org},
+    author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
+    month = {March},
+    year = {2023}
+}
+
+@misc{liu2023geval,
+      title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
+      author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
+      year={2023},
+      eprint={2303.16634},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
new file mode 100644
index 000000000000..a8c7ea8a3135
--- /dev/null
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -0,0 +1,123 @@
+{
+  "language": "cn",
+  "category": {
+    "brainstorming": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
new file mode 100644
index 000000000000..4067b15db6e8
--- /dev/null
+++ b/applications/Chat/evaluate/eval.py
@@ -0,0 +1,103 @@
+import argparse
+import json
+import os
+
+import openai
+from evaluator import Evaluator
+from utils import jload
+
+
+def main(args):
+    assert len(args.answer_file_list) == len(
+        args.model_name_list), "The number of answer files and model names should be equal!"
+
+    # load config
+    config = jload(args.config_file)
+
+    if config["language"] == "cn":
+        # get metric settings for all categories
+        metrics_per_category = {}
+        for category in config["category"].keys():
+            metrics_all = {}
+            for metric_type, metrics in config["category"][category].items():
+                metrics_all[metric_type] = metrics
+            metrics_per_category[category] = metrics_all
+
+        battle_prompt = None
+        if args.battle_prompt_file:
+            battle_prompt = jload(args.battle_prompt_file)
+
+        gpt_evaluation_prompt = None
+        if args.gpt_evaluation_prompt_file:
+            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
+
+        if len(args.model_name_list) == 2 and not battle_prompt:
+            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
+
+        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
+            raise Exception(
+                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
+
+        # initialize evaluator
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
+                              config["language"])
+        if len(args.model_name_list) == 2:
+            answers1 = jload(args.answer_file_list[0])
+            answers2 = jload(args.answer_file_list[1])
+
+            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
+
+            evaluator.battle(answers1=answers1, answers2=answers2)
+            evaluator.save(args.save_path, args.model_name_list)
+        elif len(args.model_name_list) == 1:
+            targets = jload(args.target_file)
+            answers = jload(args.answer_file_list[0])
+
+            assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
+
+            evaluator.evaluate(answers=answers, targets=targets)
+            evaluator.save(args.save_path, args.model_name_list)
+        else:
+            raise ValueError("Unsupported number of answer files and model names!")
+    else:
+        raise ValueError(f'Unsupported language {config["language"]}!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
+    parser.add_argument('--config_file',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='path to the file of target results')
+    parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
+    parser.add_argument('--gpt_evaluation_prompt_file',
+                        type=str,
+                        default=None,
+                        help='path to the prompt file for gpt evaluation')
+    parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
+    parser.add_argument('--answer_file_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='path to the answer files of at most 2 models')
+    parser.add_argument('--model_name_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='the names of at most 2 models')
+    parser.add_argument('--gpt_model',
+                        default="gpt-3.5-turbo",
+                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+                        help='which GPT model to use for evaluation')
+    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
+    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
+    args = parser.parse_args()
+
+    if args.openai_key is not None:
+        os.environ["OPENAI_API_KEY"] = args.openai_key
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    main(args)
diff --git a/applications/Chat/evaluate/eval.sh b/applications/Chat/evaluate/eval.sh
new file mode 100755
index 000000000000..f5729e6ee5c7
--- /dev/null
+++ b/applications/Chat/evaluate/eval.sh
@@ -0,0 +1,9 @@
+python eval.py \
+    --config_file "path to the config file" \
+    --battle_prompt_file "path to the prompt file for battle" \
+    --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+    --target_file "path to the target answer file" \
+    --answer_file_list "path to the answer files of at most 2 models" \
+    --model_name_list "the names of at most 2 models" \
+    --save_path "path to save results" \
+    --openai_key "your openai key" \
diff --git a/applications/Chat/evaluate/evaluate.py b/applications/Chat/evaluate/evaluate.py
deleted file mode 100644
index 2f9c9ce8e10d..000000000000
--- a/applications/Chat/evaluate/evaluate.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py
-#    Copyright 2023 LM-SYS@FastChat
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import argparse
-import json
-import os
-import time
-import re
-import concurrent.futures
-
-import openai
-import tqdm
-import shortuuid
-import logging
-
-from utils import jload, jdump, get_json_list
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-MAX_API_RETRY = 3
-
-
-def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str):
-    logging.basicConfig(level=logging.INFO)
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model=model,
-                messages=[{
-                    'role': 'system',
-                    'content': sys_prompt
-                }, {
-                    'role': 'user',
-                    'content': user_prompt,
-                }],
-                temperature=0.2,
-                max_tokens=max_tokens,
-            )
-            review = response['choices'][0]['message']['content']
-            return {"review": review, 'id': answer_id}
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.')
-    return 'error'
-
-
-def parse_score(review):
-    try:
-        pattern = re.compile('([0-9]|10) out of 10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        pattern = re.compile('a score of ([0-9]|10)')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        pattern = re.compile('([0-9]|10)/10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        score_pair = review.split('\n')[0]
-        score_pair = score_pair.replace(',', ' ')
-        sp = score_pair.split(' ')
-        if len(sp) == 2:
-            return [float(sp[0]), float(sp[1])]
-        else:
-            raise Exception('Invalid score pair.')
-    except Exception as e:
-        return [-1, -1]
-
-
-def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
-    reviewer_idx = 0
-    for idx, reviewer in enumerate(reviewer_jsons):
-        if reviewer['category'] == cat:
-            reviewer_idx = idx
-            break
-    prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
-    prompt_json = prompt_jsons[prompt_id-1]
-    assert prompt_json['prompt_id'] == prompt_id
-
-    sys_prompt = prompt_json['system_prompt']
-    prompt_template = prompt_json['prompt_template']
-    defaults = prompt_json['defaults']
-    prompt = prompt_template.format(
-        question=ques, answer_1=ans1, answer_2=ans2, **defaults)
-
-    return sys_prompt, prompt, reviewer_idx+1
-
-
-def evaluate(args):
-    answer1_jsons = jload(args.answer_file_list[0])
-    answer2_jsons = jload(args.answer_file_list[1])
-    reviewer_jsons = get_json_list(args.reviewer_file)
-    prompt_jsons = get_json_list(args.prompt_file)
-
-    assert len(answer1_jsons) == len(answer2_jsons)
-
-    handles = []
-    review_jsons = []
-
-    total_len = len(answer1_jsons)
-    question_idx_list = list(range(total_len))
-
-    logger.info(
-        f' Total number of answers: {len(answer2_jsons)}.')
-
-    reviews = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for i in question_idx_list:
-            assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
-            answer_id = answer1_jsons[i]['id']
-
-            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
-                " " + answer1_jsons[i]['input']
-            cat = answer1_jsons[i]['category']
-            ans1 = answer1_jsons[i]['output']
-            ans2 = answer2_jsons[i]['output']
-
-            sys_prompt, prompt, reviewer_id = gen_prompt(
-                reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
-
-            review_id = shortuuid.uuid()
-            review_jsons.append({
-                'review_id': review_id,
-                'id': answer_id,
-                'reviewer_id': reviewer_id,
-                'metadata': {}
-            })
-
-            future = executor.submit(
-                get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model)
-            futures.append(future)
-
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            reviews.append(future.result())
-
-    reviews.sort(key=lambda x: x['id'])
-    review_jsons.sort(key=lambda x: x['id'])
-
-    ans1_score = 0
-    ans2_score = 0
-    better_count = 0
-    worse_count = 0
-    tie_count = 0
-    invalid_count = 0
-
-    better_file = []
-    worse_file = []
-    tie_file = []
-    invalid_file = []
-    output_review_file = []
-
-    for idx, review in enumerate(reviews):
-        scores = parse_score(review['review'])
-        review_jsons[idx]['review'] = review['review']
-        review_jsons[idx]['score'] = scores
-
-        if scores[0] == -1 and scores[1] == -1:
-            invalid_count += 1
-            invalid_file.append(review_jsons[idx])
-            logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.')
-        else:
-            if scores[0] > scores[1]:
-                worse_count += 1
-                worse_file.append(review_jsons[idx])
-            elif scores[0] < scores[1]:
-                better_count += 1
-                better_file.append(review_jsons[idx])
-            else:
-                tie_count += 1
-                tie_file.append(review_jsons[idx])
-            ans1_score += scores[0]
-            ans2_score += scores[1]
-
-        output_review_file.append(review_jsons[idx])
-
-    better_file.sort(key=lambda x: x['id'])
-    worse_file.sort(key=lambda x: x['id'])
-    tie_file.sort(key=lambda x: x['id'])
-    invalid_file.sort(key=lambda x: x['id'])
-    output_review_file.sort(key=lambda x: x['id'])
-
-    name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0]
-    name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0]
-    prefix = f"{name1}_vs_{name2}"
-
-    jdump(better_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_better.json"))
-    jdump(worse_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_worse.json"))
-    jdump(tie_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_tie.json"))
-    jdump(invalid_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_invalid.json"))
-    jdump(output_review_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_review.json"))
-
-    if os.path.exists(os.path.join(args.output_folder, "results.json")):
-        results = jload(os.path.join(args.output_folder, "results.json"))
-    else:
-        results = {}
-    results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count /
-                       (len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]}
-    jdump(results, os.path.join(args.output_folder, "results.json"))
-
-    logger.info(f' Total {invalid_count} invalid score pair(s).')
-    logger.info(f' Model {name2} has {better_count} better answer(s).')
-    logger.info(f' Model {name2} has {worse_count} worse answer(s).')
-    logger.info(f' {tie_count} answer(s) play(s) to a tie.')
-    logger.info(
-        f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Model evaluation.')
-    parser.add_argument('--answer_file_list', nargs='+', default=[])
-    parser.add_argument('--prompt_file')
-    parser.add_argument('--reviewer_file')
-    parser.add_argument('--output_folder', type=str, default="./output")
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--model', type=str, default="gpt-4")
-    parser.add_argument('--num_workers', type=int, default=8)
-    parser.add_argument('--max_tokens', type=int, default=512,
-                        help='maximum number of tokens produced in the output')
-    args = parser.parse_args()
-
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-
-    evaluate(args)
diff --git a/applications/Chat/evaluate/evaluate.sh b/applications/Chat/evaluate/evaluate.sh
deleted file mode 100755
index c51aa941019e..000000000000
--- a/applications/Chat/evaluate/evaluate.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-python evaluate.py \
-    --answer_file_list "path to answers of model 1" "path to answers of model 2" \
-    --prompt_file "path to prompt file" \
-    --reviewer_file "path to reviewer file" \
-    --output_folder "path to output folder" \
-    --openai_key "your openai key" \
-    --model "gpt-4" \
-    --num_workers 8 \
-    --max_tokens 512 \
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
new file mode 100644
index 000000000000..433d775d27ed
--- /dev/null
+++ b/applications/Chat/evaluate/evaluator.py
@@ -0,0 +1,140 @@
+import os
+from typing import Any, Dict, List
+
+import gpt_evaluate
+import metrics
+import pandas as pd
+from utils import get_data_per_category, jdump
+
+
+class Evaluator(object):
+    """
+        A class named Evaluator includes GPT-3.5/GPT-4 evaluation
+        and automatic evaluation
+
+    """
+
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
+                 gpt_model: str, language: str) -> None:
+        self.params = params
+        self.battle_prompt = battle_prompt
+        self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.gpt_model = gpt_model
+        self.language = language
+        self.automatic_metric_stats = dict()
+        self.gpt_evaluation_results = dict()
+        self.battle_results = []
+
+    def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
+        """
+        Comparison between two models using GPT-4 as the reviewer.
+        """
+
+        self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
+
+    def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
+        """
+        A comprehensive evaluation of the answers from the model.
+        The function evaluates the model's performance from different perspectives
+        using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+
+        The metrics will be decided by the config file.
+
+        """
+
+        def switch(metric):
+            if metric == "BLEU":
+                return metrics.bleu_score(preds=predicts_list, targets=targets_list)
+            elif metric == "ROUGE":
+                return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
+            elif (metric == "Distinct"):
+                return metrics.distinct_score(preds=predicts_list)
+            elif (metric == "BERTScore"):
+                return metrics.bert_score(preds=predicts_list, targets=targets_list)
+            elif (metric == "Precision"):
+                return metrics.precision(preds=predicts_list, targets=targets_list)
+            elif (metric == "Recall"):
+                return metrics.recall(preds=predicts_list, targets=targets_list)
+            elif (metric == "F1 score"):
+                return metrics.F1_score(preds=predicts_list, targets=targets_list)
+            else:
+                raise ValueError(f"Unexpected metric")
+
+        answers_per_category = get_data_per_category(answers, list(self.params.keys()))
+        targets_per_category = get_data_per_category(targets, list(self.params.keys()))
+
+        # automatic evaluation
+        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            category_metrics = self.params[category]["Metrics"]
+            self.automatic_metric_stats[category] = {}
+
+            targets_list = [
+                target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
+            ]
+            predicts_list = [answer["output"] for answer in answers_per_category[category]]
+
+            for metric in category_metrics:
+                self.automatic_metric_stats[category].update(switch(metric=metric))
+
+        # gpt evaluation
+        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            category_metrics = self.params[category]["GPT"]
+
+            prompt = self.gpt_evaluation_prompt.get(category, None)
+            if prompt is None:
+                print(f"No prompt for category {category}! Use prompt for category general now.")
+                prompt = self.gpt_evaluation_prompt["general"]
+
+            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
+                                                                          category_metrics, category, self.gpt_model)
+
+    def save(self, path: str, model_name_list: List[str]) -> None:
+        """
+        Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+
+        """
+
+        if len(model_name_list) == 2:
+            save_path = os.path.join(path, "gpt_evaluate", "battle_results")
+            gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
+        else:
+            # save evaluation results for automatic metrics
+            automatic_df = pd.DataFrame(self.automatic_metric_stats)
+
+            automatic_results_save_path = os.path.join(path, "automatic_results")
+            if not os.path.exists(automatic_results_save_path):
+                os.makedirs(automatic_results_save_path)
+            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
+
+            # Save evaluation results for GPT-3.5 evaluation metrics.
+            all_evaluations = []
+            base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
+
+            for category, evaluations in self.gpt_evaluation_results.items():
+                jdump(
+                    evaluations,
+                    os.path.join(evaluation_results_save_path, model_name_list[0],
+                                 f"{category}_evaluation_results.json"))
+                all_evaluations.extend(evaluations)
+
+            jdump(all_evaluations,
+                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
+
+            # Start to calculate scores and save statistics.
+            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
+            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                        evaluation_statistics_save_path)
+
+            # Save charts and csv.
+            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
+            gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
+                                                           evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/generate_answers.py b/applications/Chat/evaluate/generate_answers.py
deleted file mode 100644
index fbebf5c5e6f6..000000000000
--- a/applications/Chat/evaluate/generate_answers.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import argparse
-import os
-import random
-import copy
-import math
-from tqdm import tqdm
-
-import torch
-import torch.distributed as dist
-import transformers
-
-from coati.models.bloom import BLOOMActor
-from coati.models.gpt import GPTActor
-from coati.models.opt import OPTActor
-from coati.models.roberta import RoBERTaActor
-from coati.models.llama import LlamaActor
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from transformers import AutoTokenizer, RobertaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.logging import get_dist_logger
-
-from utils import jload, jdump, is_rank_0
-
-
-logger = get_dist_logger()
-
-PROMPT_DICT = {
-    "prompt_input":
-        ("Below is an instruction that describes a task, paired with an input that provides further context. "
-         "Write a response that appropriately completes the request.\n\n"
-         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
-    "prompt_no_input": ("Below is an instruction that describes a task. "
-                        "Write a response that appropriately completes the request.\n\n"
-                        "### Instruction:\n{instruction}\n\n### Response:"),
-}
-
-
-def generate(args):
-    # torch.cuda.set_per_process_memory_fraction(0.4)
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-
-    with strategy.model_init_context():
-        if args.model == 'gpt2':
-            actor = GPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'opt':
-            actor = OPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'roberta':
-            actor = RoBERTaActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'llama':
-            actor = LlamaActor(pretrained=args.model_path).to(
-                torch.float16).to(torch.cuda.current_device())
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
-    elif args.model == 'roberta':
-        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    elif args.model == 'llama':
-        tokenizer = AutoTokenizer.from_pretrained(args.model_path,
-                                                  padding_side="right",
-                                                  use_fast=False,
-                                                  )
-        tokenizer.eos_token = '<\s>'
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    questions = []
-    if args.max_datasets_size is not None:
-        questions = random.sample(jload(args.dataset), args.max_datasets_size)
-        if is_rank_0():
-            logger.info(
-                f"Limiting dataset to {args.max_datasets_size} examples.")
-        questions = questions[rank:args.max_datasets_size:world_size]
-
-    answers = copy.deepcopy(questions)
-
-    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
-    sources = [
-        prompt_input.format_map(example) if example.get(
-            "input", "") != "" else prompt_no_input.format_map(example)
-        for example in questions
-    ]
-
-    if is_rank_0():
-        logger.info("Tokenizing inputs... This may take some time...")
-
-    input_ids_list = []
-
-    for string in sources:
-        input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0)
-        input_ids_list.append(input_ids)
-
-    bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)),
-               desc=f'steps', disable=not is_rank_0())
-
-    actor.eval()
-    with torch.no_grad():
-        for i in range(0, len(input_ids_list), args.batch_size):
-            batch = input_ids_list[i:i+args.batch_size]
-            batch = [i.flip(dims=[0]) for i in batch]
-            batch = torch.nn.utils.rnn.pad_sequence(batch,
-                                                    batch_first=True,
-                                                    padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device())
-            batch = batch.flip(dims=[1])
-            attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
-
-            outputs = actor.model.generate(batch, attention_mask=attention_mask,
-                                           max_length=args.max_length,
-                                           do_sample=True,
-                                           top_k=50,
-                                           top_p=0.95,
-                                           num_return_sequences=1)
-
-            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            for j in range(batch.size(0)):
-                answers[i +
-                        j]['output'] = outputs[j].split("### Response:")[1].strip()
-
-            bar.update()
-
-    jdump(answers, os.path.join(args.answer_path,
-          f'{args.model_name}_answers_rank{rank}.json'))
-
-    if is_rank_0():
-        logger.info(
-            f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini',
-                                 'colossalai_zero2', 'colossalai_zero2_cpu'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2',
-                        choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama'])
-    parser.add_argument('--model_path', type=str, default=None)
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--dataset', type=str, default=None)
-    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--max_datasets_size', type=int, default=None)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--max_length', type=int, default=1024)
-    args = parser.parse_args()
-    generate(args)
diff --git a/applications/Chat/evaluate/generate_answers.sh b/applications/Chat/evaluate/generate_answers.sh
deleted file mode 100755
index 36881f5f4f29..000000000000
--- a/applications/Chat/evaluate/generate_answers.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-device_number=number of your devices
-model_name="name of your model"
-model_path="path to your model"
-dataset="path to the question dataset"
-answer_path="path to save the model answers"
-
-torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
-    --model 'llama' \
-    --strategy ddp \
-    --model_path $model_path \
-    --model_name $model_name \
-    --dataset $dataset \
-    --batch_size 8 \
-    --max_datasets_size 80 \
-    --answer_path $answer_path \
-    --max_length 512
-
-python merge.py \
-    --model_name $model_name \
-    --shards $device_number \
-    --answer_path $answer_path \
-
-for (( i=0; i<device_number; i++ )) do
-    rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
-done
diff --git a/applications/Chat/evaluate/generate_gpt35_answers.py b/applications/Chat/evaluate/generate_gpt35_answers.py
deleted file mode 100644
index db95cd2febf4..000000000000
--- a/applications/Chat/evaluate/generate_gpt35_answers.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/qa_baseline_gpt35.py
-#    Copyright 2023 LM-SYS@FastChat
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import argparse
-import json
-import os
-import time
-import concurrent.futures
-
-import openai
-import tqdm
-import shortuuid
-import logging
-
-from utils import jload, jdump
-
-MODEL = 'gpt-3.5-turbo'
-MAX_API_RETRY = 3
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-def get_answer(question: str, max_tokens: int):
-    answer = question
-    prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
-            " " + question['input']
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model='gpt-3.5-turbo',
-                messages=[{
-                    'role': 'system',
-                    'content': 'You are a helpful assistant.'
-                }, {
-                    'role': 'user',
-                    'content': prompt,
-                }],
-                max_tokens=max_tokens,
-            )
-            answer['output'] = response['choices'][0]['message']['content']
-            return answer
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Answer {question["id"]} failed after {MAX_API_RETRY} retries.')
-    return answer
-
-def evaluate_gpt35(args):
-    questions=jload(args.dataset)
-    
-    logger.info(
-        f' Total number of answers: {len(questions)}.')
-    logger.info(
-        f' Waiting for {args.request_time_gap} seconds before sending the next request.')
-    
-    answers = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for question in questions:
-            future = executor.submit(get_answer, question, args.max_tokens)
-            futures.append(future)
-
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            answers.append(future.result())
-
-    answers.sort(key=lambda x: x['id'])
-
-    jdump(answers, os.path.join(args.answer_path,
-          f'gpt35_answers.json'))
-        
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Evaluate GPT 3.5.')
-    parser.add_argument('--dataset', type=str, default="questions.json")
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--num_workers', type=int, default=4)
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--max_tokens', type=int, default=1024)
-    
-    args = parser.parse_args()
-    
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-        
-    evaluate_gpt35(args)
diff --git a/applications/Chat/evaluate/generate_gpt35_answers.sh b/applications/Chat/evaluate/generate_gpt35_answers.sh
deleted file mode 100755
index 645e982638f5..000000000000
--- a/applications/Chat/evaluate/generate_gpt35_answers.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-python generate_gpt35_answers.py \
-    --dataset "path to the question dataset" \
-    --answer_path "path to answer folder" \
-    --num_workers 4 \
-    --openai_key "your openai key" \
-    --max_tokens 512 \
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
new file mode 100644
index 000000000000..61ce3456c49f
--- /dev/null
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -0,0 +1,596 @@
+import concurrent.futures
+import os
+import re
+import time
+from copy import deepcopy
+from typing import Any, Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import openai
+import pandas as pd
+import seaborn as sns
+import tqdm
+from utils import jdump, jload
+
+
+def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Get battle evaluation from GPT-4.
+
+    Args:
+        sys_prompt: prompt for the system.
+        user_prompt: prompt for the user.
+        id: id of the answers for comparison.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one comparison.
+    """
+
+    MAX_API_RETRY = 3
+    for _ in range(MAX_API_RETRY):
+        try:
+            response = openai.ChatCompletion.create(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": sys_prompt
+                    },
+                    {
+                        "role": "user",
+                        "content": user_prompt,
+                    },
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+            )
+            evaluation = response["choices"][0]["message"]["content"]
+            return {"evaluation": evaluation, "id": id}
+        except Exception as e:
+            print(e)
+            time.sleep(1)
+    print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    return {"evaluation": "", "id": id}
+
+
+def parse_battle_score(evaluation: str) -> List[float]:
+    """
+    Parse evaluation from GPT-4 and get the scores of model 1 and 2.
+
+    Args:
+        evaluation: evaluation from GPT-4.
+
+    Returns:
+        A score pair of two different model answers.
+    """
+
+    try:
+        pattern = re.compile("([0-9]|10) out of 10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        pattern = re.compile("a score of ([0-9]|10)")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        pattern = re.compile("([0-9]|10)/10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        score_pair = evaluation.split("\n")[0]
+        score_pair = score_pair.replace(",", " ")
+        sp = score_pair.split(" ")
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return [-1, -1]
+
+
+def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
+    """
+    Use GPT-4 to compare answers of two different models.
+
+    Args:
+        answer1: answers of model 1.
+        answer2: answers of model 2.
+        prompt_dict: prompt for battle.
+
+    Returns:
+        Evaluations of all comparison pairs.
+    """
+
+    assert len(answer1) == len(answer2)
+
+    handles = []
+    evaluation_file = []
+
+    total_len = len(answer1)
+    question_idx_list = list(range(total_len))
+
+    print(f" Total number of answers: {len(answer1)}.")
+
+    evaluations = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for i in question_idx_list:
+            assert answer1[i]["id"] == answer2[i]["id"]
+            answer_id = answer1[i]["id"]
+
+            ques = answer1[i]["instruction"] if answer1[i][
+                "input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
+            cat = answer1[i]["category"]
+            ans1 = answer1[i]["output"]
+            ans2 = answer2[i]["output"]
+
+            sys_prompt = prompt_dict["system_prompt"]
+            prompt_template = prompt_dict["prompt_template"]
+            prompt = prompt_template.format(
+                question=ques,
+                answer_1=ans1,
+                answer_2=ans2,
+                prompt=prompt_dict["prompt"],
+            )
+
+            future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            evaluations.append(future.result())
+
+    evaluations.sort(key=lambda x: x["id"])
+
+    return evaluations
+
+
+def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
+    """
+    Save evaluation results (model 1 vs model 2) from GPT-4.
+
+    Args:
+        evaluations: evaluation results from GPT-4.
+        name1: model 1 's name.
+        name2: model 2 's name.
+        save_path: path to save battle results.
+    """
+
+    evaluation_file = deepcopy(evaluations)
+
+    ans1_score = 0
+    ans2_score = 0
+    better_count = 0
+    worse_count = 0
+    tie_count = 0
+    invalid_count = 0
+
+    better_file = []
+    worse_file = []
+    tie_file = []
+    invalid_file = []
+
+    for idx, evaluation in enumerate(evaluations):
+        scores = parse_battle_score(evaluation["evaluation"])
+        evaluation_file[idx]["score"] = scores
+
+        if scores[0] == -1 and scores[1] == -1:
+            invalid_count += 1
+            invalid_file.append(evaluation_file[idx])
+            print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
+        else:
+            if scores[0] > scores[1]:
+                worse_count += 1
+                worse_file.append(evaluation_file[idx])
+            elif scores[0] < scores[1]:
+                better_count += 1
+                better_file.append(evaluation_file[idx])
+            else:
+                tie_count += 1
+                tie_file.append(evaluation_file[idx])
+            ans1_score += scores[0]
+            ans2_score += scores[1]
+
+    prefix = f"{name1}_vs_{name2}"
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
+    jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
+    jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
+    jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
+    jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
+
+    if os.path.exists(os.path.join(save_path, "battle_results.json")):
+        results = jload(os.path.join(save_path, "battle_results.json"))
+    else:
+        results = {}
+
+    results[prefix] = {
+        "model": [name1, name2],
+        "better": better_count,
+        "worse": worse_count,
+        "tie": tie_count,
+        "win_rate": better_count / (len(evaluations) - invalid_count),
+        "score": [
+            ans1_score / (len(evaluations) - invalid_count),
+            ans2_score / (len(evaluations) - invalid_count),
+        ],
+    }
+    jdump(results, os.path.join(save_path, "battle_results.json"))
+
+    print(f"Total {invalid_count} invalid score pair(s).")
+    print(f"Model {name2} has {better_count} better answer(s).")
+    print(f"Model {name2} has {worse_count} worse answer(s).")
+    print(f"{tie_count} answer(s) play(s) to a tie.")
+    print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
+
+
+def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
+                                        inst: Dict[str, Any],
+                                        metrics: List[str],
+                                        model: str = "gpt-3.5-turbo",
+                                        max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        model: the model used to evaluate answers.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role":
+                                "user",
+                            "content":
+                                prompt["prompt"].format(
+                                    question=question,
+                                    answer=answer,
+                                    metric=prompt["metrics"][metric],
+                                    steps=prompt["CoT"][metric],
+                                ),
+                        },
+                    ],
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["message"]["content"],
+                    "logprobs": None,
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
+    return inst
+
+
+def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
+                                     inst: Dict[str, Any],
+                                     metrics: List[str],
+                                     max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use completion model(text-davinci-003) to evaluate one model answer.
+    Only completion models can return log probabilities.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        max_tokens: the maximum number of tokens to generate in the completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.Completion.create(
+                    model="text-davinci-003",
+                    prompt=prompt["prompt"].format(
+                        question=question,
+                        answer=answer,
+                        metric=prompt["metrics"][metric],
+                        steps=prompt["CoT"][metric],
+                    ),
+                    logprobs=5,
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["text"],
+                    "logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
+    return inst
+
+
+def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
+    """
+    Use GPT models to evaluate model answers and save evaluation results.
+
+    Args:
+        answers: model answers.
+        prompt: prompt for GPT evaluation.
+        metrics: metrics for GPT evaluation.
+        category: the category of the model answers for evaluation.
+        model: the specific GPT model used to evaluate answers.
+
+    Returns:
+        Evaluations of the given answers.
+    """
+
+    print(f"The number of instances of category {category}'s is {len(answers)}.")
+
+    evaluations = []
+
+    metrics_str = ", ".join(x for x in metrics)
+    print(f"Category {category}'s metrics are {metrics_str}.")
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for inst in answers:
+            # Completion models can return log probabilities.
+            if model == "text-davinci-003":
+                future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
+            else:
+                future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
+
+            futures.append(future)
+
+        for future in tqdm.tqdm(
+                concurrent.futures.as_completed(futures),
+                desc=f"{category}: ",
+                total=len(futures),
+        ):
+            evaluations.append(future.result())
+
+    evaluations.sort(key=lambda x: x["id"])
+
+    print(f"{category} done.")
+
+    return evaluations
+
+
+def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
+    """
+    Calculate the score according to log probabilities returned by text-davinci-003.
+
+    Calculation formula:
+        score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
+
+    Ref: https://arxiv.org/abs/2303.16634
+    This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
+
+    Args:
+        logprobs: logprobs returned by openai.Completion.
+
+    Returns:
+        The score of one answer.
+    """
+
+    # GPT-3.5 only returns score of 1 to 5.
+    prob = np.zeros(5)
+
+    for key, value in logprobs.items():
+        # Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
+        # It is meaningless and thus we don't calculate probability.
+        if "bytes" in key:
+            continue
+        # results[0] is the score which corresponds to the key(predicted token).
+        # For example, key "5" corresponds to score 5.
+        results = re.findall(r"\d", key)
+        if len(results) == 1:
+            prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
+
+    score = np.dot(np.arange(1, 6), prob)
+
+    return score
+
+
+def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
+    """
+    Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
+
+    Args:
+        response: logprobs returned by openai.Completion.
+        evaluation: the evaluation corresponds to the question.
+
+    Returns:
+        The score of one answer.
+    """
+
+    try:
+        results = re.findall(r"\d", response)
+        if len(results) == 1:
+            return int(results[0])
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return 0
+
+
+def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+    """
+    Generate statistics for one model.
+
+    Args:
+        model_name: name of the model for saving statistics.
+        evaluations: evaluations for all of the model answers.
+        save_path: path to save GPT-3.5 evaluation statistics.
+    """
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    data_per_category = {}
+    for evaluation in evaluations:
+        category = evaluation["category"]
+        if evaluation["category"] in data_per_category.keys():
+            data_per_category[category].append(evaluation)
+        else:
+            data_per_category[category] = [evaluation]
+
+    all_statistics = {}
+    for category, data in data_per_category.items():
+        metrics = data[0]["evaluation"].keys()
+        scores = {metric: [] for metric in metrics}
+        for evaluation in data:
+            for metric in metrics:
+                if evaluation["evaluation"][metric] == {}:
+                    # This means after 3 retries, the server still returns an error and we set the score to 0.
+                    scores[metric].append(0)
+                elif evaluation["evaluation"][metric]["logprobs"] is not None:
+                    scores[metric].append(
+                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                else:
+                    scores[metric].append(
+                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
+
+        statistics = {}
+        for metric in metrics:
+            arg_sort = np.argsort(scores[metric])
+            statistics[metric] = {}
+            statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
+            statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
+            statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
+
+        all_statistics[category] = statistics
+
+    jdump(
+        all_statistics,
+        os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
+    )
+
+
+def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+
+    Args:
+        statistics_path: path to all the models' statistics.
+        save_path: path to save table and visualization results.
+    """
+
+    if not os.path.exists(statistics_path):
+        raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
+
+    all_statistics = {}
+
+    for file_name in os.listdir(statistics_path):
+        if file_name.endswith("_evaluation_statistics.json"):
+            model_name = file_name.split("_evaluation_statistics.json")[0]
+            all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
+
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
+
+    frame_all = {
+        "model": [],
+        "category": [],
+        "metric": [],
+        "avg_score": [],
+        "best_3": [],
+        "worst_3": [],
+    }
+    frame_per_category = {}
+    for model_name, model_statistics in all_statistics.items():
+        for category, category_statistics in model_statistics.items():
+            if frame_per_category.get(category) is None:
+                frame_per_category[category] = {
+                    "model": [],
+                    "metric": [],
+                    "avg_score": [],
+                    "best_3": [],
+                    "worst_3": [],
+                }
+
+            for metric, metric_statistics in category_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["avg_score"].append(metric_statistics["avg_score"])
+                frame_all["best_3"].append(metric_statistics["best_3"])
+                frame_all["worst_3"].append(metric_statistics["worst_3"])
+
+                frame_per_category[category]["model"].append(model_name)
+                frame_per_category[category]["metric"].append(metric)
+                frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
+                frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
+                frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
+
+    for category in tqdm.tqdm(
+            frame_per_category.keys(),
+            desc=f"category: ",
+            total=len(frame_per_category.keys()),
+    ):
+        data = pd.DataFrame(frame_per_category[category])
+
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+        plt.ylim((0, 5))
+
+        fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
+        fig.set_title(f"Comparison between Different Models for Category {category.title()}")
+        plt.xlabel("Evaluation Metric")
+        plt.ylabel("Average Score")
+
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
diff --git a/applications/Chat/evaluate/merge.py b/applications/Chat/evaluate/merge.py
deleted file mode 100644
index 295dd7fa7cb3..000000000000
--- a/applications/Chat/evaluate/merge.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import argparse
-import os
-
-from utils import jload, jdump
-
-
-def generate(args):
-    dataset = []
-    for i in range(args.shards):
-        shard = jload(os.path.join(args.answer_path,
-                      f'{args.model_name}_answers_rank{i}.json'))
-        dataset.extend(shard)
-
-    dataset.sort(key=lambda x: x['id'])
-    jdump(dataset, os.path.join(args.answer_path,
-                                f'{args.model_name}_answers.json'))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--shards', type=int, default=4)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    args = parser.parse_args()
-    generate(args)
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
new file mode 100644
index 000000000000..5e657234c61a
--- /dev/null
+++ b/applications/Chat/evaluate/metrics.py
@@ -0,0 +1,169 @@
+import statistics
+
+import jieba
+from bert_score import score
+from nltk.translate.bleu_score import sentence_bleu
+from rouge_chinese import Rouge as Rouge_cn
+from sklearn.metrics import f1_score, precision_score, recall_score
+
+
+def bleu_score(preds: list, targets: list) -> dict:
+    """Calculate BLEU Score Metric
+
+    The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
+    BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
+    accuracy in word level, other n-gram evaluate the fluency in
+    sentence level.
+    """
+    bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
+    cumulative_bleu = [0] * 4
+    weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
+               (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
+
+    for pred, target in zip(preds, targets):
+        pred_list = (' '.join(jieba.cut(pred))).split()
+        target_list = [(' '.join(jieba.cut(target))).split()]
+
+        bleu = sentence_bleu(target_list, pred_list, weights=weights)
+        cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
+
+    for i in range(len(cumulative_bleu)):
+        bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
+
+    return bleu_scores
+
+
+def rouge_cn_score(preds: list, targets: list) -> dict:
+    """Calculate Chinese ROUGE Score Metric
+
+    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
+    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
+    the preds and targets. ROUGE-L measures the number of matching
+    longest common subsequence (LCS) between preds and targets.
+    """
+    rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
+    all_preds = []
+    all_targets = []
+
+    for pred, target in zip(preds, targets):
+        pred_list = ' '.join(jieba.cut(pred))
+        target_list = ' '.join(jieba.cut(target))
+        all_preds.append(pred_list)
+        all_targets.append(target_list)
+
+    rouge_cn = Rouge_cn()
+    rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
+
+    rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
+    rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
+    rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
+
+    return rouge_scores
+
+
+def distinct_score(preds: list) -> dict:
+    """Calculate Distinct Score Metric
+
+    This metric refers to https://arxiv.org/abs/1510.03055.
+    It evaluates the diversity of generation text by counting
+    the unique n-grams.
+    """
+    distinct_score = {"distinct": 0}
+    cumulative_distinct = []
+
+    for pred in preds:
+        pred_seg_list = list(' '.join(jieba.cut(pred)))
+        count_segs = len(pred_seg_list)
+        unique_segs = set(pred_seg_list)
+        count_unique_chars = len(unique_segs)
+
+        cumulative_distinct.append(count_unique_chars / count_segs)
+
+    distinct_score["distinct"] = statistics.mean(cumulative_distinct)
+
+    return distinct_score
+
+
+def bert_score(preds: list, targets: list) -> dict:
+    """Calculate BERTScore Metric
+
+    The BERTScore evaluates the semantic similarity between
+    tokens of preds and targets with BERT.
+    """
+    bert_score = {"bert_score": 0}
+    pred_list = []
+    target_list = []
+
+    for pred, target in zip(preds, targets):
+        pred_list.append(' '.join(jieba.cut(pred)))
+        target_list.append(' '.join(jieba.cut(target)))
+
+    _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+
+    bert_score["bert_score"] = F.mean().item()
+
+    return bert_score
+
+
+def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
+    """Precision, Recall and F1-Score Calculation
+
+    The calculation of precision, recall and f1-score is realized by counting
+    the number f overlaps between the preds and target. The comparison length
+    limited by the shorter one of preds and targets. This design is mainly
+    considered for classification and extraction categories.
+    """
+    precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
+    precision_scores = []
+    recall_scores = []
+    f1_scores = []
+
+    for pred, target in zip(preds, targets):
+        pred_list = [char for char in pred]
+        target_list = [char for char in target]
+
+        target_labels = [1] * min(len(target_list), len(pred_list))
+        pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
+
+        precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
+        recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
+        f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
+
+    precision_recall_f1["precision"] = statistics.mean(precision_scores)
+    precision_recall_f1["recall"] = statistics.mean(recall_scores)
+    precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
+
+    return precision_recall_f1
+
+
+def precision(preds: list, targets: list) -> dict:
+    """Calculate Precision Metric
+    (design for classification and extraction categories)
+
+    Calculating precision by counting the number of overlaps between the preds and target.
+    """
+    precision = {"precision": 0}
+    precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
+    return precision
+
+
+def recall(preds: list, targets: list) -> dict:
+    """Calculate Recall Metric
+    (design for classification and extraction categories)
+
+    Calculating recall by counting the number of overlaps between the preds and target.
+    """
+    recall = {"recall": 0}
+    recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
+    return recall
+
+
+def F1_score(preds: list, targets: list) -> dict:
+    """Calculate F1-score Metric
+    (design for classification and extraction categories)
+
+    Calculating f1-score by counting the number of overlaps between the preds and target.
+    """
+    f1 = {"f1_score": 0}
+    f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
+    return f1
diff --git a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
new file mode 100644
index 000000000000..ca66afd7e464
--- /dev/null
+++ b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
@@ -0,0 +1,6 @@
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
new file mode 100644
index 000000000000..ee6caae32091
--- /dev/null
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
@@ -0,0 +1,179 @@
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "creativity": "创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。",
+      "practicality": "实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。",
+      "correctness": "正确性(1-5)：答案应该符合常识、生活实际等等。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "creativity": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。\n3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。\n4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。\n\n创意性：",
+      "practicality": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。\n3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。\n4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。\n\n实用性：",
+      "correctness": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。\n3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。\n4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "chat": {
+    "id": 2,
+    "category": "chat",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "naturalness": "自然(1-5)：答案是否自然，并且符合问题给定的身份。",
+      "engagingness": "参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。",
+      "reasonableness": "合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "naturalness": "1. 阅读题目，确定题目提供的身份信息。\n2. 检查答案内容是否符合题目给定的身份。\n3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。\n\n自然：",
+      "engagingness": "1. 阅读题目，确定对话的语境和背景。\n2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。\n3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。\n\n参与感：",
+      "reasonableness": "1. 阅读题目，确定对话的主题以及问题期望的回答方向。\n2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。\n3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。\n\n合理性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "classification": {
+    "id": 3,
+    "category": "classification",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "closed_qa": {
+    "id": 4,
+    "category": "closed_qa",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "extraction": {
+    "id": 5,
+    "category": "extraction",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "准确性(1-5)：回答应该准确无误地提取出所需信息，不应该包含任何错误或误导性信息。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读问题并确定需要从材料中提取的信息。\n2. 仔细阅读回答并确保它涵盖了所有需要提取的信息。\n3. 使用所提供的材料来验证回答的准确性。如果回答不准确或包含错误或误导性信息，则无法给出高分。\n4. 检查回答是否包含所有要求提取的信息，不要漏掉任何重要细节。\n5. 根据回答的准确性和完整性，给出一个介于1和5之间的分数，5分表示回答非常准确且完整，1分表示回答几乎没有提取出所需信息。\n\n准确性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "generation": {
+    "id": 6,
+    "category": "generation",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "diversity": "多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "diversity": "1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。\n2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。\n3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。\n4. 检查回答的合理性和适度，看看回答是否夸张或离题。\n5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。\n\n多样性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "open_qa": {
+    "id": 7,
+    "category": "open_qa",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "rewriting": {
+    "id": 8,
+    "category": "rewriting",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "roleplay": {
+    "id": 9,
+    "category": "roleplay",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "fidelity": "保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。",
+      "creativity": "创意性(1-5)：角色扮演问题的回答需要具有一定创意，但同时需要遵守角色的设定。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "fidelity": "1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。\n2. 阅读题目的请求，确认回答请求时需要注意的细节。\n3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。\n4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。\n\n保真度：",
+      "creativity": "1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。\n2. 评估回答是否具有独特的思路和建议，是否能够给提问者带来新的想法和启示。\n3. 对比回答中的创意和该角色的设定，评估回答是否遵守了该角色的设定和基本特征。\n4. 对回答的质量进行总体评估，并结合以上评估结果给出创意性的评分，范围从1到5分，其中1分表示回答缺乏创意，5分表示回答具有独特的思路和建议，并且能够遵守该角色的设定。\n\n创意性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "summarization": {
+    "id": 10,
+    "category": "summarization",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "准确性(1-5)：回答应该准确无误地总结出材料的重点。",
+      "conciseness": "简明扼要(1-5)：答案是否简明扼要，没有冗余内容。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读问题给的材料，理解其内容和要点。\n2. 评估回答是否准确地总结出原始材料的重点。\n3. 评估回答是否包含原始材料中的所有关键信息。\n4. 根据以上步骤，给出一个1-5的分数，其中1表示回答不能准确地总结出材料的重点，5表示回答完全准确地总结出材料的重点。\n\n准确性：",
+      "conciseness": "1. 阅读题目，提取出材料的重点。\n2. 阅读该总结，并注意其中的主要观点和信息。\n3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。\n4. 检查总结是否包含与主要观点无关的信息或冗余信息。\n5.确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。\n6.给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。\n\n简明扼要："
+    },
+    "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "general": {
+    "id": 11,
+    "category": "general",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
diff --git a/applications/Chat/evaluate/requirements.txt b/applications/Chat/evaluate/requirements.txt
new file mode 100644
index 000000000000..b0301c2f17f8
--- /dev/null
+++ b/applications/Chat/evaluate/requirements.txt
@@ -0,0 +1,10 @@
+jieba
+bert-score
+rouge_chinese
+scikit-metrics
+nltk
+openai
+seaborn
+pandas
+matplotlib
+numpy
diff --git a/applications/Chat/evaluate/sample/questions.json b/applications/Chat/evaluate/sample/questions.json
deleted file mode 100644
index e9ef9f8b1c66..000000000000
--- a/applications/Chat/evaluate/sample/questions.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-    {
-        "id": 0,
-        "instruction": "Help me summarize the following news?",
-        "input": "National Commercial Bank (NCB), Saudi Arabia's largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba's Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region's third-largest lender. The entity's $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East's biggest lender with about $268 billion of assets.",
-        "output": "NCB to pay 28.45 riyals for each Samba share. Deal will create Gulf region's third-largest lender",
-        "category": "closed qa"
-    }
-]
\ No newline at end of file
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index 692ee007c080..517c0a1c351e 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -2,10 +2,6 @@
 import json
 import os
 
-import torch.distributed as dist
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
 
 def _make_w_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
@@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str):
         f = open(f, mode=mode)
     return f
 
+
 def _make_r_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
         f = open(f, mode=mode)
     return f
 
+
 def jdump(obj, f, mode="w", indent=4, default=str):
     """Dump a str or dictionary to a file in json format.
     Args:
@@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
         raise ValueError(f"Unexpected type: {type(obj)}")
     f.close()
 
+
 def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     f = _make_r_io_base(f, mode)
@@ -45,9 +44,20 @@ def jload(f, mode="r"):
     f.close()
     return jdict
 
+
 def get_json_list(file_path):
     with open(file_path, 'r') as f:
         json_list = []
         for line in f:
             json_list.append(json.loads(line))
         return json_list
+
+
+def get_data_per_category(data, categories):
+    data_per_category = {category: [] for category in categories}
+    for item in data:
+        category = item["category"]
+        if category in categories:
+            data_per_category[category].append(item)
+
+    return data_per_category
diff --git a/colossalai/auto_parallel/passes/meta_info_prop.py b/colossalai/auto_parallel/passes/meta_info_prop.py
index bc0960483980..0673b767de7b 100644
--- a/colossalai/auto_parallel/passes/meta_info_prop.py
+++ b/colossalai/auto_parallel/passes/meta_info_prop.py
@@ -148,7 +148,7 @@ def node_handler(self, node: Node) -> None:
         graph_info.fwd_tmp = buffer_tensors
         graph_info.fwd_out = output_tensors
 
-        # fetch other memory informations
+        # fetch other memory information
         memory_cost = meta_info.memory_cost
         graph_info.fwd_mem_tmp = memory_cost.fwd.temp
         graph_info.fwd_mem_out = memory_cost.fwd.activation
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
index e154105b672d..112ee194b4ec 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
@@ -155,7 +155,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         Convert the sharding spec from the logical shape to the physical shape.
         """
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                        input_name=str(
@@ -221,7 +221,7 @@ def post_process(self, strategy: ShardingStrategy):
         Convert the sharding spec from the logical shape to the physical shape.
         """
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                        input_name=str(
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
index 59091dab519f..ea541e434009 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
@@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr
                                                            weight_name: str) -> ShardingStrategy:
     """
     This function is a helper function used by both module node handler and function node handler. This function will
-    convert the sharding spec for the transposed weight to the correct partititon spec.
+    convert the sharding spec for the transposed weight to the correct partition spec.
 
     Args:
         strategy (ShardingStrategy): the strategy generated by the strategy generator.
@@ -197,7 +197,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight')
 
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at dim 0 to one of the first few dimensions of the input
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                     input_name=str(self.node.args[0]),
@@ -267,7 +267,7 @@ def post_process(self, strategy: ShardingStrategy):
         strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy,
                                                                           weight_name=str(self.node.args[1]))
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at dim 0 to one of the first few dimensions of the input
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                     input_name=str(self.node.args[0]),
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
index f3c9d0cbf826..fa51114a5c94 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int):
     Determine which type of matmul operation should be executed for the given tensor dimensions.
 
     Args:
-        input_dim (int): the number of dimensions for the input tenosr
-        other_dim (int): the number of dimensions for the other tenosr
+        input_dim (int): the number of dimensions for the input tensor
+        other_dim (int): the number of dimensions for the other tensor
     """
     if input_dim == 1 and other_dim == 1:
         matmul_type = MatMulType.DOT
@@ -206,7 +206,7 @@ def _remove_sharding_on_broadcast_dim(key, strategy):
                     # e.g. [1, 2, 4] x [4, 4, 8] -> [4, 2, 8]
                     # the dim 0 of [1, 2, 4] is multiplied to 4
                     tensor_shape[dim_idx] = 1
-                elif broadcast_type == BroadcastType.PADDDING:
+                elif broadcast_type == BroadcastType.PADDING:
                     # if the dim is padded
                     # we remove its sharding
                     tensor_shape[dim_idx] = None
@@ -268,13 +268,13 @@ def _update_sharding_spec(key, strategy, physical_batch_dim):
             dim_partition_dict = sharding_spec.dim_partition_dict
             entire_shape = sharding_spec.entire_shape
 
-            # upddate the dimension index for the matrix dimensions
+            # update the dimension index for the matrix dimensions
             if 2 in dim_partition_dict:
                 dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2)
             if 1 in dim_partition_dict:
                 dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1)
 
-            # map the logical batch dim to phyiscal batch dim
+            # map the logical batch dim to physical batch dim
             if 0 in dim_partition_dict:
                 batch_dim_shard = dim_partition_dict.pop(0)
                 dim_partition_dict[physical_batch_dim] = batch_dim_shard
@@ -414,7 +414,7 @@ def _get_logical_shape_for_dot(self):
 
     def _get_logical_shape_for_mm(self):
         """
-        We need to handle the input tensor for a matrix-matrix multiplcation as the input
+        We need to handle the input tensor for a matrix-matrix multiplication as the input
         tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape
         (e.g. [4] -> [1, 4]).
         """
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index d3d09a9dcf65..4262d76173e4 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -212,7 +212,7 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
         return self.strategies_vector
 
     def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
-        # tranform the strategy generated
+        # transform the strategy generated
         # e.g. to process the sharding strategy for the transposed weights
         return strategy
 
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
index 79b69acb25b3..416dc9c29cad 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
@@ -44,7 +44,7 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
         # TODO: a constant coefficient need to be added.
         # 1D: (L) * N * Cin
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
index c2154b3104d3..e605a68a326b 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
@@ -38,9 +38,9 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # 1D: (L) * N * Cout * Cin * kernel
         # 2D: (H * W) * N * Cout * Cin * kernel
         # 3D: (H * W  * D) * N * Cout * Cin * kernel
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
index fbb6070f7e82..65b173bbf65d 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
@@ -34,9 +34,9 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # TODO: a constant coefficient need to be added.
 
         sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
index 9df6d2fbfa12..b7db42f8f67e 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
@@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
     """
     NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd.
     The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image,
-    and reduce them depening on the operation type.
+    and reduce them depending on the operation type.
     """
 
     def validate(self) -> bool:
@@ -35,9 +35,9 @@ def update_compute_cost(self, strategy: ShardingStrategy) -> TrainCycleItem:
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # 1D: (Lout) * N * C * kernel
         # 2D: (H * W) * N * Cout * Cin * kernel
         # 3D: (H * W  * D) * N * Cout * Cin * kernel
diff --git a/colossalai/auto_parallel/tensor_shard/utils/broadcast.py b/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
index 28aa551328d7..307348ea1eaf 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
@@ -21,7 +21,7 @@
 
 class BroadcastType(Enum):
     EQUAL = auto()
-    PADDDING = auto()
+    PADDING = auto()
     MULTIPLE = auto()
 
 
@@ -69,18 +69,18 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
     for i in range(logical_num_dims):
         # get the trailing dim size
         logical_dim_idx = logical_num_dims - i - 1
-        phyiscal_dim_idx = physical_num_dims - i - 1
+        physical_dim_idx = physical_num_dims - i - 1
         logical_dim_size = logical_shape[logical_dim_idx]
 
-        if phyiscal_dim_idx >= 0:
-            physical_dim_size = physical_shape[phyiscal_dim_idx]
+        if physical_dim_idx >= 0:
+            physical_dim_size = physical_shape[physical_dim_idx]
 
             if physical_dim_size == logical_dim_size:
                 logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.EQUAL
             elif physical_dim_size == 1 and physical_dim_size != logical_dim_size:
                 logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.MULTIPLE
         else:
-            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDDING
+            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDING
 
     return logical_dim_broadcast_info
 
@@ -117,7 +117,7 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
     for shape_dim, mesh_dim in logical_dim_partition.items():
         logical_broadcast_type = logical_dim_broadcast_info[shape_dim]
 
-        if logical_broadcast_type == BroadcastType.PADDDING or logical_broadcast_type == BroadcastType.MULTIPLE:
+        if logical_broadcast_type == BroadcastType.PADDING or logical_broadcast_type == BroadcastType.MULTIPLE:
             removed_dims.extend(mesh_dim)
         else:
             # get the corresponding physical dim
diff --git a/colossalai/auto_parallel/tensor_shard/utils/factory.py b/colossalai/auto_parallel/tensor_shard/utils/factory.py
index 05331e560001..347c10aa102d 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/factory.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py
@@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
     """
 
     if isinstance(input_, Node):
-        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
+        assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
         meta_tensor = input_._meta_data
         assert meta_tensor is not None, "The given node's _meta_data attribute is None"
         shape = meta_tensor.shape
diff --git a/colossalai/auto_parallel/tensor_shard/utils/reshape.py b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
index a32a14bf7d57..d0ebbd7e8b1b 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
@@ -6,12 +6,12 @@
 
 class PreviousStatus(Enum):
     """
-    This class shows the status of previous comparision.
+    This class shows the status of previous comparison.
     """
     RESET = 0
-    # ORIGIN means the dimension size of original tensor is larger in the previous comparision.
+    # ORIGIN means the dimension size of original tensor is larger in the previous comparison.
     ORIGIN = 1
-    # TGT means the dimension size of target tensor is larger in the previous comparision.
+    # TGT means the dimension size of target tensor is larger in the previous comparison.
     TGT = 2
 
 
@@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
             tgt_index += 1
 
             if previous_label == PreviousStatus.TGT:
-                # if the target dimension size is larger in the previous comparision, which means
+                # if the target dimension size is larger in the previous comparison, which means
                 # the origin dimension size has already accumulated larger than target dimension size, so
                 # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                 reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
             origin_index += 1
 
             if previous_label == PreviousStatus.ORIGIN:
-                # if the origin element is larger in the previous comparision, which means
+                # if the origin element is larger in the previous comparison, which means
                 # the target element has already accumulated larger than origin element, so
                 # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                 reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
     Rule:
         For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
         the function will return false.
-        To illustrate this issue, there are two cases to analyse:
+        To illustrate this issue, there are two cases to analyze:
         1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
         operation without distributed tensor.
         2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 11a7e62ff37c..a1080fda1541 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -366,8 +366,8 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         # find non chunk inputs
         chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
 
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
+        # reassign reshape size, some size may have changed due to chunk
+        chunk_info = self._reassign_reshape_size(chunk_info)
 
         return chunk_info
 
@@ -428,10 +428,10 @@ def _update_chunk_info(self, chunk_info: Dict, new_all_node_info: Dict, output:
         chunk_info["outputs_dim"].append(output_dim)
         return True
 
-    def _reassgin_reshape_size(self, chunk_info):
+    def _reassign_reshape_size(self, chunk_info):
         """
         Some shape args in reshape may have changed due to chunk
-        reassgin those changed shape
+        reassign those changed shape
         """
         chunk_region = chunk_info["region"]
         reshape_size = {}
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 8e6cd3e29bea..fbe0741b8827 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -397,7 +397,7 @@ def _assign_conv2d_indice(self, node: Node, node_idx: int) -> None:
         input_node = node.args[0]
         assert len(get_node_shape(input_node)) == 4
 
-        # assgin index
+        # assign index
         self._assign_indice_as_input(node, node_idx, input_node)
         self._del_dim(node_idx, 1)
         self._add_dim(node_idx, 1)
@@ -415,7 +415,7 @@ def _assign_interpolate_indice(self, node: Node, node_idx: int) -> None:
         assert node.kwargs['size'] is None
         assert len(get_node_shape(node)) == 4
 
-        # assgin index
+        # assign index
         self._assign_indice_as_input(node, node_idx)
         self._mark_computation(node, node_idx, [-1, -2])
 
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 6f2adaf03074..61d912157449 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -23,27 +23,28 @@ class Booster:
     training with different precision, accelerator, and plugin.
 
     Examples:
-        >>> colossalai.launch(...)
-        >>> plugin = GeminiPlugin(stage=3, ...)
-        >>> booster = Booster(precision='fp16', plugin=plugin)
-        >>>
-        >>> model = GPT2()
-        >>> optimizer = Adam(model.parameters())
-        >>> dataloader = Dataloader(Dataset)
-        >>> lr_scheduler = LinearWarmupScheduler()
-        >>> criterion = GPTLMLoss()
-        >>>
-        >>> model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
-        >>>
-        >>> for epoch in range(max_epochs):
-        >>>     for input_ids, attention_mask in dataloader:
-        >>>         outputs = model(input_ids, attention_mask)
-        >>>         loss = criterion(outputs.logits, input_ids)
-        >>>         booster.backward(loss, optimizer)
-        >>>         optimizer.step()
-        >>>         lr_scheduler.step()
-        >>>         optimizer.zero_grad()
-
+        ```python
+        colossalai.launch(...)
+        plugin = GeminiPlugin(stage=3, ...)
+        booster = Booster(precision='fp16', plugin=plugin)
+
+        model = GPT2()
+        optimizer = Adam(model.parameters())
+        dataloader = Dataloader(Dataset)
+        lr_scheduler = LinearWarmupScheduler()
+        criterion = GPTLMLoss()
+
+        model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
+
+        for epoch in range(max_epochs):
+            for input_ids, attention_mask in dataloader:
+                outputs = model(input_ids, attention_mask)
+                loss = criterion(outputs.logits, input_ids)
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+        ```
 
     Args:
         device (str or torch.device): The device to run the training. Default: 'cuda'.
@@ -130,6 +131,12 @@ def boost(
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
+        """Backward pass.
+
+        Args:
+            loss (torch.Tensor): The loss to be backpropagated.
+            optimizer (Optimizer): The optimizer to be updated.
+        """
         # TODO: implement this method with plugin
         optimizer.backward(loss)
 
@@ -146,6 +153,14 @@ def execute_pipeline(self,
         pass
 
     def no_sync(self, model: nn.Module) -> contextmanager:
+        """Context manager to disable gradient synchronization across DP process groups.
+
+        Args:
+            model (nn.Module): The model to be disabled gradient synchronization.
+
+        Returns:
+            contextmanager: Context to disable gradient synchronization.
+        """
         assert self.plugin is not None, f'no_sync is only enabled when a plugin is provided and the plugin supports no_sync.'
         assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
         return self.plugin.no_sync(model)
@@ -181,7 +196,7 @@ def save_model(self,
                 If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
             size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
         """
-        self.checkpoint_io.save_model(model, checkpoint, prefix, shard, size_per_shard)
+        self.checkpoint_io.save_model(model, checkpoint=checkpoint, shard=shard, size_per_shard=size_per_shard)
 
     def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
         """Load optimizer from checkpoint.
diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 266a750734b1..e184271e932a 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -1,5 +1,38 @@
+from typing import Any, Optional, Union
+
+import torch
+
 from .mixed_precision_base import MixedPrecision
 
 
 class FP16ApexMixedPrecision(MixedPrecision):
-    pass
+    """
+    Precision for mixed precision training in FP16 using apex AMP.
+
+    Args:
+        opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level. Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+        cast_model_type (torch.dtype, optional, default=None): Casts your model’s parameters and buffers to the desired type.
+        patch_torch_functions (bool, optional, default=None): Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+        keep_batchnorm_fp32 (bool or str, optional, default=None): To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
+        master_weights (bool, optional, default=None): Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
+        loss_scale (float or str, optional, default=None): If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
+        cast_model_outputs (torch.dpython:type, optional, default=None): Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level.
+        num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use. When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple losses/backward passes, but use a single global loss scale for all of them.
+        verbosity(int, default=1): Set to 0 to suppress Amp-related output.
+        min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
+        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
+    """
+
+    def __init__(self,
+                 opt_level: Optional[str] = "O1",
+                 cast_model_type: torch.dtype = None,
+                 patch_torch_functions: bool = None,
+                 keep_batchnorm_fp32: Union[bool, str] = None,
+                 master_weights: bool = None,
+                 loss_scale: Union[float, str] = None,
+                 cast_model_outputs: Any = None,
+                 num_losses: Optional[int] = 1,
+                 verbosity: int = 1,
+                 min_loss_scale: float = None,
+                 max_loss_scale: float = 2.**24) -> None:
+        pass
diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
index ef1ec1f42d70..5d0d815257f3 100644
--- a/colossalai/booster/mixed_precision/fp16_naive.py
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -2,4 +2,25 @@
 
 
 class FP16NaiveMixedPrecision(MixedPrecision):
-    pass
+    """
+    Precision for mixed precision training in FP16 using naive AMP.
+
+    Args:
+    log_num_zeros_in_grad(bool): return number of zeros in the gradients.
+    initial_scale(int): initial scale of gradient scaler.
+    growth_factor(int): the growth rate of loss scale.
+    backoff_factor(float): the decrease rate of loss scale.
+    hysteresis(int): delay shift in dynamic loss scaling.
+    max_scale(int): maximum loss scale allowed.
+    verbose(bool): if set to `True`, will print debug info.
+    """
+
+    def __init__(self,
+                 log_num_zeros_in_grad: bool,
+                 initial_scale: int,
+                 growth_factor: int,
+                 backoff_factor: float,
+                 hysteresis: int,
+                 max_scale: int,
+                 verbose: bool = None) -> None:
+        pass
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index bb3124642ccf..adbf4803eefe 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase):
             Users can provide this argument to speed up searching.
             If users do not know this argument before training, it is ok. We will use a default value 1024.
         min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
-            If the aggregate size of parameters is still samller than the minimum chunk size,
+            If the aggregate size of parameters is still smaller than the minimum chunk size,
             all parameters will be compacted into one small chunk.
         memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
         gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index 0daefa9fff53..8d534ea4c061 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -1,34 +1,23 @@
+from pathlib import Path
 from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
+import warnings
 from packaging import version
 from torch.distributed import ProcessGroup
 
-if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-        torch.__version__) < version.parse('2.0.0'):
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
     from torch.distributed.fsdp import FullStateDictConfig
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     from torch.distributed.fsdp import StateDictType
     from torch.distributed.fsdp.fully_sharded_data_parallel import (
         BackwardPrefetch,
         CPUOffload,
-        MixedPrecision,
-        ShardingStrategy,
-    )
-elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    from torch.distributed.fsdp._init_utils import ProcessGroupType
-    from torch.distributed.fsdp.api import (
-        BackwardPrefetch,
-        CPUOffload,
-        FullOptimStateDictConfig,
         FullStateDictConfig,
         MixedPrecision,
         ShardingStrategy,
-        StateDictType,
     )
-    from torch.distributed.fsdp.wrap import _FSDPPolicy
 else:
     raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
 
@@ -36,7 +25,7 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
-from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
+from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO, utils
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 
@@ -51,102 +40,71 @@ def __init__(self) -> None:
         super().__init__()
         self.coordinator = DistCoordinator()
 
-    def __set_model_optim_state(
-        self,
-        model,
-        state_dict_type,
-        state_dict_config,
-        optim_state_dict_config,
-    ):
-        return FSDP.set_state_dict_type(model, state_dict_type, state_dict_config, optim_state_dict_config)
-
-    def load_sharded_model(self, model: nn.Module, checkpoint: str):
-
-        # TODO(jishaomin): implement this method as it can be supported by Huggingface model
-        raise NotImplementedError("Torch FSDP sharded model checkpoint is not supported yet.")
-
-    def load_sharded_optimizer(self, model: nn.Module, optimizer: Optimizer, checkpoint: str):
-
-        # TODO(jishaomin): implement this method as it can be supported by Huggingface model
-        raise NotImplementedError("Torch FSDP sharded model checkpoint is not supported yet.")
-
-    def save_sharded_model(self, model: nn.Module, checkpoint: str):
-
-        # TODO(jishaomin): implement this method as it can be supported by Huggingface model
-        raise NotImplementedError("Torch FSDP sharded model checkpoint is not supported yet.")
-
-    def save_sharded_optimizer(self, model: nn.Module, optimizer: Optimizer, checkpoint: str):
+    def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool):
+        checkpoint = utils.load_state_dict(checkpoint)
+        model.load_state_dict(checkpoint)
 
-        # TODO(jishaomin): implement this method as it can be supported by Huggingface model
-        raise NotImplementedError("Torch FSDP sharded model checkpoint is not supported yet.")
+    def load_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path):
+        checkpoint = utils.load_state_dict(checkpoint)
+        fsdp_model = optimizer.unwrap_model()
+        sharded_osd = FSDP.scatter_full_optim_state_dict(checkpoint, fsdp_model)
+        optimizer.load_state_dict(sharded_osd)
 
-    def load_unsharded_model(self, model: nn.Module, checkpoint: str):
+    def save_unsharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
         """
-        Load model from checkpoint with automatic unwrapping.
+        Save model to checkpoint but only on master process.
         """
         # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, cfg):
+            full_model_state = model.state_dict()
+        utils.save_state_dict(full_model_state, checkpoint_file_path=checkpoint, use_safetensors=use_safetensors)
 
-        if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-                torch.__version__) < version.parse('2.0.0'):
-            full_state_dict = self.load_state_dict(checkpoint)
-        elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-            full_state_dict = self.load_state_dict(checkpoint)
-            self.__set_model_optim_state(model, StateDictType.FULL_STATE_DICT, FullStateDictConfig(rank0_only=True))
-            full_state_dict = model.state_dict()
-        else:
-            raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
-
-        model.load_state_dict(full_state_dict)
-
-    def load_unsharded_optimizer(self, model: nn.Module, optim: Optimizer, checkpoint: str):
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
         """
-        Load Optimizer from checkpoint with automatic unwrapping.
+        Save optimizer to checkpoint but only on master process.
         """
+        assert isinstance(optimizer, FSDPOptimizerWrapper)
+        fsdp_model = optimizer.unwrap_model()
+        full_optimizer_state = FSDP.full_optim_state_dict(fsdp_model, optim=optimizer, rank0_only=True)
+        utils.save_state_dict(full_optimizer_state, checkpoint_file_path=checkpoint, use_safetensors=False)
 
-        if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-                torch.__version__) < version.parse('2.0.0'):
-            optim_full_state_dict = self.load_state_dict(checkpoint)
-        elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-            optim_full_state_dict = self.load_state_dict(checkpoint)
-            FSDP.full_optim_state_dict_to_load(optim_full_state_dict, model, optim)
-        else:
-            raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
-
-        optim.load_state_dict(optim_full_state_dict)
-
-    def save_unsharded_model(self, model: nn.Module, checkpoint: str):
+    def save_sharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, variant: Optional[str],
+                           size_per_shard: int, use_safetensors: bool):
         """
         Save model to checkpoint but only on master process.
         """
-        # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        raise NotImplementedError("Sharded model checkpoint is not supported yet.")
+
+    def load_sharded_model(self,
+                           model: nn.Module,
+                           checkpoint_index_file: Path,
+                           strict: bool = False,
+                           use_safetensors: bool = False,
+                           load_sub_module: bool = True):
+        """
+        Load model to checkpoint but only on master process.
+        """
+        raise NotImplementedError("Sharded model checkpoint is not supported yet.")
 
-        if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-                torch.__version__) < version.parse('2.0.0'):
-            cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, cfg):
-                model_state_dict = model.state_dict()
-        elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-            self.__set_model_optim_state(model, StateDictType.FULL_STATE_DICT, FullStateDictConfig(rank0_only=True))
-            model_state_dict = model.state_dict()
-        else:
-            raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
-        self.save_checkpoint(model_state_dict, checkpoint)
-
-    def save_unsharded_optimizer(self, model: nn.Module, optimizer: Optimizer, checkpoint: str):
+    def save_sharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
         """
         Save optimizer to checkpoint but only on master process.
         """
+        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
 
-        if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-                torch.__version__) < version.parse('2.0.0'):
-            optim_state_dict = FSDP.full_optim_state_dict(model=model, optim=optimizer)
-        elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-            self.__set_model_optim_state(model, StateDictType.FULL_STATE_DICT,
-                                         FullOptimStateDictConfig(rank0_only=True))
-            optim_state_dict = FSDP.optim_state_dict(model, optimizer)
-        else:
-            raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
-        self.save_checkpoint(optim_state_dict, checkpoint)
+    def load_sharded_optimizer(self, optimizer: Optimizer, index_file_path: str, prefix: str, size_per_shard: int):
+        """
+        Load optimizer to checkpoint but only on master process.
+        """
+        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
+
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Save model to checkpoint but only on master process.
+        """
+        if self.coordinator.is_master():
+            super().save_lr_scheduler(lr_scheduler, checkpoint)
 
 
 class TorchFSDPModel(ModelWrapper):
@@ -156,7 +114,17 @@ def __init__(self, module: nn.Module, *args, **kwargs) -> None:
         self.module = FSDP(module, *args, **kwargs)
 
     def unwrap(self):
-        return self.module.module
+        return self.module
+
+
+class FSDPOptimizerWrapper(OptimizerWrapper):
+
+    def __init__(self, optimizer: Optimizer, model: nn.Module):
+        self.model = model
+        super().__init__(optimizer)
+
+    def unwrap_model(self) -> nn.Module:
+        return self.model
 
 
 class TorchFSDPPlugin(DPPluginBase):
@@ -178,8 +146,7 @@ class TorchFSDPPlugin(DPPluginBase):
         See https://pytorch.org/docs/stable/fsdp.html for details.
     """
 
-    if version.parse(torch.__version__) >= version.parse('1.12.0') and version.parse(
-            torch.__version__) < version.parse('2.0.0'):
+    if version.parse(torch.__version__) >= version.parse('1.12.0'):
 
         def __init__(
             self,
@@ -191,7 +158,6 @@ def __init__(
             mixed_precision: Optional[MixedPrecision] = None,
             ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
             param_init_fn: Optional[Callable[[nn.Module], None]] = None,
-            device_id: Optional[Union[int, torch.device]] = None,
             sync_module_states: bool = False,
         ):
             super().__init__()
@@ -203,42 +169,7 @@ def __init__(
                                     mixed_precision=mixed_precision,
                                     ignored_modules=ignored_modules,
                                     param_init_fn=param_init_fn,
-                                    device_id=device_id,
                                     sync_module_states=sync_module_states)
-    elif version.parse(torch.__version__) >= version.parse('2.0.0'):
-
-        def __init__(
-            self,
-            process_group: ProcessGroupType = None,
-            sharding_strategy: Optional[ShardingStrategy] = None,
-            cpu_offload: Optional[CPUOffload] = None,
-            auto_wrap_policy: Optional[Union[Callable, _FSDPPolicy]] = None,
-            backward_prefetch: Optional[BackwardPrefetch] = BackwardPrefetch.BACKWARD_PRE,
-            mixed_precision: Optional[MixedPrecision] = None,
-            ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
-            param_init_fn: Optional[Callable[[nn.Module], None]] = None,
-            device_id: Optional[Union[int, torch.device]] = None,
-            sync_module_states: bool = False,
-            forward_prefetch: bool = False,
-            limit_all_gathers: bool = False,
-            use_orig_params: bool = False,
-            ignored_parameters: Optional[Iterable[torch.nn.Parameter]] = None,
-        ):
-            super().__init__()
-            self.fsdp_kwargs = dict(process_group=process_group,
-                                    sharding_strategy=sharding_strategy,
-                                    cpu_offload=cpu_offload,
-                                    auto_wrap_policy=auto_wrap_policy,
-                                    backward_prefetch=backward_prefetch,
-                                    mixed_precision=mixed_precision,
-                                    ignored_modules=ignored_modules,
-                                    param_init_fn=param_init_fn,
-                                    device_id=device_id,
-                                    sync_module_states=sync_module_states,
-                                    forward_prefetch=forward_prefetch,
-                                    limit_all_gathers=limit_all_gathers,
-                                    use_orig_params=use_orig_params,
-                                    ignored_parameters=ignored_parameters)
     else:
         raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
 
@@ -269,14 +200,19 @@ def configure(
         lr_scheduler: LRScheduler = None,
     ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
 
-        model = model.cuda()
         # wrap the model with PyTorch FSDP
-        model = TorchFSDPModel(model, **self.fsdp_kwargs)
+        fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
+
+        if len(optimizer.param_groups) > 1:
+            warnings.warn(
+                'TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used.'
+            )
+        optimizer.__init__(fsdp_model.parameters(), **optimizer.defaults)
 
-        if not isinstance(optimizer, OptimizerWrapper):
-            optimizer = OptimizerWrapper(optimizer)
+        if not isinstance(optimizer, FSDPOptimizerWrapper):
+            optimizer = FSDPOptimizerWrapper(optimizer, fsdp_model)
 
-        return model, optimizer, criterion, dataloader, lr_scheduler
+        return fsdp_model, optimizer, criterion, dataloader, lr_scheduler
 
     def control_checkpoint_io(self) -> bool:
         return True
diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py
index 96a883fdb42a..2cc9c3faa12b 100644
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -1,26 +1,26 @@
-from pathlib import Path
+import gc
+import logging
+import os
 from functools import reduce
+from pathlib import Path
+from typing import Iterator, Optional, OrderedDict, Tuple
 
 import torch.nn as nn
 from torch.optim import Optimizer
-import logging
-import os
-import gc
-from typing import Optional, Iterator, OrderedDict, Tuple
 
 from .checkpoint_io_base import CheckpointIO
 from .index_file import CheckpointIndexFile
 from .utils import (
-    has_index_file, 
-    load_state_dict, 
-    save_state_dict, 
+    get_base_filenames,
+    get_shard_filename,
+    has_index_file,
     is_safetensors_available,
-    shard_checkpoint,
     load_shard_state_dict,
+    load_state_dict,
     load_state_dict_into_model,
-    get_shard_filename,
-    get_base_filenames
-    )
+    save_state_dict,
+    shard_checkpoint,
+)
 
 __all__ = ['GeneralCheckpointIO']
 
@@ -29,6 +29,7 @@ class GeneralCheckpointIO(CheckpointIO):
     """
     Checkpoint IO
     """
+
     def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool):
         checkpoint = load_state_dict(checkpoint)
         model.load_state_dict(checkpoint, strict=strict)
@@ -69,19 +70,23 @@ def save_unsharded_optimizer(
         # TODO(FrankLeeeee): handle distributed tensors
         save_state_dict(optimizer.state_dict(), checkpoint, use_safetensors=False)
 
-
-    def save_sharded_model(self, model: nn.Module, checkpoint_path: str, gather_dtensor:bool = False, 
-                           variant: Optional[str] = None, max_shard_size: int = 1024, use_safetensors: bool = False):
-        """ 
+    def save_sharded_model(self,
+                           model: nn.Module,
+                           checkpoint_path: str,
+                           gather_dtensor: bool = False,
+                           variant: Optional[str] = None,
+                           max_shard_size: int = 1024,
+                           use_safetensors: bool = False):
+        """
         implement this method as it can be supported by Huggingface model,
         save shard model, save model to multiple files
         """
         if os.path.isfile(checkpoint_path):
             logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
             return
-        
+
         Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
-        
+
         # shard checkpoint
         state_dict = model.state_dict()
         state_dict_shard = shard_checkpoint(state_dict, max_shard_size=max_shard_size)
@@ -95,21 +100,22 @@ def save_sharded_model(self, model: nn.Module, checkpoint_path: str, gather_dten
             total_size = total_size + shard_pair[1]
             for key in shard.keys():
                 index_file.append_weight_map(key, shard_file)
-            
+
             checkpoint_file_path = os.path.join(checkpoint_path, shard_file)
             save_state_dict(shard, checkpoint_file_path, use_safetensors)
-        
+
         index_file.append_meta_data("total_size", total_size)
         index_file.write_index_file(save_index_file)
-        logging.info(
-            f"The model is going to be split to checkpoint shards. "
-            f"You can find where each parameters has been saved in the "
-            f"index located at {save_index_file}."
-        )
-
-
-    def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, strict: bool = False, 
-                           use_safetensors: bool = False, load_sub_module: bool = True):
+        logging.info(f"The model is going to be split to checkpoint shards. "
+                     f"You can find where each parameters has been saved in the "
+                     f"index located at {save_index_file}.")
+
+    def load_sharded_model(self,
+                           model: nn.Module,
+                           checkpoint_index_file: Path,
+                           strict: bool = False,
+                           use_safetensors: bool = False,
+                           load_sub_module: bool = True):
         """
         load shard model, load model from multiple files
         """
@@ -119,7 +125,7 @@ def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, stri
 
         if use_safetensors and not is_safetensors_available():
             raise ImportError("`safe_serialization` requires the `safetensors` library: `pip install safetensors`.")
-        
+
         # read checkpoint index file
         ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_fileanames()
@@ -134,10 +140,7 @@ def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, stri
         if strict:
             remain_keys = reduce(lambda a, b: a & b, map(set, missing_keys))
             if len(remain_keys) > 0:
-                error_msgs = 'Missing key(s) in state_dict: {}. '.format(
-                            ', '.join('"{}"'.format(k) for k in missing_keys))
+                error_msgs = 'Missing key(s) in state_dict: {}. '.format(', '.join(
+                    '"{}"'.format(k) for k in missing_keys))
                 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                                self.__class__.__name__, "\n\t".join(error_msgs)))
-
-
-
+                    self.__class__.__name__, "\n\t".join(error_msgs)))
diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py
index 8d9ec147d401..808e4e84574f 100644
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -28,7 +28,7 @@
     type=str,
     default=None,
     help=
-    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
     " only effective when used with --hostfile.")
 @click.option("--num_nodes",
               type=int,
diff --git a/colossalai/cli/launcher/hostinfo.py b/colossalai/cli/launcher/hostinfo.py
index 065cbc37101f..d1b88b229fb8 100644
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -38,7 +38,7 @@ def is_host_localhost(hostname: str, port: str = None) -> None:
 
         # socket.getfqdn("127.0.0.1") does not return localhost
         # on some users' machines
-        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
+        # thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
         if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
             return True
 
diff --git a/colossalai/cli/launcher/multinode_runner.py b/colossalai/cli/launcher/multinode_runner.py
index a51e1e371f13..85b241e96292 100644
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -114,7 +114,7 @@ def recv_from_all(self) -> dict:
         Receive messages from all hosts
 
         Returns:
-            msg_from_node (dict): a dictionry which contains messages from each node
+            msg_from_node (dict): a dictionary which contains messages from each node
         """
 
         msg_from_node = dict()
diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 6411b4302e95..027a10aa898b 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
     # receive the stop status
     msg_from_node = runner.recv_from_all()
 
-    # printe node status
+    # print node status
     click.echo("\n====== Stopping All Nodes =====")
     for hostname, msg in msg_from_node.items():
         click.echo(f"{hostname}: {msg}")
diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py
index 99dde810e112..3ee364ec3364 100644
--- a/colossalai/cluster/dist_coordinator.py
+++ b/colossalai/cluster/dist_coordinator.py
@@ -181,7 +181,7 @@ def on_master_only(self, process_group: ProcessGroup = None):
         """
         is_master = self.is_master(process_group)
 
-        # define an inner functiuon
+        # define an inner function
         def decorator(func):
 
             @functools.wraps(func)
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index af2b10928c6f..f4e6cfffbcdf 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -197,7 +197,7 @@ def get_max_nbytes(process_group: Tuple[int], pg_handler: dist.ProcessGroup):
             dist.broadcast_object_list(broadcast_list, src=process_group[0])
             alpha_beta_dict[process_group] = tuple(broadcast_list)
 
-        # add symmetry pair to the apha_beta_dict
+        # add symmetry pair to the alpha_beta_dict
         symmetry_ab_dict = {}
         for process_group, alpha_beta_pair in alpha_beta_dict.items():
             symmetry_process_group = (process_group[1], process_group[0])
@@ -381,7 +381,7 @@ def _extract_alpha_beta(pg, pg_handler):
         first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
         second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
         mesh_alpha = [first_latency, second_latency]
-        # The beta values have been enlarged by 1e10 times temporarilly because the computation cost
+        # The beta values have been enlarged by 1e10 times temporarily because the computation cost
         # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
         mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
 
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index 38175fe0941c..9fc301a26559 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -152,9 +152,9 @@ def _get_data_slice(self, data, offset):
             raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
 
     def load_micro_batch(self):
-        mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
+        micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
         self.microbatch_offset += self.microbatch_size
-        return self._move_to_device(mciro_batch_data)
+        return self._move_to_device(micro_batch_data)
 
     def pre_processing(self, engine):
         from colossalai.zero.legacy import ShardedModelV2
diff --git a/colossalai/engine/schedule/_pipeline_schedule_v2.py b/colossalai/engine/schedule/_pipeline_schedule_v2.py
index 28c58bd82b5c..89e45c7aacec 100644
--- a/colossalai/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py
@@ -84,7 +84,7 @@ def forward_backward_step(self,
             'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
         self.load_batch(data_iter)
 
-        # num_warmup_microbatches is the step when not all the processers are working
+        # num_warmup_microbatches is the step when not all the processes are working
         num_warmup_microbatches = \
             (gpc.get_world_size(ParallelMode.PIPELINE)
              - gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py
index 5a72cb9ca923..33b164800262 100644
--- a/colossalai/fx/codegen/activation_checkpoint_codegen.py
+++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py
@@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func,
     # append code text to body
     for idx, node in enumerate(node_list):
         # if this is the first node of the ckpt region
-        # append the ckpt function defition
+        # append the ckpt function definition
         if idx in start_idx:
             label = start_idx.index(idx)
             ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
index 2c7b842b530c..245ba5d776da 100644
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
 
 def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     """
-    In avgnode_split_pass, simpliy split graph by node number.
+    In avgnode_split_pass, simply split graph by node number.
     """
     mod_graph = gm.graph
     avg_num_node = len(mod_graph.nodes) // pp_size
diff --git a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
index f28d65e2668a..4571bd93a790 100644
--- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
+++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
@@ -16,7 +16,7 @@ def apply(*args, **kwargs):
     return shape_consistency_manager.apply(*args, **kwargs)
 
 
-def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
+def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
     mod_graph = gm.graph
     nodes = tuple(mod_graph.nodes)
 
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index 2b4a8749cfd7..ab203dfd7440 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple):
     numel: int
     is_tensor: bool
     # TODO: we can add a list of sharding spec here, and record the sharding
-    # behaviour by appending sharding spec into list.
+    # behavior by appending sharding spec into list.
 
 
 def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata:
diff --git a/colossalai/fx/passes/passes_for_gpt2_test.py b/colossalai/fx/passes/passes_for_gpt2_test.py
index abc1a089e9a9..efdd34a01fe0 100644
--- a/colossalai/fx/passes/passes_for_gpt2_test.py
+++ b/colossalai/fx/passes/passes_for_gpt2_test.py
@@ -230,7 +230,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node,
                     use_partition.partitions_dependent_on.setdefault(def_partition_name)
 
     node_process_list = list(m.graph.nodes)
-    # split nodes into parititons
+    # split nodes into partitions
     while node_process_list:
         node = node_process_list.pop(0)
         orig_nodes[node.name] = node
@@ -277,7 +277,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node,
     if len(sorted_partitions) != len(partitions):
         raise RuntimeError("cycle exists between partitions!")
 
-    # add placeholders to parititons
+    # add placeholders to partitions
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         for input in partition.inputs:
diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py
index 5ce5b969cbde..61ed037ab7a1 100644
--- a/colossalai/fx/passes/split_module.py
+++ b/colossalai/fx/passes/split_module.py
@@ -29,8 +29,8 @@ def __repr__(self) -> str:
             f" nodes: {self.node_names},\n" \
             f" inputs: {self.inputs},\n" \
             f" outputs: {self.outputs},\n" \
-            f" partitions depenent on: {self.partitions_dependent_on},\n" \
-            f" parition dependents: {self.partition_dependents}"
+            f" partitions dependent on: {self.partitions_dependent_on},\n" \
+            f" partition dependents: {self.partition_dependents}"
 
 
 # Creates subgraphs out of main graph
diff --git a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
index 85f1553e304c..591485fdb1ca 100644
--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
@@ -51,7 +51,7 @@ def extract_kwargs_from_mod(self):
 
         For example:
             The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
-            considered during module initilizing. However, we need to consider those attributes as kwargs
+            considered during module initializing. However, we need to consider those attributes as kwargs
             in F.conv2d.
         """
         pass
diff --git a/colossalai/fx/tracer/experimental.py b/colossalai/fx/tracer/experimental.py
index 88b65b6188fa..22a67d1ceccc 100644
--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -295,7 +295,7 @@ class PatchedCheckpointFunction(torch.autograd.Function):
 
                 @staticmethod
                 def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                     self.inside_torch_checkpoint_func = True
                     out = run_function(*args)
                     self.inside_torch_checkpoint_func = False
diff --git a/colossalai/fx/tracer/tracer.py b/colossalai/fx/tracer/tracer.py
index 1ae31f958975..28965a1b8e74 100644
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -92,7 +92,7 @@ def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, pr
             return proxy
 
         # if graph is traced for auto parallelism module, some extra node will be added during
-        # graph construction to deal with the compatability between bias addition and all reduce.
+        # graph construction to deal with the compatibility between bias addition and all reduce.
 
         # if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
         # to create node on computation graph
@@ -208,7 +208,7 @@ def _configure_tracer_type(self, tracer_type: TracerType):
             self.proxy_cls = ColoProxy
             self.tracer_type = TracerType.META
         else:
-            raise ValueError(f"Unrecognised tracer type {tracer_type}")
+            raise ValueError(f"Unrecognized tracer type {tracer_type}")
 
     def _meta_data_computing(self, kind, target, args, kwargs):
 
@@ -445,7 +445,7 @@ class PatchedCheckpointFunction(torch.autograd.Function):
 
                 @staticmethod
                 def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                     self.inside_torch_checkpoint_func = True
                     out = run_function(*args)
                     self.inside_torch_checkpoint_func = False
diff --git a/colossalai/kernel/cuda_native/flash_attention.py b/colossalai/kernel/cuda_native/flash_attention.py
index d793815ed681..3db7374509a0 100644
--- a/colossalai/kernel/cuda_native/flash_attention.py
+++ b/colossalai/kernel/cuda_native/flash_attention.py
@@ -138,7 +138,7 @@ def forward(self,
             elif attn_mask_type == AttnMaskType.causal:    # gpt style
                 attn_bias = LowerTriangularMask()
 
-            if bias is not None:    # alibi / relative position emebedding
+            if bias is not None:    # alibi / relative position embedding
                 assert allow_alibi, "flash attention with bias is not supported in this system."
                 assert attn_mask_type == AttnMaskType.causal, \
                     "attention with bias is only supported for causal attention so far."
diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py
index 3b6470cdcbb9..69246f2f3854 100644
--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@@ -43,7 +43,7 @@ class Config:
     attn_prob_dropout_ratio: float    # attention score dropout ratio
     hidden_dropout_ratio: float    # dropout ration before residual
     norm_first: bool    # norm_first
-    fp16: bool    # fp16 presion
+    fp16: bool    # fp16 precision
 
 
 class MultiHeadAttention1DFunc(Function):
diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index aa41f57678fc..e20c08b051ed 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int,
                       seq_length: int = 512,
                       vocab_size: int = 32768,
                       dtype: torch.dtype = torch.float32):
-    """ Compilie JIT functions before the main training steps """
+    """ Compile JIT functions before the main training steps """
 
     embed = Embedding(vocab_size, hidden_size).to(get_current_device())
     linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 54036973e1e3..bb561a106515 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -13,7 +13,7 @@
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 1d0fb92de499..be6311c6c29f 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -13,19 +13,19 @@
 class HybridAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    `HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
+    `HybridAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
 
     * For parameters updating on CPU, it uses CPUAdam.
     * For parameters updating on GPU, it uses FusedAdam.
-    * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
+    * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
 
     :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
     or ``torch.optim.Adam`` with ``adamw_mode=False``
@@ -131,7 +131,7 @@ def step(self, closure=None, div_scale: float = -1):
                     assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"
                     assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"
 
-                    # record the state by gruop and update at once
+                    # record the state by group and update at once
                     g_l.append(p.grad.data)
                     p_l.append(p.data)
                     m_l.append(state['exp_avg'])
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
index da043df368ae..a6159856dcce 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
@@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
         return
     torch.cuda.current_stream().wait_stream(stream)
     # As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html,
-    # PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is
-    # freed, its memory is likely to be reused by newly constructed tenosrs.  By default,
+    # PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is
+    # freed, its memory is likely to be reused by newly constructed tensors.  By default,
     # this allocator traces whether a tensor is still in use by only the CUDA stream where it
     # was created.   When a tensor is used by additional CUDA streams, we need to call record_stream
     # to tell the allocator about all these streams.  Otherwise, the allocator might free the
@@ -294,7 +294,7 @@ def print_comm_stats(self):
             print(
                 f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem"
             )
-            print(f'cpu_to_cuda_elpase {elapsed} sec')
+            print(f'cpu_to_cuda_elapse {elapsed} sec')
 
         for k, v in self._elapsed_dict.items():
             print(f'{k}: {v}')
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 95b3b8014af1..8022e84dc24b 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     norm_type = float(norm_type)
 
     # Parameters can be on CPU or CUDA
-    # If parameters are on CPU, disable CUDA kernerls
+    # If parameters are on CPU, disable CUDA kernels
 
     # Calculate norm.
     if norm_type == inf:
diff --git a/colossalai/utils/tensor_detector/readme.md b/colossalai/utils/tensor_detector/readme.md
index 840dc8f4eca6..d6852ea55b54 100644
--- a/colossalai/utils/tensor_detector/readme.md
+++ b/colossalai/utils/tensor_detector/readme.md
@@ -46,7 +46,7 @@ detector.detect()
 
 I have made some comments on the right of the output for your understanding.
 
-Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
+Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
 
 **The order of print is not equal to the order the tensor creates, but they are really close.**
 
@@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot
 +  mlp.2.bias                        cuda:0               (32,)      True       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 27
-Totle GPU Memery Allocated on cuda:0 is 4.5 KB
+Total GPU Memory Allocated on cuda:0 is 4.5 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB
 +  Tensor                            cuda:0               (32,)      True       torch.float32          128 B    # output
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 30
-Totle GPU Memery Allocated on cuda:0 is 5.5 KB
+Total GPU Memory Allocated on cuda:0 is 5.5 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB
 +  Tensor                            cuda:0                  ()      True       torch.float32            4 B    # loss
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 32
-Totle GPU Memery Allocated on cuda:0 is 6.0 KB
+Total GPU Memory Allocated on cuda:0 is 6.0 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB
 -  Tensor                            cuda:0                (8,)      True       torch.float32           32 B    # deleted activation
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 34
-Totle GPU Memery Allocated on cuda:0 is 10.0 KB
+Total GPU Memory Allocated on cuda:0 is 10.0 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB
 +  Tensor                            cuda:0               (32,)     False       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 36
-Totle GPU Memery Allocated on cuda:0 is 14.0 KB
+Total GPU Memory Allocated on cuda:0 is 14.0 KB
 ------------------------------------------------------------------------------------------------------------
 ```
 
diff --git a/colossalai/utils/tensor_detector/tensor_detector.py b/colossalai/utils/tensor_detector/tensor_detector.py
index a8186f76834c..cfcd4e47b4cb 100644
--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@@ -55,7 +55,7 @@ def get_tensor_mem(self, tensor):
         return self.mem_format(memory_size)
 
     def mem_format(self, real_memory_size):
-        # format the tensor memory into a reasonal magnitude
+        # format the tensor memory into a reasonable magnitude
         if real_memory_size >= 2**30:
             return str(real_memory_size / (2**30)) + ' GB'
         if real_memory_size >= 2**20:
@@ -71,7 +71,7 @@ def collect_tensors_state(self):
                 if (not self.include_cpu) and obj.device == torch.device('cpu'):
                     continue
                 self.detected.append(id(obj))
-                # skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch
+                # skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch
                 if id(obj) not in self.tensor_info:
 
                     name = type(obj).__name__
@@ -84,7 +84,7 @@ def collect_tensors_state(self):
                                     name = par_name + ' (with grad)'
                         else:
                             # with no grad attached
-                            # there will be no new paramters created during running
+                            # there will be no new parameters created during running
                             # so it must be in saved_tensor_info
                             continue
                     # we can also marked common tensors as tensor(with grad)
@@ -155,7 +155,7 @@ def print_tensors_state(self):
             if device == torch.device('cpu'):
                 continue
             gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device))
-            self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n"
+            self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n"
         self.info += LINE
         self.info += '\n\n'
         if self.show_info:
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index d85df0b00476..77368d06d255 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -102,7 +102,7 @@ def access_chunk(self, chunk: Chunk) -> None:
         """
         if chunk in self.accessed_chunks:
             return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         if chunk.device_type == 'cpu':
             chunk.shard_move(get_current_device())
         self.__add_accessed_chunk(chunk)
@@ -114,7 +114,7 @@ def release_chunk(self, chunk: Chunk) -> None:
         if chunk not in self.accessed_chunks:
             return
         if chunk.can_release:
-            self.__sub_memroy_usage(chunk.memory_usage)
+            self.__sub_memory_usage(chunk.memory_usage)
             self.__sub_accessed_chunk(chunk)
             self.__add_memory_usage(chunk.memory_usage)
 
@@ -123,7 +123,7 @@ def move_chunk(self, chunk: Chunk, device: torch.device, force_copy: bool = Fals
         """
         if not chunk.can_move or chunk.device_type == device.type:
             return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.shard_move(device, force_copy)
         self.__add_memory_usage(chunk.memory_usage)
 
@@ -138,7 +138,7 @@ def reduce_chunk(self, chunk: Chunk) -> bool:
         """
         if not chunk.can_reduce:
             return False
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.reduce()
         self.__sub_accessed_chunk(chunk)
         self.__add_memory_usage(chunk.memory_usage)
@@ -228,11 +228,11 @@ def __get_chunk_group(self, group_name: str) -> Deque:
         return self.chunk_groups[group_name]
 
     def __close_one_chunk(self, chunk: Chunk):
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.close_chunk()
         self.__add_memory_usage(chunk.memory_usage)
 
-    def __sub_memroy_usage(self, usage: Dict[str, int]):
+    def __sub_memory_usage(self, usage: Dict[str, int]):
         for k, v in usage.items():
             self.total_mem[k] -= v
 
diff --git a/colossalai/zero/gemini/chunk/search_utils.py b/colossalai/zero/gemini/chunk/search_utils.py
index da58e038c879..881ceb0b3b97 100644
--- a/colossalai/zero/gemini/chunk/search_utils.py
+++ b/colossalai/zero/gemini/chunk/search_utils.py
@@ -85,7 +85,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
     Classify the parameters by their dp degree
 
     Args:
-        param_order (OrderedParamGenerator): the order of param be visied
+        param_order (OrderedParamGenerator): the order of param be vised
         strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False.
 
     Returns:
diff --git a/colossalai/zero/gemini/memory_tracer/memory_stats.py b/colossalai/zero/gemini/memory_tracer/memory_stats.py
index 9a45034ee27e..41d7e5754e96 100644
--- a/colossalai/zero/gemini/memory_tracer/memory_stats.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py
@@ -59,7 +59,7 @@ def increase_preop_step(self, param_list: List[torch.nn.Parameter]):
         time step.
 
         Args:
-            param_list (List[torch.nn.Parameter]): a list of torch paramters.
+            param_list (List[torch.nn.Parameter]): a list of torch parameters.
         """
         for p in param_list:
             if p not in self._param_step_dict:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 49ff9b344268..2c7bafd9604c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -8,14 +8,19 @@ LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-con
 # install torch
 RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 
+# install ninja
+RUN apt-get install -y --no-install-recommends ninja-build
+
 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
     cd apex && \
+    git checkout 91fcaa && \
     pip install packaging && \
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
 
 # install colossalai
-RUN git clone https://github.com/hpcaitech/ColossalAI.git \
+ARG VERSION=1
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
     && CUDA_EXT=1 pip install -v --no-cache-dir .
 
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index c3deca7e9c17..1dde7a816676 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -126,9 +126,9 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
    </a>
 </div>
 
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): 完整RLHF流程0门槛克隆 [ChatGPT](https://openai.com/blog/chatgpt/) 
-[[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
-[[博客]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): 完整RLHF流程0门槛克隆 [ChatGPT](https://openai.com/blog/chatgpt/)
+[[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[博客]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 [[在线样例]](https://www.youtube.com/watch?v=HcTiHzApHm0)
 [[教程]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
diff --git a/docs/README.md b/docs/README.md
index f520608d552c..f0cb50ffe217 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -98,7 +98,7 @@ Lastly, if you want to skip some code, you just need to add the following annota
 <!--- doc-test-ignore-end -->
 ```
 
-If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda.
+If you have any dependency required, please add it to `requirements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda.
 
 
 ### 💉 Auto Documentation
diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md
index 2681198191cb..0984b2dc3f28 100644
--- a/docs/REFERENCE.md
+++ b/docs/REFERENCE.md
@@ -1,6 +1,6 @@
 # References
 
-The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few reserach works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format.
+The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few research works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format.
 
 ## By Our Team
 
diff --git a/docs/sidebars.json b/docs/sidebars.json
index 94f79dcd3509..8be40e4512f9 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -43,8 +43,11 @@
       "label": "Features",
       "collapsed": true,
       "items": [
+        "features/mixed_precision_training_with_booster",
         "features/mixed_precision_training",
+        "features/gradient_accumulation_with_booster",
         "features/gradient_accumulation",
+        "features/gradient_clipping_with_booster",
         "features/gradient_clipping",
         "features/gradient_handler",
         "features/zero_with_chunk",
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
index be7284a7ab64..1caf58c8734e 100644
--- a/docs/source/en/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -56,7 +56,7 @@ follow the steps below to create a new distributed initialization.
                     world_size: int,
                     config: Config,
                     data_parallel_size: int,
-                    pipeline_parlalel_size: int,
+                    pipeline_parallel_size: int,
                     tensor_parallel_size: int,
                     arg1,
                     arg2):
diff --git a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index e01caf76d2b3..d5edd135c079 100644
--- a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -121,7 +121,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b
 
 
 ## Train Your Model
-Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine.
 We handle the back-propagation of MoE models for you.
 In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
 You can find more information about the handler `MoeGradientHandler` in colossal directory.
diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md
index 8afb6705b6ae..c1c23a355efa 100644
--- a/docs/source/en/advanced_tutorials/meet_gemini.md
+++ b/docs/source/en/advanced_tutorials/meet_gemini.md
@@ -9,16 +9,21 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr
 
 ## Usage
 
-At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini. Set attribute of zero model_config, i.e., tensor_placement_policy='auto'.
-
-```
-zero = dict(
-    model_config=dict(
-        tensor_placement_policy='auto',
-        shard_strategy=BucketTensorShardStrategy()
-    ),
-    optimizer_config=dict(
-    ...)
+At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions of `booster` please refer to [**usage of booster**](../basics/booster_api.md).
+
+```python
+from torchvision.models import resnet18
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+booster = Booster(plugin=plugin)
+ctx = ColoInitContext()
+with ctx:
+    model = resnet18()
+optimizer = HybridAdam(model.parameters(), lr=1e-3)
+criterion = lambda x: x.mean()
+model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 )
 ```
 
@@ -86,3 +91,5 @@ The important duty of MSC is to adjust the tensor layout position. For example,
 In the warmup stage, since we haven't finished a complete iteration yet, we don't know actual memory occupation. At this time, we limit the upper bound of memory usage of the model data. For example, only 30% of the GPU memory can be used. This ensures that we can successfully complete the warmup state.
 
 In the non-warmup stage, we need to use the memory information of non-model data collected in the warm-up stage to reserve the peak memory required by the computing device for the next Period, which requires us to move some model tensors. In order to avoid frequent replacement of the same tensor in and out of the CPU-GPU, causing a phenomenon similar to [cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science)). Using the iterative characteristics of DNN training, we design the OPT cache swap out strategy. Specifically, in the warmup stage, we record the sampling time required by each tensor computing device. If we need to expel some HOLD tensors, we will choose the latest tensor needed on this device as the victim.
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 meet_gemini.py  -->
diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md
index a43ec7fdd1fe..eccfa12f9389 100644
--- a/docs/source/en/advanced_tutorials/opt_service.md
+++ b/docs/source/en/advanced_tutorials/opt_service.md
@@ -53,7 +53,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path"
 # the ${CONFIG_DIR} must contain a server.sh file as the entry of service
 export CONFIG_DIR="config_file_path"
 
-docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest
 ```
 
 Then open `https://[IP-ADDRESS]:8020/docs#` in your browser to try out!
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index e7698e5e9d1b..22d52fb3cd1a 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -69,7 +69,7 @@ After the forward operation of the embedding module, each word in all sequences
 <figcaption>The embedding module</figcaption>
 </figure>
 
-Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer percepton is located in the second block.
+Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer perception is located in the second block.
 
 <figure style={{textAlign: "center"}}>
 <img src="https://s2.loli.net/2022/08/17/LAVzDlpRcj4dYeb.png"/>
@@ -175,11 +175,11 @@ In this way, users can train their models as usual.
 In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overhead and improve efficiency.For the details of this part, please refer to [ZeRO](../features/zero_with_chunk.md). You can combine these two parts to understand our entire training process:
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
-                        placement_policy=placememt_policy,
+                        placement_policy=placement_policy,
                         pin_memory=True,
                         search_range_mb=32)
     return model
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index b26599740c5f..6adfe4f113da 100644
--- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -195,7 +195,7 @@ def build_cifar(batch_size):
 
 ## Training ViT using pipeline
 
-You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an approriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage.
+You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleaved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an appropriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage.
 
 You should `export DATA=/path/to/cifar`.
 
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index b2438a1cf562..a2deaeb88893 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -16,14 +16,14 @@ In this example for ViT model, Colossal-AI provides three different parallelism
 We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs.
 
 
-## Tabel of Contents
+## Table of Contents
 1. Colossal-AI installation
 2. Steps to train ViT with data parallelism
 3. Steps to train ViT with pipeline parallelism
 4. Steps to train ViT with tensor parallelism or hybrid parallelism
 
 ## Colossal-AI Installation
-You can install Colossal-AI pacakage and its dependencies with PyPI.
+You can install Colossal-AI package and its dependencies with PyPI.
 ```bash
 pip install colossalai
 ```
@@ -31,7 +31,7 @@ pip install colossalai
 
 
 ## Data Parallelism
-Data parallism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps:
+Data parallelism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps:
 1. Define a configuration file
 2. Change a few lines of code in train script
 
@@ -94,7 +94,7 @@ from torchvision import transforms
 from torchvision.datasets import CIFAR10
 ```
 
-#### Lauch Colossal-AI
+#### Launch Colossal-AI
 
 In train script,  you need to initialize the distributed environment for Colossal-AI after your config file is prepared. We call this process `launch`. In Colossal-AI, we provided several launch methods to initialize the distributed backend. In most cases, you can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. Besides, Colossal-AI can utilize the existing launch tool provided by PyTorch as many users are familiar with by using `colossalai.launch_from_torch`. For more details, you can view the related [documents](https://www.colossalai.org/docs/basics/launch_colossalai).
 
@@ -613,7 +613,7 @@ NUM_MICRO_BATCHES = parallel['pipeline']
 TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
 ```
 
-Ohter configs:
+Other configs:
 ```python
 # hyper parameters
 # BATCH_SIZE is as per GPU
diff --git a/docs/source/en/basics/booster_api.md b/docs/source/en/basics/booster_api.md
index 18dec4500f76..a446ff31be83 100644
--- a/docs/source/en/basics/booster_api.md
+++ b/docs/source/en/basics/booster_api.md
@@ -14,9 +14,9 @@ In our new design, `colossalai.booster` replaces the role of `colossalai.initial
 ### Plugin
 Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows:
 
-***GeminiPlugin:*** This plugin wrapps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
+***GeminiPlugin:*** This plugin wraps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
 
-***TorchDDPPlugin:*** This plugin wrapps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
+***TorchDDPPlugin:*** This plugin wraps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
 
 ***LowLevelZeroPlugin:*** This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.
 
@@ -25,24 +25,6 @@ Plugin is an important component that manages parallel configuration (eg: The ge
 
 {{ autodoc:colossalai.booster.Booster }}
 
-{{ autodoc:colossalai.booster.Booster.boost }}
-
-{{ autodoc:colossalai.booster.Booster.backward }}
-
-{{ autodoc:colossalai.booster.Booster.no_sync }}
-
-{{ autodoc:colossalai.booster.Booster.save_model }}
-
-{{ autodoc:colossalai.booster.Booster.load_model }}
-
-{{ autodoc:colossalai.booster.Booster.save_optimizer }}
-
-{{ autodoc:colossalai.booster.Booster.load_optimizer }}
-
-{{ autodoc:colossalai.booster.Booster.save_lr_scheduler }}
-
-{{ autodoc:colossalai.booster.Booster.load_lr_scheduler }}
-
 ## Usage
 In a typical workflow, you should launch distributed environment at the beginning of training script and create objects needed (such as models, optimizers, loss function, data loaders etc.) firstly, then call `colossalai.booster` to inject features into these objects, After that, you can use our booster APIs and these returned objects to continue the rest of your training processes.
 
diff --git a/docs/source/en/basics/booster_plugins.md b/docs/source/en/basics/booster_plugins.md
index 0362f095af2b..5e2586b836ad 100644
--- a/docs/source/en/basics/booster_plugins.md
+++ b/docs/source/en/basics/booster_plugins.md
@@ -63,6 +63,10 @@ More details can be found in [Pytorch Docs](https://pytorch.org/docs/main/genera
 
 > ⚠ This plugin is not available when torch version is lower than 1.12.0.
 
+> ⚠ This plugin does not support save/load sharded model checkpoint now.
+
+> ⚠ This plugin does not support optimizer that use multi params group.
+
 More details can be found in [Pytorch Docs](https://pytorch.org/docs/main/fsdp.html).
 
 {{ autodoc:colossalai.booster.plugin.TorchFSDPPlugin }}
diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md
index 909c5e4d3c6f..abe470fe0794 100644
--- a/docs/source/en/basics/colotensor_concept.md
+++ b/docs/source/en/basics/colotensor_concept.md
@@ -2,6 +2,8 @@
 
 Author: [Jiarui Fang](https://github.com/feifeibear), [Hongxin Liu](https://github.com/ver217) and [Haichen Huang](https://github.com/1SAA)
 
+> ⚠️ The information on this page is outdated and will be deprecated.
+
 **Prerequisite:**
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
 - [Distributed Training](../concepts/distributed_training.md)
@@ -50,7 +52,7 @@ An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/c
 
 ## Example
 
-Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_dgree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor.
+Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_degree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor.
 
 
 ```python
diff --git a/docs/source/en/basics/configure_parallelization.md b/docs/source/en/basics/configure_parallelization.md
index 4ac0299eac14..fd1e72ccd45a 100644
--- a/docs/source/en/basics/configure_parallelization.md
+++ b/docs/source/en/basics/configure_parallelization.md
@@ -2,6 +2,8 @@
 
 Author: Shenggui Li, Siqi Mai
 
+> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster Plugins](../basics/booster_plugins.md) for more information.
+
 **Prerequisite:**
 - [Distributed Training](../concepts/distributed_training.md)
 - [Paradigms of Parallelism](../concepts/paradigms_of_parallelism.md)
diff --git a/docs/source/en/basics/define_your_config.md b/docs/source/en/basics/define_your_config.md
index d2569691b7dc..048ffcacbb8f 100644
--- a/docs/source/en/basics/define_your_config.md
+++ b/docs/source/en/basics/define_your_config.md
@@ -2,6 +2,9 @@
 
 Author: Guangyang Lu, Shenggui Li, Siqi Mai
 
+> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster API](../basics/booster_api.md) for more information.
+
+
 **Prerequisite:**
 - [Distributed Training](../concepts/distributed_training.md)
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md
index bbe32ed5a3b5..d2f99563f042 100644
--- a/docs/source/en/basics/engine_trainer.md
+++ b/docs/source/en/basics/engine_trainer.md
@@ -2,6 +2,8 @@
 
 Author: Shenggui Li, Siqi Mai
 
+> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster API](../basics/booster_api.md) for more information.
+
 **Prerequisite:**
 - [Initialize Features](./initialize_features.md)
 
diff --git a/docs/source/en/basics/initialize_features.md b/docs/source/en/basics/initialize_features.md
index e768d2022ad8..b89017427476 100644
--- a/docs/source/en/basics/initialize_features.md
+++ b/docs/source/en/basics/initialize_features.md
@@ -2,6 +2,8 @@
 
 Author: Shenggui Li, Siqi Mai
 
+> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster API](../basics/booster_api.md) for more information.
+
 **Prerequisite:**
 - [Distributed Training](../concepts/distributed_training.md)
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
diff --git a/docs/source/en/basics/model_checkpoint.md b/docs/source/en/basics/model_checkpoint.md
index 09d44e7c2709..70334f1c41e7 100644
--- a/docs/source/en/basics/model_checkpoint.md
+++ b/docs/source/en/basics/model_checkpoint.md
@@ -2,6 +2,8 @@
 
 Author : Guangyang Lu
 
+> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster Checkpoint](../basics/booster_checkpoint.md) for more information.
+
 **Prerequisite:**
 - [Launch Colossal-AI](./launch_colossalai.md)
 - [Initialize Colossal-AI](./initialize_features.md)
diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md
index b9e98eac9350..0e28f08b23c9 100644
--- a/docs/source/en/features/3D_tensor_parallel.md
+++ b/docs/source/en/features/3D_tensor_parallel.md
@@ -67,7 +67,7 @@ Given $P=q \times q \times q$ processors, we present the theoretical computation
 
 ## Usage
 
-To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below.
+To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below.
 ```python
 CONFIG = dict(parallel=dict(
     data=1,
diff --git a/docs/source/en/features/cluster_utils.md b/docs/source/en/features/cluster_utils.md
index 1903d64d2563..7331d5e73ae0 100644
--- a/docs/source/en/features/cluster_utils.md
+++ b/docs/source/en/features/cluster_utils.md
@@ -13,20 +13,4 @@ We provide a utility class `colossalai.cluster.DistCoordinator` to coordinate di
 
 {{ autodoc:colossalai.cluster.DistCoordinator }}
 
-{{ autodoc:colossalai.cluster.DistCoordinator.is_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.is_node_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.is_last_process }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.print_on_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.print_on_node_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.priority_execution }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.destroy }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.block_all }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.on_master_only }}
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index ecc209fbac8d..91d89b815bf7 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# Gradient Accumulation
+# Gradient Accumulation (Outdated)
 
 Author: Shenggui Li, Yongbin Li
 
@@ -43,3 +43,5 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation.py  -->
diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
new file mode 100644
index 000000000000..201e3bc2b643
--- /dev/null
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -0,0 +1,144 @@
+# Gradient Accumulation (Latest)
+
+Author: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Training Booster](../basics/booster_api.md)
+
+## Introduction
+
+Gradient accumulation is a common way to enlarge your batch size for training. When training large-scale models, memory can easily become the bottleneck and the batch size can be very small, (e.g. 2), leading to unsatisfactory convergence. Gradient accumulation works by adding up the gradients calculated in multiple iterations, and only update the parameters in the preset iteration.
+
+## Usage
+
+It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization, meanwhile you should not update the weights.
+
+## Hands-on Practice
+
+We now demonstrate gradient accumulation. In this example, we let the gradient accumulation size to be 4.
+
+### Step 1. Import libraries in train.py
+Create a `train.py` and import the necessary dependencies. The version of `torch` should not be lower than 1.8.1.
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+from torch.utils.data import DataLoader
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.cluster.dist_coordinator import priority_execution
+```
+
+### Step 2. Initialize Distributed Environment
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md) for other initialization methods.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+```
+
+### Step 3. Create training components
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
+
+```python
+# define the training hyperparameters
+BATCH_SIZE = 128
+GRADIENT_ACCUMULATION = 4
+
+# build resnet
+model = resnet18(num_classes=10)
+
+# build dataloaders
+with priority_execution():
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+```
+
+### Step 4. Inject Feature
+Create a `TorchDDPPlugin` object to instantiate a `Booster`, and boost these training components.
+
+```python
+plugin = TorchDDPPlugin()
+booster = Booster(plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                    optimizer=optimizer,
+                                                                    criterion=criterion,
+                                                                    dataloader=train_dataloader)
+```
+
+### Step 5. Train with Booster
+Use booster in a normal training loops, and verify gradient accumulation. `param_by_iter` is to record the distributed training information.
+```python
+optimizer.zero_grad()
+for idx, (img, label) in enumerate(train_dataloader):
+        sync_context = booster.no_sync(model)
+        img = img.cuda()
+        label = label.cuda()
+        if idx % (GRADIENT_ACCUMULATION - 1) != 0:
+            with sync_context:
+                output = model(img)
+                train_loss = criterion(output, label)
+                booster.backward(train_loss, optimizer)
+        else:
+            output = model(img)
+            train_loss = criterion(output, label)
+            booster.backward(train_loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+
+        ele_1st = next(model.parameters()).flatten()[0]
+        param_by_iter.append(str(ele_1st.item()))
+
+        if idx != 0 and idx % (GRADIENT_ACCUMULATION - 1) == 0:
+            break
+
+    for iteration, val in enumerate(param_by_iter):
+        print(f'iteration {iteration} - value: {val}')
+
+    if param_by_iter[-1] != param_by_iter[0]:
+        print('The parameter is only updated in the last iteration')
+
+```
+
+### Step 6. Invoke Training Scripts
+To verify gradient accumulation, we can just check the change of parameter values. When gradient accumulation is set, parameters are only updated in the last step. You can run the script using this command:
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+
+You will see output similar to the text below. This shows gradient is indeed accumulated as the parameter is not updated
+in the first 3 steps, but only updated in the last step.
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation_with_booster.py  -->
diff --git a/docs/source/en/features/gradient_clipping.md b/docs/source/en/features/gradient_clipping.md
index f606dde6c393..5a23c68e3e27 100644
--- a/docs/source/en/features/gradient_clipping.md
+++ b/docs/source/en/features/gradient_clipping.md
@@ -1,4 +1,4 @@
-# Gradient Clipping
+# Gradient Clipping (Outdated)
 
 Author: Boxiang Wang, Haichen Huang, Yongbin Li
 
@@ -60,3 +60,5 @@ to demonstrate gradient clipping. In this example, we set the gradient clipping
 ```shell
 python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  train_with_engine.py
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_clipping.py  -->
diff --git a/docs/source/en/features/gradient_clipping_with_booster.md b/docs/source/en/features/gradient_clipping_with_booster.md
new file mode 100644
index 000000000000..341a608a5c7b
--- /dev/null
+++ b/docs/source/en/features/gradient_clipping_with_booster.md
@@ -0,0 +1,142 @@
+# Gradient Clipping (Latest)
+
+Author: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Training Booster](../basics/booster_api.md)
+
+**Related Paper**
+- [On the difficulty of training Recurrent Neural Networks](https://arxiv.org/abs/1211.5063)
+
+## Introduction
+
+In order to speed up training process and seek global optimum for better performance, more and more learning rate schedulers have been proposed. People turn to control learning rate to adjust descent pace during training, which makes gradient vector better to be uniformed in every step. In that case, the descent pace can be controlled as expected. As a result, gradient clipping, a technique which can normalize the gradient vector to circumscribe it in a uniformed length, becomes indispensable for those who desire their better performance of their models.
+
+You do not have to worry about implementing gradient clipping when using Colossal-AI, we support gradient clipping in a powerful and convenient way. All you need is just an additional command in your configuration file.
+
+## Why you should use gradient clipping provided by Colossal-AI
+
+The reason of why we do not recommend users to write gradient clipping by themselves is that naive gradient clipping may fail when applying tensor parallelism, pipeline parallelism or MoE.
+
+According to the illustration below, each GPU only owns a portion of parameters of the weight in a linear layer. To get correct norm of gradient vector of the weight of the linear layer, the norm of every gradient vector in each GPU should be summed together. More complicated thing is that the distribution of bias is different from the distribution of the weight. The communication group is different in the sum operation.
+
+(PS: This situation is an old version of 2D parallelism, the implementation in the code is not the same. But it is a good example about the difficulty to unify all communication in gradient clipping.)
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/KXiJPHt3Dum82cA.png"/>
+<figcaption>Layout of parameters</figcaption>
+</figure>
+
+Do not worry about it, since Colossal-AI have handled it for you.
+
+## Usage
+To use gradient clipping, you can just add the following code to your configuration file, and after boosted, you can call `clip_grad_by_norm` or `clip_grad_by_value` method of optimizer, if it support clip gradients.
+
+## Hands-On Practice
+
+We now demonstrate how to use gradient clipping. In this example, we set the gradient clipping vector norm to be 1.0.
+
+### step 1. Import libraries in train.py
+Create a `train.py` and import the necessary dependencies.
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from tqdm import tqdm
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+```
+
+### Step 2. Initialize Distributed Environment
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md)
+for other initialization methods.
+
+```python
+colossalai.launch_from_torch(config=dict())
+logger = get_dist_logger()
+```
+
+
+### Step 3. Create training components
+
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
+```python
+# define training hyperparameters
+NUM_EPOCHS = 200
+BATCH_SIZE = 128
+GRADIENT_CLIPPING = 0.1
+# build resnet
+model = resnet34(num_classes=10)
+# build dataloaders
+train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                        download=True,
+                        transform=transforms.Compose([
+                            transforms.RandomCrop(size=32, padding=4),
+                            transforms.RandomHorizontalFlip(),
+                            transforms.ToTensor(),
+                            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                        ]))
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+# lr_scheduler
+lr_scheduler = CosineAnnealingLR(optimizer, total_steps=NUM_EPOCHS)
+
+```
+### Step 4. Inject Gradient Clipping Feature
+
+Create a `TorchDDPPlugin` object and `Booster` object, get a data loader from plugin, then boost all training components.
+```python
+plugin = TorchDDPPlugin()
+booster = Booster(mixed_precision='fp16', plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model,optimizer, criterion,train_dataloader, lr_scheduler)
+
+```
+
+### Step 5. Train with Booster
+Use booster in a normal training loops.
+```python
+# verify gradient clipping
+model.train()
+for idx, (img, label) in enumerate(train_dataloader):
+    img = img.cuda()
+    label = label.cuda()
+
+    model.zero_grad()
+    output = model(img)
+    train_loss = criterion(output, label)
+    booster.backward(train_loss, optimizer)
+    optimizer.clip_grad_by_norm(max_norm=GRADIENT_CLIPPING)
+    optimizer.step()
+    lr_scheduler.step()
+
+    ele_1st = next(model.parameters()).flatten()[0]
+    logger.info(f'iteration {idx}, loss: {train_loss}, 1st element of parameters: {ele_1st.item()}')
+
+    # only run for 4 iterations
+    if idx == 3:
+        break
+```
+
+### Step 6. Invoke Training Scripts
+You can run the script using this command:
+
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_clipping_with_booster.py  -->
diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md
index 11aa5235301a..8579d586ed5f 100644
--- a/docs/source/en/features/mixed_precision_training.md
+++ b/docs/source/en/features/mixed_precision_training.md
@@ -1,4 +1,4 @@
-# Auto Mixed Precision Training
+# Auto Mixed Precision Training (Outdated)
 
 Author: Chuanrui Wang, Shenggui Li, Yongbin Li
 
@@ -362,6 +362,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 
 Use the following command to start the training scripts. You can change `--nproc_per_node` to use a different number of GPUs.
 
-```python
+```shell
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training.py  -->
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
new file mode 100644
index 000000000000..e9b6f684f613
--- /dev/null
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -0,0 +1,251 @@
+# Auto Mixed Precision Training (Latest)
+
+Author: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Training Booster](../basics/booster_api.md)
+
+**Related Paper**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## Introduction
+
+AMP stands for automatic mixed precision training.
+In Colossal-AI, we have incorporated different implementations of mixed precision training:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
+| AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 |
+
+The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
+The last method is similar to Apex O2 level.
+Among these methods, apex AMP is not compatible with tensor parallelism.
+This is because that tensors are split across devices in tensor parallelism, thus, it is required to communicate among different processes to check if inf or nan occurs in the whole model weights.
+We modified the torch amp implementation so that it is compatible with tensor parallelism now.
+
+> ❌️ fp16 and zero are not compatible
+>
+> ⚠️ Pipeline only support naive AMP currently
+
+We recommend you to use torch AMP as it generally gives better accuracy than naive AMP if no pipeline is used.
+
+## Table of Contents
+
+In this tutorial we will cover:
+
+1. [AMP introduction](#amp-introduction)
+2. [AMP in Colossal-AI](#amp-in-colossal-ai)
+3. [Hands-on Practice](#hands-on-practice)
+
+## AMP Introduction
+
+Automatic Mixed Precision training is a mixture of FP16 and FP32 training.
+
+Half-precision float point format (FP16) has lower arithmetic complexity and higher compute efficiency. Besides, fp16 requires half of the storage needed by fp32 and saves memory & network bandwidth, which makes more memory available for large batch size and model size.
+
+However, there are other operations, like reductions, which require the dynamic range of fp32 to avoid numeric overflow/underflow. That's the reason why we introduce automatic mixed precision, attempting to match each operation to its appropriate data type, which can reduce the memory footprint and augment training efficiency.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>Illustration of an ordinary AMP (figure from <a href="https://arxiv.org/abs/2108.05818">PatrickStar paper</a>)</figcaption>
+</figure>
+
+## AMP in Colossal-AI
+
+We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Now booster support torch amp, the other two(apex amp, naive amp) are still started by `colossalai.initialize`, if needed, please refer to [this](./mixed_precision_training.md). Next we will support `bf16`, `fp8`.
+
+### Start with Booster
+instantiate `Booster` with `mixed_precision="fp16"`, then you can train with torch amp.
+<!--- doc-test-ignore-start -->
+```python
+"""
+    Mapping:
+    'fp16': torch amp
+    'fp16_apex': apex amp,
+    'bf16': bf16,
+    'fp8': fp8,
+    'fp16_naive': naive amp
+"""
+from colossalai import Booster
+booster = Booster(mixed_precision='fp16',...)
+```
+<!--- doc-test-ignore-end -->
+or you can create a `FP16TorchMixedPrecision` object, such as:
+<!--- doc-test-ignore-start -->
+```python
+from colossalai.mixed_precision import FP16TorchMixedPrecision
+mixed_precision = FP16TorchMixedPrecision(
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000)
+booster = Booster(mixed_precision=mixed_precision,...)
+```
+<!--- doc-test-ignore-end -->
+The same goes for other types of amps.
+
+
+### Torch AMP Configuration
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16TorchMixedPrecision }}
+
+### Apex AMP Configuration
+
+For this mode, we rely on the Apex implementation for mixed precision training.
+We support this plugin because it allows for finer control on the granularity of mixed precision.
+For example, O2 level (optimization level 2) will keep batch normalization in fp32.
+
+If you look for more details, please refer to [Apex Documentation](https://nvidia.github.io/apex/).
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16ApexMixedPrecision }}
+
+### Naive AMP Configuration
+
+In Naive AMP mode, we achieved mixed precision training while maintaining compatibility with complex tensor and pipeline parallelism.
+This AMP mode will cast all operations into fp16.
+The following code block shows the mixed precision api for this mode.
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16NaiveMixedPrecision }}
+
+When using `colossalai.booster`, you are required to first instantiate a model, an optimizer and a criterion.
+The output model is converted to AMP model of smaller memory consumption.
+If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`.
+Otherwise, try smaller models or checkout more parallelization training techniques!
+
+
+## Hands-on Practice
+
+Now we will introduce the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example.
+
+### Step 1. Import libraries in train.py
+
+Create a `train.py` and import the necessary dependencies. Remember to install `scipy` and `timm` by running
+`pip install timm scipy`.
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from timm.models import vit_base_patch16_224
+from titans.utils import barrier_context
+from torchvision import datasets, transforms
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+```
+
+### Step 2. Initialize Distributed Environment
+
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md)
+for other initialization methods.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### Step 3. Create training components
+
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is
+obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])`
+to a path on your machine. Data will be automatically downloaded to the root path.
+
+```python
+# define the constants
+NUM_EPOCHS = 2
+BATCH_SIZE = 128
+
+# build model
+model = vit_base_patch16_224(drop_rate=0.1)
+
+# build dataloader
+train_dataset = datasets.Caltech101(
+    root=Path(os.environ['DATA']),
+    download=True,
+    transform=transforms.Compose([
+        transforms.Resize(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        Gray2RGB(),
+        transforms.Normalize([0.5, 0.5, 0.5],
+                                [0.5, 0.5, 0.5])
+    ]))
+
+# build optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+
+# lr_scheduler
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS)
+```
+
+### Step 4. Inject AMP Feature
+
+Create a `MixedPrecision`(if needed) and `TorchDDPPlugin` object, call `colossalai.boost` convert the training components to be running with FP16.
+
+```python
+plugin = TorchDDPPlugin()
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+booster = Booster(mixed_precision='fp16', plugin=plugin)
+
+# if you need to customize the config, do like this
+# >>> from colossalai.mixed_precision import FP16TorchMixedPrecision
+# >>> mixed_precision = FP16TorchMixedPrecision(
+# >>>     init_scale=2.**16,
+# >>>     growth_factor=2.0,
+# >>>     backoff_factor=0.5,
+# >>>     growth_interval=2000)
+# >>> plugin = TorchDDPPlugin()
+# >>> booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+
+# boost model, optimizer, criterion, dataloader, lr_scheduler
+model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, dataloader, lr_scheduler)
+```
+
+### Step 5. Train with Booster
+
+Use booster in a normal training loops.
+
+```python
+model.train()
+for epoch in range(NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        optimizer.zero_grad()
+        output = model(img)
+        loss = criterion(output, label)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+    lr_scheduler.step()
+```
+
+### Step 6. Invoke Training Scripts
+
+Use the following command to start the training scripts. You can change `--nproc_per_node` to use a different number of GPUs.
+
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index 4374da3c9c45..6ed6f2dee5d6 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -53,7 +53,7 @@ It's compatible with all parallel methods in ColossalAI.
 
 > ⚠ It only offloads optimizer states on CPU. This means it only affects CPU training or Zero/Gemini with offloading.
 
-## Exampls
+## Examples
 
 Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 
@@ -78,8 +78,9 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
 import colossalai
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
 ```
 
 Then we define a loss function:
@@ -192,17 +193,23 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
     optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
     print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
 
-    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
-                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
-    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
-    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+    plugin = GeminiPlugin(
+                strict_ddp_mode=True,
+                device=torch.cuda.current_device(),
+                placement_policy='cpu',
+                pin_memory=True,
+                hidden_dim=config.n_embd,
+                initial_scale=2**5
+                )
+    booster = Booster(plugin)
+    model, optimizer, criterion, _* = booster.boost(model, optimizer, criterion)
 
     start = time.time()
     for step in range(3):
         data = get_data(4, 128, config.vocab_size)
         outputs = model(**data)
         loss = criterion(outputs.logits, data['input_ids'])
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
         optimizer.zero_grad()
         print(f'[{step}] loss: {loss.item():.3f}')
diff --git a/docs/source/en/features/pipeline_parallel.md b/docs/source/en/features/pipeline_parallel.md
index ac49863b3c71..30654b0b0195 100644
--- a/docs/source/en/features/pipeline_parallel.md
+++ b/docs/source/en/features/pipeline_parallel.md
@@ -156,4 +156,4 @@ trainer.fit(train_dataloader=train_dataloader,
             display_progress=True)
 ```
 
-We use `2` pipeline stages and the batch will be splitted into `4` micro batches.
+We use `2` pipeline stages and the batch will be split into `4` micro batches.
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index a105831a5409..d6f6f611a64c 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -3,7 +3,7 @@
 Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
 
 **Prerequisite:**
-- [Define Your Configuration](../basics/define_your_config.md)
+- [Train with booster](../basics/booster_api.md)
 
 **Example Code**
 
@@ -72,7 +72,7 @@ chunk_manager = init_chunk_manager(model=module,
 gemini_manager = GeminiManager(placement_policy, chunk_manager)
 ```
 
-`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still samller than the minimum chunk size, all parameters will be compacted into one small chunk.
+`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk.
 
 Initialization of the optimizer.
 ```python
@@ -97,6 +97,7 @@ For simplicity, we just use randomly generated data here.
 
 First we only need to import `GPT2LMHeadModel` from `Huggingface transformers` to define our model, which does not require users to define or modify the model, so that users can use it more conveniently.
 
+Define a GPT model:
 ```python
 class GPTLMModel(nn.Module):
 
@@ -182,34 +183,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 ```
 
-Define a model which uses Gemini + ZeRO DDP:
-
-```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placememt_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placememt_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
-```
-
-As we pre-train GPT in this example, we just use a simple language model loss.
-
 Write a function to get random inputs:
 
 ```python
@@ -219,9 +192,15 @@ def get_data(batch_size, seq_len, vocab_size):
     return input_ids, attention_mask
 ```
 
-Finally, we can define our training loop:
+Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss:
 
 ```python
+from torch.optim import Adam
+
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+
 def main():
     args = parse_args()
     BATCH_SIZE = 8
@@ -232,22 +211,23 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
+    optimizer = Adam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    default_dist_spec = ShardSpec([-1], [args.tp_degree])
     # build GPT model
     with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
       model = gpt2_medium(checkpoint=True)
     pg = default_pg
     # Tensor Parallelism (TP)
     tensor_parallelize(model, pg)
+
     # Gemini + ZeRO DP, Note it must be used after TP
-    model = gemini_zero_dpp(model, pg, args.placement)
-    # build optimizer
-    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
-    numel = sum([p.numel() for p in model.parameters()])
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
     torch.cuda.synchronize()
     model.train()
     for n in range(NUM_STEPS):
@@ -256,10 +236,12 @@ def main():
         optimizer.zero_grad()
         outputs = model(input_ids, attn_mask)
         loss = criterion(outputs, input_ids)
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
 
     torch.cuda.synchronize()
 ```
 > ⚠️ Note: If you want to use the Gemini module, please do not use the [Gradient Accumulation](../features/gradient_accumulation.md) we mentioned before。
 The complete example can be found on [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 zero_with_chunk.py  -->
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index b626edb19e8e..6fc4ce2c922a 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -48,5 +48,20 @@ If you don't want to install and enable CUDA kernel fusion (compulsory installat
 pip install .
 ```
 
+For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
 
 <!-- doc-test-command: echo "installation.md does not need test" -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
index 4825a6fa1d6c..059eb014affd 100644
--- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
@@ -48,7 +48,7 @@ Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管
                     world_size: int,
                     config: Config,
                     data_parallel_size: int,
-                    pipeline_parlalel_size: int,
+                    pipeline_parallel_size: int,
                     tensor_parallel_size: int,
                     arg1,
                     arg2):
diff --git a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index 456878caa147..276fcc2619e0 100644
--- a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -122,7 +122,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b
 
 
 ## Train Your Model
-Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine.
 We handle the back-propagation of MoE models for you.
 In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
 You can find more information about the handler `MoeGradientHandler` in colossal directory.
diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
index 2bf0a9c98c3f..594823862de1 100644
--- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
+++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
@@ -8,21 +8,21 @@
 
 ## 用法
 
-目前Gemini支持和ZeRO并行方式兼容，它的使用方法很简单，在训练策略的配置文件里设置zero的model_config属性tensor_placement_policy='auto'
-
-```
-zero = dict(
-    model_config=dict(
-        reduce_scatter_bucket_size_mb=25,
-        fp32_reduce_scatter=False,
-        gradient_predivide_factor=1.0,
-        tensor_placement_policy="auto",
-        shard_strategy=TensorShardStrategy(),
-        ...
-    ),
-    optimizer_config=dict(
-        ...
-    )
+目前Gemini支持和ZeRO并行方式兼容，它的使用方法很简单：使用booster将`GeminiPlugin`中的特性注入到训练组件中。更多`booster`介绍请参考[booster使用](../basics/booster_api.md)。
+
+```python
+from torchvision.models import resnet18
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+booster = Booster(plugin=plugin)
+ctx = ColoInitContext()
+with ctx:
+    model = resnet18()
+optimizer = HybridAdam(model.parameters(), lr=1e-3)
+criterion = lambda x: x.mean()
+model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 )
 ```
 
@@ -48,7 +48,7 @@ zero = dict(
 </figure>
 
 
-ColossalAI设计了Gemini，就像双子星一样，它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内，从而让模型训练突破GPU的内存墙。内存管理器由两部分组成，分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。
+ColossalAI设计了Gemini，就像双子星一样，它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内，从而让模型训练突破GPU的内存墙。内存管理器由两部分组成，分别是MemStatsCollector(MSC)和StatefulTensorMgr(STM)。
 
 
 我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段，开始时的一个或若干迭代步属于预热阶段，其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息，而在non-warmup阶段STM入去MSC收集的信息来移动tensor，以达到最小化CPU-GPU数据移动volume的目的。
@@ -75,7 +75,7 @@ STM管理所有model data tensor的信息。在模型的构造过程中，Coloss
 
 我们在算子的开始和结束计算时，触发内存采样操作，我们称这个时间点为**采样时刻（sampling moment)**，两个采样时刻之间的时间我们称为**period**。计算过程是一个黑盒，由于可能分配临时buffer，内存使用情况很复杂。但是，我们可以较准确的获取period的系统最大内存使用。非模型数据的使用可以通过两个统计时刻之间系统最大内存使用-模型内存使用获得。
 
-我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used，和下一个period的model data memoy used。并行策略会给MSC的工作造成障碍。如图所示，比如对于ZeRO或者Tensor Parallel，由于Op计算前需要gather模型数据，会带来额外的内存需求。因此，我们要求在模型数据变化前进行采样系统内存，这样在一个period内，MSC会把preOp的模型变化内存捕捉。比如在period 2-3内，我们考虑的tensor gather和shard带来的内存变化。
+我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used，和下一个period的model data memory used。并行策略会给MSC的工作造成障碍。如图所示，比如对于ZeRO或者Tensor Parallel，由于Op计算前需要gather模型数据，会带来额外的内存需求。因此，我们要求在模型数据变化前进行采样系统内存，这样在一个period内，MSC会把preOp的模型变化内存捕捉。比如在period 2-3内，我们考虑的tensor gather和shard带来的内存变化。
 尽管可以将采样时刻放在其他位置，比如排除gather buffer的变动新信息，但是会给造成麻烦。不同并行方式Op的实现有差异，比如对于Linear Op，Tensor Parallel中gather buffer的分配在Op中。而对于ZeRO，gather buffer的分配是在PreOp中。将放在PreOp开始时采样有利于将两种情况统一。
 
 
@@ -94,3 +94,5 @@ MSC的重要职责是在调整tensor layout位置，比如在上图S2时刻，
 
 在non-warmup阶段，我们需要利用预热阶段采集的非模型数据内存信息，预留出下一个Period在计算设备上需要的峰值内存，这需要我们移动出一些模型张量。
 为了避免频繁在CPU-GPU换入换出相同的tensor，引起类似[cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science))的现象。我们利用DNN训练迭代特性，设计了OPT cache换出策略。具体来说，在warmup阶段，我们记录每个tensor被计算设备需要的采样时刻。如果我们需要驱逐一些HOLD tensor，那么我们选择在本设备上最晚被需要的tensor作为受害者。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 meet_gemini.py  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/opt_service.md b/docs/source/zh-Hans/advanced_tutorials/opt_service.md
index a213584fd41d..1f8324a53ecb 100644
--- a/docs/source/zh-Hans/advanced_tutorials/opt_service.md
+++ b/docs/source/zh-Hans/advanced_tutorials/opt_service.md
@@ -52,7 +52,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path"
 # the ${CONFIG_DIR} must contain a server.sh file as the entry of service
 export CONFIG_DIR="config_file_path"
 
-docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest
 ```
 
 接下来，您就可以在您的浏览器中打开 `https://[IP-ADDRESS]:8020/docs#` 进行测试。
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index f3c6247c38e4..c4131e593437 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -159,11 +159,11 @@ for mn, module in model.named_modules():
 在我们最新示例中还定义了一个Gemini + ZeRO DDP 的模型从而减小开销，提升效率。这一部分的详细内容可以参考[ZeRO](../features/zero_with_chunk.md)，你可以将这两部分内容结合起来看从而理解我们整个训练流程：
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
-                        placement_policy=placememt_policy,
+                        placement_policy=placement_policy,
                         pin_memory=True,
                         search_range_mb=32)
     return model
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 6dc5eccf4421..e2f2c90a3791 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -477,7 +477,7 @@ def build_cifar(batch_size):
     return train_dataloader, test_dataloader
 
 
-# craete dataloaders
+# create dataloaders
 train_dataloader , test_dataloader = build_cifar()
 # create loss function
 criterion = CrossEntropyLoss(label_smoothing=0.1)
@@ -492,7 +492,7 @@ lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
 #### 启动 Colossal-AI 引擎
 
 ```python
-# intiailize
+# initialize
 engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                      optimizer=optimizer,
                                                                      criterion=criterion,
diff --git a/docs/source/zh-Hans/basics/booster_api.md b/docs/source/zh-Hans/basics/booster_api.md
index 5410cc213fd2..1bb5fd69bd15 100644
--- a/docs/source/zh-Hans/basics/booster_api.md
+++ b/docs/source/zh-Hans/basics/booster_api.md
@@ -25,24 +25,6 @@ Booster插件是管理并行配置的重要组件（eg：gemini插件封装了ge
 
 {{ autodoc:colossalai.booster.Booster }}
 
-{{ autodoc:colossalai.booster.Booster.boost }}
-
-{{ autodoc:colossalai.booster.Booster.backward }}
-
-{{ autodoc:colossalai.booster.Booster.no_sync }}
-
-{{ autodoc:colossalai.booster.Booster.save_model }}
-
-{{ autodoc:colossalai.booster.Booster.load_model }}
-
-{{ autodoc:colossalai.booster.Booster.save_optimizer }}
-
-{{ autodoc:colossalai.booster.Booster.load_optimizer }}
-
-{{ autodoc:colossalai.booster.Booster.save_lr_scheduler }}
-
-{{ autodoc:colossalai.booster.Booster.load_lr_scheduler }}
-
 ## 使用方法及示例
 
 在使用colossalai训练时，首先需要在训练脚本的开头启动分布式环境，并创建需要使用的模型、优化器、损失函数、数据加载器等对象。之后，调用`colossalai.booster` 将特征注入到这些对象中，您就可以使用我们的booster API去进行您接下来的训练流程。
diff --git a/docs/source/zh-Hans/basics/booster_plugins.md b/docs/source/zh-Hans/basics/booster_plugins.md
index b15ceb1e3ad5..5bd88b679000 100644
--- a/docs/source/zh-Hans/basics/booster_plugins.md
+++ b/docs/source/zh-Hans/basics/booster_plugins.md
@@ -63,6 +63,10 @@ Zero-2 不支持局部梯度累积。如果您坚持使用，虽然可以积累
 
 > ⚠ 如果 torch 版本低于 1.12.0，此插件将不可用。
 
+> ⚠ 该插件现在还不支持保存/加载分片的模型 checkpoint。
+
+> ⚠ 该插件现在还不支持使用了multi params group的optimizer。
+
 更多详细信息，请参阅 [Pytorch 文档](https://pytorch.org/docs/main/fsdp.html).
 
 {{ autodoc:colossalai.booster.plugin.TorchFSDPPlugin }}
diff --git a/docs/source/zh-Hans/basics/colotensor_concept.md b/docs/source/zh-Hans/basics/colotensor_concept.md
index d6a332df2e9c..ab2413e990f7 100644
--- a/docs/source/zh-Hans/basics/colotensor_concept.md
+++ b/docs/source/zh-Hans/basics/colotensor_concept.md
@@ -2,6 +2,8 @@
 
 Author: [Jiarui Fang](https://github.com/feifeibear), [Hongxin Liu](https://github.com/ver217) and [Haichen Huang](https://github.com/1SAA)
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。
+
 **Prerequisite:**
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
 - [Distributed Training](../concepts/distributed_training.md)
@@ -51,7 +53,7 @@ ColoTensor 包含额外的属性[ColoTensorSpec](https://colossalai.readthedocs.
 
 ## Example
 
-让我们看一个例子。 使用 tp_degree=4, dp_dgree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后，我们沿着 TP 进程组中的第一个维度（dim 0）对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。
+让我们看一个例子。 使用 tp_degree=4, dp_degree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后，我们沿着 TP 进程组中的第一个维度（dim 0）对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。
 
 
 ```python
diff --git a/docs/source/zh-Hans/basics/configure_parallelization.md b/docs/source/zh-Hans/basics/configure_parallelization.md
index eb4b38f48ddb..0c2a66572d60 100644
--- a/docs/source/zh-Hans/basics/configure_parallelization.md
+++ b/docs/source/zh-Hans/basics/configure_parallelization.md
@@ -2,6 +2,8 @@
 
 作者: Shenggui Li, Siqi Mai
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster插件](../basics/booster_plugins.md)页面查阅更新。
+
 **预备知识:**
 - [分布式训练](../concepts/distributed_training.md)
 - [并行技术](../concepts/paradigms_of_parallelism.md)
diff --git a/docs/source/zh-Hans/basics/define_your_config.md b/docs/source/zh-Hans/basics/define_your_config.md
index d7e49cbf23de..720e75805e8d 100644
--- a/docs/source/zh-Hans/basics/define_your_config.md
+++ b/docs/source/zh-Hans/basics/define_your_config.md
@@ -2,6 +2,8 @@
 
 作者: Guangyang Lu, Shenggui Li, Siqi Mai
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster API](../basics/booster_api.md)页面查阅更新。
+
 **预备知识:**
 - [分布式训练](../concepts/distributed_training.md)
 - [Colossal-AI 总览](../concepts/colossalai_overview.md)
diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md
index a7519bfca14f..a35bd87c44e1 100644
--- a/docs/source/zh-Hans/basics/engine_trainer.md
+++ b/docs/source/zh-Hans/basics/engine_trainer.md
@@ -2,6 +2,8 @@
 
 作者: Shenggui Li, Siqi Mai
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster API](../basics/booster_api.md)页面查阅更新。
+
 **预备知识:**
 - [初始化功能](./initialize_features.md)
 
diff --git a/docs/source/zh-Hans/basics/initialize_features.md b/docs/source/zh-Hans/basics/initialize_features.md
index 67ea114b42b2..1c28d658e1bc 100644
--- a/docs/source/zh-Hans/basics/initialize_features.md
+++ b/docs/source/zh-Hans/basics/initialize_features.md
@@ -2,6 +2,8 @@
 
 作者: Shenggui Li, Siqi Mai
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster API](../basics/booster_api.md)页面查阅更新。
+
 **预备知识:**
 - [分布式训练](../concepts/distributed_training.md)
 - [Colossal-AI 总览](../concepts/colossalai_overview.md)
diff --git a/docs/source/zh-Hans/basics/model_checkpoint.md b/docs/source/zh-Hans/basics/model_checkpoint.md
index cec12d451989..a5374b7509c9 100644
--- a/docs/source/zh-Hans/basics/model_checkpoint.md
+++ b/docs/source/zh-Hans/basics/model_checkpoint.md
@@ -1,7 +1,9 @@
-# 模型检查点
+# 模型Checkpoint
 
 作者 : Guangyang Lu
 
+> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster Checkpoint](../basics/booster_checkpoint.md)页面查阅更新。
+
 **预备知识:**
 - [Launch Colossal-AI](./launch_colossalai.md)
 - [Initialize Colossal-AI](./initialize_features.md)
@@ -13,9 +15,9 @@
 
 ## 简介
 
-本教程将介绍如何保存和加载模型检查点。
+本教程将介绍如何保存和加载模型Checkpoint。
 
-为了充分利用Colossal-AI的强大并行策略，我们需要修改模型和张量，可以直接使用 `torch.save` 或者 `torch.load` 保存或加载模型检查点。在Colossal-AI中，我们提供了应用程序接口实现上述同样的效果。
+为了充分利用Colossal-AI的强大并行策略，我们需要修改模型和张量，可以直接使用 `torch.save` 或者 `torch.load` 保存或加载模型Checkpoint。在Colossal-AI中，我们提供了应用程序接口实现上述同样的效果。
 
 但是，在加载时，你不需要使用与存储相同的保存策略。
 
@@ -24,7 +26,7 @@
 ### 保存
 
 有两种方法可以使用Colossal-AI训练模型，即使用engine或使用trainer。
-**注意我们只保存 `state_dict`.** 因此，在加载检查点时，需要首先定义模型。
+**注意我们只保存 `state_dict`.** 因此，在加载Checkpoint时，需要首先定义模型。
 
 #### 同 engine 保存
 
diff --git a/docs/source/zh-Hans/features/cluster_utils.md b/docs/source/zh-Hans/features/cluster_utils.md
index ca787a869041..f54a72c63a66 100644
--- a/docs/source/zh-Hans/features/cluster_utils.md
+++ b/docs/source/zh-Hans/features/cluster_utils.md
@@ -13,20 +13,4 @@
 
 {{ autodoc:colossalai.cluster.DistCoordinator }}
 
-{{ autodoc:colossalai.cluster.DistCoordinator.is_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.is_node_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.is_last_process }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.print_on_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.print_on_node_master }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.priority_execution }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.destroy }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.block_all }}
-
-{{ autodoc:colossalai.cluster.DistCoordinator.on_master_only }}
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/features/gradient_accumulation.md b/docs/source/zh-Hans/features/gradient_accumulation.md
index e21e5fcd43d8..fc8b29bbe8f1 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# 梯度累积
+# 梯度累积 (旧版本)
 
 作者: Shenggui Li, Yongbin Li
 
@@ -38,3 +38,4 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation.py  -->
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
new file mode 100644
index 000000000000..a8422060f0ea
--- /dev/null
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -0,0 +1,146 @@
+# 梯度累积 (新版本)
+
+作者: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [训练中使用Booster](../basics/booster_api.md)
+
+## 引言
+
+梯度累积是一种常见的增大训练 batch size 的方式。 在训练大模型时，内存经常会成为瓶颈，并且 batch size 通常会很小（如2），这导致收敛性无法保证。梯度累积将多次迭代的梯度累加，并仅在达到预设迭代次数时更新参数。
+
+## 使用
+
+在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个上下文管理器，在该上下文管理器下取消同步并且累积梯度。
+
+## 实例
+
+我们将介绍如何使用梯度累积。在这个例子中，梯度累积次数被设置为4。
+
+### 步骤 1. 在 train.py 导入相关库
+创建train.py并导入必要依赖。 `torch` 的版本应不低于1.8.1。
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.cluster.dist_coordinator import priority_execution
+```
+
+### 步骤 2. 初始化分布式环境
+
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)使用其他初始化方法。
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### 步骤 3. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`，在你的机器上设置路径。数据将会被自动下载到该路径。
+
+```python
+# define the training hyperparameters
+BATCH_SIZE = 128
+GRADIENT_ACCUMULATION = 4
+
+# build resnet
+model = resnet18(num_classes=10)
+
+# build dataloaders
+with priority_execution():
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+```
+
+### 步骤 4. 注入特性
+创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`， 调用`booster.boost`注入特性。
+
+```python
+plugin = TorchDDPPlugin()
+booster = Booster(plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                    optimizer=optimizer,
+                                                                    criterion=criterion,
+                                                                    dataloader=train_dataloader)
+```
+
+### 步骤 5. 使用booster训练
+使用booster构建一个普通的训练循环，验证梯度累积。 `param_by_iter` 记录分布训练的信息。
+```python
+optimizer.zero_grad()
+for idx, (img, label) in enumerate(train_dataloader):
+        sync_context = booster.no_sync(model)
+        img = img.cuda()
+        label = label.cuda()
+        if idx % (GRADIENT_ACCUMULATION - 1) != 0:
+            with sync_context:
+                output = model(img)
+                train_loss = criterion(output, label)
+                booster.backward(train_loss, optimizer)
+        else:
+            output = model(img)
+            train_loss = criterion(output, label)
+            booster.backward(train_loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+
+        ele_1st = next(model.parameters()).flatten()[0]
+        param_by_iter.append(str(ele_1st.item()))
+
+        if idx != 0 and idx % (GRADIENT_ACCUMULATION - 1) == 0:
+            break
+
+    for iteration, val in enumerate(param_by_iter):
+        print(f'iteration {iteration} - value: {val}')
+
+    if param_by_iter[-1] != param_by_iter[0]:
+        print('The parameter is only updated in the last iteration')
+
+```
+
+### 步骤 6. 启动训练脚本
+为了验证梯度累积，我们可以只检查参数值的变化。当设置梯度累加时，仅在最后一步更新参数。您可以使用以下命令运行脚本：
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+
+你将会看到类似下方的文本输出。这展现了梯度虽然在前3个迭代中被计算，但直到最后一次迭代，参数才被更新。
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation_with_booster.py  -->
diff --git a/docs/source/zh-Hans/features/gradient_clipping.md b/docs/source/zh-Hans/features/gradient_clipping.md
index 203f66a3fea2..2f62c31766a6 100644
--- a/docs/source/zh-Hans/features/gradient_clipping.md
+++ b/docs/source/zh-Hans/features/gradient_clipping.md
@@ -1,4 +1,4 @@
-# 梯度裁剪
+# 梯度裁剪（旧版本）
 
 作者: Boxiang Wang, Haichen Huang, Yongbin Li
 
@@ -49,3 +49,5 @@ clip_grad_norm = 1.0
 ```shell
 python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500  train_with_engine.py
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_clipping.py  -->
diff --git a/docs/source/zh-Hans/features/gradient_clipping_with_booster.md b/docs/source/zh-Hans/features/gradient_clipping_with_booster.md
new file mode 100644
index 000000000000..3c61356dd0d5
--- /dev/null
+++ b/docs/source/zh-Hans/features/gradient_clipping_with_booster.md
@@ -0,0 +1,140 @@
+# 梯度裁剪 (新版本)
+
+作者: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [booster使用](../basics/booster_api.md)
+
+**相关论文**
+- [On the difficulty of training Recurrent Neural Networks](https://arxiv.org/abs/1211.5063)
+
+## 引言
+
+为了加快训练过程和寻求全局最优以获得更好的性能，越来越多的学习率调度器被提出。人们通过控制学习率来调整训练中的下降速度。这使得梯度向量在每一步都能更好地统一。在这种情况下，下降速度可以按预期被控制。
+因此，梯度裁剪，一种可以将梯度向量归一化，以将其限制在统一长度的技术，对于那些希望模型性能更好的人来说是不可或缺的。
+
+在使用 Colossal-AI 时，你不必担心实现梯度剪裁，我们以一种有效而方便的方式支持梯度剪裁。你所需要的只是在你的配置文件中增加一个命令。
+
+## 为什么应该使用 Colossal-AI 中的梯度裁剪
+
+我们不建议用户自己编写梯度剪裁，因为朴素的梯度剪裁在应用张量并行、流水线并行、MoE 等功能时可能会失败。
+
+根据下图，每个 GPU 只拥有线性层中权重的一部分参数。为了得到线性层权重的梯度向量的正确范数，每个 GPU 中的每个梯度向量的范数应该相加。更复杂的是，偏置的分布不同于权重的分布。通信组在求和运算中有所不同。
+
+(注: 这种情况是旧版本的 2D 并行，在代码中的实现是不一样的。但这是一个很好的例子，能够说明在梯度剪裁中统一所有通信的困难。)
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/KXiJPHt3Dum82cA.png"/>
+<figcaption>参数分布</figcaption>
+</figure>
+
+不用担心它，因为 Colossal-AI 已经为你处理好。
+
+### 使用
+要使用梯度裁剪，只需在使用booster注入特性之后，调用optimizer的`clip_grad_by_norm`或者`clip_grad_by_value`函数即可进行梯度裁剪。
+
+### 实例
+
+下面我们将介绍如何使用梯度裁剪，在本例中，我们将梯度裁剪范数设置为1.0。
+
+### 步骤 1. 在训练中导入相关库
+创建`train.py`并导入相关库。
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from tqdm import tqdm
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+```
+
+### 步骤 2. 初始化分布式环境
+我们需要初始化分布式环境. 为了快速演示，我们使用`launch_from_torch`. 您可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
+
+```python
+colossalai.launch_from_torch(config=dict())
+logger = get_dist_logger()
+```
+
+### 步骤 3. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`在你的机器上设置路径。数据将会被自动下载到该路径。
+```python
+# define training hyperparameters
+NUM_EPOCHS = 200
+BATCH_SIZE = 128
+GRADIENT_CLIPPING = 0.1
+# build resnet
+model = resnet34(num_classes=10)
+# build dataloaders
+train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                        download=True,
+                        transform=transforms.Compose([
+                            transforms.RandomCrop(size=32, padding=4),
+                            transforms.RandomHorizontalFlip(),
+                            transforms.ToTensor(),
+                            transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                        ]))
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+# lr_scheduler
+lr_scheduler = CosineAnnealingLR(optimizer, total_steps=NUM_EPOCHS)
+
+```
+### 步骤 4. 注入梯度裁剪特性
+
+创建`TorchDDPPlugin`对象并初始化`Booster`, 使用booster注入相关特性。
+```python
+plugin = TorchDDPPlugin()
+booster = Booster(plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model,optimizer, criterion,train_dataloader, lr_scheduler)
+
+```
+
+### 步骤 5. 使用booster训练
+使用booster进行训练。
+```python
+# verify gradient clipping
+model.train()
+for idx, (img, label) in enumerate(train_dataloader):
+    img = img.cuda()
+    label = label.cuda()
+
+    model.zero_grad()
+    output = model(img)
+    train_loss = criterion(output, label)
+    booster.backward(train_loss, optimizer)
+    optimizer.clip_grad_by_norm(max_norm=GRADIENT_CLIPPING)
+    optimizer.step()
+    lr_scheduler.step()
+
+    ele_1st = next(model.parameters()).flatten()[0]
+    logger.info(f'iteration {idx}, loss: {train_loss}, 1st element of parameters: {ele_1st.item()}')
+
+    # only run for 4 iterations
+    if idx == 3:
+        break
+```
+
+### 步骤 6. 启动训练脚本
+你可以使用以下命令运行脚本：
+
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_clipping_with_booster.py  -->
diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md
index c9db3a59c1c3..4628b09cd910 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training.md
@@ -1,4 +1,4 @@
-# 自动混合精度训练 (AMP)
+# 自动混合精度训练 (旧版本)
 
 作者: Chuanrui Wang, Shenggui Li, Yongbin Li
 
@@ -203,7 +203,7 @@ Naive AMP 的默认参数:
 - initial_scale(int): gradient scaler 的初始值
 - growth_factor(int): loss scale 的增长率
 - backoff_factor(float): loss scale 的下降率
-- hysterisis(int): 动态 loss scaling 的延迟偏移
+- hysteresis(int): 动态 loss scaling 的延迟偏移
 - max_scale(int): loss scale 的最大允许值
 - verbose(bool): 如果被设为`True`,将打印调试信息
 
@@ -339,6 +339,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 
 使用下列命令启动训练脚本，你可以改变 `--nproc_per_node` 以使用不同数量的 GPU。
 
-```python
+```shell
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training.py  -->
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
new file mode 100644
index 000000000000..187aef1a6c4a
--- /dev/null
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -0,0 +1,235 @@
+# 自动混合精度训练 (新版本)
+
+作者: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [booster使用](../basics/booster_api.md)
+
+**相关论文**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## 引言
+
+AMP 代表自动混合精度训练。
+在 Colossal-AI 中, 我们结合了混合精度训练的不同实现:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | 支持张量并行 | 支持流水并行 | fp16范围 |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间，模型参数、激活和梯度向下转换至fp16 |
+| AMP_TYPE.APEX | ❌ | ❌ | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作，全都向下转换至fp16 |
+
+前两个依赖于 PyTorch (1.6及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现inf或nan。我们修改了torch amp实现，使其现在与张量并行兼容。
+
+> ❌️ fp16与ZeRO不兼容
+>
+> ⚠️ 流水并行目前仅支持naive amp
+
+我们建议使用 torch AMP，因为在不使用流水并行时，它通常比 NVIDIA AMP 提供更好的准确性。
+
+## 目录
+
+在本教程中，我们将介绍:
+
+1. [AMP 介绍](#amp-介绍)
+2. [Colossal-AI 中的 AMP](#colossal-ai-中的-amp)
+3. [练习实例](#实例)
+
+## AMP 介绍
+
+自动混合精度训练是混合 FP16 和 FP32 训练。
+
+半精度浮点格式（FP16）具有较低的算法复杂度和较高的计算效率。此外，FP16 仅需要 FP32 所需的一半存储空间，并节省了内存和网络带宽，从而为大 batch size 和大模型提供了更多内存。
+
+然而，还有其他操作，如缩减，需要 FP32 的动态范围，以避免数值溢出/下溢。因此，我们引入自动混合精度，尝试将每个操作与其相应的数据类型相匹配，这可以减少内存占用并提高训练效率。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>AMP 示意图 (图片来自 <a href="https://arxiv.org/abs/2108.05818">PatrickStar 论文</a>)</figcaption>
+</figure>
+
+## Colossal-AI 中的 AMP
+
+我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster支持amp特性注入，如果您要使用混合精度训练，则在创建booster实例时指定`mixed_precision`参数，我们现已支持torch amp，apex amp, naive amp（现已移植torch amp至booster，apex amp, naive amp仍由`colossalai.initialize`方式启动，如您需使用，请[参考](./mixed_precision_training.md）;后续将会拓展`bf16`,`pf8`的混合精度训练.
+
+#### booster启动方式
+您可以在创建booster实例时，指定`mixed_precision="fp16"`即使用torch amp。
+<!--- doc-test-ignore-start -->
+```python
+"""
+    初始化映射关系如下：
+    'fp16': torch amp
+    'fp16_apex': apex amp,
+    'bf16': bf16,
+    'fp8': fp8,
+    'fp16_naive': naive amp
+"""
+from colossalai import Booster
+booster = Booster(mixed_precision='fp16',...)
+```
+<!--- doc-test-ignore-end -->
+或者您可以自定义一个`FP16TorchMixedPrecision`对象，如
+<!--- doc-test-ignore-start -->
+```python
+from colossalai.mixed_precision import FP16TorchMixedPrecision
+mixed_precision = FP16TorchMixedPrecision(
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000)
+booster = Booster(mixed_precision=mixed_precision,...)
+```
+<!--- doc-test-ignore-end -->
+其他类型的amp使用方式也是一样的。
+
+### Torch AMP 配置
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16TorchMixedPrecision }}
+
+### Apex AMP 配置
+
+对于这种模式，我们依靠 Apex 实现混合精度训练。我们支持这个插件，因为它允许对混合精度的粒度进行更精细的控制。
+例如, O2 水平 (优化器水平2) 将保持 batch normalization 为 FP32。
+
+如果你想了解更多细节，请参考 [Apex Documentation](https://nvidia.github.io/apex/)。
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16ApexMixedPrecision }}
+
+### Naive AMP 配置
+
+在 Naive AMP 模式中, 我们实现了混合精度训练，同时保持了与复杂张量和流水并行的兼容性。该 AMP 模式将所有操作转为 FP16 。下列代码块展示了该模式的booster启动方式。
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16NaiveMixedPrecision }}
+
+当使用`colossalai.booster`时, 首先需要实例化一个模型、一个优化器和一个标准。将输出模型转换为内存消耗较小的 AMP 模型。如果您的输入模型已经太大，无法放置在 GPU 中，请使用`dtype=torch.float16`实例化你的模型。或者请尝试更小的模型，或尝试更多的并行化训练技术！
+
+## 实例
+
+下面我们将展现如何在 Colossal-AI 使用 AMP。在该例程中，我们使用 Torch AMP.
+
+### 步骤 1. 在 train.py 导入相关库
+
+创建`train.py`并导入必要依赖. 请记得通过命令`pip install timm scipy`安装`scipy`和`timm`。
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from timm.models import vit_base_patch16_224
+from titans.utils import barrier_context
+from torchvision import datasets, transforms
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+```
+
+### 步骤 2. 初始化分布式环境
+
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
+使用其他初始化方法。
+
+```python
+# 初始化分布式设置
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### 步骤 3. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`
+在你的机器上设置路径。数据将会被自动下载到该路径。
+
+```python
+# define the constants
+NUM_EPOCHS = 2
+BATCH_SIZE = 128
+# build model
+model = vit_base_patch16_224(drop_rate=0.1)
+
+# build dataloader
+train_dataset = datasets.Caltech101(
+    root=Path(os.environ['DATA']),
+    download=True,
+    transform=transforms.Compose([
+        transforms.Resize(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        Gray2RGB(),
+        transforms.Normalize([0.5, 0.5, 0.5],
+                                [0.5, 0.5, 0.5])
+    ]))
+
+# build optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+
+# lr_scheduelr
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS)
+```
+
+### 步骤 4. 插入 AMP
+创建一个MixedPrecision对象（如果需要）及torchDDPPlugin对象，调用 `colossalai.boost` 将所有训练组件转为为FP16模式.
+
+```python
+plugin = TorchDDPPlugin()
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+booster = Booster(mixed_precision='fp16', plugin=plugin)
+
+# if you need to customize the config, do like this
+# >>> from colossalai.mixed_precision import FP16TorchMixedPrecision
+# >>> mixed_precision = FP16TorchMixedPrecision(
+# >>>     init_scale=2.**16,
+# >>>     growth_factor=2.0,
+# >>>     backoff_factor=0.5,
+# >>>     growth_interval=2000)
+# >>> plugin = TorchDDPPlugin()
+# >>> booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+
+# boost model, optimizer, criterion, dataloader, lr_scheduler
+model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, dataloader, lr_scheduler)
+```
+
+### 步骤 5. 使用 booster 训练
+
+使用booster构建一个普通的训练循环。
+
+```python
+model.train()
+for epoch in range(NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        optimizer.zero_grad()
+        output = model(img)
+        loss = criterion(output, label)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+    lr_scheduler.step()
+```
+
+### 步骤 6. 启动训练脚本
+
+使用下列命令启动训练脚本，你可以改变 `--nproc_per_node` 以使用不同数量的 GPU。
+
+```shell
+colossalai run --nproc_per_node 1 train.py
+```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index fd75ed1f5b3e..1feb9dde5725 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -53,9 +53,8 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n
 
 > ⚠ 它只会卸载在 CPU 上的优化器状态。这意味着它只会影响 CPU 训练或者使用卸载的 Zero/Gemini。
 
-## Exampls
+## Examples
 
-Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖`transformers`。
 
 我们首先应该安装依赖：
@@ -77,8 +76,9 @@ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 import colossalai
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
 ```
 
 然后我们定义一个损失函数：
@@ -182,16 +182,24 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
     criterion = GPTLMLoss()
     optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
     print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
-    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
-                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
-    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
-    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+
+    plugin = GeminiPlugin(
+                strict_ddp_mode=True,
+                device=torch.cuda.current_device(),
+                placement_policy='cpu',
+                pin_memory=True,
+                hidden_dim=config.n_embd,
+                initial_scale=2**5
+                )
+    booster = Booster(plugin)
+    model, optimizer, criterion, _* = booster.boost(model, optimizer, criterion)
+
     start = time.time()
     for step in range(3):
         data = get_data(4, 128, config.vocab_size)
         outputs = model(**data)
         loss = criterion(outputs.logits, data['input_ids'])
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
         optimizer.zero_grad()
         print(f'[{step}] loss: {loss.item():.3f}')
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 72403bf610a4..9030464ddf9a 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -4,7 +4,7 @@
 
 **前置教程:**
 
-- [定义配置文件](../basics/define_your_config.md)
+- [booster使用](../basics/booster_api.md)
 
 **示例代码**
 
@@ -97,6 +97,8 @@ optimizer.step()
 
 首先我们只需要引入`Huggingface transformers` 的 `GPT2LMHeadModel`来定义我们的模型，不需要用户进行模型的定义与修改，方便用户使用。
 
+定义GPT模型：
+
 ```python
 class GPTLMModel(nn.Module):
 
@@ -182,34 +184,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 ```
 
-定义一个使用 Gemini + ZeRO DDP 的模型：
-
-```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placememt_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                 			init_device=GeminiManager.get_default_device(placememt_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
-```
-
-由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数。
-
 写一个获得随机输入的函数:
 
 ```python
@@ -219,9 +193,16 @@ def get_data(batch_size, seq_len, vocab_size):
     return input_ids, attention_mask
 ```
 
-最后，我们可以定义我们的训练循环:
+
+最后，使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数：
 
 ```python
+from torch.optim import Adam
+
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+
 def main():
     args = parse_args()
     BATCH_SIZE = 8
@@ -232,22 +213,23 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
+    optimizer = Adam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    default_dist_spec = ShardSpec([-1], [args.tp_degree])
     # build GPT model
     with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
       model = gpt2_medium(checkpoint=True)
     pg = default_pg
     # Tensor Parallelism (TP)
     tensor_parallelize(model, pg)
+
     # Gemini + ZeRO DP, Note it must be used after TP
-    model = gemini_zero_dpp(model, pg, args.placement)
-    # build optimizer
-    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
-    numel = sum([p.numel() for p in model.parameters()])
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
     torch.cuda.synchronize()
     model.train()
     for n in range(NUM_STEPS):
@@ -256,10 +238,12 @@ def main():
         optimizer.zero_grad()
         outputs = model(input_ids, attn_mask)
         loss = criterion(outputs, input_ids)
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
 
     torch.cuda.synchronize()
 ```
 > ⚠️ 注意：如果你使用Gemini模块的话，请不要使用我们之前提到过的[梯度累加](../features/gradient_accumulation.md)。
 完整的例子代码可以在 [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 zero_with_chunk.py  -->
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index e0d726c74f64..a6c88672b907 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -47,4 +47,20 @@ CUDA_EXT=1 pip install .
 pip install .
 ```
 
+如果您在使用CUDA 10.2，您仍然可以从源码安装ColossalAI。但是您需要手动下载cub库并将其复制到相应的目录。
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
+
 <!-- doc-test-command: echo "installation.md does not need test" -->
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index e6159e1058b9..d07febea0a84 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -340,12 +340,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
 
     model = GeminiDDP(model,
                       device=get_current_device(),
-                      placement_policy=placememt_policy,
+                      placement_policy=placement_policy,
                       pin_memory=True,
                       search_range_mb=64)
     return model
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 1b2fc778d5ed..6715b473a567 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -342,12 +342,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
 
     model = GeminiDDP(model,
                       device=get_current_device(),
-                      placement_policy=placememt_policy,
+                      placement_policy=placement_policy,
                       pin_memory=True,
                       search_range_mb=64)
     return model
diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh
index 6079d5ed615b..0ddfd3a6211c 100644
--- a/examples/language/gpt/gemini/test_ci.sh
+++ b/examples/language/gpt/gemini/test_ci.sh
@@ -3,7 +3,7 @@ $(cd `dirname $0`;pwd)
 export TRAIN_STEP=4
 
 for MODEL_TYPE in "gpt2_medium"; do
-  for DISTPLAN in "colossalai"; do
+  for DISTPLAN in "CAI_Gemini"; do
     for BATCH_SIZE in 2; do
       for GPUNUM in 1 4; do
         for TPDEGREE in 1 2; do
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index b2a7fa36d021..92751c7e2f47 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -11,11 +11,13 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
+from colossalai.zero import ColoInitContext
 
 CAI_VERSION = colossalai.__version__
 
@@ -236,23 +238,6 @@ def main():
             tensor_parallelize(model, tp_pg)
 
         # asign running configurations
-        gemini_config = None
-        if args.distplan.startswith("CAI_ZeRO"):
-            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
-        elif args.distplan == "CAI_Gemini":
-            gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
-                                 device=get_current_device(),
-                                 placement_policy=args.placement,
-                                 pin_memory=True,
-                                 hidden_dim=model.config.n_embd,
-                                 search_range_mb=128)
-            optim_config = dict(gpu_margin_mem_ratio=0.)
-        else:
-            raise RuntimeError
-
-        # build a highly optimized gpu/cpu optimizer
-        optimizer = HybridAdam(model.parameters(), lr=1e-3)
-
         if args.distplan == "CAI_ZeRO1":
             zero_stage = 1
         elif args.distplan == "CAI_ZeRO2":
@@ -262,22 +247,42 @@ def main():
         else:
             raise RuntimeError
 
-        # wrap your model and optimizer
-        model = zero_model_wrapper(model, zero_stage, gemini_config)
-        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
+        plugin = None
+        if args.distplan.startswith("CAI_ZeRO"):
+            plugin = LowLevelZeroPlugin(stage=zero_stage,
+                                        reduce_bucket_size_in_m=12 * 1024 * 1024,
+                                        overlap_communication=True,
+                                        verbose=True)
+        elif args.distplan == "CAI_Gemini":
+            plugin = GeminiPlugin(device=get_current_device(),
+                                  placement_policy=args.placement,
+                                  pin_memory=True,
+                                  strict_ddp_mode=args.tp_degree == 1,
+                                  search_range_mb=128,
+                                  hidden_dim=model.config.n_embd,
+                                  gpu_margin_mem_ratio=0.)
+        else:
+            raise RuntimeError
+
+        # build a highly optimized gpu/cpu optimizer
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     elif args.distplan.startswith("Pytorch"):
         assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
         model = model_builder(args.model_type)(checkpoint=True).cuda()
-        model = DDP(model)
+        plugin = TorchDDPPlugin()
         if args.distplan.endswith("DDP"):
             optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
         elif args.distplan.endswith("ZeRO"):
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
+
     else:
         raise RuntimeError
+    # wrap your model and optimizer
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 
     # model is shared after TP
     numel = get_model_size(model)
@@ -305,13 +310,7 @@ def train_step():
         fwd_end = time()
         fwd_time = fwd_end - start
         logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
-
-        if args.distplan.startswith("CAI"):
-            optimizer.backward(loss)
-        elif args.distplan.startswith("Pytorch"):
-            loss.backward()
-        else:
-            raise RuntimeError
+        booster.backward(loss, optimizer)
 
         torch.cuda.synchronize()
         bwd_end = time()
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 7923e4fc855d..b16da1c7744a 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -102,23 +102,23 @@ def get_model_size(model: nn.Module):
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     cai_version = colossalai.__version__
     if version.parse(cai_version) > version.parse("0.1.10"):
         from colossalai.nn.parallel import GeminiDDP
         model = GeminiDDP(model,
                           device=get_current_device(),
-                          placement_policy=placememt_policy,
+                          placement_policy=placement_policy,
                           pin_memory=True,
                           search_range_mb=32)
     elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
         from colossalai.gemini import ChunkManager, GeminiManager
         chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        gemini_manager = GeminiManager(placement_policy, chunk_manager)
         chunk_manager = ChunkManager(chunk_size,
                                      pg,
                                      enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placememt_policy))
+                                     init_device=GeminiManager.get_default_device(placement_policy))
         model = ZeroDDP(model, gemini_manager)
     else:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index 933026166d3f..0664d41fd359 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -29,9 +29,9 @@ quickly deploy large AI model training and inference, reducing large AI model tr
  - Fine-tuning and Inference for OPT [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/opt) [[video]](https://www.youtube.com/watch?v=jbEFNVzl67Y)
  - Optimized AlphaFold [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/tutorial/fastfold) [[video]](https://www.youtube.com/watch?v=-zP13LfJP7w)
  - Optimized Stable Diffusion [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion) [[video]](https://www.youtube.com/watch?v=8KHeUjjc-XQ)
- - ColossalChat: Cloning ChatGPT with a Complete RLHF Pipeline 
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
-[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+ - ColossalChat: Cloning ChatGPT with a Complete RLHF Pipeline
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 [[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
 [[video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
diff --git a/op_builder/utils.py b/op_builder/utils.py
index 2dbd976fbcbb..cb528eea66a1 100644
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
@@ -110,7 +110,7 @@ def get_pytorch_version() -> List[int]:
     torch_version = torch.__version__.split('+')[0]
     TORCH_MAJOR = int(torch_version.split('.')[0])
     TORCH_MINOR = int(torch_version.split('.')[1])
-    TORCH_PATCH = int(torch_version.split('.')[2])
+    TORCH_PATCH = int(torch_version.split('.')[2], 16)
     return TORCH_MAJOR, TORCH_MINOR, TORCH_PATCH
 
 
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 55edb1b6a512..6895113bc637 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,6 +1,7 @@
 diffusers
 fbgemm-gpu==0.2.0
 pytest
+coverage==7.2.3
 git+https://github.com/hpcaitech/pytest-testmon
 torchvision
 transformers
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 12562095c153..44767f051fdd 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -1,10 +1,6 @@
-from contextlib import nullcontext
-
 import pytest
 import torch
-import torch.distributed as dist
 from packaging import version
-from torch import nn
 from torch.optim import SGD
 
 import colossalai
@@ -19,6 +15,7 @@
 from tests.kit.model_zoo import model_zoo
 
 
+# test baisc fsdp function
 def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchFSDPPlugin()
     booster = Booster(plugin=plugin)
diff --git a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
new file mode 100644
index 000000000000..2b6090bb1e29
--- /dev/null
+++ b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
@@ -0,0 +1,113 @@
+import pytest
+import torch
+from packaging import version
+from torch import nn
+from torch.optim import SGD
+from torchvision.models import resnet18
+from utils import shared_tempdir
+
+import colossalai
+from colossalai.booster import Booster
+
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from colossalai.booster.plugin import TorchFSDPPlugin
+
+from colossalai.testing import rerun_if_address_is_in_use, spawn, check_state_dict_equal
+
+
+def compare_nested_dict(dict1, dict2):
+    for key in dict1:
+        if key in dict2:
+            if type(dict1[key]) is dict:
+                assert type(dict2[key]) is dict
+                diff = compare_nested_dict(dict1[key], dict2[key])
+                if not diff:
+                    return diff
+            elif type(dict1[key]) is list:
+                assert type(dict2[key]) is list
+                for i, val in enumerate(dict1[key]):
+                    if isinstance(val, torch.Tensor):
+                        if not torch.equal(dict1[key][i], dict2[key][i]):
+                            return False
+                    elif val != dict2[key][i]:
+                        return False
+            elif type(dict1[key]) is torch.Tensor:
+                assert type(dict2[key]) is torch.Tensor
+                if not torch.equal(dict1[key], dict2[key]):
+                    return False
+            else:
+                if dict1[key] != dict2[key]:
+                    return False
+        else:
+            return False
+    return True
+
+
+def check_torch_fsdp_ckpt():
+    model = resnet18()
+    plugin = TorchFSDPPlugin()
+    booster = Booster(plugin=plugin)
+    optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)
+    criterion = lambda x: x.mean()
+    fsdp_model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    inputs = torch.randn(4, 3, 224, 224)
+    outputs = None
+
+    def run_model():
+        nonlocal outputs
+        outputs = fsdp_model(inputs)
+        optimizer.zero_grad()
+        criterion(outputs).backward()
+        optimizer.step()
+
+    with shared_tempdir() as tempdir:
+        model_ckpt_path = f"{tempdir}/model"
+        optim_ckpt_path = f"{tempdir}/optimizer"
+
+        run_model()
+
+        booster.save_model(fsdp_model, model_ckpt_path, shard=False)
+        booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
+
+        full_msd = fsdp_model.state_dict()
+        #full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
+        sharded_osd = optimizer.state_dict()
+        import copy
+        sharded_osd = copy.deepcopy(sharded_osd)
+
+        run_model()
+
+        full_msd_updated = fsdp_model.state_dict()
+        #full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
+        sharded_osd_updated = optimizer.state_dict()
+
+        assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
+        assert not compare_nested_dict(full_msd_updated, full_msd)
+        outputs_first = fsdp_model(inputs)
+        assert criterion(outputs_first) != criterion(outputs)
+
+        booster.load_model(fsdp_model, model_ckpt_path)
+        booster.load_optimizer(optimizer, optim_ckpt_path)
+
+        full_msd_restore = fsdp_model.state_dict()
+        #full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
+        sharded_osd_restore = optimizer.state_dict()
+        
+        assert compare_nested_dict(sharded_osd, sharded_osd_restore)
+        assert compare_nested_dict(full_msd_restore, full_msd)
+        outputs_sec = fsdp_model(inputs)
+        assert criterion(outputs_sec) == criterion(outputs)
+
+
+def run_dist(rank, world_size, port):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_torch_fsdp_ckpt()
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason="requires torch1.12 or higher")
+@rerun_if_address_is_in_use()
+def test_torch_fsdp_ckpt():
+    spawn(run_dist, 2)
diff --git a/tests/test_utils/test_lazy_init/utils.py b/tests/test_utils/test_lazy_init/lazy_init_utils.py
similarity index 100%
rename from tests/test_utils/test_lazy_init/utils.py
rename to tests/test_utils/test_lazy_init/lazy_init_utils.py
diff --git a/tests/test_utils/test_lazy_init/test_distribute.py b/tests/test_utils/test_lazy_init/test_distribute.py
index c15b055e8361..fd91e7e912b5 100644
--- a/tests/test_utils/test_lazy_init/test_distribute.py
+++ b/tests/test_utils/test_lazy_init/test_distribute.py
@@ -15,7 +15,7 @@
     from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 except:
     pass
-from utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
+from lazy_init_utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
 
 from tests.kit.model_zoo import model_zoo
 
diff --git a/tests/test_utils/test_lazy_init/test_models.py b/tests/test_utils/test_lazy_init/test_models.py
index 4a0217b31a97..f828b23a94c4 100644
--- a/tests/test_utils/test_lazy_init/test_models.py
+++ b/tests/test_utils/test_lazy_init/test_models.py
@@ -1,5 +1,5 @@
 import pytest
-from utils import SUPPORT_LAZY, check_lazy_init
+from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
 
 from tests.kit.model_zoo import model_zoo
 
diff --git a/version.txt b/version.txt
index a45be4627678..0d91a54c7d43 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.8
+0.3.0