diff --git a/.compatibility b/.compatibility
index c8ac4083d2a2..32da32be5521 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,3 +1,3 @@
 1.12.0-11.3.0
-1.11.0-11.3.0
-1.10.1-11.3.0
+1.13.0-11.6.0
+2.0.0-11.7.0
diff --git a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
index 16b8957c1d88..d8f6c8fe309e 100644
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
@@ -38,7 +38,7 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
 
 def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
     """
-    Retrive the issue/PR comments made by our members in the last 7 days.
+    Retrieve the issue/PR comments made by our members in the last 7 days.
 
     Args:
         github_token (str): GitHub access token for API calls
@@ -89,7 +89,7 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
 
 def get_discussion_comments(github_token, since) -> Dict[str, int]:
     """
-    Retrive the discussion comments made by our members in the last 7 days.
+    Retrieve the discussion comments made by our members in the last 7 days.
     This is only available via the GitHub GraphQL API.
 
     Args:
@@ -194,7 +194,7 @@ def _call_graphql_api(query):
 
                 discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                 # check if the updatedAt is within the last 7 days
-                # if yes, add it to dicussion_numbers
+                # if yes, add it to discussion_numbers
                 if discussion_updated_at > since:
                     if discussion['authorAssociation'] != 'MEMBER':
                         discussion_numbers.append(discussion['number'])
@@ -207,14 +207,14 @@ def _call_graphql_api(query):
             # update cursor
             cursor = edges[-1]['cursor']
 
-    # get the dicussion comments and replies made by our member
+    # get the discussion comments and replies made by our member
     user_engagement_count = {}
-    for dicussion_number in discussion_numbers:
+    for discussion_number in discussion_numbers:
         cursor = None
         num_per_request = 10
 
         while True:
-            query = _generate_comment_reply_count_for_discussion(dicussion_number, num_per_request, cursor)
+            query = _generate_comment_reply_count_for_discussion(discussion_number, num_per_request, cursor)
             data = _call_graphql_api(query)
 
             # get the comments
@@ -249,7 +249,7 @@ def _call_graphql_api(query):
                             reply = reply_edge['node']
                             if reply['authorAssociation'] == 'MEMBER':
                                 # check if the updatedAt is within the last 7 days
-                                # if yes, add it to dicussion_numbers
+                                # if yes, add it to discussion_numbers
                                 reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
                                 if reply_updated_at > since:
                                     member_name = reply['author']['login']
diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index e3510e3522f6..077193b63ce0 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -12,12 +12,13 @@ pip install -r requirements.txt
 
 ## Evaluation Pipeline
 
-The whole evaluation pipeline consists of two methods:
+The whole evaluation pipeline consists of three methods:
 
 1. `GPT Evaluation`: evaluates model predictions using GPT models.
    * Compare the performance of two different models (battle).
    * Rate the model according to pre-defined metrics using prompting design.
 2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
+3. `UniEval`: evaluates model predictions using UniEval models(English only).
 
 ### Evaluation Category
 
@@ -75,7 +76,9 @@ GPT evaluation uses GPT models to evaluate the prediction of different models an
 
 GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
 
-> **NOTE:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+> **NOTE 1:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+
+> **NOTE 2:** To add customized metrics, you can refer to [FAQ](#faq).
 
 #### Automatic Evaluation
 
@@ -85,7 +88,7 @@ There are two ways to obtain reference answers:
 * For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
 * For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
 
-There are 5 types of automatic evaluation metrics listed in the table below:
+There are 6 types of automatic evaluation metrics listed in the table below:
 
 |     Automatic Evaluation Metric     | Description                                                  |
 | :---------------------------------: | :----------------------------------------------------------- |
@@ -94,6 +97,25 @@ There are 5 types of automatic evaluation metrics listed in the table below:
 |              Distinct               | Measure the diversity of generation text by counting the unique n-grams. |
 |              BERTScore              | Measure the semantic similarity between tokens of predictions and references with BERT. |
 | Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
+|                CHRF                 | Measure the similarity of character n-grams between prediction and reference. |
+
+#### UniEval Evaluation
+
+UniEval converts all evaluation tasks of different dimensions(metrics) into Boolean QA problems and utilize the model to answer with “Yes” or “No”. Compared with similarity-based metrics such as ROUGE and BLEU, UniEval can achieve a more comprehensive evaluation. In addition, UniEval also demonstrates its ability to transfer to unseen dimensions and tasks.
+
+In our evaluation pipeline, two pre-trained UniEval evaluators are used. One is [unieval-sum](https://huggingface.co/MingZhong/unieval-sum) and the other is [unieval-dialog](https://huggingface.co/MingZhong/unieval-dialog). The two models can be used for the 3 tasks, `summarization`, `dialogue` and `data2text`. Each task has different evaluation dimensions.
+
+| UniEval Model  | Task               | Dimension(Metric) |
+| :------------: | :----------------- | :--- |
+| unieval-sum    | summarization | coherence: whether the summary is coherent<br/>consistency: whether the claim is consistent with the given document<br/>fluency: whether the paragraph is fluent<br/>relevance: whether the summary is relevant to the reference |
+| unieval-sum | data2text | naturalness: whether the utterance is fluent<br/>informativeness: whether the utterance is informative according to the reference |
+| unieval-dialog | dialogue | naturalness: whether the response is natural in the dialogue<br/>coherence: whether the response is coherent in the dialogue history<br/>understandability: whether the response is understandable in the dialogue |
+
+> **NOTE 1:**  Task "data2text" uses the same model as task "summarization".
+
+> **NOTE 2:**  In UniEval paper, the `unieval-sum` model demonstrates the best transfer ability and so you can evaluate your customized metric with this model. Details of adding customized metrics can be found in [FAQ](#faq).
+
+> **NOTE 3:**  We consider not including all metrics provided in UniEval in our pipeline because the data structure and content of the instructions we want to evaluate are not suitable for direct use of some UniEval metrics.
 
 ## Evaluation Process
 
@@ -215,19 +237,26 @@ The following is an example of a Chinese GPT evaluation prompt. In an evaluation
 
 #### Configuration
 
-The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics and automatic metrics in key `GPT` and `Metrics`. You can find an example Chinese config file in `config`.
+The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics, automatic metrics and UniEval metrics in key `GPT`, `Metrics` and `UniEval`(English only). You can find an example English config file in `config`.
 
 ```json
 {
-    "language": "cn",
+    "language": "en",
+    "path_for_UniEval": {
+        "summarization": "path to unieval-sum model",
+        "dialogue": "path to unieval-dialog model",
+        "data2text": "path to unieval-sum model"
+    },
     "category": {
         "brainstorming": {
             "GPT": ["relevance", "creativity", "practicality", "correctness"],
-            "Metrics": ["Distinct"]
+            "Metrics": ["Distinct"],
+            "UniEval": ["summarization-fluency", "data2text-naturalness", "data2text-informativeness"]
         },
         "chat": {
             "GPT": [ "relevance", "naturalness", "engagingness", "reasonableness"],
-            "Metrics": ["Distinct"]
+            "Metrics": ["Distinct"],
+            "UniEval": ["dialogue-naturalness", "dialogue-coherence", "dialogue-understandability"]
         }
     }
 }
@@ -235,27 +264,33 @@ The following is an example of a Chinese config file. The configuration file can
 
 `"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
 
+`"path_for_UniEval"`: path to the UniEval model.
+
 `"category"`: the category/categories needed to evaluate the model capability.
 
 `"GPT"`: the metrics you want to use for GPT evaluation.
 
 `"Metrics"`: the metrics you want to use for automatic metrics evaluation.
 
+`"UniEval"`: the metrics you want to use for UniEval metrics evaluation. The metric has to be in the `"{task}-{metric}"` format because different tasks have same metrics such as naturalness and coherence.
+
+You can remove the key such as `"Metrics"` to skip evaluating answers using its corresponding evaluation metrics.
+
 You can create your config file based on available settings listed in following table.
 
-|    "category"    |          "GPT"          |  "Metrics"  |
-| :--------------: | :---------------------: | :---------: |
-| "brainstorming"  | "language organization" |   "BLEU"    |
-|      "chat"      |       "relevance"       |   "ROUGE"   |
-| "classification" |      "creativity"       | "Distinct"  |
-|   "closed_qa"    |     "practicality"      | "BERTScore" |
-|   "extraction"   |      "correctness"      | "Precision" |
-|   "generation"   |      "naturalness"      |  "Recall"   |
-|    "open_qa"     |     "engagingness"      | "F1 score"  |
-|   "rewriting"    |    "reasonableness"     |             |
-|    "roleplay"    |       "diversity"       |             |
-| "summarization"  |       "fidelity"        |             |
-|                  |      "conciseness"      |             |
+|    "category"    |          "GPT"          |  "Metrics"  |          "UniEval"           |
+| :--------------: | :---------------------: | :---------: | :--------------------------: |
+| "brainstorming"  | "language organization" |   "BLEU"    |    "dialogue-naturalness"    |
+|      "chat"      |       "relevance"       |   "ROUGE"   |     "dialogue-coherence"     |
+| "classification" |      "creativity"       | "Distinct"  | "dialogue-understandability" |
+|   "closed_qa"    |     "practicality"      | "BERTScore" |   "data2text-naturalness"    |
+|   "extraction"   |      "correctness"      | "Precision" | "data2text-informativeness"  |
+|   "generation"   |      "naturalness"      |  "Recall"   |  "summarization-coherence"   |
+|    "open_qa"     |     "engagingness"      | "F1 score"  | "summarization-consistency"  |
+|   "rewriting"    |    "reasonableness"     |   "CHRF"    |   "summarization-fluency"    |
+|    "roleplay"    |       "diversity"       |             |  "summarization-relevance"   |
+| "summarization"  |       "fidelity"        |             |                              |
+|                  |      "conciseness"      |             |                              |
 
 > **NOTE:**  For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
 
@@ -290,23 +325,36 @@ For example, if you want to add a new metric `persuasiveness` into category `bra
     "id": 1,
     "category": "brainstorming",
     "metrics": {
-      "persuasiveness": "说服力(1-5)：XXX"
+      "persuasiveness": "persuasiveness(1-5)：a short description for persuasiveness"
     },
     "CoT": {
-      "persuasiveness": "XXX\n\n说服力："
+      "persuasiveness": "CoT for persuasiveness\n\npersuasiveness："
     },
-    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+    "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
   }
 }
 ```
 
 </details>
 
+<details><summary><b>How can I add a new UniEval evaluation metric?</b></summary>
+
+For example, if you want to add a new metric `persuasiveness` into task `data2text`, you should add a Boolean QA question about the metric in function `add_question` in `unieval/utils.py`. Please do note that how effectively the model would evaluate this metric is unknown and you may need some experiments to test whether the model is capable of evaluating this metric.
+
+```python
+if task == 'data2text':
+	if dimension == 'persuasiveness':
+		cur_input = 'question: Is this a persuasive utterence </s> utterance: ' + output[i]
+```
+
+</details>
+
 ## To Do
 
 - [x] Add evaluation for English capability
-- [ ] Support UniEval
+- [x] Support UniEval
 - [x] Support GPT-4 evaluation
+- [ ] Support GPT evaluation with reference in the prompt
 
 ## Citations
 
@@ -327,4 +375,13 @@ For example, if you want to add a new metric `persuasiveness` into category `bra
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+
+@misc{zhong2022unified,
+      title={Towards a Unified Multi-Dimensional Evaluator for Text Generation},
+      author={Ming Zhong and Yang Liu and Da Yin and Yuning Mao and Yizhu Jiao and Pengfei Liu and Chenguang Zhu and Heng Ji and Jiawei Han},
+      year={2022},
+      eprint={2210.07197},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 ```
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
index a8c7ea8a3135..cf647f79bbf8 100644
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -34,7 +34,8 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
       ]
     },
     "closed_qa": {
@@ -46,7 +47,8 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
       ]
     },
     "extraction": {
@@ -58,7 +60,8 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
       ]
     },
     "generation": {
@@ -116,7 +119,8 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
       ]
     }
   }
diff --git a/applications/Chat/evaluate/config/config_en.json b/applications/Chat/evaluate/config/config_en.json
index 5b6272b97084..014c61d93a54 100644
--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
@@ -1,5 +1,10 @@
 {
   "language": "en",
+  "path_for_UniEval": {
+    "summarization": "path to unieval-sum",
+    "dialogue": "path to unieval-dialog",
+    "data2text": "path to unieval-sum"
+  },
   "category": {
     "brainstorming": {
       "GPT": [
@@ -11,6 +16,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "chat": {
@@ -23,6 +33,14 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "dialogue-naturalness",
+        "dialogue-coherence",
+        "dialogue-understandability",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "classification": {
@@ -34,7 +52,13 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "closed_qa": {
@@ -46,7 +70,13 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "extraction": {
@@ -58,7 +88,13 @@
       "Metrics": [
         "Precision",
         "Recall",
-        "F1 score"
+        "F1 score",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "generation": {
@@ -71,6 +107,11 @@
         "BLEU",
         "ROUGE",
         "BERTScore"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "open_qa": {
@@ -81,6 +122,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "rewriting": {
@@ -93,6 +139,11 @@
         "BLEU",
         "ROUGE",
         "BERTScore"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "roleplay": {
@@ -104,6 +155,11 @@
       ],
       "Metrics": [
         "Distinct"
+      ],
+      "UniEval": [
+        "summarization-fluency",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     },
     "summarization": {
@@ -116,7 +172,16 @@
       "Metrics": [
         "BLEU",
         "ROUGE",
-        "BERTScore"
+        "BERTScore",
+        "CHRF"
+      ],
+      "UniEval": [
+        "summarization-coherence",
+        "summarization-consistency",
+        "summarization-fluency",
+        "summarization-relevance",
+        "data2text-naturalness",
+        "data2text-informativeness"
       ]
     }
   }
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index 8388d95f748a..180ef438cc43 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -40,7 +40,7 @@ def main(args):
 
         # initialize evaluator
         evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
-                              config["language"])
+                              config["language"], config.get("path_for_UniEval", None))
         if len(args.model_name_list) == 2:
             answers1 = jload(args.answer_file_list[0])
             answers2 = jload(args.answer_file_list[1])
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index 0bf55ca80d7c..6bb8cdb29431 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -4,6 +4,7 @@
 import gpt_evaluate
 import metrics
 import pandas as pd
+import unieval
 from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
 
 
@@ -15,13 +16,15 @@ class Evaluator(object):
     """
 
     def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
-                 gpt_model: str, language: str) -> None:
+                 gpt_model: str, language: str, path_for_UniEval: Dict[str, str]) -> None:
         self.params = params
         self.battle_prompt = battle_prompt
         self.gpt_evaluation_prompt = gpt_evaluation_prompt
         self.gpt_model = gpt_model
         self.language = language
+        self.path_for_UniEval = path_for_UniEval
         self.automatic_metric_stats = dict()
+        self.unieval_metric_stats = dict()
         self.gpt_evaluation_results = dict()
         self.battle_results = []
 
@@ -47,16 +50,18 @@ def switch(metric, language):
                 return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
             elif metric == "ROUGE":
                 return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Distinct"):
+            elif metric == "Distinct":
                 return metrics.distinct_score(preds=predicts_list, language=language)
-            elif (metric == "BERTScore"):
+            elif metric == "BERTScore":
                 return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Precision"):
+            elif metric == "Precision":
                 return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "Recall"):
+            elif metric == "Recall":
                 return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
-            elif (metric == "F1 score"):
+            elif metric == "F1 score":
                 return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
+            elif metric == "CHRF":
+                return metrics.chrf_score(preds=predicts_list, targets=targets_list, language=language)
             else:
                 raise ValueError(f"Unexpected metric")
 
@@ -69,6 +74,9 @@ def switch(metric, language):
                 print(f"Category {category} specified in your config doesn't have corresponding answers!")
                 continue
 
+            if self.params[category].get("Metrics", None) is None:
+                continue
+
             category_metrics = self.params[category]["Metrics"]
             self.automatic_metric_stats[category] = {}
 
@@ -80,12 +88,68 @@ def switch(metric, language):
             for metric in category_metrics:
                 self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
 
+        # UniEval evaluation
+        # self.unieval_metric_stats's key is "task" instead of "category".
+        # Iterating "task" first will avoid repeated loading models because one task corresponds to one UniEval model.
+        # If key is "category", different models will be loaded for multiple times across categories because the user may require different task(models) to evaluate one category.
+        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            if self.params[category].get("UniEval", None) is None:
+                continue
+
+            if self.params[category]["UniEval"] and self.language == "cn":
+                raise Exception(
+                    "UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file.")
+
+            category_metrics = self.params[category]["UniEval"]
+
+            for task, metric in [tuple(category_metric.split("-")) for category_metric in category_metrics]:
+                if self.unieval_metric_stats.get(task, None) is None:
+                    self.unieval_metric_stats[task] = {category: {metric: 0}}
+                elif self.unieval_metric_stats[task].get(category, None) is None:
+                    self.unieval_metric_stats[task][category] = {metric: 0}
+                else:
+                    self.unieval_metric_stats[task][category][metric] = 0
+
+        for task in self.unieval_metric_stats:
+            if self.path_for_UniEval is None:
+                raise Exception(f"Please specify the path for UniEval model in the config file!")
+
+            if self.path_for_UniEval.get(task, None) is None:
+                raise Exception(f"Please specify the model path for task {task} in the config file!")
+
+            print(f"Load UniEval model for task {task}.")
+
+            uni_evaluator = unieval.get_evaluator(task, model_name_or_path=self.path_for_UniEval[task])
+            for category in self.unieval_metric_stats[task]:
+                targets_list = [
+                    target["target"] if target["target"] else target["output"]
+                    for target in targets_per_category[category]
+                ]
+                predicts_list = [answer["output"] for answer in answers_per_category[category]]
+                sources_list = [answer["instruction"] + answer["input"] for answer in answers_per_category[category]]
+
+                data = unieval.convert_data_to_unieval_format(predicts_list, sources_list, targets_list)
+                scores = uni_evaluator.evaluate(data,
+                                                category,
+                                                dims=list(self.unieval_metric_stats[task][category].keys()),
+                                                overall=False)
+                avg_scores = unieval.calculate_average_score(scores)
+
+                self.unieval_metric_stats[task][category].update(avg_scores)
+
         # gpt evaluation
         for category in self.params:
             if len(answers_per_category[category]) == 0:
                 print(f"Category {category} specified in your config doesn't have corresponding answers!")
                 continue
 
+            if self.params[category].get("GPT", None) is None:
+                continue
+
             category_metrics = self.params[category]["GPT"]
 
             prompt = self.gpt_evaluation_prompt.get(category, None)
@@ -106,29 +170,43 @@ def save(self, path: str, model_name_list: List[str]) -> None:
             save_path = os.path.join(path, "gpt_evaluate", "battle_results")
             gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
         else:
-            # Save evaluation results for automatic metrics
-            automatic_base_save_path = os.path.join(path, "automatic_results")
-            automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
-
-            save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
-
-            # Save charts and csv.
-            automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
-            analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
-
-            # Save evaluation results for GPT evaluation metrics.
-            gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
-            gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
-
-            all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
-                                                                       gpt_evaluation_results_save_path)
-
-            # Start to calculate scores and save statistics.
-            gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                        gpt_evaluation_statistics_save_path)
-
-            # Save charts and csv.
-            gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
-                                                           gpt_evaluation_analyses_save_path)
+            if self.automatic_metric_stats:
+                # Save evaluation results for automatic metrics
+                automatic_base_save_path = os.path.join(path, "automatic_results")
+                automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
+
+                save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
+
+                # Save charts and csv.
+                automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
+                analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
+
+            if self.unieval_metric_stats:
+                # Save evaluation results for UniEval metrics
+                unieval_base_save_path = os.path.join(path, "unieval_results")
+                unieval_results_save_path = os.path.join(unieval_base_save_path, "evaluation_results")
+
+                unieval.save_unieval_results(model_name_list[0], self.unieval_metric_stats, unieval_results_save_path)
+
+                # Save charts and csv.
+                unieval_analyses_save_path = os.path.join(unieval_base_save_path, "evaluation_analyses")
+                unieval.analyze_unieval_results(unieval_results_save_path, unieval_analyses_save_path)
+
+            if self.gpt_evaluation_results:
+                # Save evaluation results for GPT evaluation metrics.
+                gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+                gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+
+                all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0],
+                                                                           self.gpt_evaluation_results,
+                                                                           gpt_evaluation_results_save_path)
+
+                # Start to calculate scores and save statistics.
+                gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
+                gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                            gpt_evaluation_statistics_save_path)
+
+                # Save charts and csv.
+                gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
+                gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
+                                                               gpt_evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index b433500dfa04..6702526ac5e6 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -599,7 +599,7 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
 
     for category in tqdm.tqdm(
             frame_per_category.keys(),
-            desc=f"category: ",
+            desc=f"GPT evaluation: ",
             total=len(frame_per_category.keys()),
     ):
         data = pd.DataFrame(frame_per_category[category])
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
index 031f6fa83926..e220226ec041 100644
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -4,6 +4,7 @@
 import jieba
 from bert_score import score
 from nltk.translate.bleu_score import sentence_bleu
+from nltk.translate.chrf_score import sentence_chrf
 from rouge_chinese import Rouge as Rouge_cn
 from rouge_score import rouge_scorer as Rouge_en
 from sklearn.metrics import f1_score, precision_score, recall_score
@@ -40,6 +41,27 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,
     return bleu_scores
 
 
+def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
+    """Calculate CHRF Score Metric in sentence level.
+    """
+    chrf_score = {"chrf": 0}
+    cumulative_chrf = []
+
+    for pred, target in zip(preds, targets):
+        if language == "cn":
+            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = ' '.join(jieba.cut(preprocessing_text(target))).split()
+        elif language == "en":
+            pred_list = preprocessing_text(pred).split()
+            target_list = preprocessing_text(target).split()
+
+        cumulative_chrf.append(sentence_chrf(target_list, pred_list))
+
+    chrf_score["chrf"] = statistics.mean(cumulative_chrf)
+
+    return chrf_score
+
+
 def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
     """Calculate Chinese ROUGE Score Metric
 
diff --git a/applications/Chat/evaluate/unieval/__init__.py b/applications/Chat/evaluate/unieval/__init__.py
new file mode 100644
index 000000000000..dad8d6ad09fa
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/__init__.py
@@ -0,0 +1,12 @@
+from .evaluator import get_evaluator
+from .utils import (
+    analyze_unieval_results,
+    calculate_average_score,
+    convert_data_to_unieval_format,
+    save_unieval_results,
+)
+
+__all__ = [
+    'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
+    'analyze_unieval_results'
+]
diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py
new file mode 100644
index 000000000000..385425e4a576
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/evaluator.py
@@ -0,0 +1,330 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+from nltk import sent_tokenize
+
+from .scorer import UniEvaluator
+from .utils import add_question
+
+
+class SumEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for text summarization """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'summarization'
+        self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
+                  four dimensions: coherence, consistency, fluency, relevance.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            # Calculate average sentence-level scores for 'consistency' and 'fluency'
+            if dim == 'consistency' or dim == 'fluency':
+                src_list, output_list = [], []
+                n_sents = []    # the number of sentences in each generated summary
+                for i in range(n_data):
+                    source = data[i]['source']
+                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    n_sents.append(len(system_outputs))
+                    for j in range(len(system_outputs)):
+                        src_list.append(source)
+                        output_list.append(system_outputs[j])
+                input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
+                sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+                # Get average score for each sample
+                start_idx = 0
+                score = []
+                for cur_n_sent in n_sents:
+                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+                    start_idx += cur_n_sent
+
+            # Calculate summary-level score for 'coherence' and 'relevance'
+            elif dim == 'coherence' or dim == 'relevance':
+                src_list, output_list, ref_list = [], [], []
+                for i in range(n_data):
+                    src_list.append(data[i]['source'])
+                    output_list.append(data[i]['system_output'])
+                    if dim == 'relevance':
+                        ref_list.append(data[i]['reference'])
+                input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
+                score = self.scorer.score(input_list, self.task, category, dim)
+
+            # Please customize other dimensions here for summarization
+            else:
+                raise NotImplementedError('The input format for this dimension is still undefined. \
+                                           Please customize it first.')
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class DialogEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for dialogues """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'dialogue'
+        self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
+                  five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            # Calculate summation score for 'engagingness'
+            if dim == 'engagingness':
+                src_list, output_list, context_list = [], [], []
+                n_sents = []    # the number of sentences in each generated response
+                for i in range(n_data):
+                    source = data[i]['source']
+                    context = data[i]['context']
+                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    n_sents.append(len(system_outputs))
+                    for j in range(len(system_outputs)):
+                        src_list.append(source)
+                        context_list.append(context)
+                        output_list.append(system_outputs[j])
+                input_list = add_question(dimension=dim,
+                                          output=output_list,
+                                          src=src_list,
+                                          context=context_list,
+                                          task=self.task)
+                sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+                # Get the summation score for each sample
+                start_idx = 0
+                score = []
+                for cur_n_sent in n_sents:
+                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
+                    start_idx += cur_n_sent
+
+            # Calculate turn-level score for other dimensions
+            elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
+                src_list, output_list, context_list = [], [], []
+                for i in range(n_data):
+                    src_list.append(data[i]['source'])
+                    output_list.append(data[i]['system_output'])
+                    context_list.append(data[i]['context'])
+                input_list = add_question(dimension=dim,
+                                          output=output_list,
+                                          src=src_list,
+                                          context=context_list,
+                                          task=self.task)
+                score = self.scorer.score(input_list, self.task, category, dim)
+
+            # Please customize other dimensions here for summarization
+            else:
+                raise NotImplementedError('The input format for this dimension is still undefined. \
+                                           Please customize it first.')
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class D2tEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for data-to-text """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'data2text'
+        self.dimensions = ['naturalness', 'informativeness']
+
+    def evaluate(self, data, category, dims=None, overall=True):
+        """
+            Get the scores of all the given dimensions
+
+            category: The category to be evaluated.
+
+            dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
+                  two dimensions: naturalness and informativeness.
+
+            overall: indicates whether the overall score is to be calculated.
+                     Overall score can be customized to a combination of scores based on different
+                     dimensions. The default here is the average score of all the given dimensions.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        if dims == None:
+            eval_dims = self.dimensions
+        else:
+            assert isinstance(dims, list)
+            eval_dims = dims
+
+        for dim in eval_dims:
+            output_list, ref_list = [], []
+            for i in range(n_data):
+                output_list.append(data[i]['system_output'])
+                ref_list.append(data[i]['reference'])
+
+            input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
+            score = self.scorer.score(input_list, self.task, category, dim)
+
+            for i in range(n_data):
+                eval_scores[i][dim] = score[i]
+
+        # Customize your overall score here.
+        if overall == True:
+            for i in range(n_data):
+                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+
+        return eval_scores
+
+
+class FactEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up evaluator for factual consistency detection """
+        self.scorer = UniEvaluator(
+            model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
+            max_length=max_length,
+            device=device,
+            cache_dir=cache_dir)
+        self.task = 'fact'
+        self.dim = 'consistency'
+
+    def evaluate(self, data, category):
+        """
+            Get the factual consistency score (only 1 dimension for this task)
+
+            category: The category to be evaluated.
+        """
+        n_data = len(data)
+        eval_scores = [{} for _ in range(n_data)]
+
+        # Calculate average sentence-level scores for facutal consistency
+        src_list, output_list = [], []
+        n_sents = []    # the number of sentences in the claim
+        for i in range(n_data):
+            source = data[i]['source']
+            system_outputs = sent_tokenize(data[i]['system_output'])
+            n_sents.append(len(system_outputs))
+            for j in range(len(system_outputs)):
+                src_list.append(source)
+                output_list.append(system_outputs[j])
+        input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
+        sent_score = self.scorer.score(input_list, self.task, category, dim)
+
+        # Get average score for each sample
+        start_idx = 0
+        score = []
+        for cur_n_sent in n_sents:
+            score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+            start_idx += cur_n_sent
+
+        for i in range(n_data):
+            eval_scores[i][self.dim] = score[i]
+
+        return eval_scores
+
+
+def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
+    assert task in ['summarization', 'dialogue', 'data2text', 'fact']
+    if task == 'summarization':
+        return SumEvaluator(model_name_or_path=model_name_or_path,
+                            max_length=max_length,
+                            device=device,
+                            cache_dir=cache_dir)
+    elif task == 'dialogue':
+        return DialogEvaluator(model_name_or_path=model_name_or_path,
+                               max_length=max_length,
+                               device=device,
+                               cache_dir=cache_dir)
+    elif task == 'data2text':
+        return D2tEvaluator(model_name_or_path=model_name_or_path,
+                            max_length=max_length,
+                            device=device,
+                            cache_dir=cache_dir)
+    elif task == 'fact':
+        return FactEvaluator(model_name_or_path=model_name_or_path,
+                             max_length=max_length,
+                             device=device,
+                             cache_dir=cache_dir)
+    else:
+        raise NotImplementedError('Other tasks are not implemented, \
+                                   please customize specific tasks here.')
diff --git a/applications/Chat/evaluate/unieval/scorer.py b/applications/Chat/evaluate/unieval/scorer.py
new file mode 100644
index 000000000000..2c70bb9f6ded
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/scorer.py
@@ -0,0 +1,101 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+class UniEvaluator:
+
+    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
+        """ Set up model """
+        self.device = device
+        self.max_length = max_length
+
+        self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
+
+        self.model.eval()
+        self.model.to(device)
+
+        self.softmax = nn.Softmax(dim=1)
+
+        self.pos_id = self.tokenizer("Yes")["input_ids"][0]
+        self.neg_id = self.tokenizer("No")["input_ids"][0]
+
+    def score(self, inputs, task, category, dim, batch_size=8):
+        """
+            Get scores for the given samples.
+            final_score = postive_score / (postive_score + negative_score)
+        """
+
+        # The implementation of "forward" in T5 still requires decoder_input_ids.
+        # Therefore, we construct a random one-word target sequence.
+        # The content of the target has no effect on the final scores.
+        tgts = ["No" for _ in range(len(inputs))]
+
+        pos_score_list, neg_score_list = [], []
+        for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
+            src_list = inputs[i:i + batch_size]
+            tgt_list = tgts[i:i + batch_size]
+            try:
+                with torch.no_grad():
+                    encoded_src = self.tokenizer(src_list,
+                                                 max_length=self.max_length,
+                                                 truncation=True,
+                                                 padding=True,
+                                                 return_tensors='pt')
+                    encoded_tgt = self.tokenizer(tgt_list,
+                                                 max_length=self.max_length,
+                                                 truncation=True,
+                                                 padding=True,
+                                                 return_tensors='pt')
+
+                    src_tokens = encoded_src['input_ids'].to(self.device)
+                    src_mask = encoded_src['attention_mask'].to(self.device)
+
+                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
+
+                    output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
+                    logits = output.logits.view(-1, self.model.config.vocab_size)
+
+                    pos_score = self.softmax(logits)[:, self.pos_id]    # Yes
+                    neg_score = self.softmax(logits)[:, self.neg_id]    # No
+
+                    cur_pos_score = [x.item() for x in pos_score]
+                    cur_neg_score = [x.item() for x in neg_score]
+                    pos_score_list += cur_pos_score
+                    neg_score_list += cur_neg_score
+
+            except RuntimeError:
+                print(f'source: {src_list}')
+                print(f'target: {tgt_list}')
+                exit(0)
+
+        score_list = []
+        for i in range(len(pos_score_list)):
+            score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
+
+        return score_list
diff --git a/applications/Chat/evaluate/unieval/utils.py b/applications/Chat/evaluate/unieval/utils.py
new file mode 100644
index 000000000000..a77505faa0d2
--- /dev/null
+++ b/applications/Chat/evaluate/unieval/utils.py
@@ -0,0 +1,248 @@
+# MIT License
+
+# Copyright (c) 2022 Ming Zhong
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+from typing import Dict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import tqdm
+
+
+def add_question(dimension, output, src=None, ref=None, context=None, task=None):
+    """
+        Add questions to generate input in Bool-QA format for UniEval.
+
+        dimension: specific dimension to be evaluated
+        src: source input for different NLG tasks. For example, source document for summarization
+             and dialogue history for dialogue response generation.
+        output: output text generated by the models
+        ref: human-annotataed groundtruth
+        context: the context needed to evaluate several specific dimension. For example,
+                 additional factual information when evaluating engagingness and groundedness in dialogues.
+    """
+
+    input_with_question = []
+    for i in range(len(output)):
+        # For summarization
+        if task == 'summarization':
+            if dimension == 'fluency':
+                cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
+            elif dimension == 'coherence':
+                cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            elif dimension == 'consistency':
+                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            elif dimension == 'relevance':
+                cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[
+                    i] + ' </s> reference: ' + ref[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For dialogues
+        elif task == 'dialogue':
+            if dimension == 'naturalness':
+                cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
+            elif dimension == 'coherence':
+                cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
+                            + output[i] + ' </s> dialogue history: ' + src[i]
+            elif dimension == 'engagingness':
+                cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
+                            + output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
+            elif dimension == 'groundedness':
+                cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
+                            + output[i] + ' </s> fact: ' + context[i]
+            elif dimension == 'understandability':
+                cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For data-to-text
+        elif task == 'data2text':
+            if dimension == 'naturalness':
+                cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
+            elif dimension == 'informativeness':
+                cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
+                            + output[i] + ' </s> reference: ' + ref[i]
+            else:
+                raise NotImplementedError(
+                    'The input format for this dimension is still undefined. Please customize it first.')
+        # For factual consistency detection
+        elif task == 'fact':
+            if dimension == 'consistency':
+                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
+                    i] + ' </s> document: ' + src[i]
+            else:
+                raise NotImplementedError('No other dimensions for the factual consistency detection task.')
+        # For new customized tasks
+        else:
+            raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
+        input_with_question.append(cur_input)
+    return input_with_question
+
+
+def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
+    """
+        Convert the data into the unieval's format.
+
+        output_list: a list of model output
+
+        src_list: source input for different NLG tasks. For example, source document for summarization
+                  and dialogue history for dialogue response generation
+        ref_list: human-annotated groundtruth
+    """
+    json_data = []
+    for i in range(len(output_list)):
+        cur = {}
+        cur['system_output'] = output_list[i]
+        if src_list is not None:
+            cur['source'] = src_list[i]
+        if ref_list is not None:
+            cur['reference'] = ref_list[i]
+        cur['context'] = ""
+        json_data.append(cur)
+    return json_data
+
+
+def calculate_average_score(scores):
+    """
+        Calculate average scores for different metrics
+
+        scores: a list of scores for different metrics for each answer
+
+    """
+    metrics = {metric: 0 for metric in scores[0]}
+
+    for score in scores:
+        for metric in score:
+            metrics[metric] += score[metric]
+
+    for metric in metrics:
+        metrics[metric] /= len(scores)
+
+    return metrics
+
+
+def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
+    """
+    Save UniEval evaluation results of different categories for one model.
+
+    """
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    unieval_metric_stats_per_category = {}
+    for task, category_stat in unieval_metric_stats.items():
+        for category, metric_stat in category_stat.items():
+            if unieval_metric_stats_per_category.get(category, None) is None:
+                unieval_metric_stats_per_category[category] = {}
+            for metric, score in metric_stat.items():
+                unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
+
+    automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
+    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
+
+
+def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
+    """
+    Read a csv file and return a dictionary which stores scores per metric.
+
+    """
+
+    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
+
+    results_dict = {metric: {} for metric in list(results.index)}
+    for i, metric in enumerate(results_dict.keys()):
+        for j, category in enumerate(list(results.columns)):
+            if pd.isnull(results.iloc[i][j]):
+                continue
+            results_dict[metric][category] = results.iloc[i][j]
+
+    return results_dict
+
+
+def analyze_unieval_results(results_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all csv files in the given folder.
+
+    """
+
+    if not os.path.exists(results_path):
+        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
+
+    all_statistics = {}
+
+    for file_name in os.listdir(results_path):
+        if file_name.endswith("_results.csv"):
+            model_name = file_name.split("_results.csv")[0]
+            all_statistics[model_name] = read_unieval_results(results_path, file_name)
+
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
+
+    frame_all = {"model": [], "category": [], "metric": [], "score": []}
+    frame_per_metric = {}
+    for model_name, model_statistics in all_statistics.items():
+        for metric, metric_statistics in model_statistics.items():
+            if frame_per_metric.get(metric) is None:
+                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
+
+            for category, category_score in metric_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["score"].append(category_score)
+
+                frame_per_metric[metric]["model"].append(model_name)
+                frame_per_metric[metric]["category"].append(category)
+                frame_per_metric[metric]["score"].append(category_score)
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
+
+    for metric in tqdm.tqdm(
+            frame_per_metric.keys(),
+            desc=f"UniEval metrics: ",
+            total=len(frame_per_metric.keys()),
+    ):
+        data = pd.DataFrame(frame_per_metric[metric])
+
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+
+        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
+        fig.set_title(
+            f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
+        )
+        plt.xlabel("Evaluation Category")
+        plt.ylabel("Score")
+
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
+
+        plt.close()
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index 1f4069386fcd..fefe25f5e764 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -199,7 +199,7 @@ def analyze_automatic_results(results_path: str, save_path: str) -> None:
 
     for metric in tqdm.tqdm(
             frame_per_metric.keys(),
-            desc=f"metric: ",
+            desc=f"automatic metrics: ",
             total=len(frame_per_metric.keys()),
     ):
         data = pd.DataFrame(frame_per_metric[metric])
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 5d3f3e5530cb..dc0df0517508 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -238,7 +238,7 @@ def initialize(model: nn.Module,
     loaded into gpc.config.
 
     Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
+        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
         optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
             Your optimizer instance.
         criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
diff --git a/colossalai/zero/gemini/memory_tracer/utils.py b/colossalai/zero/gemini/memory_tracer/utils.py
index 6962c058110e..65f6ba775139 100644
--- a/colossalai/zero/gemini/memory_tracer/utils.py
+++ b/colossalai/zero/gemini/memory_tracer/utils.py
@@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
     """Trace the optimizer memory usage
 
     Args:
-        optim (ShardedOptimV2): an instance of ShardedOptimver
+        optim (ShardedOptimV2): an instance of ShardedOptimizer
 
     Returns:
         Tuple[int, int]: cuda/cpu memory usage in Byte
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py
index a3fa46b38b5a..84e2d2f4f8e1 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
     """A context to initialize model.
 
     1. Convert the model to fp16.
-    2. The paramaters of the module are adapted to type ShardedParameter.
+    2. The parameters of the module are adapted to type ShardedParameter.
     3. Shard the param and grad according to flags.
 
     Args:
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
index be3842beb208..e7064277fb3c 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
             If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
             Note that 'auto' policy can only work well when no other processes use CUDA during your training.
             Defaults to 'cuda'.
-        gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
+        gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0.
         reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
             Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
             In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
@@ -205,7 +205,7 @@ def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> N
             exit(0)
         """
         if self._use_memory_tracer:
-            self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
+            self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0])
             if gpc.get_global_rank() == 0:
                 with open(filename, 'w+') as f:
                     f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -385,7 +385,7 @@ def _save_grad(self, param: Parameter, grad: torch.Tensor):
             # make parameters point to gradient
 
             assert param.colo_attr.saved_grad.is_null(
-            ), 'Gradien accumulation is not supported when reuse_fp16_shard=True'
+            ), 'Gradient accumulation is not supported when reuse_fp16_shard=True'
 
             param.colo_attr.grad_payload_reset(grad.data)
             # release the memory of param
diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py
index afc98e7a7f54..218f7603bc54 100644
--- a/colossalai/zero/low_level/_utils.py
+++ b/colossalai/zero/low_level/_utils.py
@@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list):
     share the same memory space. This function will update the tensor list so that
     they point to the same value.
 
-    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit
+    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor list
     :param tensor_list: A list of tensors corresponding to the flattened tensor
     :type flat_tensor: torch.Tensor
     :type tensor_list: List[torch.Tensor]
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index d4d03e5b5fcd..ee03c0f0ae15 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -207,8 +207,8 @@ def __init__(
             for param in self._working_param_groups[group_id]:
                 self._param_store.set_param_reduction_state(param, False)
 
-        # intialize communication stream for
-        # communication-compuation overlapping
+        # initialize communication stream for
+        # communication-computation overlapping
         if self._overlap_communication:
             self._comm_stream = torch.cuda.Stream()
 
@@ -269,7 +269,7 @@ def _partition_param_list(self, param_list):
         params_per_rank = [[] for _ in range(self._world_size)]
         numel_per_rank = [0 for _ in range(self._world_size)]
 
-        # partititon the parameters in a greedy fashion
+        # partition the parameters in a greedy fashion
         sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
         for param in sorted_params:
             # allocate this parameter to the rank with
@@ -297,7 +297,7 @@ def _attach_reduction_hook(self):
             param_group = self._working_param_groups[group_id]
             for param in param_group:
                 if param.requires_grad:
-                    # determines the reduction destionation rank
+                    # determines the reduction destination rank
                     # this is only valid for stage 2
                     # dst_rank = None means using all-reduce
                     # else using reduce
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 22d52fb3cd1a..978ac32fc78e 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -141,16 +141,16 @@ for mn, module in model.named_modules():
 
         if 'mlp.c_fc' in mn:
             if 'weight' in pn or 'bias' in pn:
-                split_param_col_tp1d(param, pg)  # colmn slice
+                split_param_col_tp1d(param, pg)  # column slice
                 # keep the shape of the output from c_fc
                 param.compute_spec.set_output_replicate(False)
         elif 'mlp.c_proj' in mn:
             if 'weight' in pn:
                 split_param_row_tp1d(param, pg)  # row slice
         elif 'wte' in mn or 'wpe' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
         elif 'c_attn' in mn or 'c_proj' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
 ```
 
 The modified model is illustrated below.
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index c4131e593437..b4e0d18a2647 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -126,16 +126,16 @@ for mn, module in model.named_modules():
 
         if 'mlp.c_fc' in mn:
             if 'weight' in pn or 'bias' in pn:
-                split_param_col_tp1d(param, pg)  # colmn slice
+                split_param_col_tp1d(param, pg)  # column slice
                 # keep the shape of the output from c_fc
                 param.compute_spec.set_output_replicate(False)
         elif 'mlp.c_proj' in mn:
             if 'weight' in pn:
                 split_param_row_tp1d(param, pg)  # row slice
         elif 'wte' in mn or 'wpe' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
         elif 'c_attn' in mn or 'c_proj' in mn:
-            split_param_col_tp1d(param, pg)  # colmn slice
+            split_param_col_tp1d(param, pg)  # column slice
 ```
 
 修改后的模型如下图所示。
diff --git a/examples/community/roberta/README.md b/examples/community/roberta/README.md
index 8aefa327a4b4..000fce63f35f 100644
--- a/examples/community/roberta/README.md
+++ b/examples/community/roberta/README.md
@@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
 
 ## 3. Finetune
 
-The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
+The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application.
 
 ## Contributors
 The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
diff --git a/examples/community/roberta/preprocessing/README.md b/examples/community/roberta/preprocessing/README.md
index 17cc2f4dc22c..2ed747541280 100644
--- a/examples/community/roberta/preprocessing/README.md
+++ b/examples/community/roberta/preprocessing/README.md
@@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s
 In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.**
 
 ```python
-python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100
+python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100
 # This step takes a short time
 ```
-* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ...
+* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ...
 * `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--shard`: Number of shard, e.g., 10, 50, or 100
 
@@ -76,7 +76,7 @@ make
 
 * `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ...
-* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
+* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
 * `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed**
 * `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document
 * `--worker`: number of process
diff --git a/examples/community/roberta/pretraining/README.md b/examples/community/roberta/pretraining/README.md
index c248fc1f5708..8abe48aa6c0e 100644
--- a/examples/community/roberta/pretraining/README.md
+++ b/examples/community/roberta/pretraining/README.md
@@ -13,7 +13,7 @@ bash run_pretrain.sh
 * `--bert_config`: config.json which represent model
 * `--mlm`: model type of backbone, bert or deberta_v2
 
-2. if resume training from earylier checkpoint, run the script below.
+2. if resume training from earlier checkpoint, run the script below.
 
 ```shell
 bash run_pretrain_resume.sh
diff --git a/examples/community/roberta/pretraining/arguments.py b/examples/community/roberta/pretraining/arguments.py
index 40210c4b1be7..e0702ceb59b0 100644
--- a/examples/community/roberta/pretraining/arguments.py
+++ b/examples/community/roberta/pretraining/arguments.py
@@ -46,7 +46,7 @@ def parse_args():
                         type=int,
                         default=1,
                         help="This param makes sure that a certain task is repeated for this time steps to \
-        optimise on the back propogation speed with APEX's DistributedDataParallel")
+        optimize on the back propagation speed with APEX's DistributedDataParallel")
     parser.add_argument("--max_predictions_per_seq",
                         "--max_pred",
                         default=80,
@@ -73,12 +73,12 @@ def parse_args():
                         help="location of saving checkpoint, which contains model and optimizer")
     parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
     parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug")
-    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin")
+    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint")
     parser.add_argument(
         '--load_optimizer_lr',
         default='',
         type=str,
-        help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step")
+        help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step")
     parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint")
     parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta")
     parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing")
diff --git a/examples/community/roberta/pretraining/model/bert.py b/examples/community/roberta/pretraining/model/bert.py
index a5da1bea6f65..abdf925d0540 100644
--- a/examples/community/roberta/pretraining/model/bert.py
+++ b/examples/community/roberta/pretraining/model/bert.py
@@ -327,7 +327,7 @@ def forward(
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
                 relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
diff --git a/examples/community/roberta/pretraining/run_pretraining.py b/examples/community/roberta/pretraining/run_pretraining.py
index 9a6ffc1c5661..a72bdf775644 100644
--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -78,7 +78,7 @@ def main():
                              default_pg=shard_pg):
             config, model, numel = get_model(args, logger)
 
-        # asign running configurations
+        # assign running configurations
         gemini_config = None
         if args.distplan.startswith("CAI_ZeRO"):
             optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
diff --git a/examples/community/roberta/pretraining/utils/exp_util.py b/examples/community/roberta/pretraining/utils/exp_util.py
index 0cdb56bad031..4a2c9d8a47ad 100644
--- a/examples/community/roberta/pretraining/utils/exp_util.py
+++ b/examples/community/roberta/pretraining/utils/exp_util.py
@@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations,
 def synchronize():
     if not torch.distributed.is_available():
         return
-    if not torch.distributed.is_intialized():
+    if not torch.distributed.is_initialized():
         return
     world_size = torch.distributed.get_world_size()
     if world_size == 1:
diff --git a/examples/community/roberta/pretraining/utils/global_vars.py b/examples/community/roberta/pretraining/utils/global_vars.py
index 7b0c5a2be73d..9eef19e71614 100644
--- a/examples/community/roberta/pretraining/utils/global_vars.py
+++ b/examples/community/roberta/pretraining/utils/global_vars.py
@@ -110,7 +110,7 @@ def write(self, names, writer, iteration, normalizer=1.0, reset=False):
         """Write timers to a tensorboard writer"""
         # currently when using add_scalars,
         # torch.utils.add_scalars makes each timer its own run, which
-        # polutes the runs list, so we just add each as a scalar
+        # pollutes the runs list, so we just add each as a scalar
         assert normalizer > 0.0
         for name in names:
             value = self.timers[name].elapsed(reset=reset) / normalizer
diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 7c117d841e24..ba4c1a71034a 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -37,7 +37,7 @@ The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Mode
 
 ## Training
 
-We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparation. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
+We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparison. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -92,6 +92,29 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --placement="cuda"
 ```
 
+## New API
+We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
+We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
+For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
+## Performance
+
+|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
+|:--------------:|:----:|:----------:|:-----------:|:-------:|
+|  Traditional   |  1   |     16     |     oom     |    \    |
+|  Traditional   |  1   |     8      |    61.81    |    1    |
+|   torch_ddp    |  4   |     16     |     oom     |    \    |
+|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
+|     gemini     |  4   |     16     |    53.29    |    \    |
+|     gemini     |  4   |     8      |    29.36    |  2.00   |
+| low_level_zero |  4   |     16     |    52.80    |    \    |
+| low_level_zero |  4   |     8      |    28.87    |  2.02   |
+
+The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
+We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
+the memory cost and the throughput for the plugins.
+
+
 ## Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. `--instance_prompt="a photo of sks dog" ` in the above example) in your prompt.
diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 227d8b8bdb04..db4562dbc921 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -1,22 +1,18 @@
-export MODEL_NAME= <Your Pretrained Model Path> 
-export INSTANCE_DIR= <Your Input Pics Path>
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-torchrun --nproc_per_node 2 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of a dog" \
+torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
   --resolution=512 \
+  --plugin="gemini" \
   --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cuda" \
+  --test_run=True \
+  --placement="auto" \
diff --git a/examples/images/dreambooth/dreambooth.sh b/examples/images/dreambooth/dreambooth.sh
index e063bc8279c5..f6b8f5e1b87e 100644
--- a/examples/images/dreambooth/dreambooth.sh
+++ b/examples/images/dreambooth/dreambooth.sh
@@ -1,7 +1,7 @@
 python train_dreambooth.py \
-    --pretrained_model_name_or_path= ## Your Model Path  \
-    --instance_data_dir=  ## Your Training Input Pics Path \
-    --output_dir="path-to-save-model" \
+    --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4" \
+    --instance_data_dir="/data/dreambooth/Teyvat/data" \
+    --output_dir="./weight_output" \
     --instance_prompt="a photo of a dog" \
     --resolution=512 \
     --train_batch_size=1 \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index e69de29bb2d1..21f45adae2a0 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -xe
+pip install -r requirements.txt
+
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
+
+#  "torch_ddp" "torch_ddp_fp16" "low_level_zero"
+for plugin in "gemini"; do
+  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
+  --resolution=512 \
+  --plugin=$plugin \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --test_run=True \
+  --num_class_images=200 \
+  --placement="auto" # "cuda"
+done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index d07febea0a84..888b28de8306 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -4,6 +4,7 @@
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -21,9 +22,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
+from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -58,6 +62,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -187,12 +198,19 @@ def parse_args(input_args=None):
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
     parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument("--test_run", default=False, help="Whether to use a smaller dataset for test run.")
     parser.add_argument(
         "--hub_model_id",
         type=str,
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -250,6 +268,7 @@ def __init__(
         class_prompt=None,
         size=512,
         center_crop=False,
+        test=False,
     ):
         self.size = size
         self.center_crop = center_crop
@@ -260,6 +279,8 @@ def __init__(
             raise ValueError("Instance images root doesn't exists.")
 
         self.instance_images_path = list(Path(instance_data_root).iterdir())
+        if test:
+            self.instance_images_path = self.instance_images_path[:10]
         self.num_instance_images = len(self.instance_images_path)
         self.instance_prompt = instance_prompt
         self._length = self.num_instance_images
@@ -339,18 +360,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -392,7 +401,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -452,12 +461,18 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -468,10 +483,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -486,6 +513,7 @@ def main(args):
         tokenizer=tokenizer,
         size=args.resolution,
         center_crop=args.center_crop,
+        test=args.test_run
     )
 
     def collate_fn(examples):
@@ -554,6 +582,8 @@ def collate_fn(examples):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -642,36 +672,24 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    pipeline = DiffusionPipeline.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        unet=torch_unet,
-                        revision=args.revision,
-                    )
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    pipeline.save_pretrained(save_path)
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            revision=args.revision,
-        )
-
-        pipeline.save_pretrained(args.output_dir)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 6715b473a567..dce65ff514b7 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -4,6 +4,7 @@
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -23,9 +24,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -60,6 +64,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -195,6 +206,12 @@ def parse_args(input_args=None):
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -341,18 +358,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -394,7 +399,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -454,32 +459,42 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
-        unet.requires_grad_(False)
-
-        # Set correct lora layers
-        lora_attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
-                                                           cross_attention_dim=cross_attention_dim)
-
-        unet.set_attn_processor(lora_attn_procs)
-        lora_layers = AttnProcsLayers(unet.attn_processors)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet.requires_grad_(False)
+
+    # Set correct lora layers
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+
+        lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
+                                                       cross_attention_dim=cross_attention_dim)
+
+    unet.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(unet.attn_processors)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -490,10 +505,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -576,6 +603,8 @@ def collate_fn(examples):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -664,27 +693,24 @@ def collate_fn(examples):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    torch_unet = torch_unet.to(torch.float32)
-                    torch_unet.save_attn_procs(save_path)
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    torch_unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        torch_unet = torch_unet.to(torch.float32)
-        torch_unet.save_attn_procs(save_path)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/images/vit/README.md b/examples/images/vit/README.md
index 4423d85d19e0..7c4147b76457 100644
--- a/examples/images/vit/README.md
+++ b/examples/images/vit/README.md
@@ -1,61 +1,28 @@
-# Vision Transformer with ColoTensor
+## Overview
 
-# Overview
+Vision Transformer is a class of Transformer model tailored for computer vision tasks. It was first proposed in paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) and achieved SOTA results on various tasks at that time.
 
-In this example, we will run Vision Transformer with ColoTensor.
+In our example, we are using pretrained weights of ViT loaded from HuggingFace.
+We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
 
-We use model **ViTForImageClassification** from Hugging Face [Link](https://huggingface.co/docs/transformers/model_doc/vit) for unit test.
-You can change world size or decide whether use DDP in our code.
+## Run Demo
 
-We use model **vision_transformer** from timm [Link](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) for training example.
-
-(2022/6/28) The default configuration now supports 2DP+2TP with gradient accumulation and checkpoint support. Zero is not supported at present.
-
-# Requirement
-
-Install colossalai version >= 0.1.11
-
-## Unit test
-To run unit test, you should install pytest, transformers with:
-```shell
-pip install pytest transformers
+By running the following script:
+```bash
+bash run_demo.sh
 ```
+You will finetune a a [ViT-base](https://huggingface.co/google/vit-base-patch16-224) model on this [dataset](https://huggingface.co/datasets/beans), with more than 8000 images of bean leaves. This dataset is for image classification task and there are 3 labels: ['angular_leaf_spot', 'bean_rust', 'healthy'].
 
-## Training example
-To run training example with ViT-S, you should install **NVIDIA DALI** from [Link](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html) for dataloader support.
-You also need to install timm and titans for model/dataloader support with:
-```shell
-pip install timm titans
-```
+The script can be modified if you want to try another set of hyperparameters or change to another ViT model with different size.
 
-### Data preparation
-You can download the ImageNet dataset from the [ImageNet official website](https://www.image-net.org/download.php). You should get the raw images after downloading the dataset. As we use **NVIDIA DALI** to read data, we use the TFRecords dataset instead of raw Imagenet dataset. This offers better speedup to IO. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one.
+The demo code refers to this [blog](https://huggingface.co/blog/fine-tune-vit).
 
-Before you start training, you need to set the environment variable `DATA` so that the script knows where to fetch the data for DALI dataloader.
-```shell
-export DATA=/path/to/ILSVRC2012
-```
 
 
-# How to run
+## Run Benchmark
 
-## Unit test
-In your terminal
-```shell
-pytest test_vit.py
+You can run benchmark for ViT model by running the following script:
+```bash
+bash run_benchmark.sh
 ```
-
-This will evaluate models with different **world_size** and **use_ddp**.
-
-## Training example
-Modify the settings in run.sh according to your environment.
-For example, if you set `--nproc_per_node=8` in `run.sh` and `TP_WORLD_SIZE=2` in your config file,
-data parallel size will be automatically calculated as 4.
-Thus, the parallel strategy is set to 4DP+2TP.
-
-Then in your terminal
-```shell
-sh run.sh
-```
-
-This will start ViT-S training with ImageNet.
+The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
\ No newline at end of file
diff --git a/examples/images/vit/args.py b/examples/images/vit/args.py
new file mode 100644
index 000000000000..e4a873a9eb52
--- /dev/null
+++ b/examples/images/vit/args.py
@@ -0,0 +1,124 @@
+from colossalai import get_default_parser
+
+def parse_demo_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="google/vit-base-patch16-224",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="./output_model.bin",
+        help="The path of your saved model after finetuning."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--num_epoch",
+        type=int,
+        default=3,
+        help="Number of epochs."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=3e-4,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--warmup_ratio",
+        type=float,
+        default=0.3,
+        help="Ratio of warmup steps against total training steps."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.1, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+
+    args = parser.parse_args()
+    return args
+
+def parse_benchmark_args():
+
+    parser = get_default_parser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="google/vit-base-patch16-224",
+        help="Path to a pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--num_labels",
+        type=int,
+        default=10,
+        help="Number of labels for classification."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.0, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--mem_cap", 
+        type=int, 
+        default=0, 
+        help="Limit on the usage of space for each GPU (in GB)."
+    )
+    args = parser.parse_args()
+
+    return args
\ No newline at end of file
diff --git a/examples/images/vit/configs/vit_1d_tp2.py b/examples/images/vit/configs/vit_1d_tp2.py
deleted file mode 100644
index fbf399f2e50d..000000000000
--- a/examples/images/vit/configs/vit_1d_tp2.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 384
-DEPTH = 12
-NUM_HEADS = 6
-MLP_RATIO = 4
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-USE_DDP = True
-TP_WORLD_SIZE = 2
-TP_TYPE = 'row'
-parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
-
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-gradient_accumulation = 8
-
-LOG_PATH = "./log"
diff --git a/examples/images/vit/configs/vit_1d_tp2_ci.py b/examples/images/vit/configs/vit_1d_tp2_ci.py
deleted file mode 100644
index e491e4ada45e..000000000000
--- a/examples/images/vit/configs/vit_1d_tp2_ci.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 8
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 3
-WARMUP_EPOCHS = 1
-
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 32
-DEPTH = 2
-NUM_HEADS = 4
-MLP_RATIO = 4
-NUM_CLASSES = 10
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-USE_DDP = True
-TP_WORLD_SIZE = 2
-TP_TYPE = 'row'
-parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
-
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-gradient_accumulation = 2
-
-LOG_PATH = "./log_ci"
diff --git a/examples/images/vit/data.py b/examples/images/vit/data.py
new file mode 100644
index 000000000000..00fde707b173
--- /dev/null
+++ b/examples/images/vit/data.py
@@ -0,0 +1,32 @@
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+
+class BeansDataset(Dataset):
+    
+    def __init__(self, image_processor, split='train'):
+
+        super().__init__()
+        self.image_processor = image_processor
+        self.ds = load_dataset('beans')[split]
+        self.label_names = self.ds.features['labels'].names
+        self.num_labels = len(self.label_names)
+        self.inputs = []
+        for example in self.ds:
+            self.inputs.append(self.process_example(example))
+    
+    def __len__(self):
+        return len(self.inputs)
+
+    def __getitem__(self, idx):
+        return self.inputs[idx]
+    
+    def process_example(self, example):
+        input = self.image_processor(example['image'], return_tensors='pt')
+        input['labels'] = example['labels']
+        return input
+    
+
+def beans_collator(batch):
+    return {'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
+            'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)}
diff --git a/examples/images/vit/requirements.txt b/examples/images/vit/requirements.txt
index 1f69794ebe70..edad87ca380f 100644
--- a/examples/images/vit/requirements.txt
+++ b/examples/images/vit/requirements.txt
@@ -1,8 +1,6 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
 numpy>=1.24.1
-timm>=0.6.12
-titans>=0.0.7
 tqdm>=4.61.2
-transformers>=4.25.1
-nvidia-dali-cuda110>=1.8.0 --extra-index-url https://developer.download.nvidia.com/compute/redist
+transformers>=4.20.0
+datasets
\ No newline at end of file
diff --git a/examples/images/vit/run.sh b/examples/images/vit/run.sh
deleted file mode 100644
index 84fe58f11a6a..000000000000
--- a/examples/images/vit/run.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-export DATA=/data/scratch/imagenet/tf_records
-export OMP_NUM_THREADS=4
-
-# resume
-# CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
-# --nproc_per_node 4 train.py \
-# --config configs/vit_1d_tp2.py \
-# --resume_from checkpoint/epoch_10 \
-# --master_port 29598 | tee ./out 2>&1
-
-# train
-CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
---nproc_per_node 4 train.py \
---config configs/vit_1d_tp2.py \
---master_port 29598 | tee ./out 2>&1
diff --git a/examples/images/vit/run_benchmark.sh b/examples/images/vit/run_benchmark.sh
new file mode 100644
index 000000000000..2487bf81ee2b
--- /dev/null
+++ b/examples/images/vit/run_benchmark.sh
@@ -0,0 +1,27 @@
+set -xe
+pip install -r requirements.txt
+
+export BS=8
+export MEMCAP=0
+export GPUNUM=1
+
+for BS in 8 32 128
+do
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+MODEL_PATH="google/vit-base-patch16-224"
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_benchmark.py \
+  --model_name_or_path ${MODEL_PATH} \
+  --mem_cap ${MEMCAP} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+  
+done
+done
+done
diff --git a/examples/images/vit/run_demo.sh b/examples/images/vit/run_demo.sh
new file mode 100644
index 000000000000..2d140dd6e423
--- /dev/null
+++ b/examples/images/vit/run_demo.sh
@@ -0,0 +1,44 @@
+set -xe
+pip install -r requirements.txt
+
+# model name or path
+MODEL="google/vit-base-patch16-224"
+
+# path for saving model
+OUTPUT_PATH="./output_model.bin"
+
+# plugin(training strategy)
+# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
+PLUGIN="gemini"
+
+# number of gpus to use
+GPUNUM=4
+
+# batch size per gpu
+BS=16
+
+# learning rate
+LR="2e-4"
+
+# number of epoch
+EPOCH=3
+
+# weight decay
+WEIGHT_DECAY=0.05
+
+# ratio of warmup steps
+WARMUP_RATIO=0.3
+
+# run the script for demo
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_train_demo.py \
+  --model_name_or_path ${MODEL} \
+  --output_path ${OUTPUT_PATH} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS} \
+  --num_epoch ${EPOCH} \
+  --learning_rate ${LR} \
+  --weight_decay ${WEIGHT_DECAY} \
+  --warmup_ratio ${WARMUP_RATIO}
diff --git a/examples/images/vit/test_ci.sh b/examples/images/vit/test_ci.sh
index 41d25ee23521..8606015c0397 100644
--- a/examples/images/vit/test_ci.sh
+++ b/examples/images/vit/test_ci.sh
@@ -1,9 +1,19 @@
-export OMP_NUM_THREADS=4
-
+set -xe
 pip install -r requirements.txt
 
-# train
-colossalai run \
---nproc_per_node 4 train.py \
---config configs/vit_1d_tp2_ci.py \
---dummy_data
+BS=8
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  vit_benchmark.py \
+  --model_name_or_path "google/vit-base-patch16-224" \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+
+done
+done
diff --git a/examples/images/vit/test_vit.py b/examples/images/vit/test_vit.py
deleted file mode 100644
index c0ae35bca871..000000000000
--- a/examples/images/vit/test_vit.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import os
-import random
-
-import numpy as np
-import pytest
-import torch
-from torch.nn.parallel import DistributedDataParallel as DDP
-from vit import get_training_components
-
-import colossalai
-from colossalai.context import ParallelMode
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.parallel.data_parallel import ColoDDP
-from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, ProcessGroup, ShardSpec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext
-
-
-def set_seed(seed):
-    random.seed(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
-
-
-def tensor_equal(A, B):
-    return torch.allclose(A, B, rtol=1e-3, atol=1e-1)
-
-
-def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor):
-    assert tensor.ndim == shard.ndim
-    if tensor.shape == shard.shape:
-        return tensor_equal(tensor, shard)
-    else:
-        dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape))
-        if dims_not_eq.numel() == 1:
-            # 1D shard
-            dim = dims_not_eq.item()
-            world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-            rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard)
-        else:
-            raise
-
-
-# Only for all Linear, it's 1d_row split because Linear will be transposed when calculating.
-# But for other layers, it's 1d_col split.
-# Layernorm is not supported for now.
-# patch_embeddings.projection has nn.Conv2d
-# https://github.com/huggingface/transformers/blob/dcb08b99f44919425f8ba9be9ddcc041af8ec25e/src/transformers/models/vit/modeling_vit.py#L182
-def init_1d_row_for_linear_weight_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([-1], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if 'weight' in n and 'layernorm' not in n and 'embeddings.patch_embeddings.projection.weight' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-# Similarly, it's col split for Linear but row split for others.
-def init_1d_col_for_linear_weight_bias_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if ('weight' in n
-                    or 'bias' in n) and 'layernorm' not in n and 'embeddings.patch_embeddings.projection' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-def check_param_equal(model, torch_model):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        assert tensor_shard_equal(torch_p, p)
-
-
-def check_grad_equal(model, torch_model):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        if (torch_p.grad.shape == p.grad.shape):
-            assert torch.allclose(torch_p.grad, p.grad, rtol=1e-3, atol=2.0) == True
-        else:
-            dims_not_eq = torch.nonzero(torch.tensor(torch_p.grad.shape) != torch.tensor(p.grad.shape))
-            dim = dims_not_eq.item()
-            world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-            rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            assert torch.allclose(torch_p.grad.chunk(world_size, dim)[rank], p.grad, rtol=1e-3, atol=2.0) == True
-
-
-def run_vit(init_spec_func, use_ddp):
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_training_components()
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-    model = model.cuda()
-    torch_model = model_builder().cuda()
-    if use_ddp:
-        model = ColoDDP(model)
-        torch_model = DDP(torch_model,
-                          device_ids=[gpc.get_global_rank()],
-                          process_group=gpc.get_group(ParallelMode.DATA))
-    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
-        torch_p.data.copy_(p)
-
-    world_size = torch.distributed.get_world_size()
-    init_spec_func(model, world_size)
-
-    check_param_equal(model, torch_model)
-    model.train()
-    torch_model.train()
-    set_seed(gpc.get_local_rank(ParallelMode.DATA))
-
-    optimizer = optimizer_class(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
-    torch_optimizer = optimizer_class(torch_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
-
-    for i, image_dict in enumerate(train_dataloader):
-        if use_ddp:
-            model.zero_grad()
-        else:
-            optimizer.zero_grad()
-        logits = model(image_dict['pixel_values'])
-        torch_logits = torch_model(image_dict['pixel_values'])
-        assert tensor_equal(torch_logits.logits, logits.logits)
-        loss = criterion(logits.logits, image_dict['label'])
-        torch_loss = criterion(torch_logits.logits, image_dict['label'])
-        if use_ddp:
-            model.backward(loss)
-        else:
-            loss.backward()
-        torch_loss.backward()
-        check_grad_equal(model, torch_model)
-        optimizer.step()
-        torch_optimizer.step()
-        check_param_equal(model, torch_model)
-        break
-
-
-def run_dist(rank, world_size, port, use_ddp):
-    if use_ddp and world_size == 1:
-        return
-    tp_world_size = world_size // 2 if use_ddp else world_size
-    config = dict(parallel=dict(tensor=dict(mode="1d", size=tp_world_size),))
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_vit(init_1d_row_for_linear_weight_spec, use_ddp)
-    run_vit(init_1d_col_for_linear_weight_bias_spec, use_ddp)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@pytest.mark.parametrize('use_ddp', [False, True])
-@rerun_if_address_is_in_use()
-def test_vit(world_size, use_ddp):
-    spawn(run_dist, world_size, use_ddp=use_ddp)
-
-
-if __name__ == '__main__':
-    test_vit(1, False)
diff --git a/examples/images/vit/train.py b/examples/images/vit/train.py
deleted file mode 100644
index b42cf2bedc6b..000000000000
--- a/examples/images/vit/train.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import os
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from timm.models.vision_transformer import _create_vision_transformer
-from titans.dataloader.imagenet import build_dali_imagenet
-from tqdm import tqdm
-from vit import DummyDataLoader
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn._ops import *
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.nn.parallel.data_parallel import ColoDDP
-from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, ProcessGroup, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
-
-
-def init_1d_row_for_linear_weight_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([-1], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if 'weight' in n and 'norm' not in n and 'patch_embed.proj.weight' not in n:
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-# Similarly, it's col split for Linear but row split for others.
-def init_1d_col_for_linear_weight_bias_spec(model, world_size: int):
-    pg = ProcessGroup(tp_degree=world_size)
-    spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    with DistSpecManager.no_grad():
-        for n, p in model.named_parameters():
-            if ('weight' in n or 'bias' in n) and 'norm' not in n and ('patch_embed.proj.weight' not in n
-                                                                       and 'patch_embed.proj.bias' not in n):
-                p.set_process_group(pg)
-                p.set_tensor_spec(*spec)
-
-
-def init_spec_func(model, tp_type):
-    world_size = torch.distributed.get_world_size()
-    if tp_type == 'row':
-        init_1d_row_for_linear_weight_spec(model, world_size)
-    elif tp_type == 'col':
-        init_1d_col_for_linear_weight_bias_spec(model, world_size)
-    else:
-        raise NotImplemented
-
-
-def train_imagenet():
-
-    parser = colossalai.get_default_parser()
-    parser.add_argument('--resume_from', default=False, action='store_true')
-    parser.add_argument('--dummy_data', default=False, action='store_true')
-
-    args = parser.parse_args()
-    colossalai.launch_from_torch(config=args.config)
-    use_ddp = gpc.config.USE_DDP
-
-    disable_existing_loggers()
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    logger.info('Build data loader', ranks=[0])
-    if not args.dummy_data:
-        root = os.environ['DATA']
-        train_dataloader, test_dataloader = build_dali_imagenet(root,
-                                                                train_batch_size=gpc.config.BATCH_SIZE,
-                                                                test_batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader = DummyDataLoader(length=10,
-                                           batch_size=gpc.config.BATCH_SIZE,
-                                           category=gpc.config.NUM_CLASSES,
-                                           image_size=gpc.config.IMG_SIZE,
-                                           return_dict=False)
-        test_dataloader = DummyDataLoader(length=5,
-                                          batch_size=gpc.config.BATCH_SIZE,
-                                          category=gpc.config.NUM_CLASSES,
-                                          image_size=gpc.config.IMG_SIZE,
-                                          return_dict=False)
-
-    logger.info('Build model', ranks=[0])
-
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        embed_dim=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=gpc.config.NUM_CLASSES,
-                        drop_rate=0.1,
-                        attn_drop_rate=0.1,
-                        weight_init='jax')
-
-    with ColoInitContext(device=get_current_device()):
-        model = _create_vision_transformer('vit_small_patch16_224', pretrained=False, **model_kwargs)
-    init_spec_func(model, gpc.config.TP_TYPE)
-
-    world_size = torch.distributed.get_world_size()
-    model = ColoDDP(module=model, process_group=ProcessGroup(tp_degree=world_size))
-    logger.info('Build criterion, optimizer, lr_scheduler', ranks=[0])
-    optimizer = HybridAdam(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-
-    criterion = CrossEntropyLoss()
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-
-    start_epoch = 0
-    if args.resume_from:
-        load_model = torch.load(args.resume_from + '_model.pth')
-        start_epoch = load_model['epoch']
-        model.load_state_dict(load_model['model'])
-        load_optim = torch.load(args.resume_from + '_optim_rank_{}.pth'.format(dist.get_rank()))
-        optimizer.load_state_dict(load_optim['optim'])
-
-    for epoch in range(start_epoch, gpc.config.NUM_EPOCHS):
-        model.train()
-        for index, (x, y) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), leave=False):
-            x, y = x.cuda(), y.cuda()
-            output = model(x)
-            loss = criterion(output, y)
-            loss = loss / gpc.config.gradient_accumulation
-            if use_ddp:
-                model.backward(loss)
-            else:
-                loss.backward()
-            if (index + 1) % gpc.config.gradient_accumulation == 0:
-                optimizer.step()
-                if use_ddp:
-                    model.zero_grad()
-                else:
-                    optimizer.zero_grad()
-
-        logger.info(
-            f"Finish Train Epoch [{epoch+1}/{gpc.config.NUM_EPOCHS}] loss: {loss.item():.3f} lr: {optimizer.state_dict()['param_groups'][0]['lr']}",
-            ranks=[0])
-
-        model.eval()
-        test_loss = 0
-        correct = 0
-        test_sum = 0
-        with torch.no_grad():
-            for index, (x, y) in tqdm(enumerate(test_dataloader), total=len(test_dataloader), leave=False):
-                x, y = x.cuda(), y.cuda()
-                output = model(x)
-                test_loss += F.cross_entropy(output, y, reduction='sum').item()
-                pred = output.argmax(dim=1, keepdim=True)
-                correct += pred.eq(y.view_as(pred)).sum().item()
-                test_sum += y.size(0)
-
-        test_loss /= test_sum
-        logger.info(
-            f"Finish Test Epoch [{epoch+1}/{gpc.config.NUM_EPOCHS}] loss: {test_loss:.3f} Accuracy: [{correct}/{test_sum}]({correct/test_sum:.3f})",
-            ranks=[0])
-
-        lr_scheduler.step()
-
-
-if __name__ == '__main__':
-    train_imagenet()
diff --git a/examples/images/vit/vit.py b/examples/images/vit/vit.py
deleted file mode 100644
index f22e8ea90cec..000000000000
--- a/examples/images/vit/vit.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from abc import ABC, abstractmethod
-
-import torch
-import torch.nn as nn
-from transformers import ViTConfig, ViTForImageClassification
-
-from colossalai.utils.cuda import get_current_device
-
-
-class DummyDataGenerator(ABC):
-
-    def __init__(self, length=10):
-        self.length = length
-
-    @abstractmethod
-    def generate(self):
-        pass
-
-    def __iter__(self):
-        self.step = 0
-        return self
-
-    def __next__(self):
-        if self.step < self.length:
-            self.step += 1
-            return self.generate()
-        else:
-            raise StopIteration
-
-    def __len__(self):
-        return self.length
-
-
-class DummyDataLoader(DummyDataGenerator):
-
-    def __init__(self, length=10, batch_size=4, channel=3, category=8, image_size=224, return_dict=True):
-        super().__init__(length)
-        self.batch_size = batch_size
-        self.channel = channel
-        self.category = category
-        self.image_size = image_size
-        self.return_dict = return_dict
-
-    def generate(self):
-        image_dict = {}
-        image_dict['pixel_values'] = torch.rand(
-            self.batch_size, self.channel, self.image_size, self.image_size, device=get_current_device()) * 2 - 1
-        image_dict['label'] = torch.randint(self.category, (self.batch_size,),
-                                            dtype=torch.int64,
-                                            device=get_current_device())
-        if not self.return_dict:
-            return image_dict['pixel_values'], image_dict['label']
-        return image_dict
-
-
-class ViTCVModel(nn.Module):
-
-    def __init__(self,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 image_size=224,
-                 patch_size=16,
-                 num_channels=3,
-                 num_labels=8,
-                 checkpoint=False):
-        super().__init__()
-        self.checkpoint = checkpoint
-        self.model = ViTForImageClassification(
-            ViTConfig(hidden_size=hidden_size,
-                      num_hidden_layers=num_hidden_layers,
-                      num_attention_heads=num_attention_heads,
-                      image_size=image_size,
-                      patch_size=patch_size,
-                      num_channels=num_channels,
-                      num_labels=num_labels))
-        if checkpoint:
-            self.model.gradient_checkpointing_enable()
-
-    def forward(self, pixel_values):
-        return self.model(pixel_values=pixel_values)
-
-
-def vit_base_s(checkpoint=True):
-    return ViTCVModel(checkpoint=checkpoint)
-
-
-def vit_base_micro(checkpoint=True):
-    return ViTCVModel(hidden_size=32, num_hidden_layers=2, num_attention_heads=4, checkpoint=checkpoint)
-
-
-def get_training_components():
-    trainloader = DummyDataLoader()
-    testloader = DummyDataLoader()
-    return vit_base_micro, trainloader, testloader, torch.optim.Adam, torch.nn.functional.cross_entropy
diff --git a/examples/images/vit/vit_benchmark.py b/examples/images/vit/vit_benchmark.py
new file mode 100644
index 000000000000..11d480bba65f
--- /dev/null
+++ b/examples/images/vit/vit_benchmark.py
@@ -0,0 +1,129 @@
+import time
+
+import torch
+import transformers
+from transformers import ViTConfig, ViTForImageClassification
+import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_benchmark_args
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def get_data(batch_size, num_labels, num_channels=3, height=224, width=224):
+    pixel_values = torch.randn(batch_size, num_channels, height, width, device=torch.cuda.current_device(), dtype=torch.float)
+    labels = torch.randint(0, num_labels, (batch_size, ), device=torch.cuda.current_device(), dtype=torch.int64)
+    return pixel_values, labels
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print(f"Limiting GPU memory usage to {size_in_GB} GB")
+
+
+def main():
+
+    args = parse_benchmark_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Whether to set limit on memory capacity
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+    
+    # Build ViT model
+    config = ViTConfig.from_pretrained(args.model_name_or_path)
+    model = ViTForImageClassification(config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size))
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+    
+
+    # Start training.
+    logger.info(f"Start testing", ranks=[0])
+    progress_bar = tqdm.tqdm(total=args.max_train_steps, desc="Training Step", disable=not coordinator.is_master())
+    
+    torch.cuda.synchronize()
+    model.train()
+    start_time = time.time()
+   
+    for _ in range(args.max_train_steps):
+
+        pixel_values, labels = get_data(args.batch_size, args.num_labels, 3, 224, 224)
+        optimizer.zero_grad()
+        outputs = model(pixel_values=pixel_values, labels=labels)
+        loss = outputs['loss']
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+        torch.cuda.synchronize()
+        progress_bar.update(1)
+       
+    # Compute Statistics   
+    end_time = time.time()
+    throughput = "{:.4f}".format((world_size * args.max_train_steps * args.batch_size) / (end_time - start_time))
+    max_mem = format_num(torch.cuda.max_memory_allocated(device=torch.cuda.current_device()), bytes=True)
+    
+    logger.info(f"Testing finished, " 
+                f"batch size per gpu: {args.batch_size}, "
+                f"plugin: {args.plugin}, "
+                f"throughput: {throughput}, "
+                f"maximum memory usage per gpu: {max_mem}.",
+                ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/images/vit/vit_train_demo.py b/examples/images/vit/vit_train_demo.py
new file mode 100644
index 000000000000..3a739f10b5d0
--- /dev/null
+++ b/examples/images/vit/vit_train_demo.py
@@ -0,0 +1,177 @@
+import torch
+import torch.distributed as dist
+import transformers
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor
+from tqdm import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_demo_args
+from data import BeansDataset, beans_collator
+
+
+def move_to_cuda(batch, device):
+    return {k: v.to(device) for k, v in batch.items()}
+
+
+def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator):
+        
+    torch.cuda.synchronize()
+    model.train()
+
+    with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar:
+        
+        for batch in pbar:
+
+            # Foward
+            optimizer.zero_grad()
+            batch = move_to_cuda(batch, torch.cuda.current_device())
+            outputs = model(**batch)
+            loss = outputs['loss']
+
+            # Backward
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            lr_scheduler.step()
+
+            # Print batch loss
+            pbar.set_postfix({'loss': loss.item()})
+
+
+@torch.no_grad()
+def evaluate_model(epoch, model, eval_dataloader, num_labels, coordinator):
+    
+    model.eval()
+    accum_loss = torch.zeros(1, device=get_current_device())
+    total_num = torch.zeros(1, device=get_current_device())
+    accum_correct = torch.zeros(1, device=get_current_device())
+
+    for batch in eval_dataloader:
+        batch = move_to_cuda(batch, torch.cuda.current_device())
+        outputs = model(**batch)
+        val_loss, logits = outputs[:2]
+        accum_loss += (val_loss / len(eval_dataloader))
+        if num_labels > 1:
+            preds = torch.argmax(logits, dim=1)
+        elif num_labels == 1:
+            preds = logits.squeeze()
+
+        labels = batch["labels"]
+        total_num += batch["labels"].shape[0]
+        accum_correct += (torch.sum(preds == labels))
+
+    dist.all_reduce(accum_loss)
+    dist.all_reduce(total_num)
+    dist.all_reduce(accum_correct)
+    avg_loss = "{:.4f}".format(accum_loss.item())
+    accuracy = "{:.4f}".format(accum_correct.item() / total_num.item())
+    if coordinator.is_master():
+        print(f"Evaluation result for epoch {epoch + 1}: \
+                average_loss={avg_loss}, \
+                accuracy={accuracy}.")
+        
+    
+   
+
+def main():
+
+    args = parse_demo_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Prepare Dataset
+    image_processor = ViTImageProcessor.from_pretrained(args.model_name_or_path)
+    train_dataset = BeansDataset(image_processor, split='train')
+    eval_dataset = BeansDataset(image_processor, split='validation')
+
+
+    # Load pretrained ViT model
+    config = ViTConfig.from_pretrained(args.model_name_or_path)
+    config.num_labels = train_dataset.num_labels
+    config.id2label = {str(i): c for i, c in enumerate(train_dataset.label_names)}
+    config.label2id = {c: str(i) for i, c in enumerate(train_dataset.label_names)}
+    model = ViTForImageClassification.from_pretrained(args.model_name_or_path, 
+                                                      config=config, 
+                                                      ignore_mismatched_sizes=True)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Prepare dataloader
+    train_dataloader = plugin.prepare_dataloader(train_dataset,
+                                        batch_size=args.batch_size,
+                                        shuffle=True,
+                                        drop_last=True,
+                                        collate_fn=beans_collator)
+    eval_dataloader = plugin.prepare_dataloader(eval_dataset,
+                                        batch_size=args.batch_size,
+                                        shuffle=True,
+                                        drop_last=True,
+                                        collate_fn=beans_collator)
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size), weight_decay=args.weight_decay)
+
+    # Set lr scheduler
+    total_steps = len(train_dataloader) * args.num_epoch
+    num_warmup_steps = int(args.warmup_ratio * total_steps)
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                           total_steps=(len(train_dataloader) * args.num_epoch),
+                                           warmup_steps=num_warmup_steps)
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, train_dataloader, lr_scheduler = booster.boost(model=model, 
+                                                                  optimizer=optimizer, 
+                                                                  dataloader=train_dataloader, 
+                                                                  lr_scheduler=lr_scheduler)
+    
+    # Finetuning
+    logger.info(f"Start finetuning", ranks=[0])
+    for epoch in range(args.num_epoch):
+        train_epoch(epoch, model, optimizer, lr_scheduler, train_dataloader, booster, coordinator)
+        evaluate_model(epoch, model, eval_dataloader, eval_dataset.num_labels, coordinator)
+    logger.info(f"Finish finetuning", ranks=[0])
+
+    # Save the finetuned model
+    booster.save_model(model, args.output_path)
+    logger.info(f"Saving model checkpoint to {args.output_path}", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/language/bert/README.md b/examples/language/bert/README.md
index c845a5c50387..81c3f03fffca 100644
--- a/examples/language/bert/README.md
+++ b/examples/language/bert/README.md
@@ -1,6 +1,6 @@
 ## Overview
 
-This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
+This directory includes two parts: Using the Booster API finetune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
 
 ## Finetune
 ```
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 92751c7e2f47..4b78624f0110 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -162,7 +162,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
             # shard it w.r.t tp pattern
             if 'mlp.c_fc' in mn:
                 if 'weight' in pn or 'bias' in pn:
-                    split_param_col_tp1d(param, pg)    # colmn slice
+                    split_param_col_tp1d(param, pg)    # column slice
                     # keep the shape of the output from c_fc
                     param.compute_spec.set_output_replicate(False)
                 else:
@@ -173,9 +173,9 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 else:
                     param.set_dist_spec(ReplicaSpec())
             elif 'wte' in mn or 'wpe' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'c_attn' in mn or 'c_proj' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             else:
                 param.set_dist_spec(ReplicaSpec())
             param.visited = True
@@ -237,7 +237,7 @@ def main():
         if args.tp_degree > 1:
             tensor_parallelize(model, tp_pg)
 
-        # asign running configurations
+        # assign running configurations
         if args.distplan == "CAI_ZeRO1":
             zero_stage = 1
         elif args.distplan == "CAI_ZeRO2":
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index 6369b9f8c5a1..d825ae92a285 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -305,7 +305,7 @@ def forward(ctx, vocab_parallel_logits, target):
     @staticmethod
     def backward(ctx, grad_output):
 
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.
diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md
index c2fd254571c7..37e1ff4d9008 100644
--- a/examples/language/opt/README.md
+++ b/examples/language/opt/README.md
@@ -19,15 +19,35 @@ Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/fa
 
 The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
 
-We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
-the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
 
 ## Our Modifications
-We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP.
 
-## Quick Start
-You can launch training by using the following bash script
+We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
+the tokenization). 
+
+We adapt the OPT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
+
+## Run Demo
 
+By running the following script:
 ```bash
-bash ./run_gemini.sh
+bash run_demo.sh
 ```
+You will finetune a [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on this [dataset](https://huggingface.co/datasets/hugginglearners/netflix-shows), which contains more than 8000 comments on Netflix shows.
+
+The script can be modified if you want to try another set of hyperparameters or change to another OPT model with different size.
+
+The demo code is adapted from this [blog](https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475) and  the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
+
+
+
+## Run Benchmark
+
+You can run benchmark for OPT model by running the following script:
+```bash
+bash run_benchmark.sh
+```
+The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your set of hyperparameters for testing.
+
+
+
diff --git a/examples/language/opt/args.py b/examples/language/opt/args.py
new file mode 100644
index 000000000000..16730be7ebea
--- /dev/null
+++ b/examples/language/opt/args.py
@@ -0,0 +1,120 @@
+from colossalai import get_default_parser
+
+
+def parse_demo_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="facebook/opt-350m",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="./output_model.bin",
+        help="The path of your saved model after finetuning."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--num_epoch",
+        type=int,
+        default=10,
+        help="Number of epochs."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--warmup_ratio",
+        type=float,
+        default=0.1,
+        help="Ratio of warmup steps against total training steps."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.01, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+
+def parse_benchmark_args():
+
+    parser = get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="facebook/opt-125m",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size (per dp group) for the training dataloader."
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--weight_decay", 
+        type=float, 
+        default=0.0, 
+        help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform."
+    )
+    parser.add_argument(
+        "--seed", 
+        type=int, 
+        default=42, 
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--mem_cap", 
+        type=int, 
+        default=0, 
+        help="Limit on the usage of space for each GPU (in GB)."
+    )
+    args = parser.parse_args()
+
+    return args
\ No newline at end of file
diff --git a/examples/language/opt/benchmark.sh b/examples/language/opt/benchmark.sh
deleted file mode 100644
index 0d04b5e9b33c..000000000000
--- a/examples/language/opt/benchmark.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-export BS=16
-export MEMCAP=0
-export MODEL="6.7b"
-export GPUNUM=1
-
-for MODEL in "6.7b" "13b" "1.3b"
-do
-for GPUNUM in 8 1
-do
-for BS in 16 24 32 8
-do
-for MEMCAP in 0 40
-do
-pkill -9 torchrun
-pkill -9 python
-
-env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh
-done
-done
-done
-done
diff --git a/examples/language/opt/data.py b/examples/language/opt/data.py
new file mode 100644
index 000000000000..6cfffb5fc95b
--- /dev/null
+++ b/examples/language/opt/data.py
@@ -0,0 +1,37 @@
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+
+
+class NetflixDataset(Dataset):
+    
+    def __init__(self, tokenizer):
+
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.input_ids = []
+        self.attn_masks = []
+        self.labels = []
+        self.txt_list = netflix_descriptions = load_dataset("hugginglearners/netflix-shows", split="train")['description']
+        self.max_length = max([len(self.tokenizer.encode(description)) for description in netflix_descriptions])
+
+        for txt in self.txt_list:
+            encodings_dict = self.tokenizer('</s>' + txt + '</s>',
+                                        truncation=True,
+                                        max_length=self.max_length, 
+                                        padding="max_length")
+            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
+            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.attn_masks[idx]
+    
+
+def netflix_collator(data):
+    return {'input_ids': torch.stack([x[0] for x in data]),       
+            'attention_mask': torch.stack([x[1] for x in data]),
+            'labels': torch.stack([x[0] for x in data])}
diff --git a/examples/language/opt/opt_benchmark.py b/examples/language/opt/opt_benchmark.py
new file mode 100755
index 000000000000..2d69036b50c6
--- /dev/null
+++ b/examples/language/opt/opt_benchmark.py
@@ -0,0 +1,137 @@
+import time
+
+import torch
+import transformers
+from transformers import AutoConfig, OPTForCausalLM
+from transformers.utils.versions import require_version
+import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.tensor import ProcessGroup, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_benchmark_args
+
+require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt")
+
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print(f"Limiting GPU memory usage to {size_in_GB} GB")
+
+
+def main():
+
+    args = parse_benchmark_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Whether to set limit of memory capacity
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+    
+    # Build OPT model
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    model = OPTForCausalLM(config=config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                             placement_policy='cpu',
+                             pin_memory=True,
+                             strict_ddp_mode=True,
+                             initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), lr=args.learning_rate)
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, _, _ = booster.boost(model, optimizer)
+    
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+
+    # Start training.
+    logger.info(f"Start testing", ranks=[0])
+    progress_bar = tqdm.tqdm(total=args.max_train_steps, desc="Training Step", disable=not coordinator.is_master())
+    
+    torch.cuda.synchronize()
+    model.train()
+    start_time = time.time()
+   
+    for _ in range(args.max_train_steps):
+
+        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
+        loss = outputs['loss']
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+        torch.cuda.synchronize()
+        progress_bar.update(1)
+       
+    # Compute Statistics   
+    end_time = time.time()
+    throughput = "{:.4f}".format((world_size * args.max_train_steps * args.batch_size) / (end_time - start_time))
+    max_mem = format_num(torch.cuda.max_memory_allocated(device=torch.cuda.current_device()), bytes=True)
+    
+    logger.info(f"Testing finished, " 
+                f"batch size per gpu: {args.batch_size}, "
+                f"plugin: {args.plugin}, "
+                f"throughput: {throughput}, "
+                f"maximum memory usage per gpu: {max_mem}.",
+                ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
new file mode 100644
index 000000000000..fa7feca9c9a9
--- /dev/null
+++ b/examples/language/opt/opt_train_demo.py
@@ -0,0 +1,142 @@
+import time
+
+import torch
+import datasets
+import transformers
+from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer
+from transformers import get_linear_schedule_with_warmup
+from transformers.utils.versions import require_version
+from tqdm import tqdm
+
+import colossalai
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.tensor import ProcessGroup, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+
+from args import parse_demo_args
+from data import NetflixDataset, netflix_collator
+
+require_version("datasets>=1.8.0", "To fix: pip install -r requirements.txt")
+require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt")
+
+
+def move_to_cuda(batch, device):
+    return {k: v.to(device) for k, v in batch.items()}
+
+
+def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator):
+        
+    torch.cuda.synchronize()
+    model.train()
+
+    with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar:
+        
+        for batch in pbar:
+
+            # Forward
+            optimizer.zero_grad()
+            batch = move_to_cuda(batch, torch.cuda.current_device())
+            
+            outputs = model(use_cache=False, **batch)
+            loss = outputs['loss']
+
+            # Backward
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            lr_scheduler.step()
+
+            # Print batch loss
+            pbar.set_postfix({'loss': loss.item()})
+
+
+def main():
+
+    args = parse_demo_args()
+
+    # Launch ColossalAI
+    colossalai.launch_from_torch(config={}, seed=args.seed)
+    coordinator = DistCoordinator()
+    world_size = coordinator.world_size
+
+    # Manage loggers
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    if coordinator.is_master():
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    
+    # Build OPT model
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    model = OPTForCausalLM.from_pretrained(args.model_name_or_path, config=config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(device=get_current_device(),
+                        placement_policy='cpu',
+                        pin_memory=True,
+                        strict_ddp_mode=True,
+                        initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+    logger.info(f"Set plugin as {args.plugin}", ranks=[0])
+
+    # Prepare tokenizer and dataloader
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)   
+    dataset = NetflixDataset(tokenizer)
+    dataloader = plugin.prepare_dataloader(dataset,
+                                           batch_size=args.batch_size,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           collate_fn=netflix_collator)
+    
+    # Set optimizer
+    optimizer = HybridAdam(model.parameters(), 
+                           lr=(args.learning_rate * world_size),
+                           weight_decay=args.weight_decay)
+
+    # Set lr scheduler
+    total_steps = len(dataloader) * args.num_epoch
+    num_warmup_steps = int(args.warmup_ratio * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=len(dataloader) * args.num_epoch
+    )
+
+    # Set booster
+    booster = Booster(plugin=plugin, **booster_kwargs)
+    model, optimizer, _, dataloader, lr_scheduler = booster.boost(model=model, 
+                                                                  optimizer=optimizer, 
+                                                                  dataloader=dataloader, 
+                                                                  lr_scheduler=lr_scheduler)
+
+    # Start finetuning
+    logger.info(f"Start finetuning", ranks=[0])
+    for epoch in range(args.num_epoch):
+        train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator)
+
+    # Finish training and evaluate
+    logger.info(f"Finish finetuning", ranks=[0])
+    booster.save_model(model, args.output_path)
+    logger.info(f"Saving model checkpoint to {args.output_path}", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt
index 137a69e80498..4422216e6a1c 100644
--- a/examples/language/opt/requirements.txt
+++ b/examples/language/opt/requirements.txt
@@ -1,2 +1,4 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+datasets >= 1.8.0
+transformers >= 4.20.0
\ No newline at end of file
diff --git a/examples/language/opt/run_benchmark.sh b/examples/language/opt/run_benchmark.sh
new file mode 100644
index 000000000000..76c5e8601989
--- /dev/null
+++ b/examples/language/opt/run_benchmark.sh
@@ -0,0 +1,30 @@
+set -xe
+pip install -r requirements.txt
+
+export BS=32
+export MEMCAP=0
+export GPUNUM=1
+
+# acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`
+export MODEL="125m"
+
+for BS in 8 32 128
+do
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
+do
+for GPUNUM in 1 4
+do
+
+MODLE_PATH="facebook/opt-${MODEL}"
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_benchmark.py \
+  --model_name_or_path ${MODLE_PATH} \
+  --mem_cap ${MEMCAP} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+  
+done
+done
+done
diff --git a/examples/language/opt/run_demo.sh b/examples/language/opt/run_demo.sh
new file mode 100644
index 000000000000..0c9759c34039
--- /dev/null
+++ b/examples/language/opt/run_demo.sh
@@ -0,0 +1,44 @@
+set -xe
+pip install -r requirements.txt
+
+# model name or path
+MODEL="facebook/opt-350m"
+
+# path for saving model
+OUTPUT_PATH="./output_model.bin"
+
+# plugin(training strategy)
+# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
+PLUGIN="gemini"
+
+# number of gpus to use
+GPUNUM=4
+
+# batch size per gpu
+BS=16
+
+# learning rate
+LR="5e-5"
+
+# number of epoch
+EPOCH=10
+
+# weight decay
+WEIGHT_DECAY=0.01
+
+# ratio of warmup steps
+WARMUP_RATIO=0.1
+
+# run the script for demo
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_train_demo.py \
+  --model_name_or_path ${MODEL} \
+  --output_path ${OUTPUT_PATH} \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS} \
+  --num_epoch ${EPOCH} \
+  --learning_rate ${LR} \
+  --weight_decay ${WEIGHT_DECAY} \
+  --warmup_ratio ${WARMUP_RATIO}
diff --git a/examples/language/opt/run_gemini.sh b/examples/language/opt/run_gemini.sh
deleted file mode 100644
index 73f231292a13..000000000000
--- a/examples/language/opt/run_gemini.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-set -x
-export BS=${BS:-16}
-export MEMCAP=${MEMCAP:-0}
-# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
-export MODEL=${MODEL:-"125m"}
-export GPUNUM=${GPUNUM:-1}
-export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"}
-
-# make directory for logs
-mkdir -p ./logs
-
-if [ ${USE_SHARD_INIT} = "true" ]; then
-  USE_SHARD_INIT="--shardinit"
-else
-  USE_SHARD_INIT=""
-fi
-
-export MODLE_PATH="facebook/opt-${MODEL}"
-
-# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
-torchrun \
-  --nproc_per_node ${GPUNUM} \
-  --master_port 19198 \
-  train_gemini_opt.py \
-  --mem_cap ${MEMCAP} \
-  --model_name_or_path ${MODLE_PATH} \
-  ${USE_SHARD_INIT} \
-  --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh
index 317f602cda3c..fa14f52b70d2 100644
--- a/examples/language/opt/test_ci.sh
+++ b/examples/language/opt/test_ci.sh
@@ -1,4 +1,19 @@
-for GPUNUM in 2 1
+set -xe
+pip install -r requirements.txt
+
+BS=4
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
 do
-env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
+for GPUNUM in 1 4
+do
+
+torchrun \
+  --standalone \
+  --nproc_per_node ${GPUNUM} \
+  opt_benchmark.py \
+  --model_name_or_path "facebook/opt-125m" \
+  --plugin ${PLUGIN} \
+  --batch_size ${BS}
+
+done
 done
diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py
deleted file mode 100755
index 3614b689de26..000000000000
--- a/examples/language/opt/train_gemini_opt.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
-on a text file or a dataset without using HuggingFace Trainer.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-
-import time
-from functools import partial
-
-import datasets
-import torch
-import torch.distributed as dist
-import transformers
-from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM
-from transformers.utils.versions import require_version
-
-import colossalai
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.tensor import ProcessGroup, ShardSpec
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, GeminiDDP
-
-
-def get_data(batch_size, seq_len, vocab_size):
-    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-    attention_mask = torch.ones_like(input_ids)
-    return input_ids, attention_mask
-
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def get_time_stamp():
-    torch.cuda.synchronize()
-    return time.time()
-
-
-def get_tflops(model_numel, batch_size, seq_len, step_time):
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
-
-
-def parse_args():
-    parser = colossalai.get_default_parser()
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per dp group) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=20,
-        help="Total number of training steps to perform.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--shardinit",
-        action="store_true",
-        help="Initialize the model with tensor parallel",
-    )
-    parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
-    parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
-    args = parser.parse_args()
-
-    return args
-
-
-def colo_memory_cap(size_in_GB):
-    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
-    cuda_capacity = colo_device_memory_capacity(get_current_device())
-    if size_in_GB * (1024**3) < cuda_capacity:
-        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
-        print("Using {} GB of GPU memory".format(size_in_GB))
-
-
-def main():
-    args = parse_args()
-    disable_existing_loggers()
-    colossalai.launch_from_torch({})
-    logger = get_dist_logger()
-    is_main_process = dist.get_rank() == 0
-
-    if is_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    if args.mem_cap > 0:
-        colo_memory_cap(args.mem_cap)
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        torch.mannul_seed(args.seed)
-        logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-    logger.info("Model config has been created", ranks=[0])
-
-    if args.init_in_cpu:
-        init_dev = torch.device('cpu')
-    else:
-        init_dev = get_current_device()
-
-    # shard init parameters
-    if args.shardinit:
-        logger.info("Sharding initialization !", ranks=[0])
-    else:
-        logger.info("Skipping sharding initialization", ranks=[0])
-
-    world_size = torch.distributed.get_world_size()
-    shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
-    default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
-
-    # build model
-    if args.model_name_or_path is None:
-        logger.info("Train a new model from scratch", ranks=[0])
-        with ColoInitContext(device=init_dev,
-                             dtype=torch.half,
-                             default_dist_spec=default_dist_spec,
-                             default_pg=shard_pg):
-            model = OPTForCausalLM(config)
-    else:
-        logger.info("Finetune a pre-trained model", ranks=[0])
-        with ColoInitContext(device=init_dev,
-                             dtype=torch.half,
-                             default_dist_spec=default_dist_spec,
-                             default_pg=shard_pg):
-            model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
-                                                   from_tf=bool(".ckpt" in args.model_name_or_path),
-                                                   config=config,
-                                                   local_files_only=False)
-
-    # enable gradient checkpointing
-    model.gradient_checkpointing_enable()
-
-    numel = sum([p.numel() for p in model.parameters()])
-    PLACEMENT_POLICY = 'cpu'
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=PLACEMENT_POLICY,
-                      pin_memory=True,
-                      strict_ddp_mode=args.shardinit)
-    optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
-
-    SEQ_LEN = 1024
-    VOCAB_SIZE = 50257
-
-    get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
-
-    model.train()
-    for step in range(args.max_train_steps):
-        st_time = time.time()
-        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
-
-        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
-        loss = outputs['loss']
-        optimizer.backward(loss)
-
-        optimizer.step()
-        optimizer.zero_grad()
-        torch.cuda.synchronize()
-        step_time = time.time() - st_time
-        step_tflops = get_tflops_func(step_time)
-
-        logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0])
-
-    logger.info("Training finished", ranks=[0])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 62062e8bd272..a0600db1bc5b 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -140,15 +140,15 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 continue
             param.set_dist_spec(ReplicaSpec())
             if 'net.0' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'to_q' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif 'to_kv' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             elif 'to_out' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             elif '1.1' in mn:
-                split_param_col_tp1d(param, pg)    # colmn slice
+                split_param_col_tp1d(param, pg)    # column slice
             elif '1.2' in mn:
                 split_param_row_tp1d(param, pg)    # row slice
             else:
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index 7470327a65b6..6cc4c8ef370d 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 from typing import Callable
 
-__all__ = ['ModelZooRegistry', 'ModelAttributem', 'model_zoo']
+__all__ = ['ModelZooRegistry', 'ModelAttribute', 'model_zoo']
 
 
 @dataclass
@@ -37,7 +37,7 @@ def register(self,
         >>> model_zoo = ModelZooRegistry()
         >>> model_zoo.register('resnet18', resnet18, resnet18_data_gen)
         >>> # Run the model
-        >>> data = resnresnet18_data_gen() # do not input any argument
+        >>> data = resnet18_data_gen() # do not input any argument
         >>> model = resnet18() # do not input any argument
         >>> out = model(**data)
 
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
index b47b3508ad1b..c3ceef4c7adf 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_batch_norm_handler.py
@@ -27,7 +27,7 @@ def check_bn_module_handler(rank, world_size, port):
     # the index of bn node in computation graph
     node_index = 1
     # the total number of bn strategies without sync bn mode
-    # TODO: add sync bn stategies after related passes ready
+    # TODO: add sync bn strategies after related passes ready
     strategy_number = 4
     numerical_test_for_node_strategy(model=model,
                                      device_mesh=device_mesh,
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
index 5259455d2179..1703d5ded2f2 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_output_handler.py
@@ -43,14 +43,14 @@ def test_output_handler(output_option):
     output_strategies_vector = StrategiesVector(output_node)
 
     # build handler
-    otuput_handler = OutputHandler(node=output_node,
+    output_handler = OutputHandler(node=output_node,
                                    device_mesh=device_mesh,
                                    strategies_vector=output_strategies_vector,
                                    output_option=output_option)
 
-    otuput_handler.register_strategy(compute_resharding_cost=False)
+    output_handler.register_strategy(compute_resharding_cost=False)
     # check operation data mapping
-    mapping = otuput_handler.get_operation_data_mapping()
+    mapping = output_handler.get_operation_data_mapping()
 
     for name, op_data in mapping.items():
         op_data: OperationData
@@ -59,7 +59,7 @@ def test_output_handler(output_option):
 
     assert mapping['output'].name == "output"
     assert mapping['output'].type == OperationDataType.OUTPUT
-    strategy_name_list = [val.name for val in otuput_handler.strategies_vector]
+    strategy_name_list = [val.name for val in output_handler.strategies_vector]
     if output_option == 'distributed':
         assert "Distributed Output" in strategy_name_list
     else:
diff --git a/tests/test_tensor/test_dtensor/test_layout_converter.py b/tests/test_tensor/test_dtensor/test_layout_converter.py
index 5f56decb5e5d..5c3da5f2b9ff 100644
--- a/tests/test_tensor/test_dtensor/test_layout_converter.py
+++ b/tests/test_tensor/test_dtensor/test_layout_converter.py
@@ -137,7 +137,7 @@ def check_layout_converting(rank, world_size, port):
     assert comm_action_sequence[2].shard_dim == 0
     assert comm_action_sequence[2].logical_process_axis == 1
 
-    # checkout chached_spec_pairs_transform_path
+    # checkout cached_spec_pairs_transform_path
     assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][0] == transform_path
     assert layout_converter.cached_solution[('[R, S01, R]', '[S01, R, R]')][1] == comm_action_sequence