From 7d7791724ea2c574f9d871e7ae1f7f8836341d33 Mon Sep 17 00:00:00 2001
From: YeAnbang <yab0576@163.com>
Date: Tue, 21 Nov 2023 14:49:32 +0800
Subject: [PATCH 01/23] Add dpo. Fix sft, ppo, lora. Refactor all

---
 applications/Chat/.gitignore                  |    2 +
 applications/Chat/LICENSE                     |  327 ++++
 .../benchmarks/benchmark_opt_lora_dummy.py    |   10 +-
 applications/Chat/coati/adapter/adapter.py    |    0
 applications/Chat/coati/dataset/__init__.py   |   27 +-
 .../Chat/coati/dataset/conversation.py        |   51 +-
 .../Chat/coati/dataset/dpo_dataset_utils.py   |   51 +
 applications/Chat/coati/dataset/loader.py     |  359 ++++
 .../Chat/coati/dataset/prompt_dataset.py      |   19 +-
 .../Chat/coati/dataset/reward_dataset.py      |   76 +-
 .../Chat/coati/dataset/sft_dataset.py         |   71 +-
 .../dataset/spliced_and_tokenized_dataset.py  |  465 ++++++
 applications/Chat/coati/dataset/utils.py      |   56 +
 .../Chat/coati/experience_buffer/utils.py     |    5 +-
 .../Chat/coati/experience_maker/base.py       |    3 +-
 .../Chat/coati/experience_maker/naive.py      |   53 +-
 applications/Chat/coati/models/__init__.py    |   17 +-
 applications/Chat/coati/models/actor.py       |   28 +
 applications/Chat/coati/models/base.py        |   29 +
 .../Chat/coati/models/base/__init__.py        |   27 -
 applications/Chat/coati/models/base/actor.py  |   33 -
 applications/Chat/coati/models/base/critic.py |   34 -
 .../Chat/coati/models/base/reward_model.py    |   46 -
 .../Chat/coati/models/bloom/__init__.py       |    5 -
 .../Chat/coati/models/bloom/bloom_actor.py    |   36 -
 .../Chat/coati/models/bloom/bloom_critic.py   |   36 -
 .../Chat/coati/models/bloom/bloom_rm.py       |   36 -
 .../Chat/coati/models/chatglm/__init__.py     |    3 -
 .../coati/models/chatglm/chatglm_actor.py     |   31 -
 .../coati/models/chatglm/chatglm_tokenizer.py |  442 -----
 .../models/chatglm/configuration_chatglm.py   |  101 --
 .../coati/models/chatglm/modeling_chatglm.py  | 1477 -----------------
 applications/Chat/coati/models/critic.py      |   38 +
 applications/Chat/coati/models/generation.py  |    6 +-
 .../Chat/coati/models/gpt/__init__.py         |    5 -
 .../Chat/coati/models/gpt/gpt_actor.py        |   38 -
 .../Chat/coati/models/gpt/gpt_critic.py       |   37 -
 applications/Chat/coati/models/gpt/gpt_rm.py  |   37 -
 .../Chat/coati/models/llama/__init__.py       |    5 -
 .../Chat/coati/models/llama/llama_actor.py    |   38 -
 .../Chat/coati/models/llama/llama_critic.py   |   36 -
 .../Chat/coati/models/llama/llama_rm.py       |   37 -
 applications/Chat/coati/models/lora.py        |   20 -
 applications/Chat/coati/models/loss.py        |   94 +-
 .../Chat/coati/models/opt/__init__.py         |    5 -
 .../Chat/coati/models/opt/opt_actor.py        |   37 -
 .../Chat/coati/models/opt/opt_critic.py       |   37 -
 applications/Chat/coati/models/opt/opt_rm.py  |   36 -
 .../Chat/coati/models/reward_model.py         |   48 +
 applications/Chat/coati/models/utils.py       |  135 +-
 applications/Chat/coati/trainer/__init__.py   |    8 +-
 applications/Chat/coati/trainer/base.py       |   34 +-
 applications/Chat/coati/trainer/dpo.py        |  327 ++++
 applications/Chat/coati/trainer/ppo.py        |   86 +-
 applications/Chat/coati/trainer/rm.py         |  265 ++-
 applications/Chat/coati/trainer/sft.py        |  155 +-
 applications/Chat/coati/trainer/utils.py      |   11 +
 applications/Chat/coati/utils/__init__.py     |    5 +
 .../Chat/coati/utils/accumulative_meter.py    |   32 +
 applications/Chat/coati/utils/ckpt_io.py      |   88 +
 .../Chat/coati/utils/flash_attention_patch.py |  211 +++
 applications/Chat/examples/=2.0.0,            |    0
 applications/Chat/examples/README.md          |  156 +-
 .../community/peft/train_peft_prompts.py      |    6 +-
 .../examples/community/peft/train_peft_sft.py |    2 +-
 applications/Chat/examples/config_file        |   27 +
 applications/Chat/examples/hostfile           |    1 +
 applications/Chat/examples/inference.py       |    2 +-
 applications/Chat/examples/out.txt            | 1319 +++++++++++++++
 applications/Chat/examples/train_dpo.py       |  306 ++++
 applications/Chat/examples/train_dpo.sh       |   72 +
 .../{train_prompts.py => train_ppo.py}        |  103 +-
 applications/Chat/examples/train_ppo.sh       |   49 +
 applications/Chat/examples/train_prompts.sh   |   25 -
 .../Chat/examples/train_reward_model.py       |  379 +++--
 applications/Chat/examples/train_rm.sh        |   63 +-
 applications/Chat/examples/train_sft.py       |  378 +++--
 applications/Chat/examples/train_sft.sh       |   42 +-
 applications/Chat/requirements.txt            |    2 +-
 applications/Chat/tests/test_dataset.py       |   23 +-
 applications/Chat/tests/test_experience.py    |   11 +-
 applications/Chat/tests/test_lora.py          |   73 +
 applications/Chat/tests/test_models.py        |   14 +-
 applications/Chat/tests/test_train.sh         |   46 +-
 84 files changed, 5530 insertions(+), 3436 deletions(-)
 create mode 100644 applications/Chat/coati/adapter/adapter.py
 create mode 100644 applications/Chat/coati/dataset/dpo_dataset_utils.py
 create mode 100644 applications/Chat/coati/dataset/loader.py
 create mode 100644 applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
 create mode 100644 applications/Chat/coati/models/actor.py
 create mode 100644 applications/Chat/coati/models/base.py
 delete mode 100644 applications/Chat/coati/models/base/__init__.py
 delete mode 100644 applications/Chat/coati/models/base/actor.py
 delete mode 100644 applications/Chat/coati/models/base/critic.py
 delete mode 100644 applications/Chat/coati/models/base/reward_model.py
 delete mode 100644 applications/Chat/coati/models/bloom/__init__.py
 delete mode 100644 applications/Chat/coati/models/bloom/bloom_actor.py
 delete mode 100644 applications/Chat/coati/models/bloom/bloom_critic.py
 delete mode 100644 applications/Chat/coati/models/bloom/bloom_rm.py
 delete mode 100644 applications/Chat/coati/models/chatglm/__init__.py
 delete mode 100644 applications/Chat/coati/models/chatglm/chatglm_actor.py
 delete mode 100644 applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
 delete mode 100644 applications/Chat/coati/models/chatglm/configuration_chatglm.py
 delete mode 100644 applications/Chat/coati/models/chatglm/modeling_chatglm.py
 create mode 100644 applications/Chat/coati/models/critic.py
 delete mode 100644 applications/Chat/coati/models/gpt/__init__.py
 delete mode 100644 applications/Chat/coati/models/gpt/gpt_actor.py
 delete mode 100644 applications/Chat/coati/models/gpt/gpt_critic.py
 delete mode 100644 applications/Chat/coati/models/gpt/gpt_rm.py
 delete mode 100644 applications/Chat/coati/models/llama/__init__.py
 delete mode 100644 applications/Chat/coati/models/llama/llama_actor.py
 delete mode 100644 applications/Chat/coati/models/llama/llama_critic.py
 delete mode 100644 applications/Chat/coati/models/llama/llama_rm.py
 delete mode 100644 applications/Chat/coati/models/opt/__init__.py
 delete mode 100644 applications/Chat/coati/models/opt/opt_actor.py
 delete mode 100644 applications/Chat/coati/models/opt/opt_critic.py
 delete mode 100644 applications/Chat/coati/models/opt/opt_rm.py
 create mode 100644 applications/Chat/coati/models/reward_model.py
 create mode 100644 applications/Chat/coati/trainer/dpo.py
 create mode 100644 applications/Chat/coati/utils/__init__.py
 create mode 100644 applications/Chat/coati/utils/accumulative_meter.py
 create mode 100644 applications/Chat/coati/utils/ckpt_io.py
 create mode 100644 applications/Chat/coati/utils/flash_attention_patch.py
 create mode 100644 applications/Chat/examples/=2.0.0,
 create mode 100644 applications/Chat/examples/config_file
 create mode 100644 applications/Chat/examples/hostfile
 create mode 100644 applications/Chat/examples/out.txt
 create mode 100644 applications/Chat/examples/train_dpo.py
 create mode 100755 applications/Chat/examples/train_dpo.sh
 rename applications/Chat/examples/{train_prompts.py => train_ppo.py} (71%)
 create mode 100755 applications/Chat/examples/train_ppo.sh
 delete mode 100755 applications/Chat/examples/train_prompts.sh
 create mode 100644 applications/Chat/tests/test_lora.py

diff --git a/applications/Chat/.gitignore b/applications/Chat/.gitignore
index 5fa068105e26..6e464c960e24 100644
--- a/applications/Chat/.gitignore
+++ b/applications/Chat/.gitignore
@@ -144,5 +144,7 @@ docs/.build
 
 # wandb log
 example/wandb/
+example/logs/
+example/output/
 
 examples/awesome-chatgpt-prompts/
diff --git a/applications/Chat/LICENSE b/applications/Chat/LICENSE
index 0528c89ea9ec..b3eb43520a6f 100644
--- a/applications/Chat/LICENSE
+++ b/applications/Chat/LICENSE
@@ -200,3 +200,330 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+   ## Some of colossal-ai's code is derived from others projects, which is subject to the following copyright notice:
+
+   Copyright 2021 The Alpa team.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         https://github.com/alpa-projects/alpa/blob/979a45a3e6187df941ef4a4c4c6eea664527d68d/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   -------------------------------------------------
+
+   Copyright 2018-2020 Philippe Tillet
+   Copyright 2020-2022 OpenAI
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files
+   (the "Software"), to deal in the Software without restriction,
+   including without limitation the rights to use, copy, modify, merge,
+   publish, distribute, sublicense, and/or sell copies of the Software,
+   and to permit persons to whom the Software is furnished to do so,
+   subject to the following conditions:
+
+   ---------------- LICENSE FOR Microsoft Deepspeed ----------------
+
+   MIT License
+
+   Copyright (c) Microsoft Corporation.
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE
+
+   ---------------- LICENSE FOR NVIDIA Megatron-LM ----------------
+
+   Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of NVIDIA CORPORATION nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR NVIDIA Apex ----------------
+
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Facebook Fairscale ----------------
+
+   Copyright (c) Facebook, Inc. and its affiliates
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+      and IDIAP Research Institute nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Flash Attention ----------------
+
+   BSD 3-Clause License
+
+   Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   * Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR Facebook xFormers ----------------
+
+   From xFormers:
+
+   Copyright (c) Facebook, Inc. and its affiliates
+
+
+   ===
+
+   BSD 3-Clause License
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+      and IDIAP Research Institute nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR VLLM TEAM ----------------
+
+   from VLLM TEAM:
+
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         https://github.com/vllm-project/vllm/blob/main/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   ---------------- LICENSE FOR LIGHTLLM TEAM ----------------
+
+   from LIGHTLLM TEAM:
+
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         https://github.com/ModelTC/lightllm/blob/main/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ---------------- LICENSE FOR AutoGPTQ ----------------
+
+   From AutoGPTQ:
+
+   MIT License
+
+   Copyright (c) 2023 潘其威(William)
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+   ---------------- LICENSE FOR exllama ----------------
+
+   From exllama:
+
+   MIT License
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+
+   ---------------- LICENSE FOR torch-int ----------------
+
+   MIT License
+
+   Copyright (c) 2022 Guangxuan Xiao
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+
+   ---------------- LICENSE FOR smoothquant ----------------
+
+   MIT License
+
+   Copyright (c) 2022 MIT HAN Lab
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 0d0e2a7d34f5..8338abf54ced 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -76,9 +76,11 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif args.strategy == "colossalai_zero2_cpu":
@@ -155,7 +157,9 @@ def main(args):
         initial_model,
         actor_optim,
         critic_optim,
+        None,
         tokenizer=tokenizer,
+        rm_model_tokenizer=tokenizer,
         ptx_coef=0,
         train_batch_size=args.train_batch_size,
         offload_inference_models=args.offload_inference_models,
@@ -173,6 +177,7 @@ def main(args):
         num_episodes=args.num_episodes,
         num_update_steps=args.num_update_steps,
         num_collect_steps=args.num_collect_steps,
+        save_per_num_episodes=args.save_per_num_episodes,
     )
 
     print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
@@ -199,6 +204,7 @@ def main(args):
     parser.add_argument("--num_collect_steps", type=int, default=8)
     parser.add_argument("--num_update_steps", type=int, default=1)
     parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--save_per_num_episodes", type=int, default=1000)
     parser.add_argument("--experience_batch_size", type=int, default=8)
     parser.add_argument("--lora_rank", type=int, default=0)
     parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
diff --git a/applications/Chat/coati/adapter/adapter.py b/applications/Chat/coati/adapter/adapter.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
index 599b57609775..2d276b6c6964 100644
--- a/applications/Chat/coati/dataset/__init__.py
+++ b/applications/Chat/coati/dataset/__init__.py
@@ -1,13 +1,24 @@
-from .prompt_dataset import PromptDataset
-from .reward_dataset import HhRlhfDataset, RmStaticDataset
-from .sft_dataset import SFTDataset, SupervisedDataset
+# from .prompt_dataset import PromptDataset
+# from .reward_dataset import PreferenceDataset  # HhRlhfDataset, RmStaticDataset
+# from .sft_dataset import SFTDataset, SupervisedDataset
+from .loader import (
+    DataCollatorForPreferenceDataset,
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
 from .utils import is_rank_0
 
 __all__ = [
-    "RmStaticDataset",
-    "HhRlhfDataset",
-    "SFTDataset",
-    "SupervisedDataset",
-    "PromptDataset",
+    # "PreferenceDataset",
+    # "SFTDataset",
+    # "SupervisedDataset",
+    # "PromptDataset",
     "is_rank_0",
+    "DataCollatorForPreferenceDataset",
+    "DataCollatorForSupervisedDataset",
+    "StatefulDistributedSampler",
+    "load_tokenized_dataset",
+    "setup_distributed_dataloader",
 ]
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
index f2180d96b0d3..be27ff7bc817 100644
--- a/applications/Chat/coati/dataset/conversation.py
+++ b/applications/Chat/coati/dataset/conversation.py
@@ -18,7 +18,7 @@
 
 
 class SeparatorStyle(Enum):
-    ADD_EOS_TOKEN = auto()
+    ADD_BOS_EOS_TOKEN = auto()
 
 
 @dataclasses.dataclass
@@ -27,19 +27,35 @@ class Conversation:
     roles: List[str]
     messages: List[List[str]]
     offset: int
-    sep_style: SeparatorStyle = SeparatorStyle.ADD_EOS_TOKEN
-    sep: str = "</s>"
+    sep_style: SeparatorStyle
+    seps: List[str]
 
-    skip_next: bool = False
+    def clear(self):
+        self.messages = []
 
-    def get_prompt(self):
-        if self.sep_style == SeparatorStyle.ADD_EOS_TOKEN:
+    def get_prompt(self, length: int = None):
+        if length is None:
+            length = len(self.messages)
+
+        if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
+            ret = self.system
+            for role, message in self.messages[0:length]:
+                if message:
+                    ret += role + ": " + self.seps[0] + message + self.seps[1]
+                else:
+                    ret += role + ": " + self.seps[0]
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def save_prompt(self):
+        if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
             ret = self.system
             for role, message in self.messages:
                 if message:
-                    ret += role + ": " + message + self.sep
+                    ret += role + ": " + self.seps[0] + message + self.seps[1] + "\n"
                 else:
-                    ret += role + ": "
+                    ret += role + ": " + self.seps[0]
             return ret
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
@@ -47,15 +63,6 @@ def get_prompt(self):
     def append_message(self, role, message):
         self.messages.append([role, message])
 
-    def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
     def copy(self):
         return Conversation(
             system=self.system,
@@ -63,7 +70,7 @@ def copy(self):
             messages=[[x, y] for x, y in self.messages],
             offset=self.offset,
             sep_style=self.sep_style,
-            sep=self.sep,
+            seps=self.seps,
         )
 
     def dict(self):
@@ -72,7 +79,7 @@ def dict(self):
             "roles": self.roles,
             "messages": self.messages,
             "offset": self.offset,
-            "sep": self.sep,
+            "seps": self.seps,
         }
 
 
@@ -80,10 +87,10 @@ def dict(self):
     system="A chat between a curious human and an artificial intelligence assistant. "
     "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
     roles=("Human", "Assistant"),
-    messages=(),
+    messages=[],
     offset=0,
-    sep_style=SeparatorStyle.ADD_EOS_TOKEN,
-    sep="</s>",
+    sep_style=SeparatorStyle.ADD_BOS_EOS_TOKEN,
+    seps=["<s>", "</s>"],
 )
 
 default_conversation = conv
diff --git a/applications/Chat/coati/dataset/dpo_dataset_utils.py b/applications/Chat/coati/dataset/dpo_dataset_utils.py
new file mode 100644
index 000000000000..7aa4c2389c5a
--- /dev/null
+++ b/applications/Chat/coati/dataset/dpo_dataset_utils.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict, List, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import LlamaForCausalLM
+
+
+def get_log_probability(logits: torch.Tensor, labels: torch.Tensor):
+    log_probs = F.log_softmax(logits, dim=-1)
+    log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
+    return log_probs_labels.squeeze(-1)
+
+
+def get_reference_model_reward(
+    data_point: Dict[str, Any],
+    model: LlamaForCausalLM,
+) -> Dict[str, Union[int, str, List[int]]]:
+    (
+        chosen_input_ids,
+        chosen_attention_mask,
+        chosen_loss_mask,
+        rejected_input_ids,
+        rejected_attention_mask,
+        rejected_loss_mask,
+    ) = (
+        data_point["chosen_input_ids"],
+        data_point["chosen_attention_mask"],
+        data_point["chosen_loss_mask"],
+        data_point["rejected_input_ids"],
+        data_point["rejected_attention_mask"],
+        data_point["rejected_loss_mask"],
+    )
+    with torch.no_grad():
+        current_device = torch.cuda.current_device()
+        chosen_logits = model(
+            input_ids=torch.tensor([chosen_input_ids]).to(current_device),
+            attention_mask=torch.tensor([chosen_attention_mask]).to(current_device),
+        ).logits
+        chosen_logits = get_log_probability(chosen_logits, torch.tensor([chosen_input_ids]).to(current_device)).cpu()
+        rejected_logits = model(
+            input_ids=torch.tensor([rejected_input_ids]).to(current_device),
+            attention_mask=torch.tensor([rejected_attention_mask]).to(current_device),
+        ).logits
+        rejected_logits = get_log_probability(
+            rejected_logits, torch.tensor([rejected_input_ids]).to(current_device)
+        ).cpu()
+
+        data_point["chosen_reward"] = (chosen_logits * torch.tensor(chosen_loss_mask)).sum(-1)
+        data_point["rejected_reward"] = (rejected_logits * torch.tensor(rejected_loss_mask)).sum(-1)
+
+    return data_point
diff --git a/applications/Chat/coati/dataset/loader.py b/applications/Chat/coati/dataset/loader.py
new file mode 100644
index 000000000000..167cba1423c0
--- /dev/null
+++ b/applications/Chat/coati/dataset/loader.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import math
+import os
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, Iterator, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from coati.dataset.utils import chuncate_sequence, pad_to_max_len
+from datasets import Dataset as HFDataset
+from datasets import dataset_dict, load_from_disk
+from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.utils.data import ConcatDataset, DataLoader, Dataset, DistributedSampler
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+PathType = Union[str, os.PathLike]
+
+
+def load_tokenized_dataset(
+    dataset_paths: Union[PathType, List[PathType]], mode: str = "train", **kwargs
+) -> Optional[DatasetType]:
+    """
+    Load pre-tokenized dataset.
+    Each instance of dataset is a dictionary with
+    `{'input_ids': List[int], 'labels': List[int], sequence: str}` format.
+    """
+    mode_map = kwargs.get("mode_map", {"train": "train", "dev": "validation", "test": "test"})
+    assert mode in tuple(mode_map), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
+
+    if isinstance(dataset_paths, (str, os.PathLike)):
+        dataset_paths = [dataset_paths]
+
+    datasets = []  # `List[datasets.dataset_dict.Dataset]`
+    for ds_path in dataset_paths:
+        ds_path = os.path.abspath(ds_path)
+        assert os.path.exists(ds_path), f"Not existed file path {ds_path}"
+        ds_dict = load_from_disk(dataset_path=ds_path, keep_in_memory=False)
+        if isinstance(ds_dict, HFDataset):
+            datasets.append(ds_dict)
+        else:
+            if mode_map[mode] in ds_dict:
+                datasets.append(ds_dict[mode_map[mode]])
+    if len(datasets) == 0:
+        return None
+    if len(datasets) == 1:
+        return datasets.pop()
+    return ConcatDataset(datasets=datasets)
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """
+    Collate instances for supervised dataset.
+    Each instance is a tokenized dictionary with fields
+    `input_ids`(List[int]), `labels`(List[int]) and `sequence`(str).
+    """
+
+    tokenizer: PreTrainedTokenizer
+    max_length: int = 4096
+    ignore_index: int = -100
+
+    def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+        """
+
+        Args:
+            instances (`Sequence[Dict[str, List[int]]]`):
+                Mini-batch samples, each sample is stored in an individual dictionary.
+
+        Returns:
+            (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+                `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+                `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+        """
+        assert isinstance(self.tokenizer.pad_token_id, int) and self.tokenizer.pad_token_id >= 0, (
+            f"`{self.tokenizer.__class__.__name__}.pad_token_id` must be a valid non-negative integer index value, "
+            f"but now `{self.tokenizer.pad_token_id}`"
+        )
+
+        # `List[torch.Tensor]`
+        batch_input_ids = [
+            torch.LongTensor(instance["input_ids"][: self.max_length])
+            if len(instance["input_ids"]) > self.max_length
+            else torch.LongTensor(instance["input_ids"])
+            for instance in instances
+        ]
+        batch_labels = [
+            torch.LongTensor(instance["labels"][: self.max_length])
+            if len(instance["labels"]) > self.max_length
+            else torch.LongTensor(instance["labels"])
+            for instance in instances
+        ]
+
+        if self.tokenizer.padding_side == "right":
+            input_ids = torch.nn.utils.rnn.pad_sequence(
+                sequences=batch_input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id,
+            )  # (bsz, max_len)
+            labels = torch.nn.utils.rnn.pad_sequence(
+                sequences=batch_labels,
+                batch_first=True,
+                padding_value=self.ignore_index,
+            )  # (bsz, max_len)
+            # pad to max
+            to_pad = self.max_length - input_ids.size(1)
+            input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+            labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
+        elif self.tokenizer.padding_side == "left":
+            reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
+            reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
+                sequences=reversed_input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id,
+            )  # (bsz, max_len)
+            input_ids = torch.flip(reversed_input_ids, dims=(1,))  # (bsz, max_len)
+            reversed_labels = [seq.flip(dims=(0,)) for seq in batch_labels]
+            reversed_labels = torch.nn.utils.rnn.pad_sequence(
+                sequences=reversed_labels,
+                batch_first=True,
+                padding_value=self.ignore_index,
+            )  # (bsz, max_len)
+            labels = torch.flip(reversed_labels, dims=(1,))  # (bsz, max_len)
+        else:
+            raise RuntimeError(
+                f"`{self.tokenizer.__class__.__name__}.padding_side` can only be `left` or `right`, "
+                f"but now `{self.tokenizer.padding_side}`"
+            )
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
+
+        return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+
+
+@dataclass
+class DataCollatorForPreferenceDataset(object):
+    """
+    Collate instances for supervised dataset.
+    Each instance is a tokenized dictionary with fields
+    `input_ids`(List[int]), `labels`(List[int]) and `sequence`(str).
+    """
+
+    tokenizer: PreTrainedTokenizer
+    max_length: int = 4096
+
+    def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+        """
+
+        Args:
+            instances (`Sequence[Dict[str, List[int]]]`):
+                Mini-batch samples, each sample is stored in an individual dictionary.
+
+        Returns:
+            (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+                `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+                `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+        """
+        assert isinstance(self.tokenizer.pad_token_id, int) and self.tokenizer.pad_token_id >= 0, (
+            f"`{self.tokenizer.__class__.__name__}.pad_token_id` must be a valid non-negative integer index value, "
+            f"but now `{self.tokenizer.pad_token_id}`"
+        )
+
+        (
+            chosen_input_ids,
+            chosen_attention_mask,
+            chosen_loss_mask,  # [batch_size * seq_len]
+            reject_input_ids,
+            reject_attention_mask,
+            reject_loss_mask,
+        ) = (
+            chuncate_sequence([ins["chosen_input_ids"] for ins in instances], self.max_length, torch.int64),
+            chuncate_sequence([ins["chosen_attention_mask"] for ins in instances], self.max_length, torch.bool),
+            chuncate_sequence([ins["chosen_loss_mask"] for ins in instances], self.max_length, torch.bool),
+            chuncate_sequence([ins["rejected_input_ids"] for ins in instances], self.max_length, torch.int64),
+            chuncate_sequence([ins["rejected_attention_mask"] for ins in instances], self.max_length, torch.bool),
+            chuncate_sequence([ins["rejected_loss_mask"] for ins in instances], self.max_length, torch.bool),
+        )
+
+        padding_side = self.tokenizer.padding_side
+
+        (
+            chosen_input_ids,
+            chosen_attention_mask,
+            chosen_loss_mask,
+            reject_input_ids,
+            reject_attention_mask,
+            reject_loss_mask,
+        ) = (
+            pad_to_max_len(chosen_input_ids, self.max_length, self.tokenizer.pad_token_id, padding_side=padding_side),
+            pad_to_max_len(chosen_attention_mask, self.max_length, False, padding_side=padding_side),
+            pad_to_max_len(chosen_loss_mask, self.max_length, False, padding_side=padding_side),
+            pad_to_max_len(reject_input_ids, self.max_length, self.tokenizer.pad_token_id, padding_side=padding_side),
+            pad_to_max_len(reject_attention_mask, self.max_length, False, padding_side=padding_side),
+            pad_to_max_len(reject_loss_mask, self.max_length, False, padding_side=padding_side),
+        )
+        return dict(
+            chosen_input_ids=chosen_input_ids,
+            chosen_attention_mask=chosen_attention_mask,
+            chosen_loss_mask=chosen_loss_mask,
+            reject_input_ids=reject_input_ids,
+            reject_attention_mask=reject_attention_mask,
+            reject_loss_mask=reject_loss_mask,
+        )
+
+
+class StatefulDistributedSampler(DistributedSampler):
+    """
+    Stateful distributed sampler for multi-stage training.
+    """
+
+    def __init__(
+        self,
+        dataset: DatasetType,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+        use_tp: Optional[bool] = False,
+    ) -> None:
+        if not use_tp:
+            super().__init__(
+                dataset=dataset,
+                num_replicas=num_replicas,
+                rank=rank,
+                shuffle=shuffle,
+                seed=seed,
+                drop_last=drop_last,
+            )
+        else:
+            # adapted from https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/torch/utils/data/distributed.py#L62
+            # TODO: support tp_group>1
+            num_replicas = 1
+            if rank is None:
+                rank = dist.get_rank()
+            if rank < 0:
+                raise ValueError(f"Invalid rank {rank}, rank should be in the interval [0, 0]")
+            self.dataset = dataset
+            self.num_replicas = num_replicas
+            self.rank = rank
+            self.epoch = 0
+            self.drop_last = drop_last
+            # If the dataset length is evenly divisible by # of replicas, then there
+            # is no need to drop any data, since the dataset will be split equally.
+            if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+                # Split to nearest available length that is evenly divisible.
+                # This is to ensure each rank receives the same amount of data when
+                # using this Sampler.
+                self.num_samples = math.ceil(
+                    (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
+                )
+            else:
+                self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
+            self.total_size = self.num_samples * self.num_replicas
+            self.shuffle = shuffle
+            self.seed = seed
+        self.start_index = 0
+        self.use_tp = use_tp
+
+    def __iter__(self) -> Iterator:
+        if self.use_tp:
+            # TODO Add support for tp_group not equal to 1
+            pass
+            # adpated from https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/torch/utils/data/distributed.py#L96
+            if self.shuffle:
+                # deterministically shuffle based on epoch and seed
+                g = torch.Generator()
+                g.manual_seed(self.seed + self.epoch)
+                indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+            else:
+                indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+
+            if not self.drop_last:
+                # add extra samples to make it evenly divisible
+                padding_size = self.total_size - len(indices)
+                if padding_size <= len(indices):
+                    indices += indices[:padding_size]
+                else:
+                    indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+            else:
+                # remove tail of data to make it evenly divisible.
+                indices = indices[: self.total_size]
+            assert len(indices) == self.total_size
+
+            # subsample
+            indices = indices[
+                : self.total_size : self.num_replicas
+            ]  # num_replicas=tp_group=1, we only support tp_group==1 for now
+            assert len(indices) == self.num_samples
+
+            return iter(indices)
+
+        else:
+            iterator = super().__iter__()
+            indices = list(iterator)
+            indices = indices[self.start_index :]
+            return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples - self.start_index
+
+    def set_start_index(self, start_index: int) -> None:
+        self.start_index = start_index
+
+
+def setup_distributed_dataloader(
+    dataset: DatasetType,
+    batch_size: int = 1,
+    shuffle: bool = False,
+    seed: int = 1024,
+    drop_last: bool = False,
+    pin_memory: bool = False,
+    num_workers: int = 0,
+    collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
+    process_group: Optional[ProcessGroup] = None,
+    use_tp: Optional[bool] = False,
+    **kwargs,
+) -> DataLoader:
+    """
+    Setup dataloader for distributed training.
+    """
+    _kwargs = kwargs.copy()
+    process_group = process_group or _get_default_group()
+    sampler = StatefulDistributedSampler(
+        dataset=dataset,
+        num_replicas=process_group.size() if not use_tp else 1,
+        rank=process_group.rank(),
+        shuffle=shuffle,
+        seed=seed,
+        drop_last=drop_last,
+        use_tp=use_tp,
+    )
+
+    # Deterministic dataloader
+    def seed_worker(worker_id: int) -> None:
+        worker_seed = seed
+        np.random.seed(worker_seed)
+        torch.manual_seed(worker_seed)
+        random.seed(worker_seed)
+
+    return DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=pin_memory,
+        drop_last=drop_last,
+        worker_init_fn=seed_worker,
+        **_kwargs,
+    )
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
index 17120e6064b5..521b496290db 100644
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -7,11 +7,21 @@
 
 from colossalai.logging import get_dist_logger
 
-from .utils import jload
+from .utils import jload, read_string_by_schema
 
 
 class PromptDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
+    """Dataset for supervised fine-tuning.
+
+    Args:
+        dataset: dataset for supervised model
+        tokenizer: tokenizer for supervised model
+        max_datasets_size: number of examples to use from the dataset
+        max_length: max length of input
+        verbose: whether to display the first two item in the dataset
+        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
+             e.g. person.name.first will access data['person']['name']['first']
+    """
 
     def __init__(
         self,
@@ -19,6 +29,7 @@ def __init__(
         tokenizer: transformers.PreTrainedTokenizer,
         max_datasets_size: int = None,
         max_length: int = 96,
+        dataset_schema: Dict[str, str] = {"instruction": "instruction"},
     ):
         super(PromptDataset, self).__init__()
         self.keyed_prompt = defaultdict(list)
@@ -31,7 +42,9 @@ def __init__(
             self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
             list_data_dict = list_data_dict[:max_datasets_size]
 
-        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
+        instructions = [
+            read_string_by_schema(data_dict, dataset_schema["instruction"]) + "\n" for data_dict in list_data_dict
+        ]
         tokens = tokenizer(
             instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
         )
diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
index 3afcd7b69238..0f289c9fd866 100644
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@@ -1,54 +1,14 @@
-from typing import Callable
+from typing import Callable, Dict
 
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
-from .utils import is_rank_0
+from colossalai.logging import get_dist_logger
 
+from .utils import is_rank_0, read_string_by_schema
 
-# Dahoas/rm-static
-class RmStaticDataset(Dataset):
-    """
-    Dataset for reward model
-
-    Args:
-        dataset: dataset for reward model
-        tokenizer: tokenizer for reward model
-        max_length: max length of input
-        special_token: special token at the end of sentence
-    """
-
-    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
-        super().__init__()
-        self.end_token = tokenizer.eos_token if special_token is None else special_token
-
-        chosen = [data["prompt"] + data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        chosen_token = tokenizer(
-            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
-
-        reject = [data["prompt"] + data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        reject_token = tokenizer(
-            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
-
-    def __len__(self):
-        length = self.chosen["input_ids"].shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        return (
-            self.chosen["input_ids"][idx],
-            self.chosen["attention_mask"][idx],
-            self.reject["input_ids"][idx],
-            self.reject["attention_mask"][idx],
-        )
 
-
-# Anthropic/hh-rlhf
-class HhRlhfDataset(Dataset):
+class PreferenceDataset(Dataset):
     """
     Dataset for reward model
 
@@ -57,19 +17,39 @@ class HhRlhfDataset(Dataset):
         tokenizer: tokenizer for reward model
         max_length: max length of input
         special_token: special token at the end of sentence
+        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
+             e.g. person.name.first will access data['person']['name']['first']
     """
 
-    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
+    def __init__(
+        self,
+        dataset,
+        tokenizer: Callable,
+        max_length: int,
+        special_token=None,
+        dataset_schema: Dict[str, str] = {"prompt": "", "chosen": "chosen", "rejected": "rejected"},
+    ) -> None:
         super().__init__()
         self.end_token = tokenizer.eos_token if special_token is None else special_token
-
-        chosen = [data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        chosen = [
+            (read_string_by_schema(data, dataset_schema["prompt"]) if "prompt" in dataset_schema else "")
+            + (read_string_by_schema(data, dataset_schema["chosen"]) if "chosen" in dataset_schema else "")
+            + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
+        self.logger = get_dist_logger()
+        self.logger.info("Tokenizing inputs... This may take some time...")
         chosen_token = tokenizer(
             chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
         )
         self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
 
-        reject = [data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        reject = [
+            (read_string_by_schema(data, dataset_schema["prompt"]) if "prompt" in dataset_schema else "")
+            + (read_string_by_schema(data, dataset_schema["rejected"]) if "rejected" in dataset_schema else "")
+            + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
         reject_token = tokenizer(
             reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
         )
diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
index c0e257f54a07..d25ed2efe7f4 100644
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@@ -17,15 +17,14 @@
 
 import torch
 from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from datasets import load_dataset
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer
 
-from colossalai.logging import get_dist_logger
+from colossalai.cluster import DistCoordinator
 
-from .utils import is_rank_0, jload
-
-logger = get_dist_logger()
+from .utils import is_rank_0, jload, read_string_by_schema
 
 IGNORE_INDEX = -100
 PROMPT_DICT = {
@@ -120,16 +119,28 @@ class SFTDataset(Dataset):
         dataset: dataset for supervised model
         tokenizer: tokenizer for supervised model
         max_length: max length of input
+        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
+             e.g. person.name.first will access data['person']['name']['first']
     """
 
-    def __init__(self, dataset: Dict, tokenizer: PreTrainedTokenizer, max_length: int = 512) -> None:
+    def __init__(
+        self,
+        dataset: Dict,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int = 512,
+        dataset_schema: Dict[str, str] = {"prompt": "prompt", "completion": "completion"},
+    ) -> None:
         super().__init__()
         self.input_ids = []
+        self.coordinator = DistCoordinator()
 
-        sources = [data["prompt"] for data in dataset]
-        targets = [data["completion"] + tokenizer.eos_token for data in tqdm(dataset, disable=not is_rank_0())]
+        sources = [read_string_by_schema(data, dataset_schema["prompt"]) for data in dataset]
+        targets = [
+            read_string_by_schema(data, dataset_schema["completion"]) + tokenizer.eos_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
 
-        logger.info("Tokenizing inputs... This may take some time...")
+        self.coordinator.print_on_master("Tokenizing inputs... This may take some time...")
         if isinstance(tokenizer, ChatGLMTokenizer):
             self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
                 sources, targets, tokenizer, max_length
@@ -137,7 +148,7 @@ def __init__(self, dataset: Dict, tokenizer: PreTrainedTokenizer, max_length: in
         else:
             self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
 
-        logger.info("Loaded dataset.")
+        self.coordinator.print_on_master("Loaded dataset.")
 
     def __len__(self):
         length = self.input_ids.shape[0]
@@ -151,7 +162,17 @@ def __getitem__(self, idx):
 
 
 class SupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
+    """Dataset for supervised fine-tuning.
+
+    Args:
+        dataset: dataset for supervised model
+        tokenizer: tokenizer for supervised model
+        max_datasets_size: number of examples to use from the dataset
+        max_length: max length of input
+        prompt_dict: prompts for the dataset used to format prompt
+        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
+             e.g. person.name.first will access data['person']['name']['first']
+    """
 
     def __init__(
         self,
@@ -159,25 +180,35 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
         max_datasets_size: Optional[int] = None,
         max_length: int = 512,
+        prompt_dict: Optional[Dict[str, str]] = PROMPT_DICT,
+        split: str = "train",
+        dataset_schema: Dict[str, str] = {"instruction": "instruction", "input": "input", "output": "output"},
     ):
         super().__init__()
-        logger.info("Loading data...")
-        list_data_dict = jload(data_path)
-        logger.info(f"Loaded {len(list_data_dict)} examples.")
+        self.coordinator = DistCoordinator()
+        self.coordinator.print_on_master("Loading data...")
+        try:
+            dataset = load_dataset(data_path)
+            list_data_dict = list(dataset[split])
+        except FileNotFoundError:
+            list_data_dict = jload(data_path)
+        self.coordinator.print_on_master(f"Loaded {len(list_data_dict)} examples.")
 
         if max_datasets_size is not None:
-            logger.info(f"Limiting dataset to {max_datasets_size} examples.")
+            self.coordinator.print_on_master(f"Limiting dataset to {max_datasets_size} examples.")
             list_data_dict = list_data_dict[:max_datasets_size]
 
-        logger.info("Formatting inputs...")
-        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
+        self.coordinator.print_on_master("Formatting inputs...")
+        prompt_input, prompt_no_input = prompt_dict["prompt_input"], prompt_dict["prompt_no_input"]
+        list_data_dict = [
+            {k: read_string_by_schema(example, dataset_schema[k]) for k in dataset_schema} for example in list_data_dict
+        ]
         sources = [
-            prompt_input.format_map(example) if "input" in example else prompt_no_input.format_map(example)
+            prompt_input.format_map(example) if example["input"] != "" else prompt_no_input.format_map(example)
             for example in list_data_dict
         ]
         targets = [example["output"] + tokenizer.eos_token for example in list_data_dict]
-
-        logger.info("Tokenizing inputs... This may take some time...")
+        self.coordinator.print_on_master("Tokenizing inputs... This may take some time...")
         if isinstance(tokenizer, ChatGLMTokenizer):
             self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
                 sources, targets, tokenizer, max_length
@@ -185,7 +216,7 @@ def __init__(
         else:
             self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
 
-        logger.info("Loaded dataset.")
+        self.coordinator.print_on_master("Loaded dataset.")
 
     def __len__(self):
         length = self.input_ids.shape[0]
diff --git a/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py b/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
new file mode 100644
index 000000000000..19bb84033cf1
--- /dev/null
+++ b/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
@@ -0,0 +1,465 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Splicing multiple pre-tokenized sequence data points
+"""
+
+import bisect
+import random
+import warnings
+from copy import deepcopy
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+
+from colossal_llama2.utils.conversation import Conversation, default_conversation
+from datasets import dataset_dict
+from torch.utils.data import ConcatDataset, Dataset, IterableDataset
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+IGNORE_INDEX = -100
+
+DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+
+
+def supervised_tokenize_pretrain(
+    data_point: Dict[str, str], tokenizer: LlamaTokenizer, ignore_index: int = None, max_length: int = 4096
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    A tokenization function to tokenize an original pretraining data point as following:
+        {"source": "", "target": "Beijing, the capital of the People's Republic of China, ...", "category": "geography"}
+    """
+    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+        "add <bos> and <eos> manually later"
+    )
+    if ignore_index is None:
+        ignore_index = IGNORE_INDEX
+
+    source_text = data_point["source"]  # `str`
+    target_text = data_point["target"]  # `str`
+    is_null_source = len(source_text) == 0
+
+    source_text = tokenizer.bos_token + source_text
+    target_text += tokenizer.eos_token
+    sequence_text = source_text + target_text
+
+    tokenized = tokenizer([source_text, sequence_text])["input_ids"]
+    sequence_input_ids = tokenized[1]
+    sequence_labels = deepcopy(sequence_input_ids)
+
+    source_length = len(tokenized[0])
+    if not is_null_source:
+        sequence_labels[:source_length] = [ignore_index for _ in range(source_length)]
+
+    # sequence truncation.
+    if len(sequence_input_ids) > max_length:
+        sequence_input_ids = sequence_input_ids[:max_length]
+        sequence_labels = sequence_labels[:max_length]
+
+    return dict(
+        input_ids=sequence_input_ids,
+        labels=sequence_labels,
+        seq_length=len(sequence_input_ids),
+        seq_category=data_point["category"],
+    )
+
+
+def supervised_tokenize_sft(
+    data_point: Dict[str, str],
+    tokenizer: LlamaTokenizer,
+    conversation_template: Conversation = default_conversation,
+    ignore_index: int = None,
+    max_length: int = 4096,
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    A tokenization function to tokenize an original pretraining data point as following:
+        {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
+    """
+    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+        "add <bos> and <eos> manually later"
+    )
+
+    assert (
+        tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
+    ), "`bos_token` and `eos_token` should be the same with `conversation_template.seps`."
+
+    if ignore_index is None:
+        ignore_index = IGNORE_INDEX
+
+    messages = data_point["messages"]
+    template = deepcopy(conversation_template)
+    template.messages = []
+
+    for mess in messages:
+        from_str = mess["from"]
+        if from_str.lower() == "human":
+            from_str = template.roles[0]
+        elif from_str.lower() == "assistant":
+            from_str = template.roles[1]
+        else:
+            raise ValueError(f"Unsupported role {from_str.lower()}")
+
+        template.append_message(from_str, mess["content"])
+
+    if len(template.messages) % 2 != 0:
+        template.messages = template.messages[0:-1]
+
+    # `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
+    turns = [i for i in range(1, len(messages) // 2 + 1)]
+    target_turn_index = bisect.bisect_right(
+        turns,
+        max_length - 1,
+        key=lambda x: len(tokenizer([template.get_prompt(2 * x)], add_special_tokens=False)["input_ids"][0]),
+    )
+
+    # The tokenized length for first turn already exceeds `max_length - 1`.
+    if target_turn_index - 1 < 0:
+        return dict(
+            input_ids=None,
+            labels=None,
+            inputs_decode=None,
+            labels_decode=None,
+            seq_length=None,
+            seq_category=None,
+        )
+
+    target_turn = turns[target_turn_index - 1]
+    prompt = template.get_prompt(2 * target_turn)
+    tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
+
+    # Uncomment this to check whether `bisect_right` is right.
+    # if 2 * target_turn < len(template.messages):
+    #     length_to_next_turn = len(tokenizer([template.get_prompt(2*target_turn+2)], add_special_tokens=False)["input_ids"][0])
+    #     assert length_to_next_turn > max_length - 1, print(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
+
+    template.messages = template.messages[0 : 2 * target_turn]
+
+    starts = []
+    ends = []
+    gpt_bos = False if template.messages[0][0] == template.roles[0] else True
+    gpt_eos = False if template.messages[0][0] == template.roles[0] else True
+
+    for i, token_id in enumerate(tokenized):
+        if token_id == tokenizer.bos_token_id:
+            if gpt_bos:
+                starts.append(i)
+            gpt_bos = not gpt_bos
+        elif token_id == tokenizer.eos_token_id:
+            if gpt_eos:
+                ends.append(i)
+            gpt_eos = not gpt_eos
+
+    if len(starts) != target_turn or len(ends) != target_turn:
+        print(
+            "Please check whether the tokenizer add additional `bos_token` and `eos_token`.\n\nOr the original message contains `bos_token` or `eos_token`."
+        )
+        return dict(
+            input_ids=None,
+            labels=None,
+            inputs_decode=None,
+            labels_decode=None,
+            seq_length=None,
+            seq_category=None,
+        )
+
+    tokenized = [tokenizer.bos_token_id] + tokenized
+    labels = [ignore_index] * len(tokenized)
+    for start, end in zip(starts, ends):
+        labels[start + 1 : end + 2] = tokenized[start + 1 : end + 2]
+
+    labels_decode = deepcopy(labels)
+    for i, z in enumerate(labels_decode):
+        if z == ignore_index:
+            labels_decode[i] = tokenizer.unk_token_id
+
+    # `inputs_decode` and `labels decode` can be used to check whether the tokenization method is true.
+    return dict(
+        input_ids=tokenized,
+        labels=labels,
+        inputs_decode=tokenizer.decode(tokenized),
+        labels_decode=tokenizer.decode(labels_decode),
+        seq_length=len(tokenized),
+        seq_category=data_point["category"] if "category" in data_point else "None",
+    )
+
+
+def generate_loss_mask(template: Conversation, tokenizer: Any, context_len: int):
+    target_turn = int(len(template.messages) / 2)
+    prompt = template.get_prompt(2 * target_turn)
+    tokenized = tokenizer([prompt], add_special_tokens=False)
+    input_ids = tokenized["input_ids"][0]
+    attention_mask = tokenized["attention_mask"][0]
+    starts = []
+    ends = []
+    gpt_bos = False if template.messages[0][0] == template.roles[0] else True
+    gpt_eos = False if template.messages[0][0] == template.roles[0] else True
+
+    for i, token_id in enumerate(input_ids):
+        if token_id == tokenizer.bos_token_id:
+            if gpt_bos:
+                starts.append(i)
+            gpt_bos = not gpt_bos
+        elif token_id == tokenizer.eos_token_id:
+            if gpt_eos:
+                ends.append(i)
+            gpt_eos = not gpt_eos
+
+    if len(starts) != target_turn or len(ends) != target_turn:
+        print(
+            "Please check whether the tokenizer add additional `bos_token` and `eos_token`.\n\nOr the original message contains `bos_token` or `eos_token`."
+        )
+        return dict(input_ids=None, attention_mask=None, loss_mask=None)
+
+    input_ids = [tokenizer.bos_token_id] + input_ids
+    attention_mask = [1] + attention_mask
+    loss_mask = [0 for _ in range(len(input_ids))]
+    starts = starts[context_len:]
+    ends = ends[context_len:]
+    for start, end in zip(starts, ends):
+        for i in range(start + 1, end + 2):
+            loss_mask[i] = 1 if attention_mask[i] else 0
+
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "loss_mask": loss_mask}
+
+
+def tokenize_rlhf(
+    data_point: Dict[str, str],
+    tokenizer: LlamaTokenizer,
+    conversation_template: Conversation = default_conversation,
+    ignore_index: int = None,
+    max_length: int = 4096,
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    A tokenization function to tokenize an original pretraining data point as following:
+        {"context": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}],
+        "chosen": {"from": "assistant", "content": "xxx"}, "rejected": {"from": "assistant", "content": "xxx"}}
+    """
+    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+        "add <bos> and <eos> manually later"
+    )
+
+    assert (
+        tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
+    ), "`bos_token` and `eos_token` should be the same with `conversation_template.seps`."
+
+    if ignore_index is None:
+        ignore_index = IGNORE_INDEX
+
+    context = data_point["context"]
+    template = deepcopy(conversation_template)
+    template.messages = []
+
+    for mess in context:
+        from_str = mess["from"]
+        if from_str.lower() == "human":
+            from_str = template.roles[0]
+        elif from_str.lower() == "assistant":
+            from_str = template.roles[1]
+        else:
+            raise ValueError(f"Unsupported role {from_str.lower()}")
+
+        if len(template.messages) > 0 and from_str == template.messages[-1][0]:
+            template.messages[-1][1] = str(template.messages[-1][1] + mess["content"])
+        else:
+            template.append_message(from_str, mess["content"])
+
+    if len(template.messages) % 2 != 1:
+        print("Please make sure leading context is started and ended with a line from human")
+        print(template.messages)
+        return dict(
+            chosen_input_ids=None,
+            chosen_attention_mask=None,
+            chosen_loss_mask=None,
+            rejected_input_ids=None,
+            rejected_attention_mask=None,
+            rejected_loss_mask=None,
+        )
+    round_of_context = int((len(template.messages) - 1) / 2)
+
+    assert context[-1]["from"].lower() == "human", "The last message in context should be from human."
+    chosen = deepcopy(template)
+    rejected = deepcopy(template)
+
+    for round in range(len(data_point["chosen"])):
+        from_str = data_point["chosen"][round]["from"]
+        if from_str.lower() == "human":
+            from_str = template.roles[0]
+        elif from_str.lower() == "assistant":
+            from_str = template.roles[1]
+        else:
+            raise ValueError(f"Unsupported role {from_str.lower()}")
+        chosen.append_message(from_str, data_point["chosen"][round]["content"])
+
+    for round in range(len(data_point["rejected"])):
+        from_str = data_point["rejected"][round]["from"]
+        if from_str.lower() == "human":
+            from_str = template.roles[0]
+        elif from_str.lower() == "assistant":
+            from_str = template.roles[1]
+        else:
+            raise ValueError(f"Unsupported role {from_str.lower()}")
+        rejected.append_message(from_str, data_point["rejected"][round]["content"])
+
+    (
+        chosen_input_ids,
+        chosen_attention_mask,
+        chosen_loss_mask,
+        rejected_input_ids,
+        rejected_attention_mask,
+        rejected_loss_mask,
+    ) = (None, None, None, None, None, None)
+    if (
+        len(tokenizer([chosen.get_prompt(len(chosen.messages))], add_special_tokens=False)["input_ids"][0])
+        <= max_length - 1
+        and len(tokenizer([rejected.get_prompt(len(rejected.messages))], add_special_tokens=False)["input_ids"][0])
+        <= max_length - 1
+    ):
+        chosen_data_packed = generate_loss_mask(chosen, tokenizer, round_of_context)
+        (chosen_input_ids, chosen_attention_mask, chosen_loss_mask) = (
+            chosen_data_packed["input_ids"],
+            chosen_data_packed["attention_mask"],
+            chosen_data_packed["loss_mask"],
+        )
+
+        rejected_data_packed = generate_loss_mask(rejected, tokenizer, round_of_context)
+        (rejected_input_ids, rejected_attention_mask, rejected_loss_mask) = (
+            rejected_data_packed["input_ids"],
+            rejected_data_packed["attention_mask"],
+            rejected_data_packed["loss_mask"],
+        )
+
+        return {
+            "chosen_input_ids": chosen_input_ids,
+            "chosen_attention_mask": chosen_attention_mask,
+            "chosen_loss_mask": chosen_loss_mask,
+            "rejected_input_ids": rejected_input_ids,
+            "rejected_attention_mask": rejected_attention_mask,
+            "rejected_loss_mask": rejected_loss_mask,
+        }
+    else:
+        return dict(
+            chosen_input_ids=None,
+            chosen_attention_mask=None,
+            chosen_loss_mask=None,
+            rejected_input_ids=None,
+            rejected_attention_mask=None,
+            rejected_loss_mask=None,
+        )
+
+
+class ClosedToConstantLengthSplicedDataset(IterableDataset):
+    """
+    Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
+    original independent (pre-tokenized) data points.
+    """
+
+    def __init__(
+        self,
+        dataset: DSType,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int = 4096,
+        num_packed_sequences: int = 8,
+        fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
+        input_ids_field: str = "input_ids",
+        labels_field: str = "labels",
+        infinite: bool = False,
+        shuffle: bool = True,
+        error_strict: bool = False,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.max_length = max_length
+        self.infinite = infinite
+        self.max_buffer_size = max_length * num_packed_sequences  # e.g., 4096 * 16
+        self.shuffle = shuffle
+
+        # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
+        # A function that fetch sequence input_ids and labels from the original data point
+        if fetch_sequence_func is None:
+            self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
+        else:
+            self.fetch_sequence_func = fetch_sequence_func
+        self.input_ids_field = input_ids_field
+        self.labels_field = labels_field
+
+        self.error_strict = error_strict
+        self.current_size = 0  # `int`, current packed data size.
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __iter__(self) -> Iterable[Dict[str, List[int]]]:
+        iterator = iter(self.dataset)
+        more_data_points = True
+        while more_data_points is True:
+            buffer, buffer_len = [], 0
+            while True:
+                # ending condition.
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    # `Tuple[List[int], List[int]]`
+                    seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
+                    buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
+                    buffer_len += len(buffer[-1][self.input_ids_field])
+                except StopIteration:
+                    if self.infinite is True:
+                        iterator = iter(self.dataset)
+                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
+                    else:
+                        more_data_points = False
+                        break
+            examples = []  # `List[Dict[str, List[int]]]`, save buffered spliced data points.
+            spliced_input_ids, spliced_labels = [], []  # `List[int]`, `List[int]`
+            for i, data_point in enumerate(buffer):
+                # TODO(2023-09-18) check errors for each unspliced tokenized data point
+                seq_input_ids = data_point[self.input_ids_field]
+                seq_labels = data_point[self.labels_field]
+                # Handle special case:
+                # If the length of an original data point (i.e., input_ids length of a data point before splicing)
+                # exceeds `max_length`, truncate it.
+                if len(seq_input_ids) > self.max_length:
+                    truncated_seq_input_ids = seq_input_ids[: self.max_length]
+                    truncated_label_ids = seq_labels[: self.max_length]
+                    if set(truncated_label_ids) == {IGNORE_INDEX}:
+                        if self.error_strict is True:
+                            raise ValueError(
+                                f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
+                                f"with all label values as {IGNORE_INDEX}."
+                            )
+                        else:
+                            warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
+                            continue  # Skip the current error data point.
+                    spliced_data_point = {
+                        self.input_ids_field: truncated_seq_input_ids,
+                        self.labels_field: truncated_label_ids,
+                    }
+                    examples.append(spliced_data_point)
+                    warnings.warn("Find a data point to be truncated.")
+                    continue
+
+                # Pre action judgment.
+                if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
+                    spliced_data_point = {
+                        self.input_ids_field: spliced_input_ids,
+                        self.labels_field: spliced_labels,
+                    }  # `Dict[str, List[int]]`
+                    # Update.
+                    spliced_input_ids, spliced_labels = [], []
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+                    examples.append(spliced_data_point)
+                else:
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+            # For residual spliced data point at the end of the data set
+            if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
+                examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
+            if self.shuffle:
+                random.shuffle(examples)
+            for spliced_data_point in examples:
+                # TODO(2023-09-18): check errors for each spliced tokenized data point.
+                self.current_size += 1
+                yield spliced_data_point
diff --git a/applications/Chat/coati/dataset/utils.py b/applications/Chat/coati/dataset/utils.py
index f37fce67a7c6..1652aed4a2a0 100644
--- a/applications/Chat/coati/dataset/utils.py
+++ b/applications/Chat/coati/dataset/utils.py
@@ -1,7 +1,10 @@
 import io
 import json
+from typing import Any, Dict, List
 
+import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 
 
 def is_rank_0() -> bool:
@@ -20,3 +23,56 @@ def jload(f, mode="r"):
     jdict = json.load(f)
     f.close()
     return jdict
+
+
+def read_string_by_schema(data: Dict[str, Any], schema: str) -> str:
+    """
+    Read a feild of the dataset be schema
+    Args:
+        data: Dict[str, Any]
+        schema: cascaded feild names seperated by '.'. e.g. person.name.first will access data['person']['name']['first']
+    """
+    keys = schema.split(".")
+    result = data
+    for key in keys:
+        result = result.get(key, None)
+        if result is None:
+            return ""
+    assert isinstance(result, str), f"dataset element is not a string: {result}"
+    return result
+
+
+def pad_to_max_len(
+    sequence: List[torch.Tensor], max_length: int, padding_value: int, batch_first: bool = True, padding_side="left"
+):
+    """
+    Args:
+        sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True
+    """
+    if padding_side == "left":
+        reversed_sequence = [seq.flip(dims=(0,)) for seq in sequence]
+        padded = torch.nn.utils.rnn.pad_sequence(
+            sequences=reversed_sequence, batch_first=batch_first, padding_value=padding_value
+        )
+        to_pad = max_length - padded.size(1)
+        padded = F.pad(padded, (0, to_pad), value=padding_value)
+        return torch.flip(padded, dims=(1,))
+    elif padding_side == "right":
+        padded = torch.nn.utils.rnn.pad_sequence(
+            sequences=sequence, batch_first=batch_first, padding_value=padding_value
+        )
+        to_pad = max_length - padded.size(1)
+        return F.pad(padded, (0, to_pad), value=padding_value)
+    else:
+        raise RuntimeError(f"`padding_side` can only be `left` or `right`, " f"but now `{padding_side}`")
+
+
+def chuncate_sequence(sequence: List[torch.Tensor], max_length: int, dtype: Any):
+    """
+    Args:
+        sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True
+    """
+    return [
+        torch.Tensor(seq[:max_length]).to(dtype) if len(seq) > max_length else torch.Tensor(seq).to(dtype)
+        for seq in sequence
+    ]
diff --git a/applications/Chat/coati/experience_buffer/utils.py b/applications/Chat/coati/experience_buffer/utils.py
index baedbebd184f..c4807d179d90 100644
--- a/applications/Chat/coati/experience_buffer/utils.py
+++ b/applications/Chat/coati/experience_buffer/utils.py
@@ -26,6 +26,7 @@ class BufferItem:
     action_log_probs: torch.Tensor
     values: torch.Tensor
     reward: torch.Tensor
+    kl: torch.Tensor
     advantages: torch.Tensor
     attention_mask: Optional[torch.LongTensor]
     action_mask: Optional[torch.BoolTensor]
@@ -34,7 +35,7 @@ class BufferItem:
 def split_experience_batch(experience: Experience) -> List[BufferItem]:
     batch_size = experience.sequences.size(0)
     batch_kwargs = [{} for _ in range(batch_size)]
-    keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
+    keys = ("sequences", "action_log_probs", "values", "reward", "kl", "advantages", "attention_mask", "action_mask")
     for key in keys:
         value = getattr(experience, key)
         if isinstance(value, torch.Tensor):
@@ -63,7 +64,7 @@ def _zero_pad_sequences(sequences: List[torch.Tensor], side: str = "left") -> to
 def make_experience_batch(items: List[BufferItem]) -> Experience:
     kwargs = {}
     to_pad_keys = set(("action_log_probs", "action_mask"))
-    keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
+    keys = ("sequences", "action_log_probs", "values", "reward", "kl", "advantages", "attention_mask", "action_mask")
     for key in keys:
         vals = [getattr(item, key) for item in items]
         if key in to_pad_keys:
diff --git a/applications/Chat/coati/experience_maker/base.py b/applications/Chat/coati/experience_maker/base.py
index 0731f6e0f97f..c98d9f9256b4 100644
--- a/applications/Chat/coati/experience_maker/base.py
+++ b/applications/Chat/coati/experience_maker/base.py
@@ -3,7 +3,7 @@
 from typing import Optional
 
 import torch
-from coati.models.base import Actor, Critic, RewardModel
+from coati.models import Actor, Critic, RewardModel
 
 
 @dataclass
@@ -28,6 +28,7 @@ class Experience:
     action_log_probs: torch.Tensor
     values: torch.Tensor
     reward: torch.Tensor
+    kl: torch.Tensor
     advantages: torch.Tensor
     attention_mask: Optional[torch.LongTensor]
     action_mask: Optional[torch.BoolTensor]
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py
index 941e1994b148..f0145503df91 100644
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn.functional as F
-from coati.models.base import Actor, Critic, RewardModel
+from coati.models import Actor, Critic, RewardModel
 from coati.models.generation import generate
 from coati.models.utils import calc_action_log_probs, compute_reward
 from transformers import PreTrainedTokenizer
@@ -20,11 +20,29 @@ def __init__(
         reward_model: RewardModel,
         initial_model: Actor,
         tokenizer: PreTrainedTokenizer,
-        kl_coef: float = 0.1,
+        rm_model_tokenizer: PreTrainedTokenizer,
+        kl_coef: float = 0.01,
+        gamma: float = 1.0,
+        lam: float = 0.95,
     ) -> None:
         super().__init__(actor, critic, reward_model, initial_model)
         self.tokenizer = tokenizer
+        self.rm_model_tokenizer = rm_model_tokenizer
         self.kl_coef = kl_coef
+        self.gamma = gamma
+        self.lam = lam
+
+    @torch.no_grad()
+    def calculate_advantage(self, value, reward, num_actions):
+        lastgaelam = 0
+        advantages_reversed = []
+        for t in reversed(range(num_actions)):
+            nextvalues = value[:, t + 1] if t < num_actions - 1 else 0.0
+            delta = reward[:, t] + self.gamma * nextvalues - value[:, t]
+            lastgaelam = delta + self.gamma * self.lam * lastgaelam
+            advantages_reversed.append(lastgaelam)
+        advantages = torch.stack(advantages_reversed[::-1], dim=1)
+        return advantages
 
     @torch.no_grad()
     def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
@@ -34,8 +52,12 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
         self.reward_model.eval()
 
         # generate sequences
+
         sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
 
+        self.actor.train()
+        self.critic.train()
+
         # calculate auxiliary tensors
         attention_mask = None
         pad_token_id = self.tokenizer.pad_token_id
@@ -57,15 +79,28 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
 
         actor_output = self.actor(sequences, attention_mask)["logits"]
         action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
+
         base_model_output = self.initial_model(sequences, attention_mask)["logits"]
+
         base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
         value = self.critic(sequences, attention_mask)
-        r = self.reward_model(sequences, attention_mask)
-        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
+        sequences_text = self.tokenizer.batch_decode(sequences, skip_special_tokens=True)
+
+        sequences_rm = self.rm_model_tokenizer(
+            sequences_text, return_tensors="pt", padding="max_length", truncation=True, max_length=300
+        )
+        r = self.reward_model(
+            **{
+                "sequences": sequences_rm["input_ids"].to(dtype=torch.long, device=sequences.device),
+                "attention_mask": sequences_rm["attention_mask"].to(device=sequences.device),
+            }
+        )
+        reward, kl = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
+        value = value[:, -num_actions:] * action_mask
+        advantages = self.calculate_advantage(value, reward, num_actions)
 
-        advantage = reward - value
-        # TODO(ver217): maybe normalize adv
-        if advantage.ndim == 1:
-            advantage = advantage.unsqueeze(-1)
+        advantages = advantages.detach()
+        value = value.detach()
+        r = r.detach()
 
-        return Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)
+        return Experience(sequences, action_log_probs, value, r, kl, advantages, attention_mask, action_mask)
diff --git a/applications/Chat/coati/models/__init__.py b/applications/Chat/coati/models/__init__.py
index ad4a525b4af2..22b71ec43fcc 100644
--- a/applications/Chat/coati/models/__init__.py
+++ b/applications/Chat/coati/models/__init__.py
@@ -1,8 +1,14 @@
-from .base import Actor, Critic, RewardModel
-from .lora import LoRAModule, convert_to_lora_module
-from .loss import LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
+from .actor import Actor
+from .base import BaseModel
+from .critic import Critic
+from .generation import generate
+from .lora import convert_to_lora_module
+from .loss import DpoLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
+from .reward_model import RewardModel
+from .utils import load_checkpoint, save_checkpoint
 
 __all__ = [
+    "BaseModel",
     "Actor",
     "Critic",
     "RewardModel",
@@ -10,6 +16,9 @@
     "ValueLoss",
     "LogSigLoss",
     "LogExpLoss",
-    "LoRAModule",
     "convert_to_lora_module",
+    "save_checkpoint",
+    "load_checkpoint",
+    "DpoLoss",
+    "generate",
 ]
diff --git a/applications/Chat/coati/models/actor.py b/applications/Chat/coati/models/actor.py
new file mode 100644
index 000000000000..6db4f7f6c722
--- /dev/null
+++ b/applications/Chat/coati/models/actor.py
@@ -0,0 +1,28 @@
+from typing import Optional
+
+import torch
+from coati.models import BaseModel
+from transformers import PretrainedConfig
+
+
+class Actor(BaseModel):
+    """
+    Actor model base class.
+
+    Args:
+        pretrained (str): path to pretrained model.
+        config (PretrainedConfig): PretrainedConfig used to initiate the base model.
+    """
+
+    def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
+        super().__init__(pretrained=pretrained, config=config)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **model_kwargs,
+    ) -> torch.Tensor:
+        """Returns model output."""
+        output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
+        return output
diff --git a/applications/Chat/coati/models/base.py b/applications/Chat/coati/models/base.py
new file mode 100644
index 000000000000..afe2c6da7563
--- /dev/null
+++ b/applications/Chat/coati/models/base.py
@@ -0,0 +1,29 @@
+from typing import Optional
+
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, PretrainedConfig
+
+
+class BaseModel(nn.Module):
+    """
+    Actor model base class.
+
+    Args:
+        pretrained (str): path to pretrained model.
+        config (PretrainedConfig): PretrainedConfig used to initiate the base model.
+    """
+
+    def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
+        super().__init__()
+        if pretrained is not None:
+            if config is not None:
+                # initialize with config and load weights from pretrained
+                self.model = AutoModelForCausalLM.from_pretrained(pretrained, config=config)
+            else:
+                # initialize with pretrained
+                self.model = AutoModelForCausalLM.from_pretrained(pretrained)
+        elif config is not None:
+            # initialize with config
+            self.model = AutoModelForCausalLM(config)
+        else:
+            raise ValueError("Either pretrained or config must be provided.")
diff --git a/applications/Chat/coati/models/base/__init__.py b/applications/Chat/coati/models/base/__init__.py
deleted file mode 100644
index 5c9905bb2224..000000000000
--- a/applications/Chat/coati/models/base/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import Union
-
-import torch.nn as nn
-
-from .actor import Actor
-from .critic import Critic
-from .reward_model import RewardModel
-
-
-def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
-    """Get the base model of our wrapper classes.
-    For Actor, Critic and RewardModel, return ``model.model``,
-    it's usually a ``transformers.PreTrainedModel``.
-
-    Args:
-        model (nn.Module): model to get base model from
-
-    Returns:
-        nn.Module: the base model
-    """
-    assert isinstance(
-        model, (Actor, Critic, RewardModel)
-    ), f"Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first."
-    return model.model
-
-
-__all__ = ["Actor", "Critic", "RewardModel", "get_base_model"]
diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py
deleted file mode 100644
index 8b2b81ed071c..000000000000
--- a/applications/Chat/coati/models/base/actor.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class Actor(LoRAModule):
-    """
-    Actor model base class.
-
-    Args:
-        model (nn.Module): Actor Model.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.convert_to_lora()
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **model_kwargs,
-    ) -> torch.Tensor:
-        """Returns model output."""
-        output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
-        return output
-    
diff --git a/applications/Chat/coati/models/base/critic.py b/applications/Chat/coati/models/base/critic.py
deleted file mode 100644
index 8672365f5783..000000000000
--- a/applications/Chat/coati/models/base/critic.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class Critic(LoRAModule):
-    """
-    Critic model base class.
-
-    Args:
-        model (nn.Module): Critic model.
-        value_head (nn.Module): Value head to get value.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self, model: nn.Module, value_head: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none"
-    ) -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.value_head = value_head
-        self.convert_to_lora()
-
-    def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        outputs = self.model(sequences, attention_mask=attention_mask)
-        last_hidden_states = outputs["last_hidden_state"]
-        sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
-            0
-        ]
-        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
-        values = self.value_head(sequence_hidden_states).squeeze(1)  # ensure shape is (B, )
-        return values
diff --git a/applications/Chat/coati/models/base/reward_model.py b/applications/Chat/coati/models/base/reward_model.py
deleted file mode 100644
index e9545d1cddaf..000000000000
--- a/applications/Chat/coati/models/base/reward_model.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class RewardModel(LoRAModule):
-    """
-    Reward model base class.
-
-    Args:
-        model (nn.Module): Reward model.
-        value_head (nn.Module): Value head to get reward score.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        model: nn.Module,
-        value_head: Optional[nn.Module] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.convert_to_lora()
-
-        if value_head is not None:
-            if value_head.out_features != 1:
-                raise ValueError("The value head of reward model's output dim should be 1!")
-            self.value_head = value_head
-        else:
-            self.value_head = nn.Linear(model.config.n_embd, 1)
-
-    def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        outputs = self.model(sequences, attention_mask=attention_mask)
-        last_hidden_states = outputs["last_hidden_state"]
-        sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
-            0
-        ]
-        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
-        values = self.value_head(sequence_hidden_states).squeeze(1)  # ensure shape is (B, )
-        return values
diff --git a/applications/Chat/coati/models/bloom/__init__.py b/applications/Chat/coati/models/bloom/__init__.py
deleted file mode 100644
index 7af199a67d3b..000000000000
--- a/applications/Chat/coati/models/bloom/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .bloom_actor import BLOOMActor
-from .bloom_critic import BLOOMCritic
-from .bloom_rm import BLOOMRM
-
-__all__ = ["BLOOMActor", "BLOOMCritic", "BLOOMRM"]
diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py
deleted file mode 100644
index 73855a2245e7..000000000000
--- a/applications/Chat/coati/models/bloom/bloom_actor.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from typing import Optional
-
-from transformers import BloomConfig, BloomForCausalLM
-
-from ..base import Actor
-
-
-class BLOOMActor(Actor):
-    """
-    BLOOM Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = BloomForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomForCausalLM(config)
-        else:
-            model = BloomForCausalLM(BloomConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/bloom/bloom_critic.py b/applications/Chat/coati/models/bloom/bloom_critic.py
deleted file mode 100644
index b2d838f7ffc5..000000000000
--- a/applications/Chat/coati/models/bloom/bloom_critic.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import BloomConfig, BloomModel
-
-from ..base import Critic
-
-
-class BLOOMCritic(Critic):
-    """
-    BLOOM Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = BloomModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomModel(config)
-        else:
-            model = BloomModel(BloomConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/Chat/coati/models/bloom/bloom_rm.py b/applications/Chat/coati/models/bloom/bloom_rm.py
deleted file mode 100644
index c09457ddc8c7..000000000000
--- a/applications/Chat/coati/models/bloom/bloom_rm.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import BloomConfig, BloomModel
-
-from ..base import RewardModel
-
-
-class BLOOMRM(RewardModel):
-    """
-    BLOOM Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = BloomModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomModel(config)
-        else:
-            model = BloomModel(BloomConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/chatglm/__init__.py b/applications/Chat/coati/models/chatglm/__init__.py
deleted file mode 100644
index 5956f5a8e91b..000000000000
--- a/applications/Chat/coati/models/chatglm/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .chatglm_actor import ChatGLMActor
-
-__all__ = ["ChatGLMActor"]
diff --git a/applications/Chat/coati/models/chatglm/chatglm_actor.py b/applications/Chat/coati/models/chatglm/chatglm_actor.py
deleted file mode 100644
index 00a61561ee47..000000000000
--- a/applications/Chat/coati/models/chatglm/chatglm_actor.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from typing import Optional
-
-from ..base import Actor
-from .configuration_chatglm import ChatGLMConfig
-from .modeling_chatglm import ChatGLMForConditionalGeneration
-
-
-class ChatGLMActor(Actor):
-    """
-    ChatGLM Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (ChatGLMConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-
-    do not support lora for now.
-    """
-
-    def __init__(
-        self, pretrained: str = None, config: Optional[ChatGLMConfig] = None, checkpoint: bool = False
-    ) -> None:
-        if pretrained is not None:
-            model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
-        elif config is not None:
-            model = ChatGLMForConditionalGeneration(config)
-        else:
-            model = ChatGLMForConditionalGeneration(ChatGLMConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank=0, lora_train_bias="none")
diff --git a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py b/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
deleted file mode 100644
index 221ef044b470..000000000000
--- a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""
-This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
-"""
-"""Tokenization classes for ChatGLM."""
-import os
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import sentencepiece as spm
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
-from transformers.utils import PaddingStrategy, logging
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "THUDM/chatglm-6b": 2048,
-}
-
-
-class TextTokenizer:
-    def __init__(self, model_path):
-        self.sp = spm.SentencePieceProcessor()
-        self.sp.Load(model_path)
-        self.num_tokens = self.sp.vocab_size()
-
-    def encode(self, text):
-        return self.sp.EncodeAsIds(text)
-
-    def decode(self, ids: List[int]):
-        return self.sp.DecodeIds(ids)
-
-    def tokenize(self, text):
-        return self.sp.EncodeAsPieces(text)
-
-    def convert_tokens_to_string(self, tokens):
-        return self.sp.DecodePieces(tokens)
-
-    def convert_tokens_to_ids(self, tokens):
-        return [self.sp.PieceToId(token) for token in tokens]
-
-    def convert_token_to_id(self, token):
-        return self.sp.PieceToId(token)
-
-    def convert_id_to_token(self, idx):
-        return self.sp.IdToPiece(idx)
-
-    def __len__(self):
-        return self.num_tokens
-
-
-class SPTokenizer:
-    def __init__(
-        self,
-        vocab_file,
-        num_image_tokens=20000,
-        max_blank_length=80,
-        byte_fallback=True,
-    ):
-        assert vocab_file is not None
-        self.vocab_file = vocab_file
-        self.num_image_tokens = num_image_tokens
-        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
-        self.max_blank_length = max_blank_length
-        self.byte_fallback = byte_fallback
-        self.text_tokenizer = TextTokenizer(vocab_file)
-
-    def _get_text_tokenizer(self):
-        return self.text_tokenizer
-
-    @staticmethod
-    def get_blank_token(length: int):
-        assert length >= 2
-        return f"<|blank_{length}|>"
-
-    @staticmethod
-    def get_tab_token():
-        return f"<|tab|>"
-
-    @property
-    def num_text_tokens(self):
-        return self.text_tokenizer.num_tokens
-
-    @property
-    def num_tokens(self):
-        return self.num_image_tokens + self.num_text_tokens
-
-    @staticmethod
-    def _encode_whitespaces(text: str, max_len: int = 80):
-        text = text.replace("\t", SPTokenizer.get_tab_token())
-        for i in range(max_len, 1, -1):
-            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
-        return text
-
-    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
-        if linebreak:
-            text = text.replace("\n", "<n>")
-        if whitespaces:
-            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
-        return text
-
-    def encode(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[int]:
-        """
-        @param text: Text to encode.
-        @param linebreak: Whether to encode newline (\n) in text.
-        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self._preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-        tmp = self._get_text_tokenizer().encode(text)
-        tokens = [x + self.num_image_tokens for x in tmp]
-        return tokens if add_dummy_prefix else tokens[2:]
-
-    def postprocess(self, text):
-        text = text.replace("<n>", "\n")
-        text = text.replace(SPTokenizer.get_tab_token(), "\t")
-        for i in range(2, self.max_blank_length + 1):
-            text = text.replace(self.get_blank_token(i), " " * i)
-        return text
-
-    def decode(self, text_ids: List[int]) -> str:
-        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
-        ids = [_id for _id in ids if _id >= 0]
-        text = self._get_text_tokenizer().decode(ids)
-        text = self.postprocess(text)
-        return text
-
-    def decode_tokens(self, tokens: List[str]) -> str:
-        text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
-        text = self.postprocess(text)
-        return text
-
-    def tokenize(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[str]:
-        """
-        @param text: Text to encode.
-        @param linebreak: Whether to encode newline (\n) in text.
-        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self._preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-        tokens = self._get_text_tokenizer().tokenize(text)
-        return tokens if add_dummy_prefix else tokens[2:]
-
-    def __getitem__(self, x: Union[int, str]):
-        if isinstance(x, int):
-            if x < self.num_image_tokens:
-                return "<image_{}>".format(x)
-            else:
-                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
-        elif isinstance(x, str):
-            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
-                return int(x[7:-1])
-            else:
-                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
-        else:
-            raise ValueError("The key should be str or int.")
-
-
-class ChatGLMTokenizer(PreTrainedTokenizer):
-    """
-    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = {"vocab_file": "ice_text.model"}
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask", "position_ids"]
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=False,
-        remove_space=False,
-        bos_token="<sop>",
-        eos_token="<eop>",
-        end_token="</s>",
-        mask_token="[MASK]",
-        gmask_token="[gMASK]",
-        padding_side="left",
-        pad_token="<pad>",
-        unk_token="<unk>",
-        num_image_tokens=20000,
-        **kwargs,
-    ) -> None:
-        super().__init__(
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            padding_side=padding_side,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            end_token=end_token,
-            mask_token=mask_token,
-            gmask_token=gmask_token,
-            pad_token=pad_token,
-            unk_token=unk_token,
-            num_image_tokens=num_image_tokens,
-            **kwargs,
-        )
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.vocab_file = vocab_file
-
-        self.bos_token = bos_token
-        self.eos_token = eos_token
-        self.end_token = end_token
-        self.mask_token = mask_token
-        self.gmask_token = gmask_token
-
-        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
-
-        """ Initialisation """
-
-    @property
-    def gmask_token_id(self) -> Optional[int]:
-        if self.gmask_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.gmask_token)
-
-    @property
-    def end_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
-        set.
-        """
-        if self.end_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.end_token)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_tokenizer.num_tokens
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text, **kwargs):
-        """Returns a tokenized string."""
-        text = self.preprocess_text(text)
-
-        seq = self.sp_tokenizer.tokenize(text)
-
-        return seq
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        return self.sp_tokenizer.decode_tokens(tokens)
-
-    def _decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
-        if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        if len(token_ids) == 0:
-            return ""
-        if self.pad_token_id in token_ids:  # remove pad
-            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
-        return super()._decode(token_ids, **kwargs)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_tokenizer[token]
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_tokenizer[index]
-
-    def save_vocabulary(self, save_directory, filename_prefix=None):
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-            filename_prefix (`str`, *optional*):
-                An optional prefix to add to the named of the saved files.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
-        else:
-            vocab_file = save_directory
-
-        with open(self.vocab_file, "rb") as fin:
-            proto_str = fin.read()
-
-        with open(vocab_file, "wb") as writer:
-            writer.write(proto_str)
-
-        return (vocab_file,)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        gmask_id = self.sp_tokenizer[self.gmask_token]
-        self.sp_tokenizer[self.eos_token]
-        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
-        if token_ids_1 is not None:
-            token_ids_0 = token_ids_0 + token_ids_1
-        return token_ids_0
-
-    def _pad(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        bos_token_id = self.sp_tokenizer[self.bos_token]
-        mask_token_id = self.sp_tokenizer[self.mask_token]
-        gmask_token_id = self.sp_tokenizer[self.gmask_token]
-        assert self.padding_side == "left"
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-        seq_length = len(required_input)
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if max_length is not None:
-            if "attention_mask" not in encoded_inputs:
-                if bos_token_id in required_input:
-                    context_length = required_input.index(bos_token_id)
-                else:
-                    context_length = seq_length
-                attention_mask = np.ones((1, seq_length, seq_length))
-                attention_mask = np.tril(attention_mask)
-                attention_mask[:, :, :context_length] = 1
-                attention_mask = np.bool_(attention_mask < 0.5)
-                encoded_inputs["attention_mask"] = attention_mask
-
-            if "position_ids" not in encoded_inputs:
-                if bos_token_id in required_input:
-                    context_length = required_input.index(bos_token_id)
-                else:
-                    context_length = seq_length
-                position_ids = np.arange(seq_length, dtype=np.int64)
-                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
-                if mask_token in required_input:
-                    mask_position = required_input.index(mask_token)
-                    position_ids[context_length:] = mask_position
-                block_position_ids = np.concatenate(
-                    [
-                        np.zeros(context_length, dtype=np.int64),
-                        np.arange(1, seq_length - context_length + 1, dtype=np.int64),
-                    ]
-                )
-                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = np.pad(
-                    encoded_inputs["attention_mask"],
-                    pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                    mode="constant",
-                    constant_values=True,
-                )
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                    "token_type_ids"
-                ]
-            if "special_tokens_mask" in encoded_inputs:
-                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-            if "position_ids" in encoded_inputs:
-                encoded_inputs["position_ids"] = np.pad(
-                    encoded_inputs["position_ids"], pad_width=[(0, 0), (difference, 0)]
-                )
-            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-
-        return encoded_inputs
diff --git a/applications/Chat/coati/models/chatglm/configuration_chatglm.py b/applications/Chat/coati/models/chatglm/configuration_chatglm.py
deleted file mode 100644
index a6d2ccd18715..000000000000
--- a/applications/Chat/coati/models/chatglm/configuration_chatglm.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/configuration_chatglm.py
-"""
-
-""" ChatGLM model configuration """
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class ChatGLMConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
-    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
-
-    Configuration objects inherit from  [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
-    for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 150528):
-            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~ChatGLMModel`] or
-            [`~TFChatGLMModel`].
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        inner_hidden_size (`int`, *optional*, defaults to 16384):
-            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        max_sequence_length (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether the model should return the last key/values attentions (not used by all models).
-        Example:
-
-    ```python
-    >>> from configuration_chatglm import ChatGLMConfig
-    >>> from modeling_chatglm import ChatGLMModel
-
-    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
-    >>> configuration = ChatGLMConfig()
-
-    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
-    >>> model = ChatGLMModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "chatglm"
-
-    def __init__(
-        self,
-        vocab_size=130528,
-        hidden_size=4096,
-        num_layers=28,
-        num_attention_heads=32,
-        layernorm_epsilon=1e-5,
-        use_cache=True,
-        bos_token_id=130004,
-        eos_token_id=130005,
-        mask_token_id=130000,
-        gmask_token_id=130001,
-        pad_token_id=3,
-        max_sequence_length=2048,
-        inner_hidden_size=16384,
-        position_encoding_2d=True,
-        quantization_bit=0,
-        pre_seq_len=None,
-        prefix_projection=False,
-        **kwargs,
-    ):
-        self.num_layers = num_layers
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.max_sequence_length = max_sequence_length
-        self.layernorm_epsilon = layernorm_epsilon
-        self.inner_hidden_size = inner_hidden_size
-        self.use_cache = use_cache
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.mask_token_id = mask_token_id
-        self.gmask_token_id = gmask_token_id
-        self.position_encoding_2d = position_encoding_2d
-        self.quantization_bit = quantization_bit
-        self.pre_seq_len = pre_seq_len
-        self.prefix_projection = prefix_projection
-
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/applications/Chat/coati/models/chatglm/modeling_chatglm.py b/applications/Chat/coati/models/chatglm/modeling_chatglm.py
deleted file mode 100644
index d1d15c68ffd8..000000000000
--- a/applications/Chat/coati/models/chatglm/modeling_chatglm.py
+++ /dev/null
@@ -1,1477 +0,0 @@
-"""
-This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/modeling_chatglm.py
-"""
-
-""" PyTorch ChatGLM model. """
-
-import copy
-import math
-import os
-import re
-import sys
-import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn.utils import skip_init
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import GenerationConfig, LogitsProcessorList, ModelOutput, StoppingCriteriaList
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-)
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != "darwin":
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm-6b",
-    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
-]
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(config.hidden_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2),
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-@torch.jit.script
-def gelu_impl(x):
-    """OpenAI's gelu implementation."""
-    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
-
-
-def gelu(x):
-    return gelu_impl(x)
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        inv_freq = inv_freq.half()
-        self.learnable = learnable
-        if learnable:
-            self.inv_freq = torch.nn.Parameter(inv_freq)
-            self.max_seq_len_cached = None
-        else:
-            self.register_buffer("inv_freq", inv_freq)
-            self.max_seq_len_cached = None
-            self.cos_cached = None
-            self.sin_cached = None
-        self.precision = precision
-
-    def _load_from_state_dict(
-        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-    ):
-        pass
-
-    def forward(self, x, seq_dim=1, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[seq_dim]
-        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
-            self.max_seq_len_cached = None if self.learnable else seq_len
-            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            if self.precision == torch.bfloat16:
-                emb = emb.float()
-
-            # [sx, 1 (b * np), hn]
-            cos_cached = emb.cos()[:, None, :]
-            sin_cached = emb.sin()[:, None, :]
-            if self.precision == torch.bfloat16:
-                cos_cached = cos_cached.bfloat16()
-                sin_cached = sin_cached.bfloat16()
-            if self.learnable:
-                return cos_cached, sin_cached
-            self.cos_cached, self.sin_cached = cos_cached, sin_cached
-        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
-
-    def _apply(self, fn):
-        if self.cos_cached is not None:
-            self.cos_cached = fn(self.cos_cached)
-        if self.sin_cached is not None:
-            self.sin_cached = fn(self.sin_cached)
-        return super()._apply(fn)
-
-
-def rotate_half(x):
-    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
-
-
-@torch.jit.script
-def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
-    # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
-    cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), F.embedding(
-        position_id, sin.squeeze(1)
-    ).unsqueeze(2)
-    q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
-    return q, k
-
-
-def attention_fn(
-    self,
-    query_layer,
-    key_layer,
-    value_layer,
-    attention_mask,
-    hidden_size_per_partition,
-    layer_id,
-    layer_past=None,
-    scaling_attention_score=True,
-    use_cache=False,
-):
-    if layer_past is not None:
-        past_key, past_value = layer_past[0], layer_past[1]
-        key_layer = torch.cat((past_key, key_layer), dim=0)
-        value_layer = torch.cat((past_value, value_layer), dim=0)
-
-    # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
-    seq_len, b, nh, hidden_size = key_layer.shape
-
-    if use_cache:
-        present = (key_layer, value_layer)
-    else:
-        present = None
-
-    query_key_layer_scaling_coeff = float(layer_id + 1)
-    if scaling_attention_score:
-        query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
-
-    # ===================================
-    # Raw attention scores. [b, np, s, s]
-    # ===================================
-
-    # [b, np, sq, sk]
-    output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-    # [sq, b, np, hn] -> [sq, b * np, hn]
-    query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-    # [sk, b, np, hn] -> [sk, b * np, hn]
-    key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-    matmul_result = torch.zeros(
-        1,
-        1,
-        1,
-        dtype=query_layer.dtype,
-        device=query_layer.device,
-    )
-
-    matmul_result = torch.baddbmm(
-        matmul_result,
-        query_layer.transpose(0, 1),  # [b * np, sq, hn]
-        key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-        beta=0.0,
-        alpha=1.0,
-    )
-
-    # change view to [b, np, sq, sk]
-    attention_scores = matmul_result.view(*output_size)
-
-    if self.scale_mask_softmax:
-        self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
-        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
-    else:
-        if not (attention_mask == 0).all():
-            # if auto-regressive, skip
-            attention_scores.masked_fill_(attention_mask, -10000.0)
-        dtype = attention_scores.dtype
-        attention_scores = attention_scores.float()
-        attention_scores = attention_scores * query_key_layer_scaling_coeff
-
-        attention_probs = F.softmax(attention_scores, dim=-1)
-
-        attention_probs = attention_probs.type(dtype)
-
-    # =========================
-    # Context layer. [sq, b, hp]
-    # =========================
-
-    # value_layer -> context layer.
-    # [sk, b, np, hn] --> [b, np, sq, hn]
-
-    # context layer shape: [b, np, sq, hn]
-    output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-
-    # change view [sk, b * np, hn]
-    value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-
-    # change view [b * np, sq, sk]
-    attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-
-    # matmul: [b * np, sq, hn]
-    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-
-    # change view [b, np, sq, hn]
-    context_layer = context_layer.view(*output_size)
-
-    # [b, np, sq, hn] --> [sq, b, np, hn]
-    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-    # [sq, b, np, hn] --> [sq, b, hp]
-    new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
-    context_layer = context_layer.view(*new_context_layer_shape)
-
-    outputs = (context_layer, present, attention_probs)
-
-    return outputs
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class SelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        num_attention_heads,
-        layer_id,
-        hidden_size_per_attention_head=None,
-        bias=True,
-        params_dtype=torch.float,
-        position_encoding_2d=True,
-        empty_init=True,
-    ):
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        super(SelfAttention, self).__init__()
-
-        self.layer_id = layer_id
-        self.hidden_size = hidden_size
-        self.hidden_size_per_partition = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_attention_heads_per_partition = num_attention_heads
-        self.position_encoding_2d = position_encoding_2d
-        self.rotary_emb = RotaryEmbedding(
-            self.hidden_size // (self.num_attention_heads * 2)
-            if position_encoding_2d
-            else self.hidden_size // self.num_attention_heads,
-            base=10000,
-            precision=torch.half,
-            learnable=False,
-        )
-
-        self.scale_mask_softmax = None
-
-        if hidden_size_per_attention_head is None:
-            self.hidden_size_per_attention_head = hidden_size // num_attention_heads
-        else:
-            self.hidden_size_per_attention_head = hidden_size_per_attention_head
-
-        self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
-
-        # Strided linear layer.
-        self.query_key_value = init_method(
-            torch.nn.Linear,
-            hidden_size,
-            3 * self.inner_hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-        self.dense = init_method(
-            torch.nn.Linear,
-            self.inner_hidden_size,
-            hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-    @staticmethod
-    def attention_mask_func(attention_scores, attention_mask):
-        attention_scores.masked_fill_(attention_mask, -10000.0)
-        return attention_scores
-
-    def split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=False):
-        """Split a tensor along its last dimension.
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                    in memory.
-        """
-        # Get the size and dimension.
-        last_dim = tensor.dim() - 1
-        last_dim_size = tensor.size()[last_dim] // num_partitions
-        # Split.
-        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-        # Note: torch.split does not create contiguous tensors by default.
-        if contiguous_split_chunks:
-            return tuple(chunk.contiguous() for chunk in tensor_list)
-
-        return tensor_list
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_ids,
-        attention_mask: torch.Tensor,
-        layer_id,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        attention_mask: [(1, 1), seq_len, seq_len]
-        """
-
-        # [seq_len, batch, 3 * hidden_size]
-        mixed_raw_layer = self.query_key_value(hidden_states)
-
-        # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
-        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
-
-        # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
-        (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3)
-
-        if self.position_encoding_2d:
-            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
-            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
-            cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
-            position_ids, block_position_ids = (
-                position_ids[:, 0, :].transpose(0, 1).contiguous(),
-                position_ids[:, 1, :].transpose(0, 1).contiguous(),
-            )
-            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
-            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
-            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
-            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
-        else:
-            position_ids = position_ids.transpose(0, 1)
-            cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
-            # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
-            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
-
-        # [seq_len, batch, hidden_size]
-        context_layer, present, attention_probs = attention_fn(
-            self=self,
-            query_layer=query_layer,
-            key_layer=key_layer,
-            value_layer=value_layer,
-            attention_mask=attention_mask,
-            hidden_size_per_partition=self.hidden_size_per_partition,
-            layer_id=layer_id,
-            layer_past=layer_past,
-            use_cache=use_cache,
-        )
-
-        output = self.dense(context_layer)
-
-        outputs = (output, present)
-
-        if output_attentions:
-            outputs += (attention_probs,)
-
-        return outputs  # output, present, attention_probs
-
-
-class GEGLU(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.activation_fn = F.gelu
-
-    def forward(self, x):
-        # dim=-1 breaks in jit for pt<1.10
-        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
-        return x1 * self.activation_fn(x2)
-
-
-class GLU(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        inner_hidden_size=None,
-        layer_id=None,
-        bias=True,
-        activation_func=gelu,
-        params_dtype=torch.float,
-        empty_init=True,
-    ):
-        super(GLU, self).__init__()
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        self.layer_id = layer_id
-        self.activation_func = activation_func
-
-        # Project to 4h.
-        self.hidden_size = hidden_size
-        if inner_hidden_size is None:
-            inner_hidden_size = 4 * hidden_size
-        self.inner_hidden_size = inner_hidden_size
-        self.dense_h_to_4h = init_method(
-            torch.nn.Linear,
-            self.hidden_size,
-            self.inner_hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-        # Project back to h.
-        self.dense_4h_to_h = init_method(
-            torch.nn.Linear,
-            self.inner_hidden_size,
-            self.hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-    def forward(self, hidden_states):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        """
-
-        # [seq_len, batch, inner_hidden_size]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-
-        output = self.dense_4h_to_h(intermediate_parallel)
-
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        num_attention_heads,
-        layernorm_epsilon,
-        layer_id,
-        inner_hidden_size=None,
-        hidden_size_per_attention_head=None,
-        layernorm=LayerNorm,
-        use_bias=True,
-        params_dtype=torch.float,
-        num_layers=28,
-        position_encoding_2d=True,
-        empty_init=True,
-    ):
-        super(GLMBlock, self).__init__()
-        # Set output layer initialization if not provided.
-
-        self.layer_id = layer_id
-
-        # Layernorm on the input data.
-        self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
-
-        self.position_encoding_2d = position_encoding_2d
-
-        # Self attention.
-        self.attention = SelfAttention(
-            hidden_size,
-            num_attention_heads,
-            layer_id,
-            hidden_size_per_attention_head=hidden_size_per_attention_head,
-            bias=use_bias,
-            params_dtype=params_dtype,
-            position_encoding_2d=self.position_encoding_2d,
-            empty_init=empty_init,
-        )
-
-        # Layernorm on the input data.
-        self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
-
-        self.num_layers = num_layers
-
-        # GLU
-        self.mlp = GLU(
-            hidden_size,
-            inner_hidden_size=inner_hidden_size,
-            bias=use_bias,
-            layer_id=layer_id,
-            params_dtype=params_dtype,
-            empty_init=empty_init,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_ids,
-        attention_mask: torch.Tensor,
-        layer_id,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        attention_mask: [(1, 1), seq_len, seq_len]
-        """
-
-        # Layer norm at the begining of the transformer layer.
-        # [seq_len, batch, hidden_size]
-        attention_input = self.input_layernorm(hidden_states)
-
-        # Self attention.
-        attention_outputs = self.attention(
-            attention_input,
-            position_ids,
-            attention_mask=attention_mask,
-            layer_id=layer_id,
-            layer_past=layer_past,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-        attention_output = attention_outputs[0]
-
-        outputs = attention_outputs[1:]
-
-        # Residual connection.
-        alpha = (2 * self.num_layers) ** 0.5
-        hidden_states = attention_input * alpha + attention_output
-
-        mlp_input = self.post_attention_layernorm(hidden_states)
-
-        # MLP.
-        mlp_output = self.mlp(mlp_input)
-
-        # Second residual connection.
-        output = mlp_input * alpha + mlp_output
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
-        attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
-        attention_mask.tril_()
-        for i, context_length in enumerate(context_lengths):
-            attention_mask[i, :, :context_length] = 1
-        attention_mask.unsqueeze_(1)
-        attention_mask = (attention_mask < 0.5).bool()
-
-        return attention_mask
-
-    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
-        batch_size, seq_length = input_ids.shape
-        if use_gmasks is None:
-            use_gmasks = [False] * batch_size
-        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
-        if self.position_encoding_2d:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            for i, context_length in enumerate(context_lengths):
-                position_ids[i, context_length:] = mask_positions[i]
-            block_position_ids = [
-                torch.cat(
-                    (
-                        torch.zeros(context_length, dtype=torch.long, device=device),
-                        torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1,
-                    )
-                )
-                for context_length in context_lengths
-            ]
-            block_position_ids = torch.stack(block_position_ids, dim=0)
-            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
-        else:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            for i, context_length in enumerate(context_lengths):
-                if not use_gmasks[i]:
-                    position_ids[i, context_length:] = mask_positions[i]
-
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ChatGLMModel):
-            module.gradient_checkpointing = value
-
-
-CHATGLM_6B_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-CHATGLM_6B_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`ChatGLM6BTokenizer`].
-            See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range `[0, config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.",
-    CHATGLM_6B_START_DOCSTRING,
-)
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    `is_decoder` argument of the configuration set to `True`.
-    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
-    argument and `add_cross_attention` set to `True`; an
-    `encoder_hidden_states` is then expected as an input to the forward pass.
-    """
-
-    def __init__(self, config: ChatGLMConfig, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        # recording parameters
-        self.max_sequence_length = config.max_sequence_length
-        self.hidden_size = config.hidden_size
-        self.params_dtype = torch.half
-        self.num_attention_heads = config.num_attention_heads
-        self.vocab_size = config.vocab_size
-        self.num_layers = config.num_layers
-        self.layernorm_epsilon = config.layernorm_epsilon
-        self.inner_hidden_size = config.inner_hidden_size
-        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
-        self.position_encoding_2d = config.position_encoding_2d
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-
-        self.word_embeddings = init_method(
-            torch.nn.Embedding, num_embeddings=self.vocab_size, embedding_dim=self.hidden_size, dtype=self.params_dtype
-        )
-        self.gradient_checkpointing = False
-
-        def get_layer(layer_id):
-            return GLMBlock(
-                self.hidden_size,
-                self.num_attention_heads,
-                self.layernorm_epsilon,
-                layer_id,
-                inner_hidden_size=self.inner_hidden_size,
-                hidden_size_per_attention_head=self.hidden_size_per_attention_head,
-                layernorm=LayerNorm,
-                use_bias=True,
-                params_dtype=self.params_dtype,
-                position_encoding_2d=self.position_encoding_2d,
-                empty_init=empty_init,
-            )
-
-        self.layers = torch.nn.ModuleList([get_layer(layer_id) for layer_id in range(self.num_layers)])
-
-        # Final layer norm before output.
-        self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
-
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-            # total_params = sum(p.numel() for p in self.parameters())
-            # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
-            # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params))
-
-    def get_input_embeddings(self):
-        return self.word_embeddings
-
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.num_attention_heads,
-            self.hidden_size // self.num_attention_heads,
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        # past_key_values = [(v[0], v[1]) for v in past_key_values]
-        return past_key_values
-
-    @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        if past_key_values is None:
-            if self.pre_seq_len is not None:
-                past_key_values = self.get_prompt(
-                    batch_size=input_ids.shape[0], device=input_ids.device, dtype=inputs_embeds.dtype
-                )
-            else:
-                past_key_values = tuple([None] * len(self.layers))
-
-            if attention_mask is None:
-                attention_mask = self.get_masks(input_ids, device=input_ids.device)
-
-            if position_ids is None:
-                MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-                seqs = input_ids.tolist()
-
-                mask_positions, use_gmasks = [], []
-                for seq in seqs:
-                    mask_token = gMASK if gMASK in seq else MASK
-                    use_gmask = mask_token == gMASK
-                    mask_positions.append(seq.index(mask_token))
-                    use_gmasks.append(use_gmask)
-
-                position_ids = self.get_position_ids(
-                    input_ids, mask_positions=mask_positions, device=input_ids.device, use_gmasks=use_gmasks
-                )
-
-        if self.pre_seq_len is not None and attention_mask is not None:
-            prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
-                attention_mask.device
-            )
-            prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
-            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
-
-        # [seq_len, batch, hidden_size]
-        hidden_states = inputs_embeds.transpose(0, 1)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        if attention_mask is None:
-            attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        for i, layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_past = past_key_values[i]
-
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    position_ids,
-                    attention_mask,
-                    torch.tensor(i),
-                    layer_past,
-                    use_cache,
-                    output_attentions,
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    position_ids=position_ids,
-                    attention_mask=attention_mask,
-                    layer_id=torch.tensor(i),
-                    layer_past=layer_past,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_ret[0]
-
-            if use_cache:
-                presents = presents + (layer_ret[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],)
-
-        # Final layer norm.
-        hidden_states = self.final_layernorm(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-
-        # self.hidden_size = config.hidden_size
-        # self.params_dtype = torch.half
-        # self.vocab_size = config.vocab_size
-        self.max_sequence_length = config.max_sequence_length
-
-        self.position_encoding_2d = config.position_encoding_2d
-
-        self.transformer = ChatGLMModel(config, empty_init=empty_init)
-
-        self.lm_head = init_method(nn.Linear, config.hidden_size, config.vocab_size, bias=False, dtype=torch.half)
-
-        self.config = config
-
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def _update_model_kwargs_for_generation(
-        self,
-        outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
-        is_encoder_decoder: bool = False,
-        standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            if attention_mask is not None and attention_mask.dtype == torch.bool:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3
-                )
-                new_attention_mask = attention_mask[:, :, -1:].clone()
-                new_attention_mask[..., -1] = False
-                model_kwargs["attention_mask"] = torch.cat([attention_mask, new_attention_mask], dim=2)
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id[:, 1, :] += 1
-            model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
-
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past: Optional[torch.Tensor] = None,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        batch_size, seq_length = input_ids.shape
-        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-        seqs = input_ids.tolist()
-        mask_positions, use_gmasks = [], []
-        for seq in seqs:
-            mask_token = gMASK if gMASK in seq else MASK
-            use_gmask = mask_token == gMASK
-            mask_positions.append(seq.index(mask_token))
-            use_gmasks.append(use_gmask)
-
-        # only last token for input_ids if past is not None
-        if past is not None or past_key_values is not None:
-            last_token = input_ids[:, -1].unsqueeze(-1)
-            if attention_mask is not None and attention_mask.dtype == torch.bool:
-                attention_mask = attention_mask[:, :, -1:]
-            else:
-                attention_mask = None
-            if position_ids is not None:
-                position_ids = position_ids[..., -1:]
-            else:
-                context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
-                if self.position_encoding_2d:
-                    position_ids = torch.tensor(
-                        [
-                            [mask_position, seq_length - context_length]
-                            for mask_position, context_length in zip(mask_positions, context_lengths)
-                        ],
-                        dtype=torch.long,
-                        device=input_ids.device,
-                    ).unsqueeze(-1)
-                else:
-                    position_ids = torch.tensor(
-                        [mask_position for mask_position in mask_positions], dtype=torch.long, device=input_ids.device
-                    ).unsqueeze(-1)
-
-            if past is None:
-                past = past_key_values
-            return {
-                "input_ids": last_token,
-                "past_key_values": past,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-            }
-        else:
-            if attention_mask is not None and attention_mask.dtype != torch.bool:
-                logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
-                attention_mask = None
-            if attention_mask is None:
-                attention_mask = self.get_masks(input_ids, device=input_ids.device)
-            if position_ids is None:
-                position_ids = self.get_position_ids(
-                    input_ids, device=input_ids.device, mask_positions=mask_positions, use_gmasks=use_gmasks
-                )
-
-            return {
-                "input_ids": input_ids,
-                "past_key_values": past,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-            }
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        punkts = [
-            [",", "，"],
-            ["!", "！"],
-            [":", "："],
-            [";", "；"],
-            ["\?", "？"],
-        ]
-        for item in punkts:
-            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-        return response
-
-    @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = None,
-        max_length: int = 2048,
-        num_beams=1,
-        do_sample=True,
-        top_p=0.7,
-        temperature=0.95,
-        logits_processor=None,
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {
-            "max_length": max_length,
-            "num_beams": num_beams,
-            "do_sample": do_sample,
-            "top_p": top_p,
-            "temperature": temperature,
-            "logits_processor": logits_processor,
-            **kwargs,
-        }
-        if not history:
-            prompt = query
-        else:
-            prompt = ""
-            for i, (old_query, response) in enumerate(history):
-                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
-            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
-        response = tokenizer.decode(outputs)
-        response = self.process_response(response)
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = None,
-        max_length: int = 2048,
-        do_sample=True,
-        top_p=0.7,
-        temperature=0.95,
-        logits_processor=None,
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {
-            "max_length": max_length,
-            "do_sample": do_sample,
-            "top_p": top_p,
-            "temperature": temperature,
-            "logits_processor": logits_processor,
-            **kwargs,
-        }
-        if not history:
-            prompt = query
-        else:
-            prompt = ""
-            for i, (old_query, response) in enumerate(history):
-                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
-            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        for outputs in self.stream_generate(**inputs, **gen_kwargs):
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
-            response = tokenizer.decode(outputs)
-            response = self.process_response(response)
-            new_history = history + [(query, response)]
-            yield response, new_history
-
-    @torch.no_grad()
-    def stream_generate(
-        self,
-        input_ids,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-        **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-            yield input_ids
-
-    def quantize(self, bits: int, empty_init=False, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
-        return self
diff --git a/applications/Chat/coati/models/critic.py b/applications/Chat/coati/models/critic.py
new file mode 100644
index 000000000000..4313f83387ae
--- /dev/null
+++ b/applications/Chat/coati/models/critic.py
@@ -0,0 +1,38 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from coati.models import BaseModel
+from transformers import PretrainedConfig
+
+
+class Critic(BaseModel):
+    """
+    Critic model class.
+
+    Args:
+        pretrained (str): path to pretrained model.
+        config (PretrainedConfig): PretrainedConfig used to initiate the base model.
+    """
+
+    def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
+        super().__init__(pretrained=pretrained, config=config)
+        # get last hidden state size with dummy input
+        try:
+            dummy_outputs = self.model(
+                torch.tensor([[1]]).to(self.model.device), attention_mask=torch.tensor([[1]]).to(self.model.device)
+            )
+            last_hidden_state_size = dummy_outputs["last_hidden_state"].size(-1)
+        except Exception as e:
+            raise ValueError(
+                f"Please provide a valid pretrained model name or a valid config file for a CasualLM. Caught exception: {e}"
+            )
+
+        self.value_head = nn.Linear(last_hidden_state_size, 1)
+
+    def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        last_hidden_states = outputs["last_hidden_state"]
+        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), :]
+        values = self.value_head(sequence_hidden_states).squeeze(2)  # ensure shape is (B, sequence length)
+        return values
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index 4ab0cdc8a3ea..baad8f2b55bb 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -4,8 +4,6 @@
 import torch.distributed as dist
 from transformers import PreTrainedTokenizer
 
-from .base import Actor
-
 try:
     from transformers.generation_logits_process import (
         LogitsProcessorList,
@@ -39,7 +37,7 @@ def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
 
 
 def _sample(
-    model: Actor,
+    model: Any,
     input_ids: torch.Tensor,
     max_length: int,
     early_stopping: bool = False,
@@ -94,7 +92,7 @@ def _sample(
 
 @torch.no_grad()
 def generate(
-    model: Actor,
+    model: Any,
     input_ids: torch.Tensor,
     tokenizer: PreTrainedTokenizer,
     max_length: int,
diff --git a/applications/Chat/coati/models/gpt/__init__.py b/applications/Chat/coati/models/gpt/__init__.py
deleted file mode 100644
index 823cf4a75e0d..000000000000
--- a/applications/Chat/coati/models/gpt/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .gpt_actor import GPTActor
-from .gpt_critic import GPTCritic
-from .gpt_rm import GPTRM
-
-__all__ = ["GPTActor", "GPTCritic", "GPTRM"]
diff --git a/applications/Chat/coati/models/gpt/gpt_actor.py b/applications/Chat/coati/models/gpt/gpt_actor.py
deleted file mode 100644
index a7e4b9bc3e22..000000000000
--- a/applications/Chat/coati/models/gpt/gpt_actor.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-
-from ..base import Actor
-
-
-class GPTActor(Actor):
-    """
-    GPT Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the LoRa layer.
-        lora_train_bias (str): Bias training strategy for the LoRa layer.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2LMHeadModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2LMHeadModel(config)
-        else:
-            model = GPT2LMHeadModel(GPT2Config())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/Chat/coati/models/gpt/gpt_critic.py b/applications/Chat/coati/models/gpt/gpt_critic.py
deleted file mode 100644
index 22ab36dea276..000000000000
--- a/applications/Chat/coati/models/gpt/gpt_critic.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-
-from ..base import Critic
-
-
-class GPTCritic(Critic):
-    """
-    GPT Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        lora_rank (int): Rank of the LO-RA decomposition.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2Model.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2Model(config)
-        else:
-            model = GPT2Model(GPT2Config())
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/Chat/coati/models/gpt/gpt_rm.py b/applications/Chat/coati/models/gpt/gpt_rm.py
deleted file mode 100644
index 8edfc4008466..000000000000
--- a/applications/Chat/coati/models/gpt/gpt_rm.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-
-from ..base import RewardModel
-
-
-class GPTRM(RewardModel):
-    """
-    GPT Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2Model.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2Model(config)
-        else:
-            model = GPT2Model(GPT2Config())
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/llama/__init__.py b/applications/Chat/coati/models/llama/__init__.py
deleted file mode 100644
index c87d732538a9..000000000000
--- a/applications/Chat/coati/models/llama/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .llama_actor import LlamaActor
-from .llama_critic import LlamaCritic
-from .llama_rm import LlamaRM
-
-__all__ = ["LlamaActor", "LlamaCritic", "LlamaRM"]
diff --git a/applications/Chat/coati/models/llama/llama_actor.py b/applications/Chat/coati/models/llama/llama_actor.py
deleted file mode 100644
index f1d9406835ca..000000000000
--- a/applications/Chat/coati/models/llama/llama_actor.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers import LlamaConfig, LlamaForCausalLM
-
-from ..base import Actor
-
-
-class LlamaActor(Actor):
-    """
-    Llama Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaForCausalLM(config)
-        else:
-            model = LlamaForCausalLM(LlamaConfig())
-
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-
-        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/llama/llama_critic.py b/applications/Chat/coati/models/llama/llama_critic.py
deleted file mode 100644
index 000dce17ccf0..000000000000
--- a/applications/Chat/coati/models/llama/llama_critic.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import LlamaConfig, LlamaModel
-
-from ..base import Critic
-
-
-class LlamaCritic(Critic):
-    """
-    Llama Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaModel(config)
-        else:
-            model = LlamaModel(LlamaConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/Chat/coati/models/llama/llama_rm.py b/applications/Chat/coati/models/llama/llama_rm.py
deleted file mode 100644
index 43bc9e638dc7..000000000000
--- a/applications/Chat/coati/models/llama/llama_rm.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import LlamaConfig, LlamaModel
-
-from ..base import RewardModel
-
-
-class LlamaRM(RewardModel):
-    """
-    Llama Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaModel(config)
-        else:
-            model = LlamaModel(LlamaConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
-
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index e9bd7b2ed8f0..d9e7f038686c 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -131,23 +131,3 @@ def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: s
     _convert_to_lora_recursively(module, lora_rank)
     lora.mark_only_lora_as_trainable(module, lora_train_bias)
     return module
-
-
-class LoRAModule(nn.Module):
-    """A LoRA module base class. All derived classes should call `convert_to_lora()` at the bottom of `__init__()`.
-    This class will convert all torch.nn.Linear layer to LoraLinear layer.
-
-    Args:
-        lora_rank (int, optional): LoRA rank. 0 means LoRA is not applied. Defaults to 0.
-        lora_train_bias (str, optional): Whether LoRA train biases.
-            'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers.
-            Defaults to 'none'.
-    """
-
-    def __init__(self, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
-        super().__init__()
-        self.lora_rank = lora_rank
-        self.lora_train_bias = lora_train_bias
-
-    def convert_to_lora(self) -> None:
-        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias)
diff --git a/applications/Chat/coati/models/loss.py b/applications/Chat/coati/models/loss.py
index 687bd0f7bfe7..061fc6ac56f9 100644
--- a/applications/Chat/coati/models/loss.py
+++ b/applications/Chat/coati/models/loss.py
@@ -39,14 +39,20 @@ def forward(
         advantages: torch.Tensor,
         action_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        ratio = (log_probs - old_log_probs).exp()
+        skip = False
+        ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
+
+        # note that if dropout is disabled (recommanded), ratio will always be 1.
+        if ratio_.max() > 30.0:
+            skip = True
+
+        ratio = ratio_.clamp(0.0, 10.0)
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
         loss = -torch.min(surr1, surr2)
-        if action_mask is not None:
-            loss = masked_mean(loss, action_mask)
+        loss = masked_mean(loss, action_mask)
         loss = loss.mean()
-        return loss
+        return loss, skip, ratio_.max()
 
 
 class ValueLoss(nn.Module):
@@ -54,7 +60,7 @@ class ValueLoss(nn.Module):
     Value Loss for PPO
     """
 
-    def __init__(self, clip_eps: float = 0.4) -> None:
+    def __init__(self, clip_eps: float = 0.2) -> None:
         super().__init__()
         self.clip_eps = clip_eps
 
@@ -62,17 +68,80 @@ def forward(
         self,
         values: torch.Tensor,
         old_values: torch.Tensor,
-        reward: torch.Tensor,
+        advantage: torch.Tensor,
         action_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        returns = advantage + old_values
         values_clipped = old_values + (values - old_values).clamp(-self.clip_eps, self.clip_eps)
-        surr1 = (values_clipped - reward) ** 2
-        surr2 = (values - reward) ** 2
-        loss = torch.max(surr1, surr2)
-        loss = loss.mean()
+        surr1 = (values_clipped - returns) ** 2
+        surr2 = (values - returns) ** 2
+        loss = torch.max(surr1, surr2) / torch.sum(action_mask)
+        loss = torch.sum(loss * action_mask)
         return 0.5 * loss
 
 
+class DpoLoss(nn.Module):
+    """
+    Dpo loss
+    Details: https://arxiv.org/pdf/2305.18290.pdf
+    """
+
+    def __init__(self, beta: float = 0.1):
+        super().__init__()
+        self.beta = beta
+
+    def forward(
+        self,
+        logprob_actor_chosen: torch.Tensor,
+        logprob_actor_reject: torch.Tensor,
+        logprob_ref_chosen: torch.Tensor,
+        logprob_ref_reject: torch.Tensor,
+        chosen_mask: torch.Tensor,
+        reject_mask: torch.Tensor,
+    ):
+        """Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        # adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py#L328
+
+        Args:
+            logprob_actor_chosen: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            logprob_actor_reject: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+            logprob_ref_chosen: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
+            logprob_ref_reject: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
+            The losses tensor contains the DPO loss for each example in the batch.
+            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
+        """
+        # print(logprob_ref_chosen)
+        # print(logprob_ref_reject)
+        if logprob_ref_chosen is not None and logprob_ref_reject is not None:
+            # print(logprob_ref_chosen.size(), logprob_ref_reject.size())
+            if len(logprob_ref_chosen.shape) == 2:
+                ref_logratios = logprob_ref_chosen.sum(-1) - logprob_ref_reject.sum(-1)
+            else:
+                ref_logratios = logprob_ref_chosen.squeeze() - logprob_ref_reject.squeeze()
+        else:
+            ref_logratios = 0.0
+
+        pi_logratios = logprob_actor_chosen.sum(-1) - logprob_actor_reject.sum(-1)
+        # print(pi_logratios)
+        # print(ref_logratios)
+        logits = pi_logratios - ref_logratios
+        losses = -torch.nn.functional.logsigmoid(self.beta * logits)
+        if logprob_ref_chosen is not None:
+            chosen_rewards = self.beta * (logprob_actor_chosen.sum(-1) - logprob_ref_chosen.sum(-1)).detach()
+        else:
+            chosen_rewards = self.beta * logprob_actor_chosen.sum(-1).detach()
+        if logprob_ref_reject is not None:
+            rejected_rewards = self.beta * (logprob_actor_reject.sum(-1) - logprob_ref_reject.sum(-1)).detach()
+        else:
+            rejected_rewards = self.beta * logprob_actor_reject.sum(-1).detach()
+
+        return losses, chosen_rewards, rejected_rewards
+
+
 class LogSigLoss(nn.Module):
     """
     Pairwise Loss for Reward Model
@@ -80,10 +149,7 @@ class LogSigLoss(nn.Module):
     """
 
     def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
-        probs = torch.sigmoid(chosen_reward - reject_reward)
-        log_probs = torch.log(probs)
-        loss = -log_probs.mean()
-        return loss
+        return -torch.nn.functional.logsigmoid(chosen_reward - reject_reward).mean()
 
 
 class LogExpLoss(nn.Module):
diff --git a/applications/Chat/coati/models/opt/__init__.py b/applications/Chat/coati/models/opt/__init__.py
deleted file mode 100644
index e37d6e45c8fc..000000000000
--- a/applications/Chat/coati/models/opt/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .opt_actor import OPTActor
-from .opt_critic import OPTCritic
-from .opt_rm import OPTRM
-
-__all__ = ["OPTActor", "OPTCritic", "OPTRM"]
diff --git a/applications/Chat/coati/models/opt/opt_actor.py b/applications/Chat/coati/models/opt/opt_actor.py
deleted file mode 100644
index cd8908e13fb8..000000000000
--- a/applications/Chat/coati/models/opt/opt_actor.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Optional
-
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-
-from ..base import Actor
-
-
-class OPTActor(Actor):
-    """
-    OPT Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = OPTForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTForCausalLM(config)
-        else:
-            model = OPTForCausalLM(OPTConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/opt/opt_critic.py b/applications/Chat/coati/models/opt/opt_critic.py
deleted file mode 100644
index f37d28812c27..000000000000
--- a/applications/Chat/coati/models/opt/opt_critic.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTModel
-
-from ..base import Critic
-
-
-class OPTCritic(Critic):
-    """
-    OPT Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = OPTModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTModel(config)
-        else:
-            model = OPTModel(OPTConfig())
-
-        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
diff --git a/applications/Chat/coati/models/opt/opt_rm.py b/applications/Chat/coati/models/opt/opt_rm.py
deleted file mode 100644
index 893708344ad4..000000000000
--- a/applications/Chat/coati/models/opt/opt_rm.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import OPTConfig, OPTModel
-
-from ..base import RewardModel
-
-
-class OPTRM(RewardModel):
-    """
-    OPT Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = OPTModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTModel(config)
-        else:
-            model = OPTModel(OPTConfig())
-
-        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.word_embed_proj_dim + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
diff --git a/applications/Chat/coati/models/reward_model.py b/applications/Chat/coati/models/reward_model.py
new file mode 100644
index 000000000000..0e2d0472b174
--- /dev/null
+++ b/applications/Chat/coati/models/reward_model.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from coati.models import BaseModel
+from transformers import PretrainedConfig
+
+
+class RewardModel(BaseModel):
+    """
+    Reward model class.
+
+    Args:
+        model (nn.Module): Critic Model.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
+        super().__init__(pretrained=pretrained, config=config)
+        # get last hidden state size with dummy input
+        try:
+            dummy_outputs = self.model(
+                torch.tensor([[1]]).to(self.model.device), attention_mask=torch.tensor([[1]]).to(self.model.device)
+            )
+            last_hidden_state_size = dummy_outputs["logits"].size(-1)
+        except Exception as e:
+            raise ValueError(
+                f"Please provide a valid pretrained model name or a valid config file for a CasualLM. Caught exception: {e}"
+            )
+
+        self.value_head = nn.Linear(last_hidden_state_size, 1)
+        self.value_head.weight.data.normal_(mean=0.0, std=1 / (last_hidden_state_size + 1))
+
+    def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        last_hidden_states = outputs["logits"]
+        sequence_lengths = torch.max(attention_mask * torch.arange(input_ids.size(1), device=input_ids.device), dim=1)[
+            0
+        ]
+        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths].type(
+            self.value_head.weight.dtype
+        )
+        # print("sequence_hidden_states", sequence_hidden_states.size(), sequence_hidden_states.dtype)
+        # print("values head weight", self.value_head.weight.size(),self.value_head.weight.dtype)
+
+        values = self.value_head(sequence_hidden_states).squeeze(-1)  # ensure shape is (B,)
+        return values
diff --git a/applications/Chat/coati/models/utils.py b/applications/Chat/coati/models/utils.py
index 1aaef16620d2..f23c20c22296 100644
--- a/applications/Chat/coati/models/utils.py
+++ b/applications/Chat/coati/models/utils.py
@@ -1,29 +1,18 @@
-from typing import Optional, Union
+import json
+import os
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
 
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
 
-def _compute_approx_kl(
-    log_probs: torch.Tensor, log_probs_base: torch.Tensor, action_mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    """
-    Compute the approximate KL divergence between two distributions.
-    Schulman blog: http://joschu.net/blog/kl-approx.html
-
-    Args:
-        log_probs: Log probabilities of the new distribution.
-        log_probs_base: Log probabilities of the base distribution.
-        action_mask: Mask for actions.
-    """
 
-    log_ratio = log_probs_base - log_probs
-    approx_kl = (log_ratio.exp() - 1) - log_ratio
-    if action_mask is not None:
-        approx_kl = masked_mean(approx_kl, action_mask, dim=1)
-        return approx_kl
-    approx_kl = approx_kl.mean(dim=1)
-    return approx_kl
+def get_model_numel(model: torch.nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters())
 
 
 def compute_reward(
@@ -32,12 +21,26 @@ def compute_reward(
     log_probs: torch.Tensor,
     log_probs_base: torch.Tensor,
     action_mask: Optional[torch.Tensor] = None,
+    reward_eps=5,
 ) -> torch.Tensor:
-    if kl_coef <= 0.0:
-        return r
-    kl = _compute_approx_kl(log_probs, log_probs_base, action_mask=action_mask)
-    reward = r - kl_coef * kl
-    return reward
+    """
+    Args:
+        log_probs: [batch_size, response_length]
+        log_probs_base: [batch_size, response_length]
+        action_mask: [batch_size, response_length]
+        r: float
+    Returns:
+        reward: [batch_size, response_length]
+    """
+    log_ratio = log_probs - log_probs_base  # address numerical instability issue
+    kl = -kl_coef * log_ratio * action_mask
+    reward = kl
+    r_clip = torch.clamp(r, -reward_eps, reward_eps)
+    for i in range(action_mask.size(0)):
+        assert action_mask[i].sum() > 0
+        reward[i, : action_mask[i].sum()] += r_clip[i]
+        reward[i, action_mask[i].sum() :] *= 0
+    return reward, ((log_ratio * (log_ratio < 10)).exp() - 1 - log_ratio) * action_mask
 
 
 def _log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
@@ -67,3 +70,85 @@ def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch
     mask_sum = mask.sum(dim=dim)
     mean = tensor / (mask_sum + 1e-8)
     return mean
+
+
+def calc_masked_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, mask: torch.Tensor) -> torch.Tensor:
+    log_probs = _log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
+    return log_probs * mask
+
+
+def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
+    """
+    Load file in JSON format
+    """
+    with open(file=file_path, mode="r", encoding="utf-8") as fp:
+        return json.load(fp)
+
+
+def save_json(data: Dict[str, Any], file_path: Union[str, os.PathLike]) -> None:
+    """
+    Save as JSON format
+    """
+    with open(file=file_path, mode="w", encoding="utf-8") as fp:
+        json.dump(data, fp=fp, ensure_ascii=False, indent=4)
+
+
+def save_checkpoint(
+    save_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+    epoch: int,
+    step: int,
+    batch_size: int,
+    coordinator: DistCoordinator,
+) -> None:
+    """
+    Save model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    save_dir = os.path.join(save_dir, f"epoch-{epoch}_step-{step}")
+    os.makedirs(os.path.join(save_dir, "modeling"), exist_ok=True)
+
+    booster.save_model(model, os.path.join(save_dir, "modeling"), shard=True)
+
+    booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
+    booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
+    running_states = {
+        "epoch": epoch,
+        "step": step,
+        "sample_start_index": step * batch_size,
+    }
+    if coordinator.is_master():
+        save_json(running_states, os.path.join(save_dir, "running_states.json"))
+
+
+def load_checkpoint(
+    load_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+) -> Tuple[int, int, int]:
+    """
+    Load model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    # Update booster params states.
+    booster.load_model(model=model, checkpoint=os.path.join(load_dir, "modeling"))
+    booster.load_optimizer(optimizer=optimizer, checkpoint=os.path.join(load_dir, "optimizer"))
+    booster.load_lr_scheduler(lr_scheduler=lr_scheduler, checkpoint=os.path.join(load_dir, "lr_scheduler"))
+
+    running_states = load_json(file_path=os.path.join(load_dir, "running_states.json"))
+    return (
+        running_states["epoch"],
+        running_states["step"],
+        running_states["sample_start_index"],
+    )
+
+
+def disable_dropout(model: torch.nn.Module):
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0.0
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/Chat/coati/trainer/__init__.py
index 4be5d27f93b1..110df0dbe764 100644
--- a/applications/Chat/coati/trainer/__init__.py
+++ b/applications/Chat/coati/trainer/__init__.py
@@ -1,6 +1,8 @@
-from .base import OnPolicyTrainer, SLTrainer
-from .ppo import PPOTrainer
+from .base import OLTrainer, SLTrainer
+from .dpo import DPOTrainer
+
+# from .ppo import PPOTrainer
 from .rm import RewardModelTrainer
 from .sft import SFTTrainer
 
-__all__ = ["SLTrainer", "OnPolicyTrainer", "RewardModelTrainer", "SFTTrainer", "PPOTrainer"]
+__all__ = ["SLTrainer", "OLTrainer", "RewardModelTrainer", "SFTTrainer", "DPOTrainer"]
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
index 0a41d450d41e..126cd2f863f4 100644
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -8,6 +8,8 @@
 from coati.experience_maker import Experience
 from torch.optim import Optimizer
 
+from colossalai.booster import Booster
+
 from .callbacks import Callback
 from .strategies import Strategy
 from .utils import is_rank_0
@@ -26,16 +28,18 @@ class SLTrainer(ABC):
 
     def __init__(
         self,
-        strategy: Strategy,
+        booster: Booster,
         max_epochs: int,
         model: nn.Module,
         optimizer: Optimizer,
+        start_epoch: int = 0,
     ) -> None:
         super().__init__()
-        self.strategy = strategy
+        self.booster = booster
         self.max_epochs = max_epochs
         self.model = model
         self.optimizer = optimizer
+        self.start_epoch = start_epoch
 
     @abstractmethod
     def _train(self, epoch):
@@ -45,19 +49,20 @@ def _train(self, epoch):
     def _eval(self, epoch):
         raise NotImplementedError()
 
+    @abstractmethod
     def _before_fit(self):
         raise NotImplementedError()
 
     def fit(self, *args, **kwargs):
         self._before_fit(*args, **kwargs)
-        for epoch in tqdm.trange(self.max_epochs, desc="Epochs", disable=not is_rank_0()):
+        for epoch in tqdm.trange(self.start_epoch, self.max_epochs, desc="Epochs", disable=not is_rank_0()):
             self._train(epoch)
             self._eval(epoch)
 
 
-class OnPolicyTrainer(ABC):
+class OLTrainer(ABC):
     """
-        Base class for on-policy rl trainers, e.g. PPO.
+        Base class for online learning trainers, e.g. PPO.
 
     Args:
         strategy (Strategy):the strategy to use for training
@@ -160,6 +165,7 @@ def fit(
         num_episodes: int,
         num_collect_steps: int,
         num_update_steps: int,
+        save_per_num_episodes: int,
         *args,
         **kwargs,
     ):
@@ -186,3 +192,21 @@ def fit(
                         self._update_phase(update_step)
                     # NOTE: this is for on-policy algorithms
                     self.data_buffer.clear()
+                if is_rank_0() and (episode + 1) % save_per_num_episodes == 0:
+                    if args.lora_rank > 0 and args.merge_lora_weights:
+                        from coati.models.lora import LORA_MANAGER
+
+                        # NOTE: set model to eval to merge LoRA weights
+                        LORA_MANAGER.merge_weights = True
+                        self.actor.eval()
+                    # save model checkpoint after fitting
+                    self.strategy.save_model(self.actor, args.save_path, only_rank0=True)
+                    # save optimizer checkpoint on all ranks
+                    if args.need_optim_ckpt:
+                        self.strategy.save_optimizer(
+                            self.actor_optim,
+                            "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()),
+                            only_rank0=False,
+                        )
+
+                    self.strategy.save_checkpoint(episode)
diff --git a/applications/Chat/coati/trainer/dpo.py b/applications/Chat/coati/trainer/dpo.py
new file mode 100644
index 000000000000..81df8b62ed63
--- /dev/null
+++ b/applications/Chat/coati/trainer/dpo.py
@@ -0,0 +1,327 @@
+import os
+from typing import Any, Optional
+
+import torch
+from coati.models.loss import DpoLoss
+from coati.models.utils import calc_masked_log_probs
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter, save_checkpoint
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import trange
+from transformers import PreTrainedTokenizerBase
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device
+
+from .base import SLTrainer
+from .utils import is_rank_0, to_device
+
+
+class DPOTrainer(SLTrainer):
+    """
+        Trainer for PPO algorithm.
+
+    Args:
+        actor (Actor): the actor model in ppo algorithm
+        ref_model (Critic): the reference model in ppo algorithm
+        booster (Strategy): the strategy to use for training
+        actor_optim (Optimizer): the optimizer to use for actor model
+        actor_lr_scheduler (_LRScheduler): the lr scheduler to use for actor model
+        tokenizer (PreTrainedTokenizerBase): the tokenizer to use for encoding
+        max_epochs (int, defaults to 1): the max number of epochs to train
+        beta (float, defaults to 0.1): the beta parameter in dpo loss
+        accumulation_steps (int): the number of steps to accumulate gradients
+        start_epoch (int, defaults to 0): the start epoch, non-zero if resumed from a checkpoint
+        save_interval (int): the interval to save model checkpoints, default to 0, which means no checkpoint will be saved during trainning
+        save_dir (str): the directory to save checkpoints
+        coordinator (DistCoordinator): the coordinator to use for distributed logging
+    """
+
+    def __init__(
+        self,
+        actor: Any,
+        ref_model: Any,
+        booster: Booster,
+        actor_optim: Optimizer,
+        actor_lr_scheduler: _LRScheduler,
+        tokenizer: PreTrainedTokenizerBase,
+        max_epochs: int = 1,
+        beta: float = 0.1,
+        accumulation_steps: int = 1,
+        start_epoch: int = 0,
+        save_interval: int = 0,
+        save_dir: str = None,
+        coordinator: DistCoordinator = None,
+    ) -> None:
+        super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch)
+        self.ref_model = ref_model
+        self.actor_scheduler = actor_lr_scheduler
+        self.tokenizer = tokenizer
+        self.actor_loss_fn = DpoLoss(beta)
+        self.save_interval = save_interval
+        self.coordinator = coordinator
+        self.save_dir = save_dir
+        self.num_train_step = 0
+        self.accumulation_steps = accumulation_steps
+        self.device = get_current_device()
+        self.accumulative_meter = AccumulativeMeanMeter()
+
+    def _before_fit(
+        self,
+        train_preference_dataloader: DataLoader = None,
+        eval_preference_dataloader: DataLoader = None,
+        log_dir: Optional[str] = None,
+        use_wandb: bool = False,
+    ):
+        """
+        Args:
+            prompt_dataloader (DataLoader): the dataloader to use for prompt data
+            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
+        """
+        self.train_dataloader = train_preference_dataloader
+        self.eval_dataloader = eval_preference_dataloader
+        self.writer = None
+        if use_wandb and is_rank_0():
+            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+            import wandb
+
+            self.wandb_run = wandb.init(project="Coati-dpo", sync_tensorboard=True)
+        if log_dir is not None and is_rank_0():
+            import os
+            import time
+
+            from torch.utils.tensorboard import SummaryWriter
+
+            log_dir = os.path.join(log_dir, "dpo")
+            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+            self.writer = SummaryWriter(log_dir=log_dir)
+
+    def _train(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        self.model.train()
+        self.accumulative_meter.reset()
+        step_bar = trange(
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, self.device)
+            # print(batch)
+            (
+                chosen_input_ids,
+                chosen_attention_mask,
+                chosen_loss_mask,
+                reject_input_ids,
+                reject_attention_mask,
+                reject_loss_mask,
+            ) = (
+                batch["chosen_input_ids"],
+                batch["chosen_attention_mask"],
+                batch["chosen_loss_mask"],
+                batch["reject_input_ids"],
+                batch["reject_attention_mask"],
+                batch["reject_loss_mask"],
+            )
+
+            batch_size = chosen_input_ids.size()[0]
+
+            actor_all_logits = self.model(
+                torch.cat([chosen_input_ids, reject_input_ids]),
+                torch.cat([chosen_attention_mask, reject_attention_mask]),
+            )["logits"].to(torch.float32)
+            actor_chosen_logits = actor_all_logits[:batch_size]
+            actor_reject_logits = actor_all_logits[batch_size:]
+
+            logprob_actor_chosen = calc_masked_log_probs(actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
+
+            logprob_actor_reject = calc_masked_log_probs(actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+
+            self.ref_model.eval()
+            with torch.no_grad():
+                ref_all_logits = self.ref_model(
+                    torch.cat([chosen_input_ids, reject_input_ids]),
+                    torch.cat([chosen_attention_mask, reject_attention_mask]),
+                )["logits"].to(torch.float32)
+                ref_chosen_logits = ref_all_logits[:batch_size]
+                ref_reject_logits = ref_all_logits[batch_size:]
+                logprob_ref_chosen = calc_masked_log_probs(ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
+                logprob_ref_reject = calc_masked_log_probs(ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+
+            losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                logprob_actor_chosen,
+                logprob_actor_reject,
+                logprob_ref_chosen if logprob_ref_chosen is not None else None,
+                logprob_ref_reject if logprob_ref_reject is not None else None,
+                chosen_loss_mask,
+                reject_loss_mask,
+            )
+            reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+            loss = losses.mean()
+
+            self.booster.backward(loss=loss, optimizer=self.optimizer)
+            if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                self.actor_scheduler.step()
+
+            # sync
+            loss_mean = all_reduce_mean(tensor=loss)
+            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
+            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
+            reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
+            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+            self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).mean().item())
+
+            # logging
+            if self.writer and is_rank_0():
+                self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                self.writer.add_scalar(
+                    "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                )
+                self.writer.add_scalar(
+                    "train/rejected_rewards",
+                    self.accumulative_meter.get("rejected_rewards"),
+                    self.num_train_step,
+                )
+                self.writer.add_scalar(
+                    "train/accuracy",
+                    self.accumulative_meter.get("accuracy"),
+                    self.num_train_step,
+                )
+            self.accumulative_meter.reset()
+
+            if i % self.accumulation_steps == self.accumulation_steps - 1:
+                self.num_train_step += 1
+                step_bar.update()
+
+            if (self.save_interval > 0 and (i + 1) % (self.save_interval * self.accumulation_steps) == 0) or (
+                i + 1
+            ) == len(self.train_dataloader):
+                self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
+                save_checkpoint(
+                    save_dir=self.save_dir,
+                    booster=self.booster,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    lr_scheduler=self.actor_scheduler,
+                    epoch=epoch,
+                    step=i + 1,
+                    batch_size=batch_size,
+                    coordinator=self.coordinator,
+                )
+                self.coordinator.print_on_master(
+                    f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"
+                )
+
+        step_bar.close()
+
+    def _eval(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.model.eval()
+        self.ref_model.eval()
+        self.coordinator.print_on_master("\nStart evaluation...")
+
+        step_bar = trange(
+            len(self.eval_dataloader),
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+
+        self.accumulative_meter.reset()
+
+        with torch.no_grad():
+            for i, batch in enumerate(self.eval_dataloader):
+                batch = to_device(batch, self.device)
+                (
+                    chosen_input_ids,
+                    chosen_attention_mask,
+                    chosen_loss_mask,
+                    reject_input_ids,
+                    reject_attention_mask,
+                    reject_loss_mask,
+                ) = (
+                    batch["chosen_input_ids"],
+                    batch["chosen_attention_mask"],
+                    batch["chosen_loss_mask"],
+                    batch["reject_input_ids"],
+                    batch["reject_attention_mask"],
+                    batch["reject_loss_mask"],
+                )
+
+                batch_size = chosen_input_ids.size()[0]
+
+                actor_all_logits = self.model(
+                    torch.cat([chosen_input_ids, reject_input_ids]),
+                    torch.cat([chosen_attention_mask, reject_attention_mask]),
+                )["logits"].to(torch.float32)
+                actor_chosen_logits = actor_all_logits[:batch_size]
+                actor_reject_logits = actor_all_logits[batch_size:]
+
+                logprob_actor_chosen = calc_masked_log_probs(
+                    actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:]
+                )
+
+                logprob_actor_reject = calc_masked_log_probs(
+                    actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:]
+                )
+
+                self.ref_model.eval()
+
+                ref_all_logits = self.ref_model(
+                    torch.cat([chosen_input_ids, reject_input_ids]),
+                    torch.cat([chosen_attention_mask, reject_attention_mask]),
+                )["logits"].to(torch.float32)
+                ref_chosen_logits = ref_all_logits[:batch_size]
+                ref_reject_logits = ref_all_logits[batch_size:]
+                logprob_ref_chosen = calc_masked_log_probs(ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
+                logprob_ref_reject = calc_masked_log_probs(ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+
+                losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                    logprob_actor_chosen,
+                    logprob_actor_reject,
+                    logprob_ref_chosen if logprob_ref_chosen is not None else None,
+                    logprob_ref_reject if logprob_ref_reject is not None else None,
+                    chosen_loss_mask,
+                    reject_loss_mask,
+                )
+                reward_accuracies = (chosen_rewards > rejected_rewards).float()
+                loss = losses.mean()
+                loss_mean = all_reduce_mean(tensor=loss)
+                chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
+                rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
+                reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
+                self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean)
+                self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean)
+                self.accumulative_meter.add("dist", chosen_rewards_mean - rejected_rewards_mean)
+                self.accumulative_meter.add("loss", loss_mean)
+                self.accumulative_meter.add("accuracy", reward_accuracies_mean)
+                step_bar.update()
+
+        msg = "Evaluation Result:\n"
+        for tag in ["loss", "chosen_rewards", "rejected_rewards", "accuracy"]:
+            msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+        msg = (
+            msg
+            + f"distance: {self.accumulative_meter.get('chosen_rewards')-self.accumulative_meter.get('rejected_rewards')}\n"
+        )
+        self.coordinator.print_on_master(msg)
+        with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+            f.write(msg)
+        step_bar.close()
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index d6966689885e..ac6df0026b2e 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -2,17 +2,18 @@
 
 from coati.experience_buffer import NaiveExperienceBuffer
 from coati.experience_maker import Experience, NaiveExperienceMaker
-from coati.models.base import Actor, Critic, RewardModel, get_base_model
+from coati.models import Actor, Critic, RewardModel
 from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
 from coati.models.utils import calc_action_log_probs
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
 
 from colossalai.utils import get_current_device
 
-from .base import OnPolicyTrainer
+from .base import OLTrainer
 from .callbacks import Callback
 from .strategies import GeminiStrategy, Strategy
 from .utils import CycledDataLoader, is_rank_0, to_device
@@ -20,7 +21,7 @@
 
 def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
     unwrapped_model = strategy.unwrap_model(actor)
-    hf_model = get_base_model(unwrapped_model)
+    hf_model = unwrapped_model.model
     new_kwargs = {**generate_kwargs}
     # use huggingface models method directly
     if "prepare_inputs_fn" not in generate_kwargs and hasattr(hf_model, "prepare_inputs_for_generation"):
@@ -32,7 +33,7 @@ def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, acto
     return new_kwargs
 
 
-class PPOTrainer(OnPolicyTrainer):
+class PPOTrainer(OLTrainer):
     """
         Trainer for PPO algorithm.
 
@@ -68,7 +69,9 @@ def __init__(
         initial_model: Actor,
         actor_optim: Optimizer,
         critic_optim: Optimizer,
+        actor_lr_scheduler: _LRScheduler,
         tokenizer: PreTrainedTokenizerBase,
+        rm_model_tokenizer: PreTrainedTokenizerBase,
         kl_coef: float = 0.1,
         ptx_coef: float = 0.9,
         train_batch_size: int = 8,
@@ -76,7 +79,7 @@ def __init__(
         buffer_cpu_offload: bool = True,
         eps_clip: float = 0.2,
         vf_coef: float = 1.0,
-        value_clip: float = 0.4,
+        value_clip: float = 0.2,
         sample_buffer: bool = False,
         dataloader_pin_memory: bool = True,
         offload_inference_models: bool = True,
@@ -90,11 +93,15 @@ def __init__(
         super().__init__(strategy, data_buffer, sample_buffer, dataloader_pin_memory, callbacks)
 
         self.generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
-        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer, kl_coef)
 
         self.actor = actor
         self.critic = critic
+        self.actor_scheduler = actor_lr_scheduler
         self.tokenizer = tokenizer
+        self.rm_model_tokenizer = rm_model_tokenizer
+        self.experience_maker = NaiveExperienceMaker(
+            self.actor, self.critic, reward_model, initial_model, self.tokenizer, self.rm_model_tokenizer, kl_coef
+        )
 
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
@@ -103,6 +110,7 @@ def __init__(
         self.ptx_coef = ptx_coef
         self.actor_optim = actor_optim
         self.critic_optim = critic_optim
+        self.num_train_step = 0
 
         self.offload_inference_models = offload_inference_models
         self.device = get_current_device()
@@ -127,7 +135,7 @@ def _before_fit(
             assert log_dir is not None, "log_dir must be provided when use_wandb is True"
             import wandb
 
-            wandb.init(project="Coati-ppo", sync_tensorboard=True)
+            self.wandb_run = wandb.init(project="Coati-ppo", sync_tensorboard=True)
         if log_dir is not None and is_rank_0():
             import os
             import time
@@ -148,13 +156,23 @@ def _make_experience(self, collect_step: int) -> Experience:
         return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
 
     def _training_step(self, experience: Experience):
+        """
+        Args:
+            experience:
+                sequences: [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>
+        """
+        self.num_train_step += 1
         self.actor.train()
         self.critic.train()
-        # policy loss
         num_actions = experience.action_log_probs.size(1)
-        actor_logits = self.actor(experience.sequences, experience.attention_mask)["logits"]
+        # policy loss
+
+        actor_logits = self.actor(experience.sequences, experience.attention_mask)[
+            "logits"
+        ]  # [batch size, prompt_length + response_length]
         action_log_probs = calc_action_log_probs(actor_logits, experience.sequences, num_actions)
-        actor_loss = self.actor_loss_fn(
+
+        actor_loss, to_skip, max_ratio = self.actor_loss_fn(
             action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
         )
         actor_loss = (1 - self.ptx_coef) * actor_loss
@@ -168,16 +186,54 @@ def _training_step(self, experience: Experience):
             ptx_loss = self.ptx_coef * self.ptx_loss_fn(ptx_log_probs, batch["labels"])
             self.strategy.backward(ptx_loss, self.actor, self.actor_optim)
 
-        self.strategy.optimizer_step(self.actor_optim)
-        self.actor_optim.zero_grad()
-
         # value loss
-        values = self.critic(experience.sequences, attention_mask=experience.attention_mask)
-        critic_loss = self.critic_loss_fn(values, experience.values, experience.reward)
+        values = self.critic(
+            experience.sequences, attention_mask=experience.attention_mask
+        )  # [batch size, prompt_length + response_length]
+        critic_loss = self.critic_loss_fn(
+            values[:, -num_actions:], experience.values, experience.advantages, action_mask=experience.action_mask
+        )
         critic_loss = critic_loss * self.vf_coef
         self.strategy.backward(critic_loss, self.critic, self.critic_optim)
+
+        if not to_skip:
+            self.strategy.optimizer_step(self.actor_optim)
         self.strategy.optimizer_step(self.critic_optim)
+        self.actor_optim.zero_grad()
         self.critic_optim.zero_grad()
+        if self.actor_scheduler:
+            self.actor_scheduler.step()
+
+        # preparing logging model output and corresponding rewards.
+        response_text = self.experience_maker.tokenizer.batch_decode(experience.sequences, skip_special_tokens=True)
+        for i in range(len(response_text)):
+            response_text[i] = response_text[i] + f"\n\nReward: {experience.reward[i]}"
+
+        if self.writer:
+            # use wandb
+            import wandb
+
+            if self.num_train_step % 50 == 1 and "wandb_run" in self.__dict__:
+                my_table = wandb.Table(
+                    columns=[f"sample response {i}" for i in range(len(response_text))], data=[response_text]
+                )
+                try:
+                    self.wandb_run.log({"sample_response": my_table})
+                except OSError as e:
+                    print(e)
+
+            self.writer.add_scalar("train/max_ratio", max_ratio, self.num_train_step)
+            self.writer.add_scalar("train/skip", 1 if to_skip else 0, self.num_train_step)
+            self.writer.add_scalar("train/actor_loss", actor_loss.mean().item(), self.num_train_step)
+            self.writer.add_scalar("train/lr_actor", self.actor_optim.param_groups[0]["lr"], self.num_train_step)
+            self.writer.add_scalar("train/lr_critic", self.critic_optim.param_groups[0]["lr"], self.num_train_step)
+            self.writer.add_scalar("train/critic_loss", critic_loss.mean().item(), self.num_train_step)
+            if self.ptx_coef != 0:
+                self.writer.add_scalar("train/ptx_loss", ptx_loss.mean().item(), self.num_train_step)
+            self.writer.add_scalar("reward", experience.reward.mean().item(), self.num_train_step)
+            self.writer.add_scalar("approx_kl", experience.kl.mean().item(), self.num_train_step)
+            self.writer.add_scalar("value", experience.values.mean().item(), self.num_train_step)
+            self.writer.add_scalar("advantages", experience.advantages.mean().item(), self.num_train_step)
 
     def _learn(self, update_step: int):
         if self.offload_inference_models:
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index d7f8c21a5a3d..1c3613d108c6 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,117 +1,92 @@
-from typing import Callable, Optional
+import os
+from typing import Any, Callable, Optional
 
 import torch
 import tqdm
+from coati.models import LogSigLoss
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter, save_checkpoint
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device
 
 from .base import SLTrainer
-from .strategies import Strategy
-from .utils import is_rank_0
+from .utils import is_rank_0, to_device
 
 
 class RewardModelTrainer(SLTrainer):
     """
-        Trainer to use while training reward model.
+        Trainer for PPO algorithm.
 
     Args:
-        model (torch.nn.Module): the model to train
-        strategy (Strategy): the strategy to use for training
-        optim (Optimizer): the optimizer to use for training
-        lr_scheduler (_LRScheduler): the lr scheduler to use for training
-        loss_fn (callable): the loss function to use for training
-        max_epochs (int, defaults to 2): the number of epochs to train
+        actor (Actor): the actor model in ppo algorithm
+        ref_model (Critic): the reference model in ppo algorithm
+        booster (Strategy): the strategy to use for training
+        actor_optim (Optimizer): the optimizer to use for actor model
+        actor_lr_scheduler (_LRScheduler): the lr scheduler to use for actor model
+        tokenizer (PreTrainedTokenizerBase): the tokenizer to use for encoding
+        max_epochs (int, defaults to 1): the max number of epochs to train
+        beta (float, defaults to 0.1): the beta parameter in dpo loss
+        accumulation_steps (int): the number of steps to accumulate gradients
+        start_epoch (int, defaults to 0): the start epoch, non-zero if resumed from a checkpoint
+        save_interval (int): the interval to save model checkpoints, default to 0, which means no checkpoint will be saved during trainning
+        save_dir (str): the directory to save checkpoints
+        coordinator (DistCoordinator): the coordinator to use for distributed logging
     """
 
     def __init__(
         self,
-        model,
-        strategy: Strategy,
-        optim: Optimizer,
+        model: Any,
+        booster: Booster,
+        optimizer: Optimizer,
         lr_scheduler: _LRScheduler,
-        loss_fn: Callable,
+        tokenizer: PreTrainedTokenizerBase,
+        loss_fn: Optional[Callable] = None,
         max_epochs: int = 1,
+        beta: float = 0.1,
+        accumulation_steps: int = 1,
+        start_epoch: int = 0,
+        save_interval: int = 0,
+        save_dir: str = None,
+        coordinator: DistCoordinator = None,
     ) -> None:
-        super().__init__(strategy, max_epochs, model, optim)
-
-        self.loss_fn = loss_fn
-        self.scheduler = lr_scheduler
-
+        super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, start_epoch=start_epoch)
+        self.actor_scheduler = lr_scheduler
+        self.tokenizer = tokenizer
+        self.loss_fn = loss_fn if loss_fn is not None else LogSigLoss(beta=beta)
+        self.save_interval = save_interval
+        self.coordinator = coordinator
+        self.save_dir = save_dir
         self.num_train_step = 0
-
-    def _eval(self, epoch):
-        if self.eval_dataloader is not None:
-            self.model.eval()
-            dist, num_correct, num_samples = 0, 0, 0
-            with torch.no_grad():
-                for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
-                    chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-                    c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-                    reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-                    r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-                    chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                    reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                    num_samples += chosen_ids.size(0)
-                    num_correct += (chosen_reward > reject_reward).sum().item()
-                    dist += (chosen_reward - reject_reward).mean().item()
-                self.dist = dist / len(self.eval_dataloader)
-                self.acc = num_correct / num_samples
-
-            if self.writer:
-                self.writer.add_scalar("eval/dist", self.dist, epoch)
-                self.writer.add_scalar("eval/acc", self.acc, epoch)
-
-    def _train(self, epoch):
-        self.model.train()
-        step_bar = tqdm.trange(
-            len(self.train_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
-        )
-        for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
-            chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-            c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-            reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-            r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-            chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-            reject_reward = self.model(reject_ids, attention_mask=r_mask)
-            loss = self.loss_fn(chosen_reward, reject_reward)
-            self.strategy.backward(loss, self.model, self.optimizer)
-            self.strategy.optimizer_step(self.optimizer)
-            self.optimizer.zero_grad()
-            if self.writer:
-                self.writer.add_scalar("train/loss", loss.item(), self.num_train_step)
-                self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
-                self.writer.add_scalar("train/dist", (chosen_reward - reject_reward).mean().item(), self.num_train_step)
-                self.writer.add_scalar(
-                    "train/acc", (chosen_reward > reject_reward).float().mean().item(), self.num_train_step
-                )
-            self.num_train_step += 1
-            if self.num_train_step % 100 == 0:
-                self.scheduler.step()
-            step_bar.update()
-        step_bar.close()
+        self.accumulation_steps = accumulation_steps
+        self.device = get_current_device()
+        self.accumulative_meter = AccumulativeMeanMeter()
 
     def _before_fit(
         self,
-        train_dataloader: DataLoader,
-        eval_dataloader: DataLoader,
+        train_preference_dataloader: DataLoader = None,
+        eval_preference_dataloader: DataLoader = None,
         log_dir: Optional[str] = None,
         use_wandb: bool = False,
     ):
         """
         Args:
-            train_dataloader (DataLoader): the dataloader to use for training
-            eval_dataloader (DataLoader): the dataloader to use for evaluation
+            prompt_dataloader (DataLoader): the dataloader to use for prompt data
+            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
         """
-        self.train_dataloader = train_dataloader
-        self.eval_dataloader = eval_dataloader
-
+        self.train_dataloader = train_preference_dataloader
+        self.eval_dataloader = eval_preference_dataloader
         self.writer = None
         if use_wandb and is_rank_0():
             assert log_dir is not None, "log_dir must be provided when use_wandb is True"
             import wandb
 
-            wandb.init(project="Coati-rm", sync_tensorboard=True)
+            self.wandb_run = wandb.init(project="Coati-rm", sync_tensorboard=True)
         if log_dir is not None and is_rank_0():
             import os
             import time
@@ -121,3 +96,137 @@ def _before_fit(
             log_dir = os.path.join(log_dir, "rm")
             log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
             self.writer = SummaryWriter(log_dir=log_dir)
+
+    def _train(self, epoch):
+        self.model.train()
+        step_bar = tqdm.trange(
+            len(self.train_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, self.device)
+
+            (
+                chosen_input_ids,
+                chosen_attention_mask,
+                reject_input_ids,
+                reject_attention_mask,
+            ) = (
+                batch["chosen_input_ids"],
+                batch["chosen_attention_mask"],
+                batch["reject_input_ids"],
+                batch["reject_attention_mask"],
+            )
+            batch_size = chosen_input_ids.size()[0]
+
+            # concatenate for better parrallelism
+            reward = self.model(
+                torch.cat([chosen_input_ids, reject_input_ids], dim=0),
+                attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask], dim=0),
+            )
+            chosen_reward = reward[:batch_size]
+            reject_reward = reward[batch_size:]
+            loss = self.loss_fn(chosen_reward, reject_reward).mean()
+
+            self.booster.backward(loss=loss, optimizer=self.optimizer)
+            if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                self.actor_scheduler.step()
+
+            # sync
+            loss_mean = all_reduce_mean(tensor=loss)
+            chosen_rewards_mean = all_reduce_mean(tensor=chosen_reward)
+            rejected_rewards_mean = all_reduce_mean(tensor=reject_reward)
+            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+
+            if self.writer and is_rank_0():
+                self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                self.writer.add_scalar(
+                    "train/dist",
+                    self.accumulative_meter.get("chosen_rewards") - self.accumulative_meter.get("rejected_rewards"),
+                    self.num_train_step,
+                )
+                self.writer.add_scalar(
+                    "train/reward_chosen", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                )
+                self.writer.add_scalar(
+                    "train/reward_reject", self.accumulative_meter.get("rejected_rewards"), self.num_train_step
+                )
+
+            if i % self.accumulation_steps == self.accumulation_steps - 1:
+                self.num_train_step += 1
+                step_bar.update()
+                self.accumulative_meter.reset()
+
+            if (self.save_interval > 0 and (i + 1) % (self.save_interval * self.accumulation_steps) == 0) or (
+                i + 1
+            ) == len(self.train_dataloader):
+                self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
+                save_checkpoint(
+                    save_dir=self.save_dir,
+                    booster=self.booster,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    lr_scheduler=self.actor_scheduler,
+                    epoch=epoch,
+                    step=i + 1,
+                    batch_size=batch_size,
+                    coordinator=self.coordinator,
+                )
+                self.coordinator.print_on_master(
+                    f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"
+                )
+        step_bar.close()
+
+    def _eval(self, epoch):
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.model.eval()
+        step_bar = tqdm.trange(
+            len(self.eval_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
+        )
+        with torch.no_grad():
+            for i, batch in enumerate(self.eval_dataloader):
+                batch = to_device(batch, self.device)
+                # print(batch)
+                (
+                    chosen_input_ids,
+                    chosen_attention_mask,
+                    reject_input_ids,
+                    reject_attention_mask,
+                ) = (
+                    batch["chosen_input_ids"],
+                    batch["chosen_attention_mask"],
+                    batch["reject_input_ids"],
+                    batch["reject_attention_mask"],
+                )
+
+                chosen_reward = self.model(chosen_input_ids, attention_mask=chosen_attention_mask)
+                reject_reward = self.model(reject_input_ids, attention_mask=reject_attention_mask)
+                loss = self.loss_fn(chosen_reward, reject_reward).mean()
+
+                # sync
+                loss_mean = all_reduce_mean(tensor=loss)
+                chosen_rewards_mean = all_reduce_mean(tensor=chosen_reward)
+                rejected_rewards_mean = all_reduce_mean(tensor=reject_reward)
+                self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+
+                step_bar.update()
+
+            msg = "Evaluation Result:\n"
+            for tag in ["loss", "chosen_rewards", "rejected_rewards"]:
+                msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+            msg = (
+                msg
+                + f"distance: {self.accumulative_meter.get('chosen_rewards')-self.accumulative_meter.get('rejected_rewards')}\n"
+            )
+            self.coordinator.print_on_master(msg)
+            with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+                f.write(msg)
+            step_bar.close()
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index 7d0eeec897e5..1a69833591da 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,16 +1,19 @@
+import os
 from typing import Optional
 
 import torch
-import torch.distributed as dist
-import tqdm
+from coati.models import save_checkpoint
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
+from tqdm import trange
 
-from colossalai.logging import DistributedLogger
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
 
 from .base import SLTrainer
-from .strategies import GeminiStrategy, Strategy
 from .utils import is_rank_0, to_device
 
 
@@ -30,75 +33,31 @@ class SFTTrainer(SLTrainer):
     def __init__(
         self,
         model,
-        strategy: Strategy,
+        booster: Booster,
         optim: Optimizer,
         lr_scheduler: _LRScheduler,
         max_epochs: int = 2,
         accumulation_steps: int = 8,
+        start_epoch=0,
+        save_interval: int = None,
+        save_dir: str = None,
+        coordinator: Optional[DistCoordinator] = None,
     ) -> None:
-        if accumulation_steps > 1:
-            assert not isinstance(
-                strategy, GeminiStrategy
-            ), "Accumulation steps are not supported in stage 3 of ColossalAI"
-
-        super().__init__(strategy, max_epochs, model, optim)
+        super().__init__(booster, max_epochs, model, optim, start_epoch=start_epoch)
 
         self.accumulation_steps = accumulation_steps
         self.scheduler = lr_scheduler
-
+        self.save_interval = save_interval
+        self.save_dir = save_dir
+        self.coordinator = coordinator
         self.num_train_step = 0
         self.num_eval_step = 0
-
-    def _train(self, epoch: int):
-        self.model.train()
-        step_bar = tqdm.trange(
-            len(self.train_dataloader) // self.accumulation_steps,
-            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
-            disable=not is_rank_0(),
-        )
-        for i, batch in enumerate(self.train_dataloader):
-            batch = to_device(batch, torch.cuda.current_device())
-            outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
-            loss = outputs.loss / self.accumulation_steps
-            self.total_loss += loss.item()
-            self.strategy.backward(loss, self.model, self.optimizer)
-            # gradient accumulation
-            if (i + 1) % self.accumulation_steps == 0:
-                self.strategy.optimizer_step(self.optimizer)
-                self.optimizer.zero_grad()
-                self.scheduler.step()
-                if self.writer:
-                    self.writer.add_scalar("train/loss", self.total_loss, self.num_train_step)
-                    self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
-                    self.num_train_step += 1
-                self.total_loss = 0
-                step_bar.update()
-        step_bar.close()
-
-    def _eval(self, epoch: int):
-        if self.eval_dataloader is not None:
-            self.model.eval()
-            with torch.no_grad():
-                loss_sum, num_seen = 0, 0
-                for batch in self.eval_dataloader:
-                    batch = to_device(batch, torch.cuda.current_device())
-                    outputs = self.model(
-                        batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
-                    )
-                    loss_sum += outputs.loss.item()
-                    num_seen += batch["input_ids"].size(0)
-                loss_mean = loss_sum / num_seen
-                if dist.get_rank() == 0:
-                    self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")
-                if self.writer:
-                    self.writer.add_scalar("eval/loss", loss_mean, self.num_eval_step)
-                    self.num_eval_step += 1
+        self.accumulative_meter = AccumulativeMeanMeter()
 
     def _before_fit(
         self,
         train_dataloader: DataLoader,
         eval_dataloader: Optional[DataLoader] = None,
-        logger: Optional[DistributedLogger] = None,
         log_dir: Optional[str] = None,
         use_wandb: bool = False,
     ):
@@ -110,7 +69,6 @@ def _before_fit(
         self.train_dataloader = train_dataloader
         self.eval_dataloader = eval_dataloader
 
-        self.logger = logger
         self.writer = None
         if use_wandb and is_rank_0():
             assert log_dir is not None, "log_dir must be provided when use_wandb is True"
@@ -127,4 +85,81 @@ def _before_fit(
             log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
             self.writer = SummaryWriter(log_dir=log_dir)
 
-        self.total_loss = 0
+    def _train(self, epoch: int):
+        self.model.train()
+        step_bar = trange(
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, torch.cuda.current_device())
+            batch_size = batch["input_ids"].size(0)
+            outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+            loss = outputs.loss
+            loss_mean = all_reduce_mean(tensor=loss)
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+            self.booster.backward(loss=loss, optimizer=self.optimizer)
+
+            # gradient accumulation
+            if (i + 1) % self.accumulation_steps == 0:
+                self.optimizer.step()
+                self.scheduler.step()
+                self.optimizer.zero_grad()
+
+                if self.writer:
+                    self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                    self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
+                    self.num_train_step += 1
+                self.accumulative_meter.reset()
+                step_bar.update()
+
+            # save checkpoint
+            if (
+                self.save_dir is not None
+                and self.save_interval is not None
+                and (self.save_interval and (i + 1) % (self.save_interval * self.accumulation_steps) == 0)
+                or (i + 1) == len(self.train_dataloader)
+            ):
+                save_checkpoint(
+                    save_dir=self.save_dir,
+                    booster=self.booster,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    lr_scheduler=self.scheduler,
+                    epoch=epoch,
+                    step=i + 1,
+                    batch_size=batch_size,
+                    coordinator=self.coordinator,
+                )
+                self.coordinator.print_on_master(
+                    f"Saved checkpoint at epoch {epoch} step {i + 1} at folder {self.save_dir}"
+                )
+        step_bar.close()
+
+    def _eval(self, epoch: int):
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.accumulative_meter.reset()
+        self.model.eval()
+        with torch.no_grad():
+            step_bar = trange(
+                len(self.eval_dataloader),
+                desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+                disable=not is_rank_0(),
+            )
+            for batch in self.eval_dataloader:
+                batch = to_device(batch, torch.cuda.current_device())
+                outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+                self.accumulative_meter.add("loss", outputs.loss.item(), count_update=batch["input_ids"].size(0))
+                step_bar.update()
+            loss_mean = self.accumulative_meter.get("loss")
+            loss_mean = all_reduce_mean(tensor=loss_mean)
+            msg = "Evaluation Result:\n"
+            for tag in ["loss"]:
+                msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+            self.coordinator.print_on_master(msg)
+            with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+                f.write(msg)
+            step_bar.close()
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index 7811e7365eeb..6993b8bb8750 100644
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -48,3 +48,14 @@ def _to(t: Any):
         return t
 
     return tree_map(_to, x)
+
+
+def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+    tensor.div_(dist.get_world_size())
+    return tensor
+
+
+def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
+    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/applications/Chat/coati/utils/__init__.py b/applications/Chat/coati/utils/__init__.py
new file mode 100644
index 000000000000..7e2c631a5081
--- /dev/null
+++ b/applications/Chat/coati/utils/__init__.py
@@ -0,0 +1,5 @@
+from .accumulative_meter import AccumulativeMeanMeter
+from .ckpt_io import load_checkpoint, save_checkpoint
+from .flash_attention_patch import replace_with_flash_attention
+
+__all__ = ["load_checkpoint", "save_checkpoint", "replace_with_flash_attention", "AccumulativeMeanMeter"]
diff --git a/applications/Chat/coati/utils/accumulative_meter.py b/applications/Chat/coati/utils/accumulative_meter.py
new file mode 100644
index 000000000000..0fa4af3cf980
--- /dev/null
+++ b/applications/Chat/coati/utils/accumulative_meter.py
@@ -0,0 +1,32 @@
+class AccumulativeMeanVariable:
+    def __init__(self):
+        self._sum = 0
+        self._count = 0
+
+    def add(self, value, count_update=1):
+        self._sum += value
+        self._count += count_update
+
+    def get(self):
+        return self._sum / self._count if self._count > 0 else 0
+
+    def reset(self):
+        self._sum = 0
+        self._count = 0
+
+
+class AccumulativeMeanMeter:
+    def __init__(self):
+        self.variable_dict = {}
+
+    def add(self, name, value, count_update=1):
+        if name not in self.variable_dict:
+            self.variable_dict[name] = AccumulativeMeanVariable()
+        self.variable_dict[name].add(value, count_update=count_update)
+
+    def get(self, name):
+        return self.variable_dict[name].get()
+
+    def reset(self):
+        for name in self.variable_dict:
+            self.variable_dict[name].reset()
diff --git a/applications/Chat/coati/utils/ckpt_io.py b/applications/Chat/coati/utils/ckpt_io.py
new file mode 100644
index 000000000000..05342ce41a60
--- /dev/null
+++ b/applications/Chat/coati/utils/ckpt_io.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Helper functions for IO
+"""
+
+import json
+import os
+from typing import Any, Dict, Tuple, Union
+
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+
+def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
+    """
+    Load file in JSON format
+    """
+    with open(file=file_path, mode="r", encoding="utf-8") as fp:
+        return json.load(fp)
+
+
+def save_json(data: Dict[str, Any], file_path: Union[str, os.PathLike]) -> None:
+    """
+    Save as JSON format
+    """
+    with open(file=file_path, mode="w", encoding="utf-8") as fp:
+        json.dump(data, fp=fp, ensure_ascii=False, indent=4)
+
+
+def save_checkpoint(
+    save_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+    epoch: int,
+    step: int,
+    batch_size: int,
+    coordinator: DistCoordinator,
+) -> None:
+    """
+    Save model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    save_dir = os.path.join(save_dir, f"epoch-{epoch}_step-{step}")
+    os.makedirs(os.path.join(save_dir, "modeling"), exist_ok=True)
+
+    booster.save_model(model, os.path.join(save_dir, "modeling"), shard=True)
+
+    booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
+    booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
+    running_states = {
+        "epoch": epoch,
+        "step": step,
+        "sample_start_index": step * batch_size,
+    }
+    if coordinator.is_master():
+        save_json(running_states, os.path.join(save_dir, "running_states.json"))
+
+
+def load_checkpoint(
+    load_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+) -> Tuple[int, int, int]:
+    """
+    Load model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    # Update booster params states.
+    booster.load_model(model=model, checkpoint=os.path.join(load_dir, "modeling"))
+    booster.load_optimizer(optimizer=optimizer, checkpoint=os.path.join(load_dir, "optimizer"))
+    booster.load_lr_scheduler(lr_scheduler=lr_scheduler, checkpoint=os.path.join(load_dir, "lr_scheduler"))
+
+    running_states = load_json(file_path=os.path.join(load_dir, "running_states.json"))
+    return (
+        running_states["epoch"],
+        running_states["step"],
+        running_states["sample_start_index"],
+    )
diff --git a/applications/Chat/coati/utils/flash_attention_patch.py b/applications/Chat/coati/utils/flash_attention_patch.py
new file mode 100644
index 000000000000..3ae098201e2f
--- /dev/null
+++ b/applications/Chat/coati/utils/flash_attention_patch.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from types import MethodType
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_kvpacked_func
+from flash_attn.ops.rms_norm import rms_norm
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def _prepare_decoder_attention_mask(
+    self: LlamaModel,
+    attention_mask: torch.BoolTensor,
+    input_shape: torch.Size,
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+) -> Optional[torch.Tensor]:
+    """
+    Decoder attetion mask
+    """
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            tensors=(
+                torch.full(
+                    size=(input_shape[0], past_key_values_length),
+                    fill_value=True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )  # (bsz, past_key_values_length + q_len)
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # Faster
+    return attention_mask
+
+
+def attention_forward(
+    self: LlamaAttention,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.
+    """
+    if output_attentions:
+        logger.warning(
+            "Argument `output_attentions` is not supported for flash-attention patched `LlamaAttention`, "
+            "return `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    if self.config.pretraining_tp > 1:
+        q_slicing, kv_slicing = (
+            dim // self.config.pretraining_tp
+            for dim in (
+                self.num_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            )
+        )  # `Tuple[int, int]`
+        q_slices, k_slices, v_slices = (
+            proj.weight.split(slicing, dim=0)
+            for proj, slicing in (
+                (self.q_proj, q_slicing),
+                (self.k_proj, kv_slicing),
+                (self.v_proj, kv_slicing),
+            )
+        )  # Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor], Tuple[torch.Tensor]]
+        q, k, v = (
+            torch.cat(
+                [F.linear(hidden_states, slices[i]) for i in range(self.config.pretraining_tp)],
+                dim=-1,
+            )
+            for slices in (q_slices, k_slices, v_slices)
+        )
+        # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
+        # (bsz, q_len, num_heads * head_dim),
+        # (bsz, q_len, num_key_value_heads * head_dim),
+        # (bsz, q_len, num_key_value_heads * head_dim)
+    else:
+        q, k, v = (proj(hidden_states) for proj in (self.q_proj, self.k_proj, self.v_proj))
+        # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
+        # (bsz, q_len, num_heads * head_dim),
+        # (bsz, q_len, num_key_value_heads * head_dim),
+        # (bsz, q_len, num_key_value_heads * head_dim)
+
+    # (bsz, q_len, num_heads * head_dim) -> (bsz, num_heads, q_len, head_dim);
+    # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim);
+    # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim)
+    q, k, v = (
+        states.view(bsz, q_len, num_heads, self.head_dim).transpose(1, 2)
+        for states, num_heads in (
+            (q, self.num_heads),
+            (k, self.num_key_value_heads),
+            (v, self.num_key_value_heads),
+        )
+    )
+    kv_len = k.shape[-2]  # initially, `kv_len` == `q_len`
+    past_kv_len = 0
+    if past_key_value is not None:
+        # if `past_key_value` is not None, `kv_len` > `q_len`.
+        past_kv_len = past_key_value[0].shape[-2]
+        kv_len += past_kv_len
+
+    # two `torch.Tensor` objs of shape (1, 1, kv_len, head_dim)
+    cos, sin = self.rotary_emb(v, seq_len=kv_len)
+    # (bsz, num_heads, q_len, head_dim), (bsz, num_key_value_heads, q_len, head_dim)
+    q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, position_ids=position_ids)
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        k = torch.cat([past_key_value[0], k], dim=2)
+        v = torch.cat([past_key_value[1], v], dim=2)
+
+    past_key_value = (k, v) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    k = repeat_kv(hidden_states=k, n_rep=self.num_key_value_groups)
+    # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
+    v = repeat_kv(hidden_states=v, n_rep=self.num_key_value_groups)
+    # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
+
+    key_padding_mask = attention_mask
+    # (bsz, num_heads, q_len, head_dim) -> (bsz, q_len, num_heads, head_dim)
+    q, k, v = (states.transpose(1, 2) for states in (q, k, v))
+
+    if past_kv_len > 0:
+        q = torch.cat(
+            tensors=(
+                torch.full(
+                    size=(bsz, past_kv_len, self.num_heads, self.head_dim),
+                    fill_value=0.0,
+                    dtype=q.dtype,
+                    device=q.device,
+                ),
+                q,
+            ),
+            dim=1,
+        )  # (bsz, past_kv_len + q_len, num_heads, head_dim)
+
+    if key_padding_mask is None:
+        # (bsz, past_kv_len + q_len, num_heads, head_dim)
+        output = flash_attn_func(q=q, k=k, v=v, dropout_p=0.0, softmax_scale=None, causal=True)  # (bsz, )
+        output = rearrange(output, pattern="... h d -> ... (h d)")  # (bsz, past_kv_len + q_len, num_heads * head_dim)
+    else:
+        q, indices, cu_q_lens, max_q_len = unpad_input(hidden_states=q, attention_mask=key_padding_mask)
+        kv, _, cu_kv_lens, max_kv_len = unpad_input(
+            hidden_states=torch.stack(tensors=(k, v), dim=2),
+            attention_mask=key_padding_mask,
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q=q,
+            kv=kv,
+            cu_seqlens_q=cu_q_lens,
+            cu_seqlens_k=cu_kv_lens,
+            max_seqlen_q=max_q_len,
+            max_seqlen_k=max_kv_len,
+            dropout_p=0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output = pad_input(
+            hidden_states=rearrange(output_unpad, pattern="nnz h d -> nnz (h d)"),
+            indices=indices,
+            batch=bsz,
+            seqlen=past_kv_len + q_len,
+        )  # (bsz, past_kv_len + q_len, num_heads * head_dim)
+
+    if past_kv_len > 0:
+        # Strip off the zero query outputs.
+        output = output[:, past_kv_len:, ...]  # (bsz, q_len, num_heads * head_dim)
+    output = self.o_proj(output)  # (bsz, q_len, hidden_size)
+    return output, None, past_key_value
+
+
+def rms_norm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor) -> torch.Tensor:
+    """
+    Formard function for RMS Norm
+    """
+    return rms_norm(x=hidden_states, weight=self.weight, epsilon=self.variance_epsilon)
+
+
+def replace_with_flash_attention(model: LlamaForCausalLM) -> None:
+    for name, module in model.named_modules():
+        if isinstance(module, LlamaAttention):
+            module.forward = MethodType(attention_forward, module)
+        if isinstance(module, LlamaModel):
+            module._prepare_decoder_attention_mask = MethodType(_prepare_decoder_attention_mask, module)
+        if isinstance(module, LlamaRMSNorm):
+            module.forward = MethodType(rms_norm_forward, module)
diff --git a/applications/Chat/examples/=2.0.0, b/applications/Chat/examples/=2.0.0,
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index 9438aafd1268..b626f3fb4c6a 100644
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -4,17 +4,19 @@
 
 - [Examples](#examples)
   - [Table of Contents](#table-of-contents)
-  - [Install requirements](#install-requirements)
-  - [Supervised datasets collection](#supervised-datasets-collection)
-    - [Conversation dataset generation](#conversation-dataset-generation)
-  - [Stage1 - Supervised instructs tuning](#stage1---supervised-instructs-tuning)
-    - [Arg List](#arg-list)
-  - [Stage2 - Training reward model](#stage2---training-reward-model)
-    - [Features and tricks in RM training](#features-and-tricks-in-rm-training)
-    - [Experiment result](#experiment-result)
-    - [Arg List](#arg-list-1)
-  - [Stage3 - Training model using prompts with RL](#stage3---training-model-using-prompts-with-rl)
-    - [Arg List](#arg-list-2)
+  - [Install Requirements](#install-requirements)
+  - [Supervised Datasets Collection](#supervised-datasets-collection)
+    - [Conversation Dataset Generation](#conversation-dataset-generation)
+  - [Task I: Supervised Instruction Tuning](#task-i-supervised-instructs-tuning)
+  - [Task II: Reinforcement Learning from Human Feedback](#task-ii-reinforcement-learning-from-human-feedback)
+    - [Stage1 - Supervised instructs tuning](#stage1---supervised-instructs-tuning)
+      - [Arg List](#arg-list)
+    - [Stage2 - Training reward model](#stage2---training-reward-model)
+      - [Features and tricks in RM training](#features-and-tricks-in-rm-training)
+      - [Experiment result](#experiment-result)
+      - [Arg List](#arg-list-1)
+    - [Stage3 - Training model using prompts with RL](#stage3---training-model-using-prompts-with-rl)
+      - [Arg List](#arg-list-2)
   - [Inference example - After Stage3](#inference-example---after-stage3)
   - [Attention](#attention)
     - [data](#data)
@@ -36,6 +38,29 @@
 pip install -r requirements.txt
 ```
 
+## Get Start with ColossalRun
+
+You can use colossalai run to launch multi-nodes training:
+```
+colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
+train.py --OTHER_CONFIGURATIONS
+```
+Here is a sample hostfile:
+
+```
+hostname1
+hostname2
+hostname3
+hostname4
+```
+
+Make sure master node can access all nodes (including itself) by ssh without password. Here are some other arguments.
+
+- nnodes: number of nodes used in the training
+- nproc-per-node: specifies the number of processes to be launched per node
+- rdzv-endpoint: address of the host node
+
+
 ## Supervised datasets collection
 
 We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
@@ -92,9 +117,9 @@ python generate_conversation_dataset.py \
     --save_path "/path/to/dataset"
 ```
 
-## Stage1 - Supervised instructs tuning
+## Task I: Supervised Instructs Tuning
 
-Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
+In the task of supervised instructs fine-tuning, we will uses the datasets mentioned earlier to fine-tune the model.
 [[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
 You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
@@ -102,7 +127,7 @@ You can run the `examples/train_sft.sh` to start a supervised instructs fine-tun
 You can also use the following cmd to start a supervised instructs fine-tuning with your own settings.
 
 ```bash
-torchrun --standalone --nproc_per_node=4 train_sft.py \
+colossalai run --nproc_per_node 1 --hostfile ./hostfile train_sft.py \
     --pretrain "/path/to/LLaMa-7B/" \
     --model 'llama' \
     --strategy colossalai_zero2 \
@@ -113,7 +138,8 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1 \
-    --grad_checkpoint
+    --grad_checkpoint \
+    --use_wandb
 ```
 
 **Note**: the supervised dataset follows the following format,
@@ -131,7 +157,6 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 ```
 
 ### Arg List
-
 - `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
 - `--model`: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
 - `--pretrain`: pretrain model, type=str, default=None
@@ -142,8 +167,40 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 - `--batch_size`: batch size while training, type=int, default=4
 - `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
 - `--grad_checkpoint`: enable gradient checkpointing, type=bool, default=False
+- `use_wandb`: whether to use [wandb](https://wandb.ai/site)
+
+## Task II: Reinforcement Learning from Human Feedback
+### Stage1 - Supervised Instructs Tuning
+
+The first stage of RLHF is supervised instructs fine-tuning (SFT). This stage is basically the same as the first task, which uses the same datasets but with different prompt format.
 
-## Stage2 - Training reward model
+You can run the `examples/train_rlhf_sft.sh` to start a supervised instructs fine-tuning.
+
+You can also use the following cmd to start a supervised instructs fine-tuning with your own settings.
+
+```bash
+colossalai run --nproc_per_node 1 --hostfile ./hostfile train_rlhf_sft.py \
+    --pretrain "gpt2" \
+    --model 'gpt2' \
+    --strategy colossalai_zero2 \
+    --save_path 'path to a directory where you want to stre the weights of the model' \
+    --dataset 'path to your dataset, which should be a json file' \
+    --batch_size 4 \
+    --accumulation_steps 8 \
+    --lr 2e-5 \
+    --max_datasets_size 60000 \
+    --max_epochs 1 \
+    --use_wandb
+```
+
+**Note**: the supervised dataset follows the same format as in Task I.
+
+### Arg List
+
+The same as in Task I.
+
+
+### Stage2 - Training reward model
 
 We train a reward model in stage 2, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
 [[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
@@ -153,7 +210,7 @@ You can run the `examples/train_rm.sh` to start a reward model training.
 You can also use the following cmd to start training a reward model.
 
 ```bash
-torchrun --standalone --nproc_per_node=4 train_reward_model.py \
+colossalai run --nproc_per_node 1 --hostfile ./hostfile train_reward_model.py \
     --pretrain "/path/to/LLaMa-7B/" \
     --model 'llama' \
     --strategy colossalai_zero2 \
@@ -198,6 +255,20 @@ Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
 - `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
 - `--loss_func`: which kind of loss function, choices=['log_sig', 'log_exp']
 - `--max_len`: max sentence length for generation, type=int, default=512
+- `--use_wandb`: whether to use wandb
+
+
+### Note on Reward Model Training
+
+Before you move on the next stage, please check the following list to ensure that your reward model is stable and robust. You can check the reward chart and the accuracy chart on wandb.
+- The mean reward for chosen data is much higher than those for rejected data
+- The accuracy is larger than 0.5 by a significant margin (usually should be greater than 0.6)
+- Optional：check the reward is positive for chosen data vice versa
+
+Your training reward curves should look similar to the following charts.
+<p align="center">
+<img width="1000" alt="image" src="https://raw.githubusercontent.com/YeAnbang/imagehostingrepo/main/mean_reward_chart.png">
+</p>
 
 ## Stage3 - Training model using prompts with RL
 
@@ -212,17 +283,26 @@ You can run the `examples/train_prompts.sh` to start PPO training.
 You can also use the cmd following to start PPO training.
 [[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
 
+
+PPO Training Script
 ```bash
-torchrun --standalone --nproc_per_node=4 train_prompts.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
+colossalai run --nproc_per_node 1 --hostfile ./hostfile train_prompts.py \
+    --pretrain_dataset 'path to sft dataset used in stage 1'  \
+    --prompt_dataset 'dataset that contains prompt (queries) for PPO training' \
     --strategy colossalai_zero2 \
-    --prompt_dataset /path/to/your/prompt_dataset \
-    --pretrain_dataset /path/to/your/pretrain_dataset \
-    --rm_pretrain /your/pretrain/rm/definition \
-    --rm_path /your/rm/model/path
-```
+    --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
+    --experience_batch_size 32 \
+    --train_batch_size 32 \
+    --save_path 'path to save the trained model' \
+    --ptx_coef 0.0 \
+    --rm_model 'gpt2' \
+    --rm_pretrain 'gpt2' \
+    --rm_path 'path to reward model trained in stage 2' \
+    --reward_model_tokenizer 'gpt2' \
+    --pretrain '/home/lcyab/data/Anthropic_rlhf/actor/pretrain_v3' \
+    --use_wandb
 
+```
 Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/generate_prompt_dataset.py) which samples `instinwild_en.json` or `instinwild_ch.json` in [InstructionWild](https://github.com/XueFuzhao/InstructionWild/tree/main/data#instructwild-data) to generate the prompt dataset.
 Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
 
@@ -257,6 +337,29 @@ Pretrain dataset: the pretrain dataset including the instruction and correspondi
       ...
   ]
   ```
+### Sample Training Results Using Default Script
+#### Reward
+<p align="center">
+<img width="700" alt="image" src="https://raw.githubusercontent.com/YeAnbang/imagehostingrepo/main/reward.png">
+</p>
+
+#### Approximate KL Divergence
+<p align="center">
+<img width="700" alt="image" src="https://raw.githubusercontent.com/YeAnbang/imagehostingrepo/main/KL.png">
+</p>
+
+### Note on PPO Training
+#### Q1: My reward is nagtive
+Answer: Check your reward model trained in stage 1. If the reward model only generate negative reward, we actually will expect a negative reward. However, even though the reward is negative, the reward should go up.
+
+#### Q2: My actor loss is negative
+Answer: This is normal for actor loss as PPO doesn't restrict the actor loss to be positive.
+
+#### Q3: My reward doesn't go up (decreases)
+Answer: The causes to this problem are two-fold. Check your reward model, make sure that it gives positive and strong reward for good cases and negative, strong reward for bad responses. You should also try different hyperparameter settings.
+
+#### Q4: Generation is garbage
+Answer: Yes, this happens and is well documented by other implementations. After training for too many episodes, the actor gradually deviate from its original state, which may leads to decrease in language modeling capabilities. A way to fix this is to add suppervised loss during PPO. Set ptx_coef to a none-zero value (between 0 and 1), which balances PPO loss and sft loss.
 
 ### Arg List
 
@@ -279,6 +382,7 @@ Pretrain dataset: the pretrain dataset including the instruction and correspondi
 - `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
 - `--kl_coef`: kl_coef using for computing reward, type=float, default=0.1
 - `--ptx_coef`: ptx_coef using for computing policy loss, type=float, default=0.9
+- `--use_wandb`
 
 ## Inference example - After Stage3
 
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 1dd9ffcdf1cd..0b174297aaef 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -24,7 +24,9 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
@@ -118,7 +120,7 @@ def main(args):
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == "llama":
         tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
-        tokenizer.eos_token = "</s>"
+        tokenizer.eos_token = "<\s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 6d395deadd0e..3bbef7208374 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -68,7 +68,7 @@ def train(args):
             padding_side="right",
             use_fast=False,
         )
-        tokenizer.eos_token = "</s>"
+        tokenizer.eos_token = "<\s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
diff --git a/applications/Chat/examples/config_file b/applications/Chat/examples/config_file
new file mode 100644
index 000000000000..664cf3d34af7
--- /dev/null
+++ b/applications/Chat/examples/config_file
@@ -0,0 +1,27 @@
+{
+    "plugin": "gemini",
+    "grad_clip": 1.0,
+    "weight_decay": 0.1,
+    "warmup_steps": null,
+    "tp": 1,
+    "zero": 1,
+    "pretrain": null,
+    "dataset": [],
+    "checkpoint_path": null,
+    "save_path": "output",
+    "max_epochs": 3,
+    "batch_size": 4,
+    "max_len": 512,
+    "mixed_precision": "fp16",
+    "lora_rank": 0,
+    "lora_train_bias": "none",
+    "save_interval": 1000,
+    "merge_lora_weights": true,
+    "lr": 5e-06,
+    "config_file": "config_file",
+    "accumulation_steps": 8,
+    "log_dir": "logs",
+    "use_wandb": false,
+    "grad_checkpoint": false,
+    "use_flash_attn": false
+}
diff --git a/applications/Chat/examples/hostfile b/applications/Chat/examples/hostfile
new file mode 100644
index 000000000000..91af58fb794c
--- /dev/null
+++ b/applications/Chat/examples/hostfile
@@ -0,0 +1 @@
+192.168.0.36
diff --git a/applications/Chat/examples/inference.py b/applications/Chat/examples/inference.py
index 9df8649d9c61..62e06bf7b3bb 100644
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference.py
@@ -39,7 +39,7 @@ def eval(args):
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == "llama":
         tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        tokenizer.eos_token = "</s>"
+        tokenizer.eos_token = "<\s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
diff --git a/applications/Chat/examples/out.txt b/applications/Chat/examples/out.txt
new file mode 100644
index 000000000000..37b12d27dad4
--- /dev/null
+++ b/applications/Chat/examples/out.txt
@@ -0,0 +1,1319 @@
+GPU Memory Usage:
+Now CUDA_VISIBLE_DEVICES is set to:
+CUDA_VISIBLE_DEVICES=0
+[11/16/23 11:17:06] INFO     colossalai - colossalai - INFO:
+                             /home/lcyab/data/anaconda/envs/colossal-llama2/lib/
+                             python3.8/site-packages/colossalai/initialize.py:63
+                              launch
+                    INFO     colossalai - colossalai - INFO: Distributed
+                             environment is initialized, world size: 1
+trainable params: 8110080 || all params: 567324672 || trainable%: 1.43
+BloomForCausalLM(
+  (transformer): BloomModel(
+    (word_embeddings): Embedding(250880, 1024)
+    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+    (h): ModuleList(
+      (0): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (1): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (2): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (3): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (4): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (5): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (6): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (7): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (8): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (9): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (10): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (11): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (12): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (13): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (14): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (15): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (16): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (17): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (18): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (19): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (20): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (21): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (22): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+      (23): BloomBlock(
+        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (self_attention): BloomAttention(
+          (query_key_value): Linear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=3072, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (dense): Linear(
+            in_features=1024, out_features=1024, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=1024, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+        (mlp): BloomMLP(
+          (dense_h_to_4h): Linear(
+            in_features=1024, out_features=4096, bias=True
+            (lora_dropout): ModuleDict(
+              (default): Dropout(p=0.1, inplace=False)
+            )
+            (lora_A): ModuleDict(
+              (default): Linear(in_features=1024, out_features=30, bias=False)
+            )
+            (lora_B): ModuleDict(
+              (default): Linear(in_features=30, out_features=4096, bias=False)
+            )
+            (lora_embedding_A): ParameterDict()
+            (lora_embedding_B): ParameterDict()
+          )
+          (gelu_impl): BloomGelu()
+          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
+        )
+      )
+    )
+    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=1024, out_features=250880, bias=False)
+)
+Gradient checkpointing enabled successfully
+Loading data...
+Loaded 52002 examples.
+Limiting dataset to 200 examples.
+Formatting inputs...
+Tokenizing inputs... This may take some time...
+Loaded dataset.
+Warmup steps is set to 0
+Booster init max CUDA memory: 1113.03 MB
+Booster init max CPU memory: 4792.71 MB
+Error: failed to run torchrun --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=28535 train_sft.py --pretrain bigscience/bloom-560m --plugin zero2 --save_path /home/lcyab/data/test_folder/model_checkpoint/gpt2 --dataset tatsu-lab/alpaca --batch_size 4 --max_epochs 1 --max_datasets_size 200 --accumulation_steps 8 --lr 2e-2 --lora_rank 30 --max_len 512 --max_epochs 1 --grad_checkpoint on 192.168.0.32, is localhost: False, exception: Encountered a bad command exit code!
+
+Command: 'cd /mnt/vepfs/lcyab/ColossalAI/applications/Chat/examples && export SHELL="/bin/bash" COLORTERM="truecolor" TERM_PROGRAM_VERSION="1.84.1" CONDA_EXE="/home/lcyab/data/anaconda/bin/conda" _LMFILES__modshare="/mnt/vepfs/modulefiles/cuda/11.7:1:/mnt/vepfs/modulefiles/clash:1" ENV="/usr/share/modules/init/profile.sh" PWD="/home/lcyab/data/ColossalAI/applications/Chat/examples" LOGNAME="lcyab" XDG_SESSION_TYPE="tty" CONDA_PREFIX="/home/lcyab/data/anaconda/envs/colossal-llama2" MODULESHOME="/usr/share/modules" MANPATH=":" C_INCLUDE_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/include:1" MOTD_SHOWN="pam" HOME="/home/lcyab" LANG="en_US.UTF-8" LS_COLORS="rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:" LD_LIBRARY_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/lib64:1" CONDA_PROMPT_MODIFIER="(colossal-llama2) " https_proxy="http://192.168.0.32:7890" SSH_CONNECTION="223.25.80.122 57221 192.168.0.36 22" PATH_modshare="/usr/bin:1:/home/lcyab/.local/bin:1:/usr/local/bin:1:/mnt/vepfs/softwares/cuda-toolkits/11.7/bin:1:/home/lcyab/data/anaconda/envs/colossal-llama2/bin:1:/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/remote-cli:1:/bin:1:/snap/bin:1:/sbin:1:/home/lcyab/data/anaconda/condabin:1:/usr/sbin:1:/usr/games:1:/usr/local/sbin:1:/usr/local/games:1" CUDA_VISIBLE_DEVICES="0" LESSCLOSE="/usr/bin/lesspipe %s %s" MODULEPATH_modshare="/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1" XDG_SESSION_CLASS="user" LOADEDMODULES_modshare="cuda/11.7:1:clash:1" TERM="xterm-256color" CPLUS_INCLUDE_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/include" LESSOPEN="| /usr/bin/lesspipe %s" USER="lcyab" VSCODE_GIT_IPC_HANDLE="/run/user/1002/vscode-git-3a0159ec83.sock" CONDA_SHLVL="3" LOADEDMODULES="clash:cuda/11.7" NCCL_IB_TIMEOUT="23" SHLVL="2" BASH_ENV="/usr/share/modules/init/bash" XDG_SESSION_ID="6941" http_proxy="http://192.168.0.32:7890" CONDA_PYTHON_EXE="/home/lcyab/data/anaconda/bin/python" LD_LIBRARY_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/lib64" CPLUS_INCLUDE_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/include:1" XDG_RUNTIME_DIR="/run/user/1002" SSH_CLIENT="223.25.80.122 57221 22" CONDA_DEFAULT_ENV="colossal-llama2" CUDA_HOME="/mnt/vepfs/softwares/cuda-toolkits/11.7" BROWSER="/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/helpers/browser.sh" PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/bin:/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/remote-cli:/home/lcyab/.local/bin:/home/lcyab/data/anaconda/envs/colossal-llama2/bin:/home/lcyab/data/anaconda/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin" MODULEPATH="/mnt/vepfs/modulefiles:/mnt/vepfs/modulefiles:/mnt/vepfs/modulefiles" NCCL_IB_RETRY_CNT="7" _LMFILES_="/mnt/vepfs/modulefiles/clash:/mnt/vepfs/modulefiles/cuda/11.7" DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1002/bus" C_INCLUDE_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/include" CONDA_PREFIX_1="/home/lcyab/data/anaconda" CONDA_PREFIX_2="/home/lcyab/data/anaconda/envs/colossalqa" OLDPWD="/home/lcyab" MODULES_CMD="/usr/lib/x86_64-linux-gnu/modulecmd.tcl" TERM_PROGRAM="vscode" VSCODE_IPC_HOOK_CLI="/run/user/1002/vscode-ipc-6a17d9e7-4845-4f65-b164-b8e69318ffe4.sock" _="/home/lcyab/data/anaconda/envs/colossal-llama2/bin/colossalai" && torchrun --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=28535 train_sft.py --pretrain bigscience/bloom-560m --plugin zero2 --save_path /home/lcyab/data/test_folder/model_checkpoint/gpt2 --dataset tatsu-lab/alpaca --batch_size 4 --max_epochs 1 --max_datasets_size 200 --accumulation_steps 8 --lr 2e-2 --lora_rank 30 --max_len 512 --max_epochs 1 --grad_checkpoint'
+
+Exit code: 1
+
+Stdout: already printed
+
+Stderr: already printed
+
+
+
+====== Training on All Nodes =====
+192.168.0.32: failure
+
+====== Stopping All Nodes =====
+192.168.0.32: finish
diff --git a/applications/Chat/examples/train_dpo.py b/applications/Chat/examples/train_dpo.py
new file mode 100644
index 000000000000..a7b2af36be6e
--- /dev/null
+++ b/applications/Chat/examples/train_dpo.py
@@ -0,0 +1,306 @@
+import argparse
+import json
+import math
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import (
+    DataCollatorForPreferenceDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import convert_to_lora_module
+from coati.trainer import DPOTrainer
+from coati.utils import load_checkpoint, replace_with_flash_attention
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    )
+    with init_ctx:
+        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+
+        # debug tiny model
+        # model = transformers.LlamaForCausalLM(
+        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
+        #     )
+        # )
+        # ref_model = transformers.LlamaForCausalLM(
+        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
+        #     )
+        # )
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "right"
+    tokenizer.pad_token = tokenizer.eos_token
+    coordinator.print_on_master(
+        f"Tokenizer pad token: {tokenizer.pad_token}, Tokenizer padding side: {tokenizer.padding_side}"
+    )
+
+    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt'),
+    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
+    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    math.ceil(args.max_epochs * num_update_steps_per_epoch)
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+
+    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt').to(get_current_device()),
+    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
+    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
+
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = DPOTrainer(
+        actor=model,
+        ref_model=ref_model,
+        booster=booster,
+        actor_optim=optim,
+        actor_lr_scheduler=lr_scheduler,
+        tokenizer=tokenizer,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
+    )
+
+    get_dist_logger()
+    trainer.fit(
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=None,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/Chat/examples/train_dpo.sh b/applications/Chat/examples/train_dpo.sh
new file mode 100755
index 000000000000..15b919a09e03
--- /dev/null
+++ b/applications/Chat/examples/train_dpo.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-dpo"
+PARENT_SAVE_DIR="./output/ckpt"
+PARENT_TENSORBOARD_DIR="./output/tensorboard"
+PARENT_CONFIG_FILE="./output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_dpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --checkpoint_path $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "3d" \
+    --save_interval 500 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --max_epochs 5 \
+    --accumulation_steps 4 \
+    --batch_size 4 \
+    --tp 8 \
+    --lr 5e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
+    --use_flash_attn \
+    --use_wandb
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_ppo.py
similarity index 71%
rename from applications/Chat/examples/train_prompts.py
rename to applications/Chat/examples/train_ppo.py
index 40e06043ab57..7625f94c596f 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_ppo.py
@@ -11,9 +11,10 @@
 from coati.trainer import PPOTrainer
 from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from torch.optim import Adam
+from torch.optim.lr_scheduler import CosineAnnealingLR
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
+from transformers import AutoConfig, AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
 
 from colossalai.nn.optimizer import HybridAdam
 
@@ -33,20 +34,15 @@ def main(args):
         warnings.warn("LoRA weights should be merged with the model weights")
         state_dict = torch.load(args.rm_path, map_location="cpu")
 
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
     with strategy.model_init_context():
         # configure model
+        # TODO: add support for llama
         if args.model == "gpt2":
             initial_model = GPTActor(pretrained=args.pretrain)
         elif args.model == "bloom":
             initial_model = BLOOMActor(pretrained=args.pretrain)
         elif args.model == "opt":
             initial_model = OPTActor(pretrained=args.pretrain)
-        elif args.model == "llama":
-            initial_model = LlamaActor(pretrained=args.pretrain)
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
@@ -67,47 +63,57 @@ def main(args):
             raise ValueError(f'Unsupported reward model "{rm_model_name}"')
 
         if args.rm_path is not None:
-            reward_model.load_state_dict(state_dict, strict=False)
+            reward_model.load_state_dict(state_dict, strict=True)
 
-        initial_model.to(torch.bfloat16).to(torch.cuda.current_device())
-        reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
+        initial_model.to(torch.cuda.current_device())
+        reward_model.to(torch.cuda.current_device())
 
         if args.model == "gpt2":
-            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            config = AutoConfig.from_pretrained(args.pretrain)
+            config.embd_pdrop = 0.000
+            config.attn_pdrop = 0.0001
+            config.resid_pdrop = 0.0001
+            actor = GPTActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
         elif args.model == "bloom":
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            config = AutoConfig.from_pretrained(args.pretrain)
+            # TODO: find a proper hyperparameter setting for BLOOM
+            config.attention_dropout = 0.0001
+            config.hidden_dropout = 0.0001
+            actor = BLOOMActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
         elif args.model == "opt":
-            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            config = AutoConfig.from_pretrained(args.pretrain)
+            # TODO: find a proper hyperparameter setting for OPT
+            config.attention_dropout = 0.0001
+            config.dropout = 0.0001
+            config.layerdrop = 0.000
+            actor = OPTActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
         elif args.model == "llama":
-            actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            # Note: llama disable dropout by default
+            actor = LlamaActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
-        if rm_model_name == "gpt2":
-            critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "bloom":
-            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "opt":
-            critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "llama":
-            critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
+        if args.model == "gpt2":
+            critic = GPTCritic(pretrained="gpt2", lora_rank=args.lora_rank)
+        elif args.model == "bloom":
+            critic = BLOOMCritic(pretrained="bigscience/bloom-560m", lora_rank=args.lora_rank)
+        elif args.model == "opt":
+            critic = OPTCritic(pretrained="facebook/opt-350m", lora_rank=args.lora_rank)
+        elif args.model == "llama":
+            critic = LlamaCritic(pretrained=args.pretrain, lora_rank=args.lora_rank)
         else:
             raise ValueError(f'Unsupported reward model "{rm_model_name}"')
-
-        if args.rm_path is not None:
-            critic.load_state_dict(state_dict, strict=False)
-            del state_dict
-
-        actor.to(torch.bfloat16).to(torch.cuda.current_device())
-        critic.to(torch.bfloat16).to(torch.cuda.current_device())
+        actor.to(torch.cuda.current_device())
+        critic.to(torch.cuda.current_device())
+        strategy.save_pretrained(actor, path=args.save_path)
 
     # configure optimizer
     if args.strategy.startswith("colossalai"):
         actor_optim = HybridAdam(actor.parameters(), lr=args.lr)
-        critic_optim = HybridAdam(critic.parameters(), lr=args.lr)
+        critic_optim = HybridAdam(critic.parameters(), lr=0.001)
     else:
         actor_optim = Adam(actor.parameters(), lr=args.lr)
-        critic_optim = Adam(critic.parameters(), lr=args.lr)
+        critic_optim = Adam(critic.parameters(), lr=args.critic_lr)
 
     # configure tokenizer
     if args.model == "gpt2":
@@ -125,13 +131,17 @@ def main(args):
         tokenizer = LlamaTokenizer.from_pretrained(
             "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
         )
-        tokenizer.eos_token = "</s>"
+        tokenizer.eos_token = "<\s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
     # NOTE: generate() requires padding_side to be "left"
     tokenizer.padding_side = "left"
 
+    # configure tokenizer
+    rm_model_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_tokenizer)
+    rm_model_tokenizer.pad_token = rm_model_tokenizer.eos_token
+
     prompt_dataset = PromptDataset(
         tokenizer=tokenizer,
         data_path=args.prompt_dataset,
@@ -161,10 +171,16 @@ def main(args):
     )
 
     # NOTE: For small models like opt-1.3b, reward model and initial model are not required to be parallelized.
-    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
+    (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+        (critic, critic_optim), reward_model, initial_model
     )
 
+    lr_scheduler = CosineAnnealingLR(actor_optim, args.num_episodes)
+    strategy_dict = strategy.prepare(dict(model=actor, optimizer=actor_optim, lr_scheduler=lr_scheduler))
+    actor = strategy_dict["model"]
+    actor_optim = strategy_dict["optimizer"]
+    actor_lr_scheduler = strategy_dict["lr_scheduler"]
+
     # configure trainer
     trainer = PPOTrainer(
         strategy,
@@ -174,10 +190,13 @@ def main(args):
         initial_model,
         actor_optim,
         critic_optim,
+        actor_lr_scheduler,
         tokenizer=tokenizer,
+        rm_model_tokenizer=rm_model_tokenizer,
         kl_coef=args.kl_coef,
         ptx_coef=args.ptx_coef,
         train_batch_size=args.train_batch_size,
+        buffer_limit=args.train_batch_size,
         max_length=args.max_seq_len,
         use_cache=True,
         do_sample=True,
@@ -190,6 +209,7 @@ def main(args):
         num_episodes=args.num_episodes,
         num_collect_steps=args.num_collect_steps,
         num_update_steps=args.num_update_steps,
+        save_per_num_episodes=args.save_per_num_episodes,
         prompt_dataloader=prompt_dataloader,
         pretrain_dataloader=pretrain_dataloader,
         log_dir=args.log_dir,
@@ -224,25 +244,28 @@ def main(args):
     )
     parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
     parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--reward_model_tokenizer", type=str, default=None)
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
     parser.add_argument("--rm_path", type=str, default=None)
     parser.add_argument("--rm_pretrain", type=str, default=None)
     parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
     parser.add_argument("--need_optim_ckpt", type=bool, default=False)
-    parser.add_argument("--num_episodes", type=int, default=10)
-    parser.add_argument("--num_collect_steps", type=int, default=10)
+    parser.add_argument("--num_episodes", type=int, default=1)
+    parser.add_argument("--num_collect_steps", type=int, default=2)
     parser.add_argument("--num_update_steps", type=int, default=5)
-    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--save_per_num_episodes", type=int, default=1000)
+    parser.add_argument("--train_batch_size", type=int, default=16)
     parser.add_argument("--ptx_batch_size", type=int, default=1)
     parser.add_argument("--experience_batch_size", type=int, default=8)
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=1e-7)
+    parser.add_argument("--lr", type=float, default=9e-6)
+    parser.add_argument("--critic_lr", type=float, default=9e-6)
     parser.add_argument("--kl_coef", type=float, default=0.1)
-    parser.add_argument("--ptx_coef", type=float, default=0.9)
+    parser.add_argument("--ptx_coef", type=float, default=0.0)
     parser.add_argument("--max_input_len", type=int, default=96)
-    parser.add_argument("--max_seq_len", type=int, default=128)
+    parser.add_argument("--max_seq_len", type=int, default=256)
     parser.add_argument("--log_dir", default="logs", type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     args = parser.parse_args()
diff --git a/applications/Chat/examples/train_ppo.sh b/applications/Chat/examples/train_ppo.sh
new file mode 100755
index 000000000000..7b8998c67576
--- /dev/null
+++ b/applications/Chat/examples/train_ppo.sh
@@ -0,0 +1,49 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
+
+# the args satisfied: train_batch_size = number_of_node_in_hostfile * num_collect_steps * experience_batch_size
+# the real batch size for gradient descent is nproc_per_node * train_batch_size
+colossalai run --nproc_per_node 1 --master_port 28567 --hostfile ./hostfile train_ppo.py \
+    --pretrain_dataset /home/lcyab/data/Anthropic_rlhf/pretrain_data.json \
+    --prompt_dataset /home/lcyab/data/Anthropic_rlhf/prompts_en.jsonl \
+    --strategy colossalai_zero2 \
+    --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
+    --experience_batch_size 32 \
+    --train_batch_size 32 \
+    --save_path '/home/lcyab/data/Anthropic_rlhf/actor/v3_5' \
+    --ptx_coef 0.0 \
+    --rm_model 'gpt2' \
+    --rm_pretrain 'gpt2' \
+    --rm_path '/home/lcyab/data/Anthropic_rlhf/reward_model_v2_1' \
+    --reward_model_tokenizer 'gpt2' \
+    --pretrain '/home/lcyab/data/Anthropic_rlhf/actor/pretrain_v3' \
+    --lora_rank 30 \
+    --use_wandb
+    # --pretrain_dataset /path/to/pretrain_data.json \
+    # --prompt_dataset /path/to/prompt_dataset.jsonl \
+    # --strategy colossalai_zero2 \
+    # --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
+    # --experience_batch_size 32 \
+    # --train_batch_size 32 \
+    # --save_path '/path/to/actor/ppo_checkpoint' \
+    # --ptx_coef 0.0 \
+    # --rm_model 'gpt2' \
+    # --rm_pretrain 'gpt2' \
+    # --rm_path '/path/to/reward_model' \
+    # --reward_model_tokenizer 'gpt2' \
+    # --model 'gpt2' \
+    # --pretrain '/path/to/actor/pretrain_path' \
+    # --use_wandb \
diff --git a/applications/Chat/examples/train_prompts.sh b/applications/Chat/examples/train_prompts.sh
deleted file mode 100755
index d04c416015b1..000000000000
--- a/applications/Chat/examples/train_prompts.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-# torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
-
-torchrun --standalone --nproc_per_node=2 train_prompts.py \
-    --pretrain_dataset /path/to/data.json \
-    --prompt_dataset /path/to/data.json \
-    --strategy colossalai_zero2 \
-    --num_episodes 1 --num_collect_steps 2 --num_update_steps 1 \
-    --train_batch_size 2
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index fcdd29b2954b..0195a3138fae 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -1,87 +1,117 @@
 import argparse
-import warnings
+import json
+import math
+import os
+import resource
+from contextlib import nullcontext
 
 import torch
-import torch.distributed as dist
-from coati.dataset import HhRlhfDataset, RmStaticDataset
-from coati.models import LogExpLoss, LogSigLoss
-from coati.models.bloom import BLOOMRM
-from coati.models.gpt import GPTRM
-from coati.models.llama import LlamaRM
-from coati.models.opt import OPTRM
+from coati.dataset import (
+    DataCollatorForPreferenceDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
 from coati.trainer import RewardModelTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from datasets import load_dataset
-from torch.optim import Adam
-from torch.optim.lr_scheduler import CosineAnnealingLR
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+from coati.utils import load_checkpoint, replace_with_flash_attention
+from transformers import AutoTokenizer
 
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
 
 
 def train(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
-    with strategy.model_init_context():
-        if args.model == "bloom":
-            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "opt":
-            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "gpt2":
-            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "llama":
-            model = LlamaRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-        model.to(torch.bfloat16).to(torch.cuda.current_device())
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
 
-        if args.model_path is not None:
-            state_dict = torch.load(args.model_path)
-            model.load_state_dict(state_dict)
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
         )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
         )
-        tokenizer.eos_token = "</s>"
-        tokenizer.pad_token = tokenizer.unk_token
     else:
-        raise ValueError(f'Unsupported model "{args.model}"')
+        raise ValueError(f"Unknown plugin {args.plugin}")
 
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        optim = HybridAdam(model.parameters(), lr=args.lr)
-    else:
-        optim = Adam(model.parameters(), lr=args.lr)
+    booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    )
+    with init_ctx:
+        model = RewardModel(args.pretrain)
+
+        # debug tiny model
+        # model = RewardModel(
+        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
+        #     )
+        # )
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        model.model.gradient_checkpointing_enable()  # TODO: support gradient checkpoint for the last linear layer
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "right"
+    tokenizer.pad_token = tokenizer.eos_token
 
     # configure loss function
     if args.loss_fn == "log_sig":
@@ -91,74 +121,110 @@ def train(args):
     else:
         raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
 
-    # prepare for data and dataset
-    if args.subset is not None:
-        data = load_dataset(args.dataset, data_dir=args.subset)
-    else:
-        data = load_dataset(args.dataset)
-
-    train_data = data["train"].select(range(min(args.max_datasets_size, len(data["train"]))))
-    eval_data = data["test"].select(range(min(args.max_datasets_size, len(data["test"]))))
-
-    if args.dataset == "Dahoas/rm-static":
-        train_dataset = RmStaticDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = RmStaticDataset(eval_data, tokenizer, args.max_len)
-    elif args.dataset == "Anthropic/hh-rlhf":
-        train_dataset = HhRlhfDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = HhRlhfDataset(eval_data, tokenizer, args.max_len)
-    else:
-        raise ValueError(f'Unsupported dataset "{args.dataset}"')
-
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(
-            train_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-        eval_sampler = DistributedSampler(
-            eval_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-    else:
-        train_sampler = None
-        eval_sampler = None
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
 
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=(train_sampler is None),
-        sampler=train_sampler,
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=train_dataset,
         batch_size=args.batch_size,
-        pin_memory=True,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
     )
 
-    eval_dataloader = DataLoader(
-        eval_dataset, shuffle=(eval_sampler is None), sampler=eval_sampler, batch_size=args.batch_size, pin_memory=True
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    math.ceil(args.max_epochs * num_update_steps_per_epoch)
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
     )
 
-    lr_scheduler = CosineAnnealingLR(optim, train_dataloader.__len__() // 100)
-    strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
-    model = strategy_dict["model"]
-    optim = strategy_dict["optimizer"]
-    lr_scheduler = strategy_dict["lr_scheduler"]
-    trainer = RewardModelTrainer(
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
         model=model,
-        strategy=strategy,
-        optim=optim,
+        optimizer=optim,
         lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = RewardModelTrainer(
+        model,
+        booster,
+        optim,
+        lr_scheduler,
+        tokenizer,
         loss_fn=loss_fn,
         max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
     )
 
     trainer.fit(
-        train_dataloader=train_dataloader,
-        eval_dataloader=eval_dataloader,
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=None,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
@@ -170,39 +236,60 @@ def train(args):
         LORA_MANAGER.merge_weights = True
         model.eval()
     # save model checkpoint after fitting on only rank0
-    state_dict = model.state_dict()
-    torch.save(state_dict, args.save_path)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="colossalai_zero2"
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
     )
-    parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama"], default="bloom")
-    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
     parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--model_path", type=str, default=None)
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
     parser.add_argument(
-        "--dataset", type=str, choices=["Anthropic/hh-rlhf", "Dahoas/rm-static"], default="Dahoas/rm-static"
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
     )
-    parser.add_argument("--subset", type=lambda x: None if x == "None" else x, default=None)
-    parser.add_argument("--max_datasets_size", type=int, default=1000000)
-    parser.add_argument("--save_path", type=str, default="rm_ckpt")
-    parser.add_argument("--max_epochs", type=int, default=1)
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"], help="Loss function")
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
     parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=9e-6)
-    parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"])
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
     parser.add_argument("--log_dir", default="logs", type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
     train(args)
diff --git a/applications/Chat/examples/train_rm.sh b/applications/Chat/examples/train_rm.sh
index c5ebaf708ddc..517ff05167f2 100755
--- a/applications/Chat/examples/train_rm.sh
+++ b/applications/Chat/examples/train_rm.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
@@ -12,14 +14,57 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-rm"
+PARENT_SAVE_DIR="./output/ckpt"
+PARENT_TENSORBOARD_DIR="./output/tensorboard"
+PARENT_CONFIG_FILE="./output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00009
+)
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
-torchrun --standalone --nproc_per_node=2 train_reward_model.py \
-    --pretrain 'gpt2' \
-    --model 'gpt2' \
-    --strategy colossalai_zero2 \
-    --loss_fn 'log_exp' \
-    --dataset 'Anthropic/hh-rlhf' \
-    --batch_size 16 \
-    --max_epochs 10
+colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_reward_model.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2" \
+    --save_interval 100 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --max_epochs 5 \
+    --accumulation_steps 1 \
+    --batch_size 8 \
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
+    --use_flash_attn \
+    --use_wandb
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index d00c04809a2d..b42789879a0d 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -1,179 +1,229 @@
 import argparse
+import json
 import math
-import warnings
+import os
+import resource
+from contextlib import nullcontext
 
 import torch
-import torch.distributed as dist
-from coati.dataset import SFTDataset, SupervisedDataset
-from coati.models.bloom import BLOOMActor
-from coati.models.chatglm import ChatGLMActor
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from coati.models.gpt import GPTActor
-from coati.models.llama import LlamaActor
-from coati.models.opt import OPTActor
+from coati.dataset import DataCollatorForSupervisedDataset, load_tokenized_dataset, setup_distributed_dataloader
+from coati.models import convert_to_lora_module, load_checkpoint
 from coati.trainer import SFTTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from datasets import load_dataset
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from transformers.trainer import get_scheduler
+from coati.utils import replace_with_flash_attention
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
 from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
 
 
 def train(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero2_cpu":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
-    with strategy.model_init_context():
-        if args.model == "bloom":
-            model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "opt":
-            model = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "gpt2":
-            model = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "llama":
-            model = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "chatglm":
-            model = ChatGLMActor(pretrained=args.pretrain)
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
 
-        model.to(torch.bfloat16).to(torch.cuda.current_device())
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "ddp":
+        # default torch ddp plugin without any acceleration, for debugging purpose acceleration, for debugging purpose
+        plugin = TorchDDPPlugin(find_unused_parameters=True)
+    elif args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
         )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
         )
-        tokenizer.eos_token = "</s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    elif args.model == "chatglm":
-        tokenizer = ChatGLMTokenizer.from_pretrained(
-            "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=args.zero,
+            max_norm=args.grad_clip,
+            precision=args.mixed_precision,
         )
     else:
-        raise ValueError(f'Unsupported model "{args.model}"')
+        raise ValueError(f"Unknown plugin {args.plugin}")
 
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
-    else:
-        optim = Adam(model.parameters(), lr=args.lr)
+    booster = Booster(plugin=plugin)
 
-    # configure dataset
-    if args.dataset == "yizhongw/self_instruct":
-        train_data = load_dataset(args.dataset, "super_natural_instructions", split="train")
-        eval_data = load_dataset(args.dataset, "super_natural_instructions", split="test")
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    )
+    with init_ctx:
+        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
 
-        if args.max_datasets_size is not None:
-            train_data = train_data.select(range(min(args.max_datasets_size, len(train_data))))
-            eval_data = eval_data.select(range(min(args.max_datasets_size, len(eval_data))))
+    if args.grad_checkpoint and args.lora_rank == 0:
+        # lora layers are not supported by gradient checkpointing
+        model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
 
-        train_dataset = SFTDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = SFTDataset(eval_data, tokenizer, args.max_len)
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
 
-    else:
-        train_dataset = SupervisedDataset(
-            tokenizer=tokenizer,
-            data_path=args.dataset,
-            max_datasets_size=args.max_datasets_size,
-            max_length=args.max_len,
-        )
-        eval_dataset = None
-
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(
-            train_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-        if eval_dataset is not None:
-            eval_sampler = DistributedSampler(
-                eval_dataset,
-                shuffle=False,
-                seed=42,
-                drop_last=False,
-                rank=dist.get_rank(),
-                num_replicas=dist.get_world_size(),
-            )
-    else:
-        train_sampler = None
-        eval_sampler = None
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir or args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.add_bos_token = False
+    tokenizer.add_eos_token = False
+
+    coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
+    coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
+
+    print([name for name, p in model.named_parameters() if p.requires_grad])
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+    # if is_rank_0():
+    #     print(optim.state_dict())
+    #     print("debug")
+    # optim_param = optim.state_dict()['param_groups'][0]['params']
+    # for id in optim_param:
+    #     for idx, (name, p) in enumerate(model.named_parameters()):
+    #         if idx == id:
+    #             print(name, p.size())
 
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=(train_sampler is None),
-        sampler=train_sampler,
+    # configure dataset
+    coordinator.print_on_master(
+        f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+    )
+    dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=dataset,
         batch_size=args.batch_size,
-        pin_memory=True,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+    )
+    coordinator.print_on_master(
+        f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
     )
-    if eval_dataset is not None:
-        eval_dataloader = DataLoader(
-            eval_dataset,
-            shuffle=(eval_sampler is None),
-            sampler=eval_sampler,
-            batch_size=args.batch_size,
-            pin_memory=True,
-        )
-    else:
-        eval_dataloader = None
 
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    max_steps = math.ceil(args.max_epochs * num_update_steps_per_epoch)
-    lr_scheduler = get_scheduler(
-        "cosine", optim, num_warmup_steps=math.ceil(max_steps * 0.03), num_training_steps=max_steps
+    math.ceil(args.max_epochs * num_update_steps_per_epoch)
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    # Flash attention will be disabled because it does NOT support fp32.
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
     )
-    strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
-    model = strategy_dict["model"]
-    optim = strategy_dict["optimizer"]
-    lr_scheduler = strategy_dict["lr_scheduler"]
+    # model = model.to(get_current_device())
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
     trainer = SFTTrainer(
         model=model,
-        strategy=strategy,
+        booster=booster,
         optim=optim,
         lr_scheduler=lr_scheduler,
         max_epochs=args.max_epochs,
         accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_path,
+        coordinator=coordinator,
     )
 
-    logger = get_dist_logger()
+    get_dist_logger()
     trainer.fit(
         train_dataloader=train_dataloader,
-        eval_dataloader=eval_dataloader,
-        logger=logger,
+        eval_dataloader=None,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
@@ -185,37 +235,59 @@ def train(args):
         LORA_MANAGER.merge_weights = True
         model.eval()
     # save model checkpoint after fitting on only rank0
-    strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_zero2_cpu"],
-        default="colossalai_zero2",
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d", "ddp"],
+        help="Choose which plugin to use",
     )
-    parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama", "chatglm"], default="bloom")
-    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=1)
     parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--dataset", type=str, default=None)
-    parser.add_argument("--max_datasets_size", type=int, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
     parser.add_argument("--save_path", type=str, default="output")
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
     parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
     parser.add_argument("--accumulation_steps", type=int, default=8)
     parser.add_argument("--log_dir", default="logs", type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
     train(args)
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index 0fb4da3d3ce8..7bbf6ccecdb9 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -13,16 +13,38 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --save_path /path/to/Coati-7B \
-    --dataset /path/to/data.json \
+PROJECT_NAME="llama2-sft"
+PARENT_SAVE_DIR="./output/ckpt"
+PARENT_TENSORBOARD_DIR="./output/tensorboard"
+PARENT_CONFIG_FILE="./output/train_config"
+PRETRAINED_MODEL_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    /home/lcyab/data/SFT_data/llama2_sft_data/part-00000
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+# the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
+colossalai run --nproc_per_node 8 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 500 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2 \
     --batch_size 4 \
-    --accumulation_steps 8 \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
     --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1
+    --max_len 512 \
+    --max_epochs 1 \
+    --use_flash_attn \
+    --grad_checkpoint \
+    --use_wandb \
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index e56aaca0e7cb..0b1ee1785fa1 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai==0.3.3
+colossalai==0.3.2
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
diff --git a/applications/Chat/tests/test_dataset.py b/applications/Chat/tests/test_dataset.py
index ec61bbb13fd7..70a171955b7a 100644
--- a/applications/Chat/tests/test_dataset.py
+++ b/applications/Chat/tests/test_dataset.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 from coati.dataset.prompt_dataset import PromptDataset
-from coati.dataset.reward_dataset import HhRlhfDataset, RmStaticDataset
+from coati.dataset.reward_dataset import PreferenceDataset
 from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDataset
 from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 from datasets import load_dataset
@@ -131,11 +131,21 @@ def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], ma
     assert tokenizer.padding_side in ("left", "right")
 
     if dataset_path == "Anthropic/hh-rlhf":
-        train_dataset = HhRlhfDataset(train_data, tokenizer, max_length)
-        test_dataset = HhRlhfDataset(test_data, tokenizer, max_length)
+        train_dataset = PreferenceDataset(train_data, tokenizer, max_length)
+        test_dataset = PreferenceDataset(test_data, tokenizer, max_length)
     elif dataset_path == "Dahoas/rm-static":
-        train_dataset = RmStaticDataset(train_data, tokenizer, max_length)
-        test_dataset = RmStaticDataset(test_data, tokenizer, max_length)
+        train_dataset = PreferenceDataset(
+            train_data,
+            tokenizer,
+            max_length,
+            dataset_schema={"prompt": "prompt", "chosen": "chosen", "rejected": "rejected"},
+        )
+        test_dataset = PreferenceDataset(
+            test_data,
+            tokenizer,
+            max_length,
+            dataset_schema={"prompt": "prompt", "chosen": "chosen", "rejected": "rejected"},
+        )
     else:
         raise ValueError(f'Unsupported dataset "{dataset_path}"')
 
@@ -176,7 +186,7 @@ def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], ma
             assert torch.all(r_mask)
 
 
-@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
+@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])  # temperally disable test for chatglm
 @pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
 @pytest.mark.parametrize("max_dataset_size", [2])
 @pytest.mark.parametrize("max_length", [32, 1024])
@@ -233,6 +243,7 @@ def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size:
 
 if __name__ == "__main__":
     test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
+    test_sft_dataset(model="bloom", dataset_path="custom", max_dataset_size=2, max_length=256)
 
     test_reward_dataset(
         model="gpt2", dataset_path="Anthropic/hh-rlhf", subset="harmless-base", max_datasets_size=8, max_length=256
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
index a9591259800d..2a736a65c07f 100644
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -62,8 +62,17 @@ def __init__(self):
             self.eos_token_id = 0
             self.pad_token_id = 0
 
+        def batch_decode(self, sequences, skip_special_tokens=True):
+            return ["This is a test sentence." for i in range(len(sequences))]
+
+        def __call__(self, sequences, **kwargs):
+            return {
+                "input_ids": torch.randint(0, 50257, (len(sequences), 100), device="cpu"),
+                "attention_mask": torch.ones((len(sequences), 100), device="cpu").bool(),
+            }
+
     tokenizer = MockTokenizer()
-    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer)
+    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer, tokenizer)
     data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
 
     generate_kwargs = dict(do_sample=True, max_length=16)
diff --git a/applications/Chat/tests/test_lora.py b/applications/Chat/tests/test_lora.py
new file mode 100644
index 000000000000..68c91a597f5b
--- /dev/null
+++ b/applications/Chat/tests/test_lora.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from coati.models import convert_to_lora_module
+from torch.utils.data import DataLoader, TensorDataset
+
+
+class SimpleNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(SimpleNN, self).__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.relu(out)
+        out = self.fc2(out)
+        return out
+
+
+def test_overfit():
+    input_size = 1000
+    hidden_size = 200
+    num_classes = 5
+    batch_size = 64
+    learning_rate = 0.01
+    num_epochs = 200
+
+    # Synthesized dataset
+    X = torch.randn(batch_size, input_size)
+    Y = torch.randint(0, num_classes, (batch_size,))
+
+    # Convert to DataLoader
+    dataset = TensorDataset(X, Y)
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    # Build and convert model
+    model = SimpleNN(input_size, hidden_size, num_classes)
+    print(model.fc1.weight)
+    weight_to_compare = model.fc1.weight.detach().clone()
+    model = convert_to_lora_module(model, lora_rank=30)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(num_epochs):
+        for i, (inputs, labels) in enumerate(loader):
+            # Forward pass
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            print(loss)
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+    # Check if model has overfitted
+    outputs = model(X)
+    _, predicted = torch.max(outputs.data, 1)
+    total = labels.size(0)
+    correct = (predicted == Y).sum().item()
+    print(correct / total)
+    assert (correct / total > 0.95, "The model has not overfitted to the synthesized dataset")
+
+    print(model.fc1.weight)
+    assert (weight_to_compare - model.fc1.weight).sum() < 0.01
+
+
+if __name__ == "__main__":
+    test_overfit()
diff --git a/applications/Chat/tests/test_models.py b/applications/Chat/tests/test_models.py
index b2c22ac6a3b9..7624dbaadff5 100644
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
@@ -118,7 +118,7 @@ def test_lora(lora_rank: int, num_dim: int, num_layers: int):
         # HACK: skip llama due to long execution time
         # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
         lambda: (OPTActor(), OPTCritic(), OPTRM()),
-        lambda: (ChatGLMActor(), None, None),
+        # lambda: (ChatGLMActor(), None, None), #temporally remove tests for chatglm
     ],
 )
 @torch.no_grad()
@@ -161,7 +161,7 @@ def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], b
         assert isinstance(critic, Critic)
         get_base_model(critic)
         critic_output = critic(**critic_input)
-        assert critic_output.shape == (batch_size,)
+        assert critic_output.shape == (batch_size, seq_len)
 
     if rm:
         assert isinstance(rm, RewardModel)
@@ -185,13 +185,17 @@ def test_loss(batch_size: int, seq_len: int, num_labels: int):
     loss_input = {
         "log_probs": torch.randn(
             batch_size,
+            seq_len,
         ),
         "old_log_probs": torch.randn(
             batch_size,
+            seq_len,
         ),
         "advantages": torch.randn(
             batch_size,
+            seq_len,
         ),
+        "action_mask": torch.randn(batch_size, seq_len),
     }
     loss(**loss_input)
 
@@ -199,13 +203,17 @@ def test_loss(batch_size: int, seq_len: int, num_labels: int):
     loss_input = {
         "values": torch.randn(
             batch_size,
+            seq_len,
         ),
         "old_values": torch.randn(
             batch_size,
+            seq_len,
         ),
-        "reward": torch.randn(
+        "advantage": torch.randn(
             batch_size,
+            seq_len,
         ),
+        "action_mask": torch.randn(batch_size, seq_len),
     }
     loss(**loss_input)
 
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 68fca7fbf8c0..4396e484e3f9 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -37,10 +37,18 @@ fi
 NUM_RETRY=3
 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 EXAMPLES_DIR=$BASE_DIR/examples
+MODEL_SAVE_PATH=$EXAMPLES_DIR/rlhf_models
 MODELS_DIR=$BASE_DIR/examples/models_config
 MODELS=('gpt2' 'bloom' 'opt' 'llama')
+MODELS_PPO=('gpt2' 'bloom' 'opt')
 STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2')
 
+if [ ! -d "$MODEL_SAVE_PATH" ]; then
+  mkdir "$MODEL_SAVE_PATH"
+  echo "Directory created successfully"
+else
+  echo "Directory already exists"
+fi
 
 export OMP_NUM_THREADS=8
 
@@ -70,6 +78,11 @@ random_choice() {
     echo ${arr[$idx]}
 }
 
+
+if [ ! -d "$DATA_DIRECTORY" ]; then
+  echo "$DATA_DIRECTORY does not exist."
+fi
+
 echo "[Test]: testing sft ..."
 
 # FIXME: This is a hack to skip tests that are not working
@@ -84,7 +97,7 @@ SKIPPED_TESTS=(
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')
-for lora_rank in '0'; do
+for lora_rank in "0"; do
     for model in ${MODELS[@]}; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
@@ -137,7 +150,7 @@ SKIPPED_TESTS=(
 
 LOSS_FNS=('log_sig' 'log_exp')
 DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static')
-for lora_rank in '0'; do
+for lora_rank in "0"; do
     for model in ${MODELS[@]}; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
@@ -150,9 +163,7 @@ for lora_rank in '0'; do
             fi
             pretrain=$(get_pretrain $model)
             pretrain_model=""
-            if [[ $lora_rank -gt 0 ]]; then
-                pretrain_model="--pretrain $pretrain"
-            fi
+            pretrain_model="--pretrain $pretrain"
             loss_fn=$(random_choice "${LOSS_FNS[@]}")
             dataset=$(random_choice "${DATASETS[@]}")
             subset=$(if [[ $dataset == "Dahoas/rm-static" ]]; then echo "None"; else echo "harmless-base"; fi)
@@ -162,7 +173,7 @@ for lora_rank in '0'; do
                     $pretrain_model --tokenizer $MODELS_DIR/$model \
                     --dataset $dataset --subset $subset --max_datasets_size 8 \
                     --model $model --strategy $strategy --lora_rank $lora_rank \
-                    --loss_fn $loss_fn --batch_size 1 --lr 1e-8 \
+                    --loss_fn $loss_fn --batch_size 1 --lr 1e-8 --max_len 200 \
                     --save_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
                 passed=$?
                 if [ $passed -eq 0 ]; then
@@ -183,15 +194,23 @@ echo "[Test]: testing RLHF ..."
 #  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
 #  - llama-*: These tests can be passed locally, skipped for long execution time
 #  - *-gemini: Gemini plugin does not support `from_pretrained` yet
+#  - skip all bloom tests as they leads to OOM
+
 SKIPPED_TESTS=(
     "gpt2-ddp"
     "llama-ddp"
     "llama-colossalai_gemini"
     "llama-colossalai_zero2"
+    "bloom-colossalai_gemini"
+    "bloom-colossalai_zero2"
+    "bloom-ddp"
+    "opt-colossalai_gemini"
+    "opt-colossalai_zero2"
+    "opt-ddp"
 )
 
-for model in ${MODELS[@]}; do
-    for lora_rank in '0'; do
+for model in ${MODELS_PPO[@]}; do
+    for lora_rank in "0"; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
             if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
@@ -203,19 +222,18 @@ for model in ${MODELS[@]}; do
             fi
             rm_pretrain=$(get_pretrain $model)
             rm_pretrain_model=""
-            if [[ $lora_rank -gt 0 ]]; then
-                rm_pretrain_model="--rm_pretrain $rm_pretrain"
-            fi
+            rm_pretrain_model="--rm_pretrain $rm_pretrain"
             for i in $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
-                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_prompts.py \
+                torchrun --standalone --nproc_per_node=1 $EXAMPLES_DIR/train_prompts.py \
                     --prompt_dataset $PROMPT_DATASET --pretrain_dataset $PRETRAIN_DATASET --max_datasets_size 32 \
                     --strategy $strategy --model $model --tokenizer $MODELS_DIR/$model \
                     --num_episodes 1 --num_collect_steps 1 --num_update_steps 1 --lr 1e-8 \
-                    --experience_batch_size 2 --train_batch_size 1 --lora_rank $lora_rank \
+                    --experience_batch_size 1 --train_batch_size 1 --lora_rank $lora_rank \
                     --pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \
                     $rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \
-                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
+                    --reward_model_tokenizer $MODELS_DIR/$model --max_input_len 10 --max_seq_len 20 \
+                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
                 passed=$?
                 if [ $passed -eq 0 ]; then
                     break

From cc69b6dc3152ecee58fffcd48d61468db9148ba4 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 30 Nov 2023 09:05:17 +0800
Subject: [PATCH 02/23] fix and tested ppo

---
 applications/Chat/coati/adapter/adapter.py    |    0
 applications/Chat/coati/dataset/__init__.py   |   11 +-
 applications/Chat/coati/dataset/chatio.py     |  164 +
 .../Chat/coati/dataset/conversation.py        |    2 +-
 applications/Chat/coati/dataset/loader.py     |   38 +-
 .../Chat/coati/dataset/prompt_dataset.py      |   58 -
 .../Chat/coati/dataset/reward_dataset.py      |   68 -
 .../Chat/coati/dataset/sft_dataset.py         |  229 -
 .../dataset/spliced_and_tokenized_dataset.py  |  148 +-
 .../Chat/coati/experience_buffer/naive.py     |    7 +-
 .../Chat/coati/experience_maker/base.py       |    7 +-
 .../Chat/coati/experience_maker/naive.py      |   58 +-
 applications/Chat/coati/models/__init__.py    |    5 +-
 applications/Chat/coati/models/actor.py       |   28 -
 applications/Chat/coati/models/base.py        |   25 +-
 applications/Chat/coati/models/critic.py      |   18 +-
 applications/Chat/coati/models/generation.py  |   79 +-
 .../Chat/coati/models/reward_model.py         |   21 +-
 applications/Chat/coati/trainer/__init__.py   |    5 +-
 applications/Chat/coati/trainer/base.py       |   51 +-
 .../Chat/coati/trainer/callbacks/__init__.py  |    5 -
 .../Chat/coati/trainer/callbacks/base.py      |   39 -
 .../callbacks/performance_evaluator.py        |  183 -
 .../trainer/callbacks/save_checkpoint.py      |   76 -
 applications/Chat/coati/trainer/ppo.py        |  240 +-
 applications/Chat/coati/trainer/rm.py         |   17 +-
 applications/Chat/coati/trainer/sft.py        |    7 +-
 .../Chat/coati/trainer/strategies/__init__.py |    5 -
 .../Chat/coati/trainer/strategies/base.py     |  137 -
 .../coati/trainer/strategies/colossalai.py    |  200 -
 .../Chat/coati/trainer/strategies/ddp.py      |  136 -
 .../Chat/coati/trainer/strategies/sampler.py  |   31 -
 applications/Chat/coati/trainer/utils.py      |    8 +
 applications/Chat/examples/download_model.py  |   37 +-
 .../examples/generate_conversation_dataset.py |   82 -
 .../Chat/examples/generate_prompt_dataset.py  |   27 -
 applications/Chat/examples/hostfile           |    2 +-
 applications/Chat/examples/inference.py       |  195 +-
 applications/Chat/examples/out.txt            | 1319 ------
 .../examples/prepare_preference_dataset.py    |  161 +
 .../examples/prepare_preference_dataset.sh    |   23 +
 ..._pretrain_dataset_for_ppo_with_ptx_loss.py |  154 +
 ..._pretrain_dataset_for_ppo_with_ptx_loss.sh |   24 +
 .../Chat/examples/prepare_prompt_dataset.py   |  156 +
 .../Chat/examples/prepare_prompt_dataset.sh   |   22 +
 .../Chat/examples/prepare_sft_dataset.py      |  156 +
 .../Chat/examples/prepare_sft_dataset.sh      |   22 +
 applications/Chat/examples/train_dpo.py       |   19 -
 applications/Chat/examples/train_dpo.sh       |    8 +-
 applications/Chat/examples/train_ppo.py       |  540 ++-
 applications/Chat/examples/train_ppo.sh       |   97 +-
 .../Chat/examples/train_reward_model.py       |   67 +-
 applications/Chat/examples/train_rm.sh        |   44 +-
 applications/Chat/examples/train_sft.py       |    9 -
 applications/Chat/examples/train_sft.sh       |   40 +-
 applications/Chat/inference/benchmark.py      |  141 -
 applications/Chat/inference/server.py         |   46 +-
 .../Chat/inference/tests/test_chat_prompt.py  |   61 -
 applications/Chat/inference/utils.py          |  157 +-
 applications/Chat/requirements.txt            |    2 +-
 applications/Chat/tests/test_inference.sh     |    4 +-
 applications/Chat/tests/test_reward_model.py  | 4125 +++++++++++++++++
 62 files changed, 6230 insertions(+), 3616 deletions(-)
 delete mode 100644 applications/Chat/coati/adapter/adapter.py
 create mode 100644 applications/Chat/coati/dataset/chatio.py
 delete mode 100644 applications/Chat/coati/dataset/prompt_dataset.py
 delete mode 100644 applications/Chat/coati/dataset/reward_dataset.py
 delete mode 100644 applications/Chat/coati/dataset/sft_dataset.py
 delete mode 100644 applications/Chat/coati/models/actor.py
 delete mode 100644 applications/Chat/coati/trainer/callbacks/__init__.py
 delete mode 100644 applications/Chat/coati/trainer/callbacks/base.py
 delete mode 100644 applications/Chat/coati/trainer/callbacks/performance_evaluator.py
 delete mode 100644 applications/Chat/coati/trainer/callbacks/save_checkpoint.py
 delete mode 100644 applications/Chat/coati/trainer/strategies/__init__.py
 delete mode 100644 applications/Chat/coati/trainer/strategies/base.py
 delete mode 100644 applications/Chat/coati/trainer/strategies/colossalai.py
 delete mode 100644 applications/Chat/coati/trainer/strategies/ddp.py
 delete mode 100644 applications/Chat/coati/trainer/strategies/sampler.py
 delete mode 100644 applications/Chat/examples/generate_conversation_dataset.py
 delete mode 100644 applications/Chat/examples/generate_prompt_dataset.py
 delete mode 100644 applications/Chat/examples/out.txt
 create mode 100644 applications/Chat/examples/prepare_preference_dataset.py
 create mode 100755 applications/Chat/examples/prepare_preference_dataset.sh
 create mode 100644 applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py
 create mode 100755 applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh
 create mode 100644 applications/Chat/examples/prepare_prompt_dataset.py
 create mode 100755 applications/Chat/examples/prepare_prompt_dataset.sh
 create mode 100644 applications/Chat/examples/prepare_sft_dataset.py
 create mode 100755 applications/Chat/examples/prepare_sft_dataset.sh
 delete mode 100644 applications/Chat/inference/benchmark.py
 delete mode 100644 applications/Chat/inference/tests/test_chat_prompt.py
 create mode 100644 applications/Chat/tests/test_reward_model.py

diff --git a/applications/Chat/coati/adapter/adapter.py b/applications/Chat/coati/adapter/adapter.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
index 2d276b6c6964..48804fe65047 100644
--- a/applications/Chat/coati/dataset/__init__.py
+++ b/applications/Chat/coati/dataset/__init__.py
@@ -1,20 +1,17 @@
-# from .prompt_dataset import PromptDataset
-# from .reward_dataset import PreferenceDataset  # HhRlhfDataset, RmStaticDataset
-# from .sft_dataset import SFTDataset, SupervisedDataset
 from .loader import (
     DataCollatorForPreferenceDataset,
+    DataCollatorForPromptDataset,
     DataCollatorForSupervisedDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
     setup_distributed_dataloader,
 )
+from .spliced_and_tokenized_dataset import tokenize_prompt_dataset
 from .utils import is_rank_0
 
 __all__ = [
-    # "PreferenceDataset",
-    # "SFTDataset",
-    # "SupervisedDataset",
-    # "PromptDataset",
+    "tokenize_prompt_dataset",
+    "DataCollatorForPromptDataset",
     "is_rank_0",
     "DataCollatorForPreferenceDataset",
     "DataCollatorForSupervisedDataset",
diff --git a/applications/Chat/coati/dataset/chatio.py b/applications/Chat/coati/dataset/chatio.py
new file mode 100644
index 000000000000..88e707bb5948
--- /dev/null
+++ b/applications/Chat/coati/dataset/chatio.py
@@ -0,0 +1,164 @@
+import abc
+import re
+
+from prompt_toolkit import PromptSession
+from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
+from prompt_toolkit.completion import WordCompleter
+from prompt_toolkit.history import InMemoryHistory
+from rich.console import Console
+from rich.live import Live
+from rich.markdown import Markdown
+
+
+class ChatIO(abc.ABC):
+    @abc.abstractmethod
+    def prompt_for_input(self, role: str) -> str:
+        """Prompt for input from a role."""
+
+    @abc.abstractmethod
+    def prompt_for_output(self, role: str):
+        """Prompt for output from a role."""
+
+    @abc.abstractmethod
+    def stream_output(self, output_stream):
+        """Stream output."""
+
+
+class SimpleChatIO(ChatIO):
+    def prompt_for_input(self, role) -> str:
+        return input(f"{role}: ")
+
+    def prompt_for_output(self, role: str):
+        print(f"{role}: ", end="", flush=True)
+
+    def stream_output(self, output_stream):
+        pre = 0
+        for outputs in output_stream:
+            outputs = outputs.strip()
+            outputs = outputs.split(" ")
+            now = len(outputs) - 1
+            if now > pre:
+                print(" ".join(outputs[pre:now]), end=" ", flush=True)
+                pre = now
+        print(" ".join(outputs[pre:]), flush=True)
+        return " ".join(outputs)
+
+
+class RichChatIO(ChatIO):
+    def __init__(self):
+        self._prompt_session = PromptSession(history=InMemoryHistory())
+        self._completer = WordCompleter(words=["!exit", "!reset"], pattern=re.compile("$"))
+        self._console = Console()
+
+    def prompt_for_input(self, role) -> str:
+        self._console.print(f"[bold]{role}:")
+        prompt_input = self._prompt_session.prompt(
+            completer=self._completer,
+            multiline=False,
+            auto_suggest=AutoSuggestFromHistory(),
+            key_bindings=None,
+        )
+        self._console.print()
+        return prompt_input
+
+    def prompt_for_output(self, role: str) -> str:
+        self._console.print(f"[bold]{role}:")
+
+    def stream_output(self, output_stream):
+        """Stream output from a role."""
+        # Create a Live context for updating the console output
+        with Live(console=self._console, refresh_per_second=60) as live:
+            # Read lines from the stream
+            for outputs in output_stream:
+                accumulated_text = outputs
+                if not accumulated_text:
+                    continue
+                # Render the accumulated text as Markdown
+                # NOTE: this is a workaround for the rendering "unstandard markdown"
+                #  in rich. The chatbots output treat "\n" as a new line for
+                #  better compatibility with real-world text. However, rendering
+                #  in markdown would break the format. It is because standard markdown
+                #  treat a single "\n" in normal text as a space.
+                #  Our workaround is adding two spaces at the end of each line.
+                #  This is not a perfect solution, as it would
+                #  introduce trailing spaces (only) in code block, but it works well
+                #  especially for console output, because in general the console does not
+                #  care about trailing spaces.
+                lines = []
+                for line in accumulated_text.splitlines():
+                    lines.append(line)
+                    if line.startswith("```"):
+                        # Code block marker - do not add trailing spaces, as it would
+                        #  break the syntax highlighting
+                        lines.append("\n")
+                    else:
+                        lines.append("  \n")
+                markdown = Markdown("".join(lines))
+                # Update the Live console output
+                live.update(markdown)
+        self._console.print()
+        return outputs
+
+
+class DummyChatIO(ChatIO):
+    """
+    Dummy ChatIO class for testing
+    """
+
+    def __init__(self):
+        self.roles = []
+        self._console = Console()
+
+    def prompt_for_input(self, role) -> str:
+        self.roles.append(role)
+        if len(self.roles) == 1:
+            ret = "Hello"
+        elif len(self.roles) == 2:
+            ret = "What's the value of 1+1?"
+        else:
+            ret = "exit"
+        self._console.print(f"[bold]{role}:{ret}")
+        return ret
+
+    def prompt_for_output(self, role: str) -> str:
+        self._console.print(f"[bold]{role}:")
+
+    def stream_output(self, output_stream):
+        """Stream output from a role."""
+        # Create a Live context for updating the console output
+        with Live(console=self._console, refresh_per_second=60) as live:
+            # Read lines from the stream
+            for outputs in output_stream:
+                accumulated_text = outputs
+                if not accumulated_text:
+                    continue
+                # Render the accumulated text as Markdown
+                # NOTE: this is a workaround for the rendering "unstandard markdown"
+                #  in rich. The chatbots output treat "\n" as a new line for
+                #  better compatibility with real-world text. However, rendering
+                #  in markdown would break the format. It is because standard markdown
+                #  treat a single "\n" in normal text as a space.
+                #  Our workaround is adding two spaces at the end of each line.
+                #  This is not a perfect solution, as it would
+                #  introduce trailing spaces (only) in code block, but it works well
+                #  especially for console output, because in general the console does not
+                #  care about trailing spaces.
+                lines = []
+                for line in accumulated_text.splitlines():
+                    lines.append(line)
+                    if line.startswith("```"):
+                        # Code block marker - do not add trailing spaces, as it would
+                        #  break the syntax highlighting
+                        lines.append("\n")
+                    else:
+                        lines.append("  \n")
+                markdown = Markdown("".join(lines))
+                # Update the Live console output
+                live.update(markdown)
+        self._console.print()
+        return outputs
+
+
+simple_io = SimpleChatIO()
+rich_io = RichChatIO()
+dummy_io = DummyChatIO()
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
index be27ff7bc817..572eb3ae54ff 100644
--- a/applications/Chat/coati/dataset/conversation.py
+++ b/applications/Chat/coati/dataset/conversation.py
@@ -41,7 +41,7 @@ def get_prompt(self, length: int = None):
             ret = self.system
             for role, message in self.messages[0:length]:
                 if message:
-                    ret += role + ": " + self.seps[0] + message + self.seps[1]
+                    ret += role + ": " + self.seps[0] + message + " " + self.seps[1]
                 else:
                     ret += role + ": " + self.seps[0]
             return ret
diff --git a/applications/Chat/coati/dataset/loader.py b/applications/Chat/coati/dataset/loader.py
index 167cba1423c0..49e63975b237 100644
--- a/applications/Chat/coati/dataset/loader.py
+++ b/applications/Chat/coati/dataset/loader.py
@@ -97,7 +97,9 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
             else torch.LongTensor(instance["labels"])
             for instance in instances
         ]
-
+        # torch.set_printoptions(threshold=10_000)
+        # if is_rank_0():
+        #     print(batch_input_ids[0])
         if self.tokenizer.padding_side == "right":
             input_ids = torch.nn.utils.rnn.pad_sequence(
                 sequences=batch_input_ids,
@@ -133,12 +135,46 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
                 f"`{self.tokenizer.__class__.__name__}.padding_side` can only be `left` or `right`, "
                 f"but now `{self.tokenizer.padding_side}`"
             )
+        # if is_rank_0():
+        #     print(input_ids[0])
+        # exit()
 
         attention_mask = input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
 
         return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
 
 
+@dataclass
+class DataCollatorForPromptDataset(DataCollatorForSupervisedDataset):
+    def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+        """
+
+        Args:
+            instances (`Sequence[Dict[str, List[int]]]`):
+                Mini-batch samples, each sample is stored in an individual dictionary.
+
+        Returns:
+            (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+                `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+                `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+        """
+        instances = [{"input_ids": ins["input_ids"], "labels": ins["input_ids"]} for ins in instances]
+        ret = super().__call__(instances=instances)
+        # torch.set_printoptions(threshold=10_000)
+        # if is_rank_0():
+        #     print(ret['input_ids'][0])
+        input_ids = F.pad(
+            ret["input_ids"], (self.max_length - ret["input_ids"].size(1), 0), value=self.tokenizer.pad_token_id
+        )
+        attention_mask = F.pad(ret["attention_mask"], (self.max_length - ret["attention_mask"].size(1), 0), value=False)
+        # if is_rank_0():
+        #     print(input_ids[0])
+        #     print(attention_mask[0])
+        # exit()
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
 @dataclass
 class DataCollatorForPreferenceDataset(object):
     """
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
deleted file mode 100644
index 521b496290db..000000000000
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from collections import defaultdict
-from typing import Dict
-
-import torch
-import transformers
-from torch.utils.data import Dataset
-
-from colossalai.logging import get_dist_logger
-
-from .utils import jload, read_string_by_schema
-
-
-class PromptDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-
-    Args:
-        dataset: dataset for supervised model
-        tokenizer: tokenizer for supervised model
-        max_datasets_size: number of examples to use from the dataset
-        max_length: max length of input
-        verbose: whether to display the first two item in the dataset
-        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
-             e.g. person.name.first will access data['person']['name']['first']
-    """
-
-    def __init__(
-        self,
-        data_path: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        max_datasets_size: int = None,
-        max_length: int = 96,
-        dataset_schema: Dict[str, str] = {"instruction": "instruction"},
-    ):
-        super(PromptDataset, self).__init__()
-        self.keyed_prompt = defaultdict(list)
-        self.logger = get_dist_logger()
-        self.logger.info("Loading data...")
-        list_data_dict = jload(data_path)
-        self.logger.info(f"Loaded {len(list_data_dict)} examples.")
-
-        if max_datasets_size is not None:
-            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
-            list_data_dict = list_data_dict[:max_datasets_size]
-
-        instructions = [
-            read_string_by_schema(data_dict, dataset_schema["instruction"]) + "\n" for data_dict in list_data_dict
-        ]
-        tokens = tokenizer(
-            instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
-        )
-        for k, tensor in tokens.items():
-            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()
-
-    def __len__(self):
-        return len(self.keyed_prompt["input_ids"])
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        return {k: v[i] for k, v in self.keyed_prompt.items()}
diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
deleted file mode 100644
index 0f289c9fd866..000000000000
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Callable, Dict
-
-from torch.utils.data import Dataset
-from tqdm import tqdm
-
-from colossalai.logging import get_dist_logger
-
-from .utils import is_rank_0, read_string_by_schema
-
-
-class PreferenceDataset(Dataset):
-    """
-    Dataset for reward model
-
-    Args:
-        dataset: dataset for reward model
-        tokenizer: tokenizer for reward model
-        max_length: max length of input
-        special_token: special token at the end of sentence
-        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
-             e.g. person.name.first will access data['person']['name']['first']
-    """
-
-    def __init__(
-        self,
-        dataset,
-        tokenizer: Callable,
-        max_length: int,
-        special_token=None,
-        dataset_schema: Dict[str, str] = {"prompt": "", "chosen": "chosen", "rejected": "rejected"},
-    ) -> None:
-        super().__init__()
-        self.end_token = tokenizer.eos_token if special_token is None else special_token
-        chosen = [
-            (read_string_by_schema(data, dataset_schema["prompt"]) if "prompt" in dataset_schema else "")
-            + (read_string_by_schema(data, dataset_schema["chosen"]) if "chosen" in dataset_schema else "")
-            + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        self.logger = get_dist_logger()
-        self.logger.info("Tokenizing inputs... This may take some time...")
-        chosen_token = tokenizer(
-            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
-
-        reject = [
-            (read_string_by_schema(data, dataset_schema["prompt"]) if "prompt" in dataset_schema else "")
-            + (read_string_by_schema(data, dataset_schema["rejected"]) if "rejected" in dataset_schema else "")
-            + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        reject_token = tokenizer(
-            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
-
-    def __len__(self):
-        length = self.chosen["input_ids"].shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        return (
-            self.chosen["input_ids"][idx],
-            self.chosen["attention_mask"][idx],
-            self.reject["input_ids"][idx],
-            self.reject["attention_mask"][idx],
-        )
diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
deleted file mode 100644
index d25ed2efe7f4..000000000000
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import copy
-from typing import Dict, Optional, Sequence, Tuple
-
-import torch
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from datasets import load_dataset
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from transformers import PreTrainedTokenizer
-
-from colossalai.cluster import DistCoordinator
-
-from .utils import is_rank_0, jload, read_string_by_schema
-
-IGNORE_INDEX = -100
-PROMPT_DICT = {
-    "prompt_input": (
-        "Below is an instruction that describes a task, paired with an input that provides further context. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
-    ),
-    "prompt_no_input": (
-        "Below is an instruction that describes a task. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Response:"
-    ),
-}
-
-
-def _preprocess(
-    sources: Sequence[str],
-    targets: Sequence[str],
-    tokenizer: PreTrainedTokenizer,
-    max_length: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Preprocess the data by tokenizing."""
-    sequences = [s + t for s, t in zip(sources, targets)]
-    sequences_token = tokenizer(
-        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-    sources_token = tokenizer(
-        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-
-    assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
-    labels = copy.deepcopy(sequences_token["input_ids"])
-    for i in range(labels.shape[0]):
-        source_len = sources_token["attention_mask"][i].sum().item()
-        pad_len = max_length - sequences_token["attention_mask"][i].sum().item()
-        if tokenizer.padding_side == "right":
-            # |prompt|completion|eos|pad|
-            labels[i][:source_len] = IGNORE_INDEX
-            labels[i][-pad_len:] = IGNORE_INDEX
-        elif tokenizer.padding_side == "left":
-            # |pad|prompt|completion|eos|
-            labels[i][: pad_len + source_len] = IGNORE_INDEX
-        else:
-            raise RuntimeError()
-
-    return sequences_token["input_ids"], labels, sequences_token["attention_mask"]
-
-
-def _preprocess_chatglm(
-    sources: Sequence[str],
-    targets: Sequence[str],
-    tokenizer: PreTrainedTokenizer,
-    max_length: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Preprocess the data by tokenizing.
-    None for attention mask, ChatGLM will calculate attention mask according to input ids
-    """
-
-    labels = []
-    input_ids = []
-    for source, target in zip(sources, targets):
-        source_id = tokenizer.encode(text=source, add_special_tokens=False)
-        target_id = tokenizer.encode(text=target, add_special_tokens=False)
-        input_id = tokenizer.build_inputs_with_special_tokens(source_id, target_id)
-        # truncate
-        sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
-        truncate_length = max(0, len(input_id) - max_length)
-        input_id = input_id[truncate_length:]
-        if truncate_length == len(source_id) + 1:
-            input_id = sp_token_list + input_id[1:]
-        elif truncate_length > len(source_id) + 1:
-            input_id = sp_token_list + input_id[2:]
-
-        context_length = input_id.index(tokenizer.bos_token_id)
-        mask_position = context_length - 1
-        label = [IGNORE_INDEX] * context_length + input_id[mask_position + 1 :]
-
-        pad_len = max_length - len(input_id)
-        input_id = input_id + [tokenizer.pad_token_id] * pad_len
-        input_ids.append(input_id)
-        labels.append(label + [IGNORE_INDEX] * pad_len)
-    return torch.tensor(input_ids), torch.tensor(labels), None
-
-
-class SFTDataset(Dataset):
-    """
-    Dataset for sft model
-
-    Args:
-        dataset: dataset for supervised model
-        tokenizer: tokenizer for supervised model
-        max_length: max length of input
-        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
-             e.g. person.name.first will access data['person']['name']['first']
-    """
-
-    def __init__(
-        self,
-        dataset: Dict,
-        tokenizer: PreTrainedTokenizer,
-        max_length: int = 512,
-        dataset_schema: Dict[str, str] = {"prompt": "prompt", "completion": "completion"},
-    ) -> None:
-        super().__init__()
-        self.input_ids = []
-        self.coordinator = DistCoordinator()
-
-        sources = [read_string_by_schema(data, dataset_schema["prompt"]) for data in dataset]
-        targets = [
-            read_string_by_schema(data, dataset_schema["completion"]) + tokenizer.eos_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-
-        self.coordinator.print_on_master("Tokenizing inputs... This may take some time...")
-        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
-                sources, targets, tokenizer, max_length
-            )
-        else:
-            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
-
-        self.coordinator.print_on_master("Loaded dataset.")
-
-    def __len__(self):
-        length = self.input_ids.shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
-        else:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
-
-
-class SupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-
-    Args:
-        dataset: dataset for supervised model
-        tokenizer: tokenizer for supervised model
-        max_datasets_size: number of examples to use from the dataset
-        max_length: max length of input
-        prompt_dict: prompts for the dataset used to format prompt
-        dataset_schema: schema for reading the dataset. cascaded feild names seperated by '.'.
-             e.g. person.name.first will access data['person']['name']['first']
-    """
-
-    def __init__(
-        self,
-        data_path: str,
-        tokenizer: PreTrainedTokenizer,
-        max_datasets_size: Optional[int] = None,
-        max_length: int = 512,
-        prompt_dict: Optional[Dict[str, str]] = PROMPT_DICT,
-        split: str = "train",
-        dataset_schema: Dict[str, str] = {"instruction": "instruction", "input": "input", "output": "output"},
-    ):
-        super().__init__()
-        self.coordinator = DistCoordinator()
-        self.coordinator.print_on_master("Loading data...")
-        try:
-            dataset = load_dataset(data_path)
-            list_data_dict = list(dataset[split])
-        except FileNotFoundError:
-            list_data_dict = jload(data_path)
-        self.coordinator.print_on_master(f"Loaded {len(list_data_dict)} examples.")
-
-        if max_datasets_size is not None:
-            self.coordinator.print_on_master(f"Limiting dataset to {max_datasets_size} examples.")
-            list_data_dict = list_data_dict[:max_datasets_size]
-
-        self.coordinator.print_on_master("Formatting inputs...")
-        prompt_input, prompt_no_input = prompt_dict["prompt_input"], prompt_dict["prompt_no_input"]
-        list_data_dict = [
-            {k: read_string_by_schema(example, dataset_schema[k]) for k in dataset_schema} for example in list_data_dict
-        ]
-        sources = [
-            prompt_input.format_map(example) if example["input"] != "" else prompt_no_input.format_map(example)
-            for example in list_data_dict
-        ]
-        targets = [example["output"] + tokenizer.eos_token for example in list_data_dict]
-        self.coordinator.print_on_master("Tokenizing inputs... This may take some time...")
-        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
-                sources, targets, tokenizer, max_length
-            )
-        else:
-            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
-
-        self.coordinator.print_on_master("Loaded dataset.")
-
-    def __len__(self):
-        length = self.input_ids.shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
-        else:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
diff --git a/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py b/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
index 19bb84033cf1..185b1cf0e4e0 100644
--- a/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
+++ b/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
@@ -4,18 +4,21 @@
 Splicing multiple pre-tokenized sequence data points
 """
 
-import bisect
 import random
 import warnings
 from copy import deepcopy
 from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
 
-from colossal_llama2.utils.conversation import Conversation, default_conversation
+from coati.dataset.conversation import Conversation, default_conversation
 from datasets import dataset_dict
 from torch.utils.data import ConcatDataset, Dataset, IterableDataset
 from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from transformers.tokenization_utils import PreTrainedTokenizer
 
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
 IGNORE_INDEX = -100
 
 DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
@@ -28,10 +31,10 @@ def supervised_tokenize_pretrain(
     A tokenization function to tokenize an original pretraining data point as following:
         {"source": "", "target": "Beijing, the capital of the People's Republic of China, ...", "category": "geography"}
     """
-    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
-        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
-        "add <bos> and <eos> manually later"
-    )
+    # assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+    #     "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+    #     "add <bos> and <eos> manually later"
+    # )
     if ignore_index is None:
         ignore_index = IGNORE_INDEX
 
@@ -40,10 +43,10 @@ def supervised_tokenize_pretrain(
     is_null_source = len(source_text) == 0
 
     source_text = tokenizer.bos_token + source_text
-    target_text += tokenizer.eos_token
+    target_text += " " + tokenizer.eos_token
     sequence_text = source_text + target_text
 
-    tokenized = tokenizer([source_text, sequence_text])["input_ids"]
+    tokenized = tokenizer([source_text, sequence_text], add_special_tokens=False)["input_ids"]
     sequence_input_ids = tokenized[1]
     sequence_labels = deepcopy(sequence_input_ids)
 
@@ -60,7 +63,7 @@ def supervised_tokenize_pretrain(
         input_ids=sequence_input_ids,
         labels=sequence_labels,
         seq_length=len(sequence_input_ids),
-        seq_category=data_point["category"],
+        seq_category=data_point["category"] if "category" in data_point else "None",
     )
 
 
@@ -75,10 +78,10 @@ def supervised_tokenize_sft(
     A tokenization function to tokenize an original pretraining data point as following:
         {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
     """
-    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
-        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
-        "add <bos> and <eos> manually later"
-    )
+    # assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+    #     "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+    #     "add <bos> and <eos> manually later"
+    # )
 
     assert (
         tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
@@ -107,14 +110,21 @@ def supervised_tokenize_sft(
 
     # `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
     turns = [i for i in range(1, len(messages) // 2 + 1)]
-    target_turn_index = bisect.bisect_right(
-        turns,
-        max_length - 1,
-        key=lambda x: len(tokenizer([template.get_prompt(2 * x)], add_special_tokens=False)["input_ids"][0]),
-    )
+
+    lo, hi = 0, len(turns)
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if max_length - 1 < len(
+            tokenizer([template.get_prompt(2 * turns[mid] - 1)], add_special_tokens=False)["input_ids"][0]
+        ):
+            hi = mid
+        else:
+            lo = mid + 1
+    target_turn_index = lo
 
     # The tokenized length for first turn already exceeds `max_length - 1`.
     if target_turn_index - 1 < 0:
+        warnings.warn("The tokenized length for first turn already exceeds `max_length - 1`.")
         return dict(
             input_ids=None,
             labels=None,
@@ -128,10 +138,10 @@ def supervised_tokenize_sft(
     prompt = template.get_prompt(2 * target_turn)
     tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
 
-    # Uncomment this to check whether `bisect_right` is right.
+    # Uncomment the following to check whether `bisect_right` is right.
     # if 2 * target_turn < len(template.messages):
     #     length_to_next_turn = len(tokenizer([template.get_prompt(2*target_turn+2)], add_special_tokens=False)["input_ids"][0])
-    #     assert length_to_next_turn > max_length - 1, print(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
+    #     assert length_to_next_turn > max_length - 1, logger.info(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
 
     template.messages = template.messages[0 : 2 * target_turn]
 
@@ -151,7 +161,7 @@ def supervised_tokenize_sft(
             gpt_eos = not gpt_eos
 
     if len(starts) != target_turn or len(ends) != target_turn:
-        print(
+        logger.info(
             "Please check whether the tokenizer add additional `bos_token` and `eos_token`.\n\nOr the original message contains `bos_token` or `eos_token`."
         )
         return dict(
@@ -171,9 +181,9 @@ def supervised_tokenize_sft(
     labels_decode = deepcopy(labels)
     for i, z in enumerate(labels_decode):
         if z == ignore_index:
-            labels_decode[i] = tokenizer.unk_token_id
+            labels_decode[i] = tokenizer.eos_token_id
 
-    # `inputs_decode` and `labels decode` can be used to check whether the tokenization method is true.
+    # `inputs_decode` and `labels_decode` can be used to check whether the tokenization method is true.
     return dict(
         input_ids=tokenized,
         labels=labels,
@@ -184,6 +194,93 @@ def supervised_tokenize_sft(
     )
 
 
+def tokenize_prompt_dataset(
+    data_point: Dict[str, str],
+    tokenizer: LlamaTokenizer,
+    conversation_template: Conversation = default_conversation,
+    ignore_index: int = None,
+    max_length: int = 4096,
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    A tokenization function to tokenize an original pretraining data point as following:
+        {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
+    """
+    # assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+    #     "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+    #     "add <bos> and <eos> manually later"
+    # )
+
+    assert (
+        tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
+    ), "`bos_token` and `eos_token` should be the same with `conversation_template.seps`."
+
+    if ignore_index is None:
+        ignore_index = IGNORE_INDEX
+
+    messages = data_point["messages"]
+    template = deepcopy(conversation_template)
+    template.messages = []
+
+    for mess in messages:
+        from_str = mess["from"]
+        if from_str.lower() == "human":
+            from_str = template.roles[0]
+        elif from_str.lower() == "assistant":
+            from_str = template.roles[1]
+        else:
+            raise ValueError(f"Unsupported role {from_str.lower()}")
+
+        template.append_message(from_str, mess["content"])
+
+    if len(template.messages) % 2 != 1:
+        # exclude the answer if provided. keep only the prompt
+        template.messages = template.messages[0:-1]
+
+    # `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
+    turns = [i for i in range(1, (len(messages) + 1) // 2 + 1)]
+
+    lo, hi = 0, len(turns)
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if max_length - 1 < len(
+            tokenizer([template.get_prompt(2 * turns[mid] - 1)], add_special_tokens=False)["input_ids"][0]
+        ):
+            hi = mid
+        else:
+            lo = mid + 1
+    target_turn_index = lo
+
+    # The tokenized length for first turn already exceeds `max_length - 1`.
+    if target_turn_index - 1 < 0:
+        warnings.warn("The tokenized length for first turn already exceeds `max_length - 1`.")
+        return dict(
+            input_ids=None,
+            inputs_decode=None,
+            seq_length=None,
+            seq_category=None,
+        )
+
+    target_turn = turns[target_turn_index - 1]
+    prompt = template.get_prompt(2 * target_turn - 1) + "Assistant: <s>"
+    tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
+
+    # Uncomment the following to check whether `bisect_right` is right.
+    # if 2 * target_turn < len(template.messages):
+    #     length_to_next_turn = len(tokenizer([template.get_prompt(2*target_turn+2)], add_special_tokens=False)["input_ids"][0])
+    #     assert length_to_next_turn > max_length - 1, logger.info(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
+
+    template.messages = template.messages[0 : 2 * target_turn - 1]
+    tokenized = [tokenizer.bos_token_id] + tokenized
+
+    # `inputs_decode` and `labels_decode` can be used to check whether the tokenization method is true.
+    return dict(
+        input_ids=tokenized,
+        inputs_decode=tokenizer.decode(tokenized),
+        seq_length=len(tokenized),
+        seq_category=data_point["category"] if "category" in data_point else "None",
+    )
+
+
 def generate_loss_mask(template: Conversation, tokenizer: Any, context_len: int):
     target_turn = int(len(template.messages) / 2)
     prompt = template.get_prompt(2 * target_turn)
@@ -235,11 +332,6 @@ def tokenize_rlhf(
         {"context": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}],
         "chosen": {"from": "assistant", "content": "xxx"}, "rejected": {"from": "assistant", "content": "xxx"}}
     """
-    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
-        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
-        "add <bos> and <eos> manually later"
-    )
-
     assert (
         tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
     ), "`bos_token` and `eos_token` should be the same with `conversation_template.seps`."
diff --git a/applications/Chat/coati/experience_buffer/naive.py b/applications/Chat/coati/experience_buffer/naive.py
index d47b67dbe713..af43400720a4 100644
--- a/applications/Chat/coati/experience_buffer/naive.py
+++ b/applications/Chat/coati/experience_buffer/naive.py
@@ -1,13 +1,16 @@
 import random
-import warnings
 from typing import List
 
 import torch
 from coati.experience_maker.base import Experience
 
+from colossalai.logging import get_dist_logger
+
 from .base import ExperienceBuffer
 from .utils import BufferItem, make_experience_batch, split_experience_batch
 
+logger = get_dist_logger()
+
 
 class NaiveExperienceBuffer(ExperienceBuffer):
     """Naive experience buffer class. It stores experience.
@@ -35,7 +38,7 @@ def append(self, experience: Experience) -> None:
         if self.limit > 0:
             samples_to_remove = len(self.items) - self.limit
             if samples_to_remove > 0:
-                warnings.warn(f"Experience buffer is full. Removing {samples_to_remove} samples.")
+                logger.warning(f"Experience buffer is full. Removing {samples_to_remove} samples.")
                 self.items = self.items[samples_to_remove:]
 
     def clear(self) -> None:
diff --git a/applications/Chat/coati/experience_maker/base.py b/applications/Chat/coati/experience_maker/base.py
index c98d9f9256b4..be6964bf5b33 100644
--- a/applications/Chat/coati/experience_maker/base.py
+++ b/applications/Chat/coati/experience_maker/base.py
@@ -3,7 +3,8 @@
 from typing import Optional
 
 import torch
-from coati.models import Actor, Critic, RewardModel
+from coati.models import Critic, RewardModel
+from transformers import PreTrainedModel
 
 
 @dataclass
@@ -59,7 +60,9 @@ def pin_memory(self):
 
 
 class ExperienceMaker(ABC):
-    def __init__(self, actor: Actor, critic: Critic, reward_model: RewardModel, initial_model: Actor) -> None:
+    def __init__(
+        self, actor: PreTrainedModel, critic: Critic, reward_model: RewardModel, initial_model: PreTrainedModel
+    ) -> None:
         super().__init__()
         self.actor = actor
         self.critic = critic
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py
index f0145503df91..f9e8c8a78010 100644
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@@ -1,9 +1,9 @@
 import torch
 import torch.nn.functional as F
-from coati.models import Actor, Critic, RewardModel
+from coati.models import Critic, RewardModel
 from coati.models.generation import generate
 from coati.models.utils import calc_action_log_probs, compute_reward
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedModel, PreTrainedTokenizer
 
 from .base import Experience, ExperienceMaker
 
@@ -15,10 +15,10 @@ class NaiveExperienceMaker(ExperienceMaker):
 
     def __init__(
         self,
-        actor: Actor,
+        actor: PreTrainedModel,
         critic: Critic,
         reward_model: RewardModel,
-        initial_model: Actor,
+        initial_model: PreTrainedModel,
         tokenizer: PreTrainedTokenizer,
         rm_model_tokenizer: PreTrainedTokenizer,
         kl_coef: float = 0.01,
@@ -45,15 +45,16 @@ def calculate_advantage(self, value, reward, num_actions):
         return advantages
 
     @torch.no_grad()
-    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
+    def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **generate_kwargs) -> Experience:
         self.actor.eval()
         self.critic.eval()
         self.initial_model.eval()
         self.reward_model.eval()
-
-        # generate sequences
-
+        # if is_rank_0():
+        #     print(input_ids[0])
+        torch.manual_seed(47)  # for tp, gurantee the same input for reward model
         sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
+        sequence_length = sequences.size(1)
 
         self.actor.train()
         self.critic.train()
@@ -84,17 +85,42 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
 
         base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
         value = self.critic(sequences, attention_mask)
-        sequences_text = self.tokenizer.batch_decode(sequences, skip_special_tokens=True)
 
-        sequences_rm = self.rm_model_tokenizer(
-            sequences_text, return_tensors="pt", padding="max_length", truncation=True, max_length=300
-        )
+        # convert from left padding to right padding
+        input_ids_rm = torch.zeros_like(sequences, device=sequences.device)
+        attention_mask_rm = torch.zeros_like(sequences, device=sequences.device)
+        for i in range(sequences.size(0)):
+            sequence = sequences[i]
+            bos_index = (sequence == self.tokenizer.bos_token_id).nonzero().squeeze()[0]
+            # print((torch.arange(sequence_length, device=sequence.device)*(sequence!=self.tokenizer.pad_token_id)).max())
+            eos_index = int(
+                (torch.arange(sequence_length, device=sequence.device) * (sequence != self.tokenizer.pad_token_id))
+                .max()
+                .item()
+            )
+            sequence_to_pad = sequence[bos_index : eos_index + 1]
+            sequence_padded = F.pad(
+                sequence_to_pad, (0, sequence_length - sequence_to_pad.size(0)), value=self.tokenizer.pad_token_id
+            )
+            input_ids_rm[i] = sequence_padded
+            if sequence_length - sequence_to_pad.size(0) > 0:
+                attention_mask_rm[i, : sequence_to_pad.size(0) + 1] = 1
+            else:
+                attention_mask_rm[i, :] = 1
+        attention_mask_rm = attention_mask_rm.to(dtype=torch.bool)
+        torch.set_printoptions(threshold=10_000)
+        # if is_rank_0():
+        # print('input ids:\n',input_ids_rm[0])
+        # print('reward input ids:\n',self.tokenizer.batch_decode(input_ids_rm)[0])
+        # print('reward atten_mask:\n',attention_mask_rm[0])
+
         r = self.reward_model(
-            **{
-                "sequences": sequences_rm["input_ids"].to(dtype=torch.long, device=sequences.device),
-                "attention_mask": sequences_rm["attention_mask"].to(device=sequences.device),
-            }
+            input_ids_rm.to(dtype=torch.long, device=sequences.device),
+            attention_mask=attention_mask_rm.to(device=sequences.device),
         )
+        # print('reward:\n',r[0])
+        # exit()
+
         reward, kl = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
         value = value[:, -num_actions:] * action_mask
         advantages = self.calculate_advantage(value, reward, num_actions)
diff --git a/applications/Chat/coati/models/__init__.py b/applications/Chat/coati/models/__init__.py
index 22b71ec43fcc..0dbc658c041e 100644
--- a/applications/Chat/coati/models/__init__.py
+++ b/applications/Chat/coati/models/__init__.py
@@ -1,7 +1,6 @@
-from .actor import Actor
 from .base import BaseModel
 from .critic import Critic
-from .generation import generate
+from .generation import generate, generate_streaming
 from .lora import convert_to_lora_module
 from .loss import DpoLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
 from .reward_model import RewardModel
@@ -9,7 +8,6 @@
 
 __all__ = [
     "BaseModel",
-    "Actor",
     "Critic",
     "RewardModel",
     "PolicyLoss",
@@ -21,4 +19,5 @@
     "load_checkpoint",
     "DpoLoss",
     "generate",
+    "generate_streaming",
 ]
diff --git a/applications/Chat/coati/models/actor.py b/applications/Chat/coati/models/actor.py
deleted file mode 100644
index 6db4f7f6c722..000000000000
--- a/applications/Chat/coati/models/actor.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Optional
-
-import torch
-from coati.models import BaseModel
-from transformers import PretrainedConfig
-
-
-class Actor(BaseModel):
-    """
-    Actor model base class.
-
-    Args:
-        pretrained (str): path to pretrained model.
-        config (PretrainedConfig): PretrainedConfig used to initiate the base model.
-    """
-
-    def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
-        super().__init__(pretrained=pretrained, config=config)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **model_kwargs,
-    ) -> torch.Tensor:
-        """Returns model output."""
-        output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
-        return output
diff --git a/applications/Chat/coati/models/base.py b/applications/Chat/coati/models/base.py
index afe2c6da7563..43a981875b04 100644
--- a/applications/Chat/coati/models/base.py
+++ b/applications/Chat/coati/models/base.py
@@ -1,7 +1,7 @@
 from typing import Optional
 
 import torch.nn as nn
-from transformers import AutoModelForCausalLM, PretrainedConfig
+from transformers import AutoModel, PretrainedConfig
 
 
 class BaseModel(nn.Module):
@@ -18,12 +18,29 @@ def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] =
         if pretrained is not None:
             if config is not None:
                 # initialize with config and load weights from pretrained
-                self.model = AutoModelForCausalLM.from_pretrained(pretrained, config=config)
+                self.model = AutoModel.from_pretrained(pretrained, config=config)
             else:
                 # initialize with pretrained
-                self.model = AutoModelForCausalLM.from_pretrained(pretrained)
+                self.model = AutoModel.from_pretrained(pretrained)
         elif config is not None:
             # initialize with config
-            self.model = AutoModelForCausalLM(config)
+            self.model = AutoModel.from_config(config)
         else:
             raise ValueError("Either pretrained or config must be provided.")
+
+        self.config = self.model.config
+        if self.model.config.architectures[0] == "GPT2LMHeadModel":
+            self.last_hidden_state_size = self.model.config.n_embd
+        if self.model.config.architectures[0] == "BloomForCausalLM":
+            self.last_hidden_state_size = self.model.config.hidden_size
+        elif self.model.config.architectures[0] == "LlamaForCausalLM":
+            self.last_hidden_state_size = self.model.config.hidden_size
+        elif self.model.config.architectures[0] == "OPTForCausalLM":
+            self.last_hidden_state_size = self.model.config.word_embed_proj_dim
+        else:
+            raise ValueError("Unsupported model architecture.")
+
+        # create dummy input to get the size of the last hidden state
+        # dummy_input = torch.zeros((1, 1), dtype=torch.long).to(self.model.device)
+        # out = self.model(dummy_input)
+        # self.last_hidden_state_size = out.last_hidden_state.shape[-1]
diff --git a/applications/Chat/coati/models/critic.py b/applications/Chat/coati/models/critic.py
index 4313f83387ae..f2ba294c04d3 100644
--- a/applications/Chat/coati/models/critic.py
+++ b/applications/Chat/coati/models/critic.py
@@ -18,21 +18,13 @@ class Critic(BaseModel):
     def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
         super().__init__(pretrained=pretrained, config=config)
         # get last hidden state size with dummy input
-        try:
-            dummy_outputs = self.model(
-                torch.tensor([[1]]).to(self.model.device), attention_mask=torch.tensor([[1]]).to(self.model.device)
-            )
-            last_hidden_state_size = dummy_outputs["last_hidden_state"].size(-1)
-        except Exception as e:
-            raise ValueError(
-                f"Please provide a valid pretrained model name or a valid config file for a CasualLM. Caught exception: {e}"
-            )
-
-        self.value_head = nn.Linear(last_hidden_state_size, 1)
+        self.value_head = nn.Linear(self.last_hidden_state_size, 1)
 
     def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         outputs = self.model(input_ids, attention_mask=attention_mask)
         last_hidden_states = outputs["last_hidden_state"]
-        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), :]
-        values = self.value_head(sequence_hidden_states).squeeze(2)  # ensure shape is (B, sequence length)
+        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), :].type(
+            self.value_head.weight.dtype
+        )
+        values = self.value_head(sequence_hidden_states).squeeze(-1)  # ensure shape is (B, sequence length)
         return values
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index baad8f2b55bb..425812e111eb 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 from transformers import PreTrainedTokenizer
 
 try:
@@ -36,7 +37,39 @@ def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
     return unfinished_sequences.max() == 0
 
 
-def _sample(
+@torch.inference_mode()
+def generate(
+    model: Any,
+    input_ids: torch.Tensor,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int,
+    **generation_kwargs,
+) -> torch.Tensor:
+    """Generate token sequence. The returned sequence is input_ids + generated_tokens.
+
+    Args:
+        model (nn.Module): model
+        input_ids (torch.Tensor): input sequence
+        max_length (int): max length of the returned sequence
+    """
+    assert tokenizer.padding_side == "left", "Current generation only supports left padding."
+    if "max_new_tokens" in generation_kwargs:
+        max_new_tokens = generation_kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = max_length - input_ids.size(1)
+    if max_new_tokens <= 0:
+        return input_ids
+    generation_kwargs["max_new_tokens"] = max_new_tokens
+    model_unwrap = model.unwrap()
+    model_unwrap.generation_config.pad_token_id = tokenizer.pad_token_id
+    input_ids = model_unwrap.generate(
+        input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id), **generation_kwargs
+    )
+    input_ids = F.pad(input_ids, (0, max_length - input_ids.size(1)), value=tokenizer.pad_token_id)
+    return input_ids
+
+
+def _sample_streaming(
     model: Any,
     input_ids: torch.Tensor,
     max_length: int,
@@ -48,17 +81,26 @@ def _sample(
     temperature: Optional[float] = None,
     prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
     update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+    stream_interval: int = 2,
     **model_kwargs,
-) -> torch.Tensor:
-    if input_ids.size(1) >= max_length:
+):
+    context_length = input_ids.size(1)
+    if "max_new_tokens" in model_kwargs:
+        max_new_tokens = model_kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = max_length - context_length
+    if context_length + max_new_tokens > max_length or max_new_tokens == 0:
         return input_ids
 
     logits_processor = _prepare_logits_processor(top_k, top_p, temperature)
     unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
 
-    for _ in range(input_ids.size(1), max_length):
+    for i in range(context_length, context_length + max_new_tokens):
+        # calculate attention mask
         model_inputs = (
-            prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
+            prepare_inputs_fn(input_ids, **model_kwargs)
+            if prepare_inputs_fn is not None
+            else {"input_ids": input_ids, "attention_mask": input_ids.ne(pad_token_id)}
         )
         outputs = model(**model_inputs)
 
@@ -84,14 +126,18 @@ def _sample(
             unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
 
         # stop when each sentence is finished if early_stopping=True
-        if early_stopping and _is_sequence_finished(unfinished_sequences):
-            break
-
-    return input_ids
-
-
-@torch.no_grad()
-def generate(
+        if (
+            (early_stopping and _is_sequence_finished(unfinished_sequences))
+            or (i - context_length) % stream_interval == 0
+            or i == context_length + max_new_tokens - 1
+        ):
+            yield input_ids
+            if early_stopping and _is_sequence_finished(unfinished_sequences):
+                break
+
+
+@torch.inference_mode()
+def generate_streaming(
     model: Any,
     input_ids: torch.Tensor,
     tokenizer: PreTrainedTokenizer,
@@ -105,7 +151,7 @@ def generate(
     prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
     update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
     **model_kwargs,
-) -> torch.Tensor:
+):
     """Generate token sequence. The returned sequence is input_ids + generated_tokens.
 
     Args:
@@ -130,7 +176,7 @@ def generate(
         raise NotImplementedError
     elif is_sample_gen_mode:
         # run sample
-        return _sample(
+        for res in _sample_streaming(
             model,
             input_ids,
             max_length,
@@ -143,7 +189,8 @@ def generate(
             prepare_inputs_fn=prepare_inputs_fn,
             update_model_kwargs_fn=update_model_kwargs_fn,
             **model_kwargs,
-        )
+        ):
+            yield res
     elif is_beam_gen_mode:
         raise NotImplementedError
     else:
diff --git a/applications/Chat/coati/models/reward_model.py b/applications/Chat/coati/models/reward_model.py
index 0e2d0472b174..1b394677b904 100644
--- a/applications/Chat/coati/models/reward_model.py
+++ b/applications/Chat/coati/models/reward_model.py
@@ -18,31 +18,18 @@ class RewardModel(BaseModel):
 
     def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] = None) -> None:
         super().__init__(pretrained=pretrained, config=config)
-        # get last hidden state size with dummy input
-        try:
-            dummy_outputs = self.model(
-                torch.tensor([[1]]).to(self.model.device), attention_mask=torch.tensor([[1]]).to(self.model.device)
-            )
-            last_hidden_state_size = dummy_outputs["logits"].size(-1)
-        except Exception as e:
-            raise ValueError(
-                f"Please provide a valid pretrained model name or a valid config file for a CasualLM. Caught exception: {e}"
-            )
-
-        self.value_head = nn.Linear(last_hidden_state_size, 1)
-        self.value_head.weight.data.normal_(mean=0.0, std=1 / (last_hidden_state_size + 1))
+        self.value_head = nn.Linear(self.last_hidden_state_size, 1)
+        self.value_head.weight.data.normal_(mean=0.0, std=1 / (self.last_hidden_state_size + 1))
 
     def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         outputs = self.model(input_ids, attention_mask=attention_mask)
-        last_hidden_states = outputs["logits"]
+
+        last_hidden_states = outputs["last_hidden_state"]
         sequence_lengths = torch.max(attention_mask * torch.arange(input_ids.size(1), device=input_ids.device), dim=1)[
             0
         ]
         sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths].type(
             self.value_head.weight.dtype
         )
-        # print("sequence_hidden_states", sequence_hidden_states.size(), sequence_hidden_states.dtype)
-        # print("values head weight", self.value_head.weight.size(),self.value_head.weight.dtype)
-
         values = self.value_head(sequence_hidden_states).squeeze(-1)  # ensure shape is (B,)
         return values
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/Chat/coati/trainer/__init__.py
index 110df0dbe764..2eff8ca7676a 100644
--- a/applications/Chat/coati/trainer/__init__.py
+++ b/applications/Chat/coati/trainer/__init__.py
@@ -1,8 +1,7 @@
 from .base import OLTrainer, SLTrainer
 from .dpo import DPOTrainer
-
-# from .ppo import PPOTrainer
+from .ppo import PPOTrainer
 from .rm import RewardModelTrainer
 from .sft import SFTTrainer
 
-__all__ = ["SLTrainer", "OLTrainer", "RewardModelTrainer", "SFTTrainer", "DPOTrainer"]
+__all__ = ["SLTrainer", "OLTrainer", "RewardModelTrainer", "SFTTrainer", "PPOTrainer", "DPOTrainer"]
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
index 126cd2f863f4..413e344568f3 100644
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import List
+from typing import Callable, List
 
 import torch.nn as nn
 import tqdm
@@ -10,8 +10,8 @@
 
 from colossalai.booster import Booster
 
-from .callbacks import Callback
-from .strategies import Strategy
+# from .callbacks import Callback
+# from .strategies import Strategy
 from .utils import is_rank_0
 
 
@@ -74,14 +74,16 @@ class OLTrainer(ABC):
 
     def __init__(
         self,
-        strategy: Strategy,
+        actor_booster: Booster,
+        critic_booster: Booster,
         data_buffer: NaiveExperienceBuffer,
         sample_buffer: bool,
         dataloader_pin_memory: bool,
-        callbacks: List[Callback] = [],
+        callbacks: List[Callable] = [],
     ) -> None:
         super().__init__()
-        self.strategy = strategy
+        self.actor_booster = actor_booster
+        self.critic_booster = critic_booster
         self.data_buffer = data_buffer
         self.sample_buffer = sample_buffer
         self.dataloader_pin_memory = dataloader_pin_memory
@@ -146,6 +148,20 @@ def _learn(self, update_step: int):
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def _setup_update_phrase_dataload(self):
+        """
+        Implement this method to setup dataloader for update phase.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _save_checkpoint(self, episode: int = 0):
+        """
+        Implement this method to save checkpoint.
+        """
+        raise NotImplementedError()
+
     def _collect_phase(self, collect_step: int):
         self._on_make_experience_start()
         experience = self._make_experience(collect_step)
@@ -165,7 +181,6 @@ def fit(
         num_episodes: int,
         num_collect_steps: int,
         num_update_steps: int,
-        save_per_num_episodes: int,
         *args,
         **kwargs,
     ):
@@ -187,26 +202,10 @@ def fit(
                         # HACK(cwher): according to the design of boost API, dataloader should also be boosted,
                         #  but it is impractical to adapt this pattern in RL training. Thus, I left dataloader unboosted.
                         #  I only call strategy.setup_dataloader() to setup dataloader.
-                        self.dataloader = self.strategy.setup_dataloader(self.data_buffer, self.dataloader_pin_memory)
+                        self._setup_update_phrase_dataload()
                     for update_step in tqdm.trange(num_update_steps, desc="Update steps", disable=not is_rank_0()):
                         self._update_phase(update_step)
                     # NOTE: this is for on-policy algorithms
                     self.data_buffer.clear()
-                if is_rank_0() and (episode + 1) % save_per_num_episodes == 0:
-                    if args.lora_rank > 0 and args.merge_lora_weights:
-                        from coati.models.lora import LORA_MANAGER
-
-                        # NOTE: set model to eval to merge LoRA weights
-                        LORA_MANAGER.merge_weights = True
-                        self.actor.eval()
-                    # save model checkpoint after fitting
-                    self.strategy.save_model(self.actor, args.save_path, only_rank0=True)
-                    # save optimizer checkpoint on all ranks
-                    if args.need_optim_ckpt:
-                        self.strategy.save_optimizer(
-                            self.actor_optim,
-                            "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()),
-                            only_rank0=False,
-                        )
-
-                    self.strategy.save_checkpoint(episode)
+                if self.save_interval > 0 and (episode + 1) % (self.save_interval) == 0:
+                    self._save_checkpoint(episode + 1)
diff --git a/applications/Chat/coati/trainer/callbacks/__init__.py b/applications/Chat/coati/trainer/callbacks/__init__.py
deleted file mode 100644
index 29c8c4f00a5c..000000000000
--- a/applications/Chat/coati/trainer/callbacks/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .base import Callback
-from .performance_evaluator import PerformanceEvaluator
-from .save_checkpoint import SaveCheckpoint
-
-__all__ = ["Callback", "PerformanceEvaluator", "SaveCheckpoint"]
diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/Chat/coati/trainer/callbacks/base.py
deleted file mode 100644
index c6e30f04885c..000000000000
--- a/applications/Chat/coati/trainer/callbacks/base.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from abc import ABC
-
-from coati.experience_maker import Experience
-
-
-class Callback(ABC):
-    """
-    Base callback class. It defines the interface for callbacks.
-    """
-
-    def on_fit_start(self) -> None:
-        pass
-
-    def on_fit_end(self) -> None:
-        pass
-
-    def on_episode_start(self, episode: int) -> None:
-        pass
-
-    def on_episode_end(self, episode: int) -> None:
-        pass
-
-    def on_make_experience_start(self) -> None:
-        pass
-
-    def on_make_experience_end(self, experience: Experience) -> None:
-        pass
-
-    def on_learn_epoch_start(self, epoch: int) -> None:
-        pass
-
-    def on_learn_epoch_end(self, epoch: int) -> None:
-        pass
-
-    def on_learn_batch_start(self) -> None:
-        pass
-
-    def on_learn_batch_end(self, experience: Experience) -> None:
-        pass
diff --git a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
deleted file mode 100644
index b286c766c263..000000000000
--- a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from time import time
-from typing import Optional
-
-import torch
-import torch.distributed as dist
-from coati.experience_maker import Experience
-
-from .base import Callback
-
-
-def get_world_size() -> int:
-    if dist.is_initialized():
-        return dist.get_world_size()
-    return 1
-
-
-def print_rank_0(*args, **kwargs) -> None:
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def divide(x: float, y: float) -> float:
-    if y == 0:
-        return float("inf")
-    elif y == float("inf"):
-        return float("nan")
-    return x / y
-
-
-@torch.no_grad()
-def all_reduce_mean(x: float, world_size: int) -> float:
-    if world_size == 1:
-        return x
-    tensor = torch.tensor([x], device=torch.cuda.current_device())
-    dist.all_reduce(tensor)
-    tensor = tensor / world_size
-    return tensor.item()
-
-
-class Timer:
-    def __init__(self) -> None:
-        self.start_time: Optional[float] = None
-        self.duration: float = 0.0
-
-    def start(self) -> None:
-        self.start_time = time()
-
-    def end(self) -> None:
-        assert self.start_time is not None
-        self.duration += time() - self.start_time
-        self.start_time = None
-
-    def reset(self) -> None:
-        self.duration = 0.0
-
-
-class PerformanceEvaluator(Callback):
-    """
-        Callback for valuate the performance of the model.
-    Args:
-        actor_num_params: The number of parameters of the actor model.
-        critic_num_params: The number of parameters of the critic model.
-        initial_model_num_params: The number of parameters of the initial model.
-        reward_model_num_params: The number of parameters of the reward model.
-        enable_grad_checkpoint: Whether to enable gradient checkpointing.
-        ignore_episodes: The number of episodes to ignore when calculating the performance.
-    """
-
-    def __init__(
-        self,
-        actor_num_params: int,
-        critic_num_params: int,
-        initial_model_num_params: int,
-        reward_model_num_params: int,
-        enable_grad_checkpoint: bool = False,
-        ignore_episodes: int = 0,
-    ) -> None:
-        super().__init__()
-        self.world_size = get_world_size()
-        self.actor_num_params = actor_num_params
-        self.critic_num_params = critic_num_params
-        self.initial_model_num_params = initial_model_num_params
-        self.reward_model_num_params = reward_model_num_params
-        self.enable_grad_checkpoint = enable_grad_checkpoint
-        self.ignore_episodes = ignore_episodes
-        self.disable: bool = False
-
-        self.overall_timer = Timer()
-        self.make_experience_timer = Timer()
-        self.learn_timer = Timer()
-        self.make_experience_num_samples: int = 0
-        self.make_experience_flop: int = 0
-        self.learn_num_samples: int = 0
-        self.learn_flop: int = 0
-
-    def on_episode_start(self, episode: int) -> None:
-        self.disable = self.ignore_episodes > 0 and episode < self.ignore_episodes
-        if self.disable:
-            return
-        self.overall_timer.start()
-
-    def on_episode_end(self, episode: int) -> None:
-        if self.disable:
-            return
-        self.overall_timer.end()
-
-    def on_make_experience_start(self) -> None:
-        if self.disable:
-            return
-        self.make_experience_timer.start()
-
-    def on_make_experience_end(self, experience: Experience) -> None:
-        if self.disable:
-            return
-        self.make_experience_timer.end()
-
-        batch_size, seq_len = experience.sequences.shape
-
-        self.make_experience_num_samples += batch_size
-
-        # actor generate
-        num_actions = experience.action_mask.size(1)
-        input_len = seq_len - num_actions
-        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
-        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
-        # actor forward
-        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
-        # critic forward
-        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
-        # initial model forward
-        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
-        # reward model forward
-        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
-
-    def on_learn_batch_start(self) -> None:
-        if self.disable:
-            return
-        self.learn_timer.start()
-
-    def on_learn_batch_end(self, experience: Experience) -> None:
-        if self.disable:
-            return
-        self.learn_timer.end()
-
-        batch_size, seq_len = experience.sequences.shape
-
-        self.learn_num_samples += batch_size
-
-        # actor forward-backward, 3 means forward(1) + backward(2)
-        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
-        # critic forward-backward
-        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
-
-    def on_fit_end(self) -> None:
-        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
-        avg_learn_duration = all_reduce_mean(self.learn_timer.duration, self.world_size)
-        avg_overall_duration = all_reduce_mean(self.overall_timer.duration, self.world_size)
-
-        avg_make_experience_throughput = (
-            self.make_experience_num_samples * self.world_size / (avg_make_experience_duration + 1e-12)
-        )
-        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
-
-        avg_learn_throughput = self.learn_num_samples * self.world_size / (avg_learn_duration + 1e-12)
-        avg_learn_tflops = self.learn_flop / 1e12 / (avg_learn_duration + 1e-12)
-
-        num_effective_samples = min(self.learn_num_samples, self.make_experience_num_samples) * self.world_size
-
-        avg_overall_throughput = num_effective_samples / (avg_overall_duration + 1e-12)
-
-        overall_time_per_sample = divide(1, avg_overall_throughput)
-        make_experience_time_per_sample = divide(avg_make_experience_duration, num_effective_samples)
-        learn_time_per_sample = divide(avg_learn_duration, num_effective_samples)
-
-        print_rank_0(
-            f"Performance summary:\n"
-            + f"Generate {self.make_experience_num_samples * self.world_size} samples, throughput: {avg_make_experience_throughput:.2f} samples/s, TFLOPS per GPU: {avg_make_experience_tflops:.2f}\n"
-            + f"Train {self.learn_num_samples * self.world_size} samples, throughput: {avg_learn_throughput:.2f} samples/s, TFLOPS per GPU: {avg_learn_tflops:.2f}\n"
-            + f"Overall throughput: {avg_overall_throughput:.2f} samples/s\n"
-            + f"Overall time per sample: {overall_time_per_sample:.2f} s\n"
-            + f"Make experience time per sample: {make_experience_time_per_sample:.2f} s, {make_experience_time_per_sample/overall_time_per_sample*100:.2f}%\n"
-            + f"Learn time per sample: {learn_time_per_sample:.2f} s, {learn_time_per_sample/overall_time_per_sample*100:.2f}%"
-        )
diff --git a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
deleted file mode 100644
index 0d70b6c53073..000000000000
--- a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-
-import torch.distributed as dist
-from coati.trainer.strategies import GeminiStrategy, LowLevelZeroStrategy, Strategy
-from coati.trainer.utils import is_rank_0
-from torch import nn
-from torch.optim import Optimizer
-
-from .base import Callback
-
-
-class SaveCheckpoint(Callback):
-    """
-        The callback for saving checkpoint for coati.
-
-        Only support saving actor and critic model.
-        A typical architecture of the saved checkpoint would be:
-            - checkpoint
-                - episode_x
-                    - actor.pt
-                    - actor-optim-rank-0.pt
-                    - actor-optim-rank-1.pt
-                    - critic.pt
-                    - critic-optim-rank-0.pt
-                    - critic-optim-rank-1.pt
-                - ...
-
-    Args:
-        path(str): the base path you want to save checkpoint, the checkpoint would be saved at `path/checkpoint`
-        interval(int): the interval episode of saving checkpoint
-        strategy(Strategy): the strategy used to train
-        actor(nn.Module): the actor model
-        critic(nn.Module): the critic model
-        actor_optim(Optimizer): the optimizer of actor
-        critic_optim(Optimizer): the optimizer of critic
-
-    """
-
-    def __init__(
-        self,
-        path: str,
-        interval: int,
-        strategy: Strategy,
-        actor: nn.Module = None,
-        critic: nn.Module = None,
-        actor_optim: Optimizer = None,
-        critic_optim: Optimizer = None,
-    ) -> None:
-        super().__init__()
-        self.path = os.path.join(path, "checkpoint")
-        self.interval = interval
-        self.strategy = strategy
-        self.model_dict = {"actor": [actor, actor_optim], "critic": [critic, critic_optim]}
-
-    def on_episode_end(self, episode: int) -> None:
-        if (episode + 1) % self.interval != 0:
-            return
-        base_path = os.path.join(self.path, f"episode_{episode}")
-        if not os.path.exists(base_path):
-            os.makedirs(base_path)
-
-        for model in self.model_dict.keys():
-            # save model
-            if self.model_dict[model][0] is None:
-                # saving only optimizer states is meaningless, so it would be skipped
-                continue
-            model_path = os.path.join(base_path, f"{model}.pt")
-            self.strategy.save_model(model=self.model_dict[model][0], path=model_path, only_rank0=True)
-
-            # save optimizer
-            if self.model_dict[model][1] is None:
-                continue
-            only_rank0 = not isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy))
-            rank = 0 if is_rank_0() else dist.get_rank()
-            optim_path = os.path.join(base_path, f"{model}-optim-rank-{rank}.pt")
-            self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0)
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index ac6df0026b2e..055379718d23 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -1,44 +1,36 @@
-from typing import Dict, List, Optional
+import os
+from typing import Optional
 
+import torch
+import wandb
 from coati.experience_buffer import NaiveExperienceBuffer
 from coati.experience_maker import Experience, NaiveExperienceMaker
-from coati.models import Actor, Critic, RewardModel
+from coati.models import Critic, RewardModel
 from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
 from coati.models.utils import calc_action_log_probs
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter, save_checkpoint
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
-from transformers import PreTrainedTokenizerBase
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
+from colossalai.cluster import DistCoordinator
 from colossalai.utils import get_current_device
 
 from .base import OLTrainer
-from .callbacks import Callback
-from .strategies import GeminiStrategy, Strategy
 from .utils import CycledDataLoader, is_rank_0, to_device
 
 
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
-    unwrapped_model = strategy.unwrap_model(actor)
-    hf_model = unwrapped_model.model
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if "prepare_inputs_fn" not in generate_kwargs and hasattr(hf_model, "prepare_inputs_for_generation"):
-        new_kwargs["prepare_inputs_fn"] = hf_model.prepare_inputs_for_generation
-
-    if "update_model_kwargs_fn" not in generate_kwargs and hasattr(hf_model, "_update_model_kwargs_for_generation"):
-        new_kwargs["update_model_kwargs_fn"] = hf_model._update_model_kwargs_for_generation
-
-    return new_kwargs
-
-
 class PPOTrainer(OLTrainer):
     """
         Trainer for PPO algorithm.
 
     Args:
-        strategy (Strategy): the strategy to use for training
+        strategy (Booster): the strategy to use for training
         actor (Actor): the actor model in ppo algorithm
         critic (Critic): the critic model in ppo algorithm
         reward_model (RewardModel): the reward model in rlhf algorithm to make reward of sentences
@@ -62,16 +54,17 @@ class PPOTrainer(OLTrainer):
 
     def __init__(
         self,
-        strategy: Strategy,
-        actor: Actor,
+        actor_booster: Booster,
+        critic_booster: Booster,
+        actor: PreTrainedModel,
         critic: Critic,
         reward_model: RewardModel,
-        initial_model: Actor,
+        initial_model: PreTrainedModel,
         actor_optim: Optimizer,
         critic_optim: Optimizer,
         actor_lr_scheduler: _LRScheduler,
+        critic_lr_scheduler: _LRScheduler,
         tokenizer: PreTrainedTokenizerBase,
-        rm_model_tokenizer: PreTrainedTokenizerBase,
         kl_coef: float = 0.1,
         ptx_coef: float = 0.9,
         train_batch_size: int = 8,
@@ -83,25 +76,31 @@ def __init__(
         sample_buffer: bool = False,
         dataloader_pin_memory: bool = True,
         offload_inference_models: bool = True,
-        callbacks: List[Callback] = [],
+        accumulation_steps: int = 1,
+        save_interval: int = 0,
+        save_dir: str = None,
+        use_tp: bool = False,
+        coordinator: DistCoordinator = None,
         **generate_kwargs,
     ) -> None:
-        if isinstance(strategy, GeminiStrategy):
+        if isinstance(actor_booster, GeminiPlugin):
             assert not offload_inference_models, "GeminiPlugin is not compatible with manual model.to('cpu')"
 
         data_buffer = NaiveExperienceBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
-        super().__init__(strategy, data_buffer, sample_buffer, dataloader_pin_memory, callbacks)
-
-        self.generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
+        super().__init__(actor_booster, critic_booster, data_buffer, sample_buffer, dataloader_pin_memory)
+        self.generate_kwargs = generate_kwargs
 
         self.actor = actor
         self.critic = critic
+        self.actor_booster = actor_booster
+        self.critic_booster = critic_booster
         self.actor_scheduler = actor_lr_scheduler
+        self.critic_scheduler = critic_lr_scheduler
         self.tokenizer = tokenizer
-        self.rm_model_tokenizer = rm_model_tokenizer
         self.experience_maker = NaiveExperienceMaker(
-            self.actor, self.critic, reward_model, initial_model, self.tokenizer, self.rm_model_tokenizer, kl_coef
+            self.actor, self.critic, reward_model, initial_model, self.tokenizer, self.tokenizer, kl_coef
         )
+        self.train_batch_size = train_batch_size
 
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
@@ -110,15 +109,22 @@ def __init__(
         self.ptx_coef = ptx_coef
         self.actor_optim = actor_optim
         self.critic_optim = critic_optim
+        self.save_interval = save_interval
+        self.coordinator = coordinator
+        self.actor_save_dir = os.path.join(save_dir, "actor")
+        self.critic_save_dir = os.path.join(save_dir, "critic")
         self.num_train_step = 0
-
+        self.accumulation_steps = accumulation_steps
+        self.use_tp = use_tp
+        self.accumulative_meter = AccumulativeMeanMeter()
         self.offload_inference_models = offload_inference_models
         self.device = get_current_device()
+        self.coordinator.print_on_master(f"generation kwargs:\n{generate_kwargs}")
 
     def _before_fit(
         self,
         prompt_dataloader: DataLoader,
-        pretrain_dataloader: DataLoader,
+        pretrain_dataloader: Optional[DataLoader] = None,
         log_dir: Optional[str] = None,
         use_wandb: bool = False,
     ):
@@ -128,7 +134,7 @@ def _before_fit(
             pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
         """
         self.prompt_dataloader = CycledDataLoader(prompt_dataloader)
-        self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader)
+        self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader) if pretrain_dataloader is not None else None
 
         self.writer = None
         if use_wandb and is_rank_0():
@@ -146,14 +152,32 @@ def _before_fit(
             log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
             self.writer = SummaryWriter(log_dir=log_dir)
 
+    def _setup_update_phrase_dataload(self):
+        """
+        why not use distributed_dataloader?
+            if tp is used, input on each rank is the same and we use the same dataloader to feed same experience to all ranks
+            if tp is not used, input on each rank is different and we expect different experiences to be fed to each rank
+        """
+        self.dataloader = DataLoader(
+            self.data_buffer,
+            batch_size=self.train_batch_size,
+            shuffle=True,
+            drop_last=True,
+            pin_memory=self.dataloader_pin_memory,
+            collate_fn=self.data_buffer.collate_fn,
+        )
+
     def _make_experience(self, collect_step: int) -> Experience:
         prompts = self.prompt_dataloader.next()
         if self.offload_inference_models:
             # TODO(ver217): this may be controlled by strategy if they are prepared by strategy
             self.experience_maker.initial_model.to(self.device)
             self.experience_maker.reward_model.to(self.device)
-        assert isinstance(prompts, dict), f'Unsupported input type "{type(prompts)}"'
-        return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
+        return self.experience_maker.make_experience(
+            input_ids=prompts["input_ids"].to(get_current_device()),
+            attention_mask=prompts["attention_mask"].to(get_current_device()),
+            **self.generate_kwargs,
+        )
 
     def _training_step(self, experience: Experience):
         """
@@ -176,7 +200,8 @@ def _training_step(self, experience: Experience):
             action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
         )
         actor_loss = (1 - self.ptx_coef) * actor_loss
-        self.strategy.backward(actor_loss, self.actor, self.actor_optim)
+        # if not to_skip: mask for debugging
+        self.actor_booster.backward(loss=actor_loss, optimizer=self.actor_optim)
 
         # ptx loss
         if self.ptx_coef != 0:
@@ -184,7 +209,7 @@ def _training_step(self, experience: Experience):
             batch = to_device(batch, self.device)
             ptx_log_probs = self.actor(batch["input_ids"], batch["attention_mask"])["logits"]
             ptx_loss = self.ptx_coef * self.ptx_loss_fn(ptx_log_probs, batch["labels"])
-            self.strategy.backward(ptx_loss, self.actor, self.actor_optim)
+            self.actor_booster.backward(loss=ptx_loss, optimizer=self.actor_optim)
 
         # value loss
         values = self.critic(
@@ -194,46 +219,78 @@ def _training_step(self, experience: Experience):
             values[:, -num_actions:], experience.values, experience.advantages, action_mask=experience.action_mask
         )
         critic_loss = critic_loss * self.vf_coef
-        self.strategy.backward(critic_loss, self.critic, self.critic_optim)
+        self.critic_booster.backward(loss=critic_loss, optimizer=self.critic_optim)
+
+        # sync
+        actor_loss_mean = all_reduce_mean(tensor=actor_loss)
+        critic_loss_mean = all_reduce_mean(tensor=critic_loss)
+        max_ratio_mean = all_reduce_mean(tensor=max_ratio)
+        reward_mean = all_reduce_mean(tensor=experience.reward.mean())
+        value_mean = all_reduce_mean(tensor=experience.values.mean())
+        advantages_mean = all_reduce_mean(tensor=experience.advantages.mean())
+        kl_mean = all_reduce_mean(tensor=experience.kl.mean())
+        if self.ptx_coef != 0:
+            ptx_loss_mean = all_reduce_mean(tensor=ptx_loss)
+
+        self.accumulative_meter.add("actor_loss", actor_loss_mean.to(torch.float16).mean().item())
+        self.accumulative_meter.add("critic_loss", critic_loss_mean.to(torch.float16).mean().item())
+        self.accumulative_meter.add("max_ratio", max_ratio_mean.to(torch.float16).item())
+        self.accumulative_meter.add("reward", reward_mean.to(torch.float16).mean().item())
+        self.accumulative_meter.add("value", value_mean.to(torch.float16).mean().item())
+        self.accumulative_meter.add("advantages", advantages_mean.to(torch.float16).item())
+        self.accumulative_meter.add("skip_ratio", 0.0 if to_skip else 1.0)
+        self.accumulative_meter.add("kl", kl_mean.to(torch.float16).item())
+        if self.ptx_coef != 0:
+            self.accumulative_meter.add("ptx_loss", ptx_loss_mean.to(torch.float16).mean().item())
 
-        if not to_skip:
-            self.strategy.optimizer_step(self.actor_optim)
-        self.strategy.optimizer_step(self.critic_optim)
-        self.actor_optim.zero_grad()
-        self.critic_optim.zero_grad()
-        if self.actor_scheduler:
+        if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
+            self.actor_optim.step()
+            self.critic_optim.step()
+            self.actor_optim.zero_grad()
+            self.critic_optim.zero_grad()
             self.actor_scheduler.step()
-
-        # preparing logging model output and corresponding rewards.
-        response_text = self.experience_maker.tokenizer.batch_decode(experience.sequences, skip_special_tokens=True)
-        for i in range(len(response_text)):
-            response_text[i] = response_text[i] + f"\n\nReward: {experience.reward[i]}"
-
-        if self.writer:
-            # use wandb
-            import wandb
-
-            if self.num_train_step % 50 == 1 and "wandb_run" in self.__dict__:
-                my_table = wandb.Table(
-                    columns=[f"sample response {i}" for i in range(len(response_text))], data=[response_text]
+            self.critic_scheduler.step()
+
+            response_text = self.experience_maker.tokenizer.batch_decode(experience.sequences, skip_special_tokens=True)
+            for i in range(len(response_text)):
+                response_text[i] = response_text[i] + f"\n\nReward: {experience.reward[i]}"
+            for line_id in range(min(3, len(response_text))):
+                # log output to screen
+                self.coordinator.print_on_master("###################\n" + response_text[line_id])
+            # preparing logging model output and corresponding rewards.
+            if self.num_train_step % 50 == 1:
+                if self.writer and is_rank_0() and "wandb_run" in self.__dict__:
+                    # log output to wandb
+                    my_table = wandb.Table(
+                        columns=[f"sample response {i}" for i in range(len(response_text))], data=[response_text]
+                    )
+                    try:
+                        self.wandb_run.log({"sample_response": my_table})
+                    except OSError as e:
+                        print(e)
+
+            if self.writer and is_rank_0():
+                self.writer.add_scalar("train/max_ratio", self.accumulative_meter.get("max_ratio"), self.num_train_step)
+                self.writer.add_scalar(
+                    "train/skip_ratio", self.accumulative_meter.get("skip_ratio"), self.num_train_step
                 )
-                try:
-                    self.wandb_run.log({"sample_response": my_table})
-                except OSError as e:
-                    print(e)
-
-            self.writer.add_scalar("train/max_ratio", max_ratio, self.num_train_step)
-            self.writer.add_scalar("train/skip", 1 if to_skip else 0, self.num_train_step)
-            self.writer.add_scalar("train/actor_loss", actor_loss.mean().item(), self.num_train_step)
-            self.writer.add_scalar("train/lr_actor", self.actor_optim.param_groups[0]["lr"], self.num_train_step)
-            self.writer.add_scalar("train/lr_critic", self.critic_optim.param_groups[0]["lr"], self.num_train_step)
-            self.writer.add_scalar("train/critic_loss", critic_loss.mean().item(), self.num_train_step)
-            if self.ptx_coef != 0:
-                self.writer.add_scalar("train/ptx_loss", ptx_loss.mean().item(), self.num_train_step)
-            self.writer.add_scalar("reward", experience.reward.mean().item(), self.num_train_step)
-            self.writer.add_scalar("approx_kl", experience.kl.mean().item(), self.num_train_step)
-            self.writer.add_scalar("value", experience.values.mean().item(), self.num_train_step)
-            self.writer.add_scalar("advantages", experience.advantages.mean().item(), self.num_train_step)
+                self.writer.add_scalar(
+                    "train/actor_loss", self.accumulative_meter.get("actor_loss"), self.num_train_step
+                )
+                self.writer.add_scalar("train/lr_actor", self.actor_optim.param_groups[0]["lr"], self.num_train_step)
+                self.writer.add_scalar("train/lr_critic", self.critic_optim.param_groups[0]["lr"], self.num_train_step)
+                self.writer.add_scalar(
+                    "train/critic_loss", self.accumulative_meter.get("critic_loss"), self.num_train_step
+                )
+                if self.ptx_coef != 0:
+                    self.writer.add_scalar(
+                        "train/ptx_loss", self.accumulative_meter.get("ptx_loss"), self.num_train_step
+                    )
+                self.writer.add_scalar("reward", self.accumulative_meter.get("reward"), self.num_train_step)
+                self.writer.add_scalar("approx_kl", self.accumulative_meter.get("kl"), self.num_train_step)
+                self.writer.add_scalar("value", self.accumulative_meter.get("value"), self.num_train_step)
+                self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), self.num_train_step)
+            self.accumulative_meter.reset()
 
     def _learn(self, update_step: int):
         if self.offload_inference_models:
@@ -256,3 +313,36 @@ def _learn(self, update_step: int):
                 experience.to_device(self.device)
                 self._training_step(experience)
                 self._on_learn_batch_end(experience)
+
+    def _save_checkpoint(self, episode: int = 0):
+        self.coordinator.print_on_master("\nStart saving actor checkpoint with running states")
+        save_checkpoint(
+            save_dir=self.actor_save_dir,
+            booster=self.actor_booster,
+            model=self.actor,
+            optimizer=self.actor_optim,
+            lr_scheduler=self.actor_scheduler,
+            epoch=0,
+            step=episode + 1,
+            batch_size=self.train_batch_size,
+            coordinator=self.coordinator,
+        )
+        self.coordinator.print_on_master(
+            f"Saved actor checkpoint at episode {(episode + 1)} at folder {self.actor_save_dir}"
+        )
+
+        self.coordinator.print_on_master("\nStart saving critic checkpoint with running states")
+        save_checkpoint(
+            save_dir=self.critic_save_dir,
+            booster=self.critic_booster,
+            model=self.critic,
+            optimizer=self.critic_optim,
+            lr_scheduler=self.critic_scheduler,
+            epoch=0,
+            step=episode + 1,
+            batch_size=self.train_batch_size,
+            coordinator=self.coordinator,
+        )
+        self.coordinator.print_on_master(
+            f"Saved critic checkpoint at episode {(episode + 1)} at folder {self.critic_save_dir}"
+        )
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index 1c3613d108c6..1886e01cbbaa 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -116,6 +116,13 @@ def _train(self, epoch):
                 batch["reject_input_ids"],
                 batch["reject_attention_mask"],
             )
+
+            # if is_rank_0():
+            #     print(batch["chosen_input_ids"][0])
+            #     print(batch["chosen_attention_mask"][0])
+            #     print(batch["reject_input_ids"][0])
+            #     print(batch["reject_attention_mask"][0])
+            # exit()
             batch_size = chosen_input_ids.size()[0]
 
             # concatenate for better parrallelism
@@ -133,14 +140,17 @@ def _train(self, epoch):
                 self.optimizer.zero_grad()
                 self.actor_scheduler.step()
 
+            accuracy = (chosen_reward > reject_reward).float()
+
             # sync
             loss_mean = all_reduce_mean(tensor=loss)
             chosen_rewards_mean = all_reduce_mean(tensor=chosen_reward)
             rejected_rewards_mean = all_reduce_mean(tensor=reject_reward)
+            accuracy_mean = all_reduce_mean(tensor=accuracy)
             self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
-
+            self.accumulative_meter.add("accuracy", accuracy_mean.mean().to(torch.float16).item())
             if self.writer and is_rank_0():
                 self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
                 self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
@@ -155,15 +165,14 @@ def _train(self, epoch):
                 self.writer.add_scalar(
                     "train/reward_reject", self.accumulative_meter.get("rejected_rewards"), self.num_train_step
                 )
+                self.writer.add_scalar("train/acc", self.accumulative_meter.get("accuracy"), self.num_train_step)
 
             if i % self.accumulation_steps == self.accumulation_steps - 1:
                 self.num_train_step += 1
                 step_bar.update()
                 self.accumulative_meter.reset()
 
-            if (self.save_interval > 0 and (i + 1) % (self.save_interval * self.accumulation_steps) == 0) or (
-                i + 1
-            ) == len(self.train_dataloader):
+            if self.save_interval > 0 and self.num_train_step % self.save_interval == 0:
                 self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
                 save_checkpoint(
                     save_dir=self.save_dir,
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index 1a69833591da..5163fbd08700 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -118,8 +118,7 @@ def _train(self, epoch: int):
             if (
                 self.save_dir is not None
                 and self.save_interval is not None
-                and (self.save_interval and (i + 1) % (self.save_interval * self.accumulation_steps) == 0)
-                or (i + 1) == len(self.train_dataloader)
+                and (self.num_train_step + 1) % self.save_interval == 0
             ):
                 save_checkpoint(
                     save_dir=self.save_dir,
@@ -128,12 +127,12 @@ def _train(self, epoch: int):
                     optimizer=self.optimizer,
                     lr_scheduler=self.scheduler,
                     epoch=epoch,
-                    step=i + 1,
+                    step=self.num_train_step + 1,
                     batch_size=batch_size,
                     coordinator=self.coordinator,
                 )
                 self.coordinator.print_on_master(
-                    f"Saved checkpoint at epoch {epoch} step {i + 1} at folder {self.save_dir}"
+                    f"Saved checkpoint at epoch {epoch} step {self.num_train_step} at folder {self.save_dir}"
                 )
         step_bar.close()
 
diff --git a/applications/Chat/coati/trainer/strategies/__init__.py b/applications/Chat/coati/trainer/strategies/__init__.py
deleted file mode 100644
index 521dcb5855b1..000000000000
--- a/applications/Chat/coati/trainer/strategies/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .base import Strategy
-from .colossalai import GeminiStrategy, LowLevelZeroStrategy
-from .ddp import DDPStrategy
-
-__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
deleted file mode 100644
index a78716216ae0..000000000000
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from abc import ABC, abstractmethod
-from contextlib import nullcontext
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from coati.experience_buffer import ExperienceBuffer
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-from colossalai.booster import Booster
-from colossalai.booster.plugin import Plugin
-
-from .sampler import DistributedSampler
-
-_BoostArgSpec = Union[nn.Module, Tuple[nn.Module, Optimizer], Dict]
-
-
-class Strategy(ABC):
-    """
-    Base class for training strategies.
-    """
-
-    def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
-        super().__init__()
-        # NOTE: dist must be initialized before Booster
-        self.setup_distributed()
-        self.plugin = plugin_initializer()
-        self.booster = Booster(plugin=self.plugin)
-        self._post_init()
-
-    @abstractmethod
-    def _post_init(self) -> None:
-        pass
-
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
-        self.booster.backward(loss, optimizer)
-
-    def optimizer_step(self, optimizer: Optimizer, **kwargs) -> None:
-        optimizer.step()
-
-    @abstractmethod
-    def setup_distributed(self) -> None:
-        pass
-
-    @abstractmethod
-    def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
-        pass
-
-    def model_init_context(self):
-        return nullcontext()
-
-    def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
-        """Prepare [model | (model, optimizer) | Dict] based on each strategy.
-        NOTE: the keys of Dict must be a subset of `self.booster.boost`'s arguments.
-
-        Example::
-            >>> # e.g., include lr_scheduler
-            >>> result_dict = strategy.prepare(dict(model=model, lr_scheduler=lr_scheduler))
-            >>> # when fine-tuning actor and critic
-            >>> (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
-            >>> # or when training reward model
-            >>> (reward_model, reward_model_optim) = strategy.prepare((reward_model, reward_model_optim))
-            >>> # or just inference
-            >>> actor, critic = strategy.prepare(actor, critic)
-
-        Returns:
-            Union[List[_BoostArgSpec], _BoostArgSpec]: [model | (model, optimizer) | Dict] in the original order.
-        """
-
-        rets = []
-        for arg in boost_args:
-            if isinstance(arg, nn.Module):
-                model, *_ = self.booster.boost(arg)
-                rets.append(model)
-            elif isinstance(arg, tuple):
-                try:
-                    model, optimizer = arg
-                except ValueError:
-                    raise RuntimeError(f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"')
-                model, optimizer, *_ = self.booster.boost(model=model, optimizer=optimizer)
-                rets.append((model, optimizer))
-            elif isinstance(arg, Dict):
-                model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
-                boost_result = dict(
-                    model=model,
-                    optimizer=optimizer,
-                    criterion=criterion,
-                    dataloader=dataloader,
-                    lr_scheduler=lr_scheduler,
-                )
-                # remove None values
-                boost_result = {key: value for key, value in boost_result.items() if value is not None}
-                rets.append(boost_result)
-            else:
-                raise RuntimeError(f"Type {type(arg)} is not supported")
-
-        return rets[0] if len(rets) == 1 else rets
-
-    @staticmethod
-    def unwrap_model(model: nn.Module) -> nn.Module:
-        """Get the unwrapped model from a wrapped model made by Strategy.prepare.
-
-        Args:
-            model (nn.Module): the model to unwrap
-
-        Returns:
-            nn.Module: the original model
-        """
-        return model
-
-    def save_model(self, model: nn.Module, path: str, shard: bool = False, **kwargs) -> None:
-        self.booster.save_model(model, path, shard=shard, **kwargs)
-
-    def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
-        self.booster.load_model(model, path, strict)
-
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False, **kwargs) -> None:
-        self.booster.save_optimizer(optimizer, path, shard=not only_rank0, **kwargs)
-
-    def load_optimizer(self, optimizer: Optimizer, path: str) -> None:
-        self.booster.load_optimizer(optimizer, path)
-
-    def setup_sampler(self, dataset) -> DistributedSampler:
-        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
-        return DistributedSampler(dataset, 1, 0)
-
-    @abstractmethod
-    def save_pretrained(
-        self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
-    ) -> None:
-        pass
-
-    @abstractmethod
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        pass
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
deleted file mode 100644
index 7129edb060ef..000000000000
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import warnings
-from typing import Optional
-
-import torch.nn as nn
-
-import colossalai
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
-from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.utils import get_current_device
-from colossalai.zero.gemini.gemini_ddp import GeminiDDP
-
-from .ddp import DDPStrategy
-
-
-class LowLevelZeroStrategy(DDPStrategy):
-    """
-        The strategy for training with ColossalAI.
-
-    Args:
-        stage(int): The stage to use in ZeRO. Choose in (1, 2)
-        precision(str): The precision to use. Choose in ('fp32', 'fp16').
-        seed(int): The seed for the random number generator.
-        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
-                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
-                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
-        reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
-        overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
-        initial_scale(float): The initial scale for the optimizer.
-        growth_factor(float): The growth factor for the optimizer.
-        backoff_factor(float): The backoff factor for the optimizer.
-        growth_interval(int): The growth interval for the optimizer.
-        hysteresis(int): The hysteresis for the optimizer.
-        min_scale(float): The minimum scale for the optimizer.
-        max_scale(float): The maximum scale for the optimizer.
-        max_norm(float): The maximum norm for the optimizer.
-        norm_type(float): The norm type for the optimizer.
-
-    """
-
-    def __init__(
-        self,
-        stage: int = 2,
-        precision: str = "fp16",
-        seed: int = 42,
-        placement_policy: str = "cuda",
-        reduce_bucket_size: int = 12 * 1024**2,  # only for stage 1&2
-        overlap_communication: bool = True,  # only for stage 1&2
-        initial_scale: float = 2**16,
-        growth_factor: float = 2,
-        backoff_factor: float = 0.5,
-        growth_interval: int = 1000,
-        hysteresis: int = 2,
-        min_scale: float = 1,
-        max_scale: float = 2**32,
-        max_norm: float = 0.0,
-        norm_type: float = 2.0,
-    ) -> None:
-        assert stage in (1, 2), f'Unsupported stage "{stage}"'
-        assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
-        assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
-
-        plugin_initializer = lambda: LowLevelZeroPlugin(
-            stage=stage,
-            precision=precision,
-            reduce_bucket_size_in_m=reduce_bucket_size,
-            overlap_communication=overlap_communication,
-            cpu_offload=(placement_policy == "cpu"),
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type,
-        )
-
-        super().__init__(seed, plugin_initializer)
-
-    def _post_init(self) -> None:
-        assert isinstance(
-            self.plugin, LowLevelZeroPlugin
-        ), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, LowLevelZeroModel)
-        return model.module
-
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        assert isinstance(model, LowLevelZeroModel)
-        yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
-
-
-class GeminiStrategy(DDPStrategy):
-    """
-        The strategy for training with ColossalAI.
-
-    Args:
-        seed(int): The seed for the random number generator.
-        shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3.
-            This is not compatible with `from_pretrained()`. We temporarily disable this and will support it in the future.
-        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
-                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
-                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
-        pin_memory(bool): Whether to pin the memory for the data loader. Only for ZeRO-3.
-        force_outputs_fp32(bool): Whether to force the outputs to be fp32. Only for ZeRO-3.
-        search_range_m(int): The number of search range for the chunk size, divided by 2^20. Only for ZeRO-3.
-        hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3.
-        min_chunk_size_m(float): The minimum chunk size divided by 2^20. Only for ZeRO-3.
-        gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3.
-        initial_scale(float): The initial scale for the optimizer.
-        growth_factor(float): The growth factor for the optimizer.
-        backoff_factor(float): The backoff factor for the optimizer.
-        growth_interval(int): The growth interval for the optimizer.
-        hysteresis(int): The hysteresis for the optimizer.
-        min_scale(float): The minimum scale for the optimizer.
-        max_scale(float): The maximum scale for the optimizer.
-        max_norm(float): The maximum norm for the optimizer.
-        norm_type(float): The norm type for the optimizer.
-
-    """
-
-    def __init__(
-        self,
-        seed: int = 42,
-        shard_init: bool = False,  # only for stage 3
-        placement_policy: str = "auto",
-        shard_param_frac: float = 1.0,  # only for static placement
-        offload_optim_frac: float = 0.0,  # only for static placement
-        offload_param_frac: float = 0.0,  # only for static placement
-        pin_memory: bool = True,  # only for stage 3
-        force_outputs_fp32: bool = False,  # only for stage 3
-        search_range_m: int = 32,  # only for stage 3
-        hidden_dim: Optional[int] = None,  # only for stage 3
-        min_chunk_size_m: float = 32,  # only for stage 3
-        gpu_margin_mem_ratio: float = 0.0,  # only for stage 3
-        initial_scale: float = 2**16,
-        growth_factor: float = 2,
-        backoff_factor: float = 0.5,
-        growth_interval: int = 1000,
-        hysteresis: int = 2,
-        min_scale: float = 1,
-        max_scale: float = 2**32,
-        max_norm: float = 0.0,
-        norm_type: float = 2.0,
-    ) -> None:
-        # TODO(ver217): support shard_init when using from_pretrained()
-        if shard_init:
-            warnings.warn(
-                f"Shard init is not supported model.from_pretrained() yet. "
-                "Please load weights after strategy.prepare()"
-            )
-        self.shard_init = shard_init
-
-        warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
-
-        # NOTE: dist should be initialized before calling get_current_device()
-        plugin_initializer = lambda: GeminiPlugin(
-            chunk_init_device=get_current_device(),
-            placement_policy=placement_policy,
-            shard_param_frac=shard_param_frac,
-            offload_optim_frac=offload_optim_frac,
-            offload_param_frac=offload_param_frac,
-            precision="fp16",
-            pin_memory=pin_memory,
-            force_outputs_fp32=force_outputs_fp32,
-            strict_ddp_mode=shard_init,
-            search_range_m=search_range_m,
-            hidden_dim=hidden_dim,
-            min_chunk_size_m=min_chunk_size_m,
-            gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type,
-        )
-
-        super().__init__(seed, plugin_initializer)
-
-    def _post_init(self) -> None:
-        assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-
-    def model_init_context(self):
-        return super().model_init_context()
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, GeminiDDP)
-        return model.module
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
deleted file mode 100644
index f2a44aeb0961..000000000000
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-import random
-from collections import OrderedDict
-from typing import Callable, Optional
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.experience_buffer import ExperienceBuffer
-from coati.models import Actor, Critic, RewardModel
-from torch.utils.data import DataLoader
-from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-from colossalai.booster.plugin import TorchDDPPlugin
-from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
-
-from .base import Strategy
-from .sampler import DistributedSampler
-
-
-# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
-def get_grad_required_state_dict(model: nn.Module):
-    state_dict = OrderedDict()
-    for name, parameter in model.named_parameters():
-        if parameter.requires_grad:
-            state_dict[name] = parameter.detach()
-    return state_dict
-
-
-class DDPStrategy(Strategy):
-    """
-    Strategy for distributed training using torch.distributed.
-    """
-
-    def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
-        self.seed = seed
-        super().__init__(plugin_initializer)
-
-    def _try_init_dist(self, force: bool = False) -> None:
-        try:
-            rank = int(os.environ["RANK"])
-            local_rank = int(os.environ["LOCAL_RANK"])
-            world_size = int(os.environ["WORLD_SIZE"])
-            host = os.environ["MASTER_ADDR"]
-            port = int(os.environ["MASTER_PORT"])
-            dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
-            torch.cuda.set_device(local_rank)
-        except KeyError as e:
-            if force:
-                raise RuntimeError(
-                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-                )
-        except Exception as e:
-            if force:
-                raise e
-
-    def _post_init(self) -> None:
-        assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        self._try_init_dist(force=True)
-        self.set_seed(self.seed)
-
-    def set_seed(self, seed: int) -> None:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
-    def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
-        return self.plugin.prepare_dataloader(
-            data_buffer,
-            batch_size=data_buffer.sample_batch_size,
-            shuffle=True,
-            drop_last=True,
-            pin_memory=pin_memory,
-            collate_fn=data_buffer.collate_fn,
-        )
-
-    def setup_sampler(self, dataset) -> DistributedSampler:
-        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
-        return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
-        return model.unwrap()
-
-    def save_pretrained(
-        self, model: nn.Module, path: str, shard: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None
-    ) -> None:
-        if dist.get_rank() == 0:
-            unwrapped_model = self.unwrap_model(model)
-            assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
-            pretrained_model = unwrapped_model.model
-            assert isinstance(pretrained_model, PreTrainedModel)
-            # HACK: only use hf save_pretrained to save config
-            pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
-            if tokenizer is not None:
-                tokenizer.save_pretrained(path)
-
-        model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model, model_path, shard=shard)
-        def _replace_keys(model_path: str, replace_fn: Callable):
-            state_dict = torch.load(model_path, map_location="cpu")
-            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
-            torch.save(state_dict, model_path)
-        # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
-        # HACK: rename keys of pytorch_model.bin
-        if dist.get_rank() == 0:
-            _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
-
-
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        # TODO: implement sharding on naive strategy
-        model = self.unwrap_model(model)
-        if "requires_grad_only" in config and config["requires_grad_only"] == True:
-            state_dict = get_grad_required_state_dict(model)
-        else:
-            state_dict = model.state_dict()
-
-        if "shard_size" in config:
-            shard_size = config["shard_size"]
-            accumulate_size = 0
-            state_dict_shard = OrderedDict()
-            for name, param in state_dict.items():
-                state_dict_shard[name] = param
-                accumulate_size += param.numel() * param.element_size()
-                if accumulate_size >= shard_size:
-                    accumulate_size = 0
-                    yield state_dict_shard
-                    state_dict_shard = OrderedDict()
-            if accumulate_size > 0:
-                yield state_dict_shard
-        else:
-            yield state_dict
diff --git a/applications/Chat/coati/trainer/strategies/sampler.py b/applications/Chat/coati/trainer/strategies/sampler.py
deleted file mode 100644
index 6e811bef11a5..000000000000
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import math
-
-import numpy as np
-
-
-class DistributedSampler:
-    def __init__(self, dataset, num_replicas: int, rank: int) -> None:
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-
-        if len(self.dataset) % self.num_replicas != 0:
-            self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
-            )
-        else:
-            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
-
-        self.total_size = self.num_samples * self.num_replicas
-
-        indices = list(range(len(self.dataset)))
-        indices = indices[: self.total_size]
-        assert len(indices) == self.total_size
-        # subsample
-        indices = indices[self.rank : self.total_size : self.num_replicas]
-        assert len(indices) == self.num_samples
-        self.indices = indices
-
-    def sample(self, batch_size: int) -> list:
-        sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
-        return [self.dataset[idx] for idx in sampled_indices]
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index 6993b8bb8750..31b2d99d98d6 100644
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -59,3 +59,11 @@ def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
 def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
     dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
     return tensor
+
+
+def prepare_data_for_inference(
+    chosen_input_ids, reject_input_ids, chosen_attention_mask, reject_attention_mask, tokenizer
+):
+    # This function will truncate each vector in the batch after the first zero
+    assert tokenizer.bos_token_id != tokenizer.pad_token_id, "This bos token should not be the same as the pad token"
+    return None, None
diff --git a/applications/Chat/examples/download_model.py b/applications/Chat/examples/download_model.py
index ec3482b5f789..fa7c45665feb 100644
--- a/applications/Chat/examples/download_model.py
+++ b/applications/Chat/examples/download_model.py
@@ -5,11 +5,9 @@
 from typing import List
 
 import tqdm
-from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models import Critic, RewardModel
 from huggingface_hub import hf_hub_download, snapshot_download
-from transformers import AutoConfig, AutoTokenizer, BloomConfig, BloomTokenizerFast, GPT2Config, GPT2Tokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 @dataclasses.dataclass
@@ -25,27 +23,14 @@ def download_all(self):
         snapshot_download(self.repo_id)
 
 
-def test_init(model: str, dir_path: str):
-    if model == "gpt2":
-        config = GPT2Config.from_pretrained(dir_path)
-        actor = GPTActor(config=config)
-        critic = GPTCritic(config=config)
-        reward_model = GPTRM(config=config)
-        GPT2Tokenizer.from_pretrained(dir_path)
-    elif model == "bloom":
-        config = BloomConfig.from_pretrained(dir_path)
-        actor = BLOOMActor(config=config)
-        critic = BLOOMCritic(config=config)
-        reward_model = BLOOMRM(config=config)
-        BloomTokenizerFast.from_pretrained(dir_path)
-    elif model == "opt":
-        config = AutoConfig.from_pretrained(dir_path)
-        actor = OPTActor(config=config)
-        critic = OPTCritic(config=config)
-        reward_model = OPTRM(config=config)
-        AutoTokenizer.from_pretrained(dir_path)
-    else:
-        raise NotImplementedError(f"Model {model} not implemented")
+def test_init(dir_path: str):
+    actor = AutoModelForCausalLM.from_pretrained(dir_path)
+    del actor
+    critic = Critic(dir_path)
+    del critic
+    reward_model = RewardModel(dir_path)
+    del reward_model
+    AutoTokenizer.from_pretrained(dir_path)
 
 
 if __name__ == "__main__":
@@ -76,4 +61,4 @@ def test_init(model: str, dir_path: str):
             repo_list[model_name].download(dir_path)
         else:
             repo_list[model_name].download_all()
-        test_init(model_name, dir_path)
+        test_init(dir_path)
diff --git a/applications/Chat/examples/generate_conversation_dataset.py b/applications/Chat/examples/generate_conversation_dataset.py
deleted file mode 100644
index 7e03b2d54260..000000000000
--- a/applications/Chat/examples/generate_conversation_dataset.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import argparse
-import json
-
-from datasets import load_dataset
-
-
-def generate_alpaca():
-    # We can convert dataset with the same format("instruction", "input", "output") as Alpaca into a one-round conversation.
-    conversation_dataset = []
-    dataset = load_dataset("tatsu-lab/alpaca", split="train")
-
-    instructions = dataset["instruction"]
-    inputs = dataset["input"]
-    outputs = dataset["output"]
-
-    assert len(instructions) == len(inputs) == len(outputs)
-
-    for idx in range(len(instructions)):
-        human_utterance = instructions[idx] + "\n\n" + inputs[idx] if inputs[idx] else instructions[idx]
-        human = {"from": "human", "value": human_utterance}
-
-        gpt_utterance = outputs[idx]
-        gpt = {"from": "gpt", "value": gpt_utterance}
-
-        conversation = dict(type="instruction", language="English", dataset="Alpaca", conversations=[human, gpt])
-        conversation_dataset.append(conversation)
-
-    return conversation_dataset
-
-
-def generate_sharegpt():
-    # ShareGPT data requires less processing.
-    conversation_dataset = []
-    dataset = load_dataset(
-        "anon8231489123/ShareGPT_Vicuna_unfiltered",
-        data_files="ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json",
-        split="train",
-    )
-
-    conversations = dataset["conversations"]
-
-    for idx in range(len(conversations)):
-        for conv in conversations[idx]:
-            # We don't need markdown and text value.
-            del conv["markdown"]
-            del conv["text"]
-
-        conversation = dict(
-            type="conversation", language="Multilingual", dataset="ShareGPT", conversations=conversations[idx]
-        )
-        conversation_dataset.append(conversation)
-
-    return conversation_dataset
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="All",
-        choices=["Alpaca", "ShareGPT", "All"],
-        help="which dataset to convert, All will combine Alpaca and ShareGPT",
-    )
-    parser.add_argument("--save_path", type=str, default="dataset.json", help="path to save the converted dataset")
-    args = parser.parse_args()
-
-    conversation_dataset = []
-
-    if args.dataset == "Alpaca":
-        conversation_dataset.extend(generate_alpaca())
-    elif args.dataset == "ShareGPT":
-        conversation_dataset.extend(generate_sharegpt())
-    else:
-        conversation_dataset.extend(generate_alpaca())
-        conversation_dataset.extend(generate_sharegpt())
-
-    for idx, sample in enumerate(conversation_dataset):
-        sample["id"] = idx + 1
-
-    with open(args.save_path, mode="w") as f:
-        json.dump(conversation_dataset, f, indent=4, default=str, ensure_ascii=False)
diff --git a/applications/Chat/examples/generate_prompt_dataset.py b/applications/Chat/examples/generate_prompt_dataset.py
deleted file mode 100644
index 4eec6feae505..000000000000
--- a/applications/Chat/examples/generate_prompt_dataset.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import argparse
-import json
-import random
-
-random.seed(42)
-
-
-def sample(args):
-    with open(args.dataset_path, mode="r") as f:
-        dataset_list = json.load(f)
-
-    sampled_dataset = [
-        {"instruction": sample["instruction"], "id": idx}
-        for idx, sample in enumerate(random.sample(dataset_list, args.sample_size))
-    ]
-
-    with open(args.save_path, mode="w") as f:
-        json.dump(sampled_dataset, f, indent=4, default=str, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset_path", type=str, default=None, required=True, help="path to the pretrain dataset")
-    parser.add_argument("--save_path", type=str, default="prompt.json", help="path to save the prompt dataset")
-    parser.add_argument("--sample_size", type=int, default=16384, help="size of the prompt dataset")
-    args = parser.parse_args()
-    sample(args)
diff --git a/applications/Chat/examples/hostfile b/applications/Chat/examples/hostfile
index 91af58fb794c..4a0d939bb155 100644
--- a/applications/Chat/examples/hostfile
+++ b/applications/Chat/examples/hostfile
@@ -1 +1 @@
-192.168.0.36
+192.168.0.35
diff --git a/applications/Chat/examples/inference.py b/applications/Chat/examples/inference.py
index 62e06bf7b3bb..27d4b480ebd3 100644
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference.py
@@ -1,73 +1,142 @@
 import argparse
+import os
+from copy import deepcopy
 
 import torch
-from coati.models.bloom import BLOOMActor
-from coati.models.generation import generate
-from coati.models.gpt import GPTActor
-from coati.models.llama import LlamaActor
-from coati.models.opt import OPTActor
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
-
-
-def eval(args):
-    # configure model
-    if args.model == "gpt2":
-        actor = GPTActor(pretrained=args.pretrain)
-    elif args.model == "bloom":
-        actor = BLOOMActor(pretrained=args.pretrain)
-    elif args.model == "opt":
-        actor = OPTActor(pretrained=args.pretrain)
-    elif args.model == "llama":
-        actor = LlamaActor(pretrained=args.pretrain)
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    actor.to(torch.cuda.current_device())
-    if args.model_path is not None:
-        state_dict = torch.load(args.model_path)
-        actor.load_state_dict(state_dict)
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    actor.eval()
-    tokenizer.padding_side = "left"
-    input_ids = tokenizer.encode(args.input, return_tensors="pt").to(torch.cuda.current_device())
-    outputs = generate(
-        actor,
-        input_ids,
-        tokenizer=tokenizer,
-        max_length=args.max_length,
-        do_sample=True,
-        top_k=50,
-        top_p=0.95,
-        num_return_sequences=1,
+from coati.dataset.chatio import dummy_io, rich_io, simple_io
+from coati.dataset.conversation import default_conversation
+from coati.models import generate_streaming
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def get_gpu_memory(max_gpus=None):
+    gpu_memory = []
+    num_gpus = torch.cuda.device_count() if max_gpus is None else min(max_gpus, torch.cuda.device_count())
+
+    for gpu_id in range(num_gpus):
+        with torch.cuda.device(gpu_id):
+            device = torch.cuda.current_device()
+            gpu_properties = torch.cuda.get_device_properties(device)
+            total_memory = gpu_properties.total_memory / (1024**3)
+            allocated_memory = torch.cuda.memory_allocated() / (1024**3)
+            available_memory = total_memory - allocated_memory
+            gpu_memory.append(available_memory)
+    return gpu_memory
+
+
+def load_model_and_tokenizer(model_path, tokenizer_path, device="cuda", **kwargs):
+    model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.to(device)
+
+    return model, tokenizer
+
+
+def generation_wrapper(*args, **kwargs):
+    input_ids = args[1]
+    tokenizer = args[2]
+    for output in generate_streaming(*args, **kwargs):
+        yield tokenizer.batch_decode(output[:, input_ids.size(1) :], skip_special_tokens=True)[0]
+
+
+def main(args):
+    max_new_tokens = args.max_new_tokens
+    model_max_length = args.model_max_length
+    model, tokenizer = load_model_and_tokenizer(
+        args.model_path, args.tokenizer_path or args.model_path, local_files_only=True
     )
-    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
-    print(f"[Output]: {''.join(output)}")
+
+    assert max_new_tokens <= model_max_length
+    if not tokenizer.eos_token_id:
+        tokenizer.eos_token_id = "</s>"
+    tokenizer.padding_side = "left"
+
+    model_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        # 'early_stopping': True,
+        # 'top_k': -1,
+        # 'top_p': 1.0,
+        # 'temperature': 1.0,
+        # 'temperature':0.1,
+    }
+    conv = deepcopy(default_conversation)
+
+    roles = conv.roles
+    round = 1
+
+    while True:
+        if args.io == "simple":
+            chat_io = simple_io
+        elif args.io == "rich":
+            chat_io = rich_io
+        elif args.io == "dummy":
+            chat_io = dummy_io
+        else:
+            raise ValueError(f"Unknown io type: {args.io}")
+        # raw_text = print(">>> Human:", end=" ")
+        inp = chat_io.prompt_for_input(conv.roles[0])
+
+        if not inp:
+            print("prompt should not be empty!")
+            continue
+
+        if inp.strip() == "clear":
+            conv.clear()
+            os.system("clear")
+            continue
+
+        if inp.strip() == "exit":
+            print("End of chat.")
+            break
+
+        query_text = inp.strip()
+
+        conv.append_message(roles[0], query_text)
+        conv.append_message(roles[1], None)
+
+        chat_io.prompt_for_output(conv.roles[1])
+
+        prompt = conv.get_prompt()
+        input_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to(
+            torch.cuda.current_device()
+        )
+        output_stream = generation_wrapper(
+            model,
+            input_ids,
+            tokenizer,
+            max_length=model_max_length,
+            temperature=0.7,
+            early_stopping=True,
+            **model_kwargs,
+        )
+
+        # print(f">>> Assistant:", end=" ")
+        outputs = chat_io.stream_output(output_stream)
+
+        conv.messages[-1][-1] = outputs.strip()
+
+        with open("round.txt", mode="a", encoding="utf-8") as f:
+            f.write("\n\n" + "=" * 10 + "\n")
+            f.write(f"round {round}:\n{conv.save_prompt()}\n\n")
+            f.write("=" * 10 + "\n")
+
+        # print(f">>> Assistant:", end=" ")
+
+        round += 1
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    # We suggest to use the pretrained model from HuggingFace, use pretrain to configure model
-    parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--model_path", type=str, default=None)
-    parser.add_argument("--input", type=str, default="Question: How are you ? Answer:")
-    parser.add_argument("--max_length", type=int, default=100)
+    parser.add_argument("--tokenizer_path", type=str, default=None)
+    parser.add_argument("--model_max_length", type=int, default=2048)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--io", type=str, default="rich", choices=["simple", "rich", "dummy"])
     args = parser.parse_args()
-    eval(args)
+    main(args)
+
+"""
+python inference.py --model_path /home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling \
+--tokenizer_path /mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/
+"""
diff --git a/applications/Chat/examples/out.txt b/applications/Chat/examples/out.txt
deleted file mode 100644
index 37b12d27dad4..000000000000
--- a/applications/Chat/examples/out.txt
+++ /dev/null
@@ -1,1319 +0,0 @@
-GPU Memory Usage:
-Now CUDA_VISIBLE_DEVICES is set to:
-CUDA_VISIBLE_DEVICES=0
-[11/16/23 11:17:06] INFO     colossalai - colossalai - INFO:
-                             /home/lcyab/data/anaconda/envs/colossal-llama2/lib/
-                             python3.8/site-packages/colossalai/initialize.py:63
-                              launch
-                    INFO     colossalai - colossalai - INFO: Distributed
-                             environment is initialized, world size: 1
-trainable params: 8110080 || all params: 567324672 || trainable%: 1.43
-BloomForCausalLM(
-  (transformer): BloomModel(
-    (word_embeddings): Embedding(250880, 1024)
-    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-    (h): ModuleList(
-      (0): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (1): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (2): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (3): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (4): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (5): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (6): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (7): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (8): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (9): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (10): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (11): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (12): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (13): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (14): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (15): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (16): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (17): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (18): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (19): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (20): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (21): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (22): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-      (23): BloomBlock(
-        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (self_attention): BloomAttention(
-          (query_key_value): Linear(
-            in_features=1024, out_features=3072, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=3072, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (dense): Linear(
-            in_features=1024, out_features=1024, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=1024, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-        (mlp): BloomMLP(
-          (dense_h_to_4h): Linear(
-            in_features=1024, out_features=4096, bias=True
-            (lora_dropout): ModuleDict(
-              (default): Dropout(p=0.1, inplace=False)
-            )
-            (lora_A): ModuleDict(
-              (default): Linear(in_features=1024, out_features=30, bias=False)
-            )
-            (lora_B): ModuleDict(
-              (default): Linear(in_features=30, out_features=4096, bias=False)
-            )
-            (lora_embedding_A): ParameterDict()
-            (lora_embedding_B): ParameterDict()
-          )
-          (gelu_impl): BloomGelu()
-          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
-        )
-      )
-    )
-    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
-  )
-  (lm_head): Linear(in_features=1024, out_features=250880, bias=False)
-)
-Gradient checkpointing enabled successfully
-Loading data...
-Loaded 52002 examples.
-Limiting dataset to 200 examples.
-Formatting inputs...
-Tokenizing inputs... This may take some time...
-Loaded dataset.
-Warmup steps is set to 0
-Booster init max CUDA memory: 1113.03 MB
-Booster init max CPU memory: 4792.71 MB
-Error: failed to run torchrun --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=28535 train_sft.py --pretrain bigscience/bloom-560m --plugin zero2 --save_path /home/lcyab/data/test_folder/model_checkpoint/gpt2 --dataset tatsu-lab/alpaca --batch_size 4 --max_epochs 1 --max_datasets_size 200 --accumulation_steps 8 --lr 2e-2 --lora_rank 30 --max_len 512 --max_epochs 1 --grad_checkpoint on 192.168.0.32, is localhost: False, exception: Encountered a bad command exit code!
-
-Command: 'cd /mnt/vepfs/lcyab/ColossalAI/applications/Chat/examples && export SHELL="/bin/bash" COLORTERM="truecolor" TERM_PROGRAM_VERSION="1.84.1" CONDA_EXE="/home/lcyab/data/anaconda/bin/conda" _LMFILES__modshare="/mnt/vepfs/modulefiles/cuda/11.7:1:/mnt/vepfs/modulefiles/clash:1" ENV="/usr/share/modules/init/profile.sh" PWD="/home/lcyab/data/ColossalAI/applications/Chat/examples" LOGNAME="lcyab" XDG_SESSION_TYPE="tty" CONDA_PREFIX="/home/lcyab/data/anaconda/envs/colossal-llama2" MODULESHOME="/usr/share/modules" MANPATH=":" C_INCLUDE_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/include:1" MOTD_SHOWN="pam" HOME="/home/lcyab" LANG="en_US.UTF-8" LS_COLORS="rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:" LD_LIBRARY_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/lib64:1" CONDA_PROMPT_MODIFIER="(colossal-llama2) " https_proxy="http://192.168.0.32:7890" SSH_CONNECTION="223.25.80.122 57221 192.168.0.36 22" PATH_modshare="/usr/bin:1:/home/lcyab/.local/bin:1:/usr/local/bin:1:/mnt/vepfs/softwares/cuda-toolkits/11.7/bin:1:/home/lcyab/data/anaconda/envs/colossal-llama2/bin:1:/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/remote-cli:1:/bin:1:/snap/bin:1:/sbin:1:/home/lcyab/data/anaconda/condabin:1:/usr/sbin:1:/usr/games:1:/usr/local/sbin:1:/usr/local/games:1" CUDA_VISIBLE_DEVICES="0" LESSCLOSE="/usr/bin/lesspipe %s %s" MODULEPATH_modshare="/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1" XDG_SESSION_CLASS="user" LOADEDMODULES_modshare="cuda/11.7:1:clash:1" TERM="xterm-256color" CPLUS_INCLUDE_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/include" LESSOPEN="| /usr/bin/lesspipe %s" USER="lcyab" VSCODE_GIT_IPC_HANDLE="/run/user/1002/vscode-git-3a0159ec83.sock" CONDA_SHLVL="3" LOADEDMODULES="clash:cuda/11.7" NCCL_IB_TIMEOUT="23" SHLVL="2" BASH_ENV="/usr/share/modules/init/bash" XDG_SESSION_ID="6941" http_proxy="http://192.168.0.32:7890" CONDA_PYTHON_EXE="/home/lcyab/data/anaconda/bin/python" LD_LIBRARY_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/lib64" CPLUS_INCLUDE_PATH_modshare="/mnt/vepfs/softwares/cuda-toolkits/11.7/include:1" XDG_RUNTIME_DIR="/run/user/1002" SSH_CLIENT="223.25.80.122 57221 22" CONDA_DEFAULT_ENV="colossal-llama2" CUDA_HOME="/mnt/vepfs/softwares/cuda-toolkits/11.7" BROWSER="/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/helpers/browser.sh" PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/bin:/home/lcyab/.vscode-server/bin/2b35e1e6d88f1ce073683991d1eff5284a32690f/bin/remote-cli:/home/lcyab/.local/bin:/home/lcyab/data/anaconda/envs/colossal-llama2/bin:/home/lcyab/data/anaconda/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin" MODULEPATH="/mnt/vepfs/modulefiles:/mnt/vepfs/modulefiles:/mnt/vepfs/modulefiles" NCCL_IB_RETRY_CNT="7" _LMFILES_="/mnt/vepfs/modulefiles/clash:/mnt/vepfs/modulefiles/cuda/11.7" DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1002/bus" C_INCLUDE_PATH="/mnt/vepfs/softwares/cuda-toolkits/11.7/include" CONDA_PREFIX_1="/home/lcyab/data/anaconda" CONDA_PREFIX_2="/home/lcyab/data/anaconda/envs/colossalqa" OLDPWD="/home/lcyab" MODULES_CMD="/usr/lib/x86_64-linux-gnu/modulecmd.tcl" TERM_PROGRAM="vscode" VSCODE_IPC_HOOK_CLI="/run/user/1002/vscode-ipc-6a17d9e7-4845-4f65-b164-b8e69318ffe4.sock" _="/home/lcyab/data/anaconda/envs/colossal-llama2/bin/colossalai" && torchrun --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=28535 train_sft.py --pretrain bigscience/bloom-560m --plugin zero2 --save_path /home/lcyab/data/test_folder/model_checkpoint/gpt2 --dataset tatsu-lab/alpaca --batch_size 4 --max_epochs 1 --max_datasets_size 200 --accumulation_steps 8 --lr 2e-2 --lora_rank 30 --max_len 512 --max_epochs 1 --grad_checkpoint'
-
-Exit code: 1
-
-Stdout: already printed
-
-Stderr: already printed
-
-
-
-====== Training on All Nodes =====
-192.168.0.32: failure
-
-====== Stopping All Nodes =====
-192.168.0.32: finish
diff --git a/applications/Chat/examples/prepare_preference_dataset.py b/applications/Chat/examples/prepare_preference_dataset.py
new file mode 100644
index 000000000000..19f9aa9cc8a4
--- /dev/null
+++ b/applications/Chat/examples/prepare_preference_dataset.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Prepare sft dataset for finetuning
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+import time
+from multiprocessing import cpu_count
+
+from coati.dataset.conversation import default_conversation
+from coati.dataset.spliced_and_tokenized_dataset import tokenize_rlhf
+from datasets import dataset_dict, load_dataset
+from transformers import AutoTokenizer
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_input_dirs",
+        type=str,
+        required=True,
+        default=None,
+        help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
+    )
+    parser.add_argument(
+        "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
+    )
+    parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
+    parser.add_argument(
+        "--data_jsonl_output_dir",
+        type=str,
+        default="jsonl_output",
+        help="Output directory of spliced dataset with jsonl format",
+    )
+    parser.add_argument(
+        "--data_arrow_output_dir",
+        type=str,
+        default="arrow_output",
+        help="Output directory of spliced dataset with arrow format",
+    )
+    parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
+    parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
+    parser.add_argument(
+        "--num_samples_per_datafile",
+        type=int,
+        default=-1,
+        help="Number of samples to be generated from each data file. -1 denote all samples.",
+    )
+    args = parser.parse_args()
+
+    if args.num_spliced_dataset_bins >= 100000:
+        raise ValueError("Too many spliced divisions, must be smaller than 100000")
+
+    assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
+    assert not os.path.exists(
+        args.data_jsonl_output_dir
+    ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
+    assert not os.path.exists(
+        args.data_arrow_output_dir
+    ), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
+    os.makedirs(args.data_jsonl_output_dir)
+    os.makedirs(args.data_arrow_output_dir)
+
+    # Prepare to all input datasets
+    input_data_paths = []
+    input_data_dirs = args.data_input_dirs.split(",")
+    for ds_dir in input_data_dirs:
+        ds_dir = os.path.abspath(ds_dir)
+        assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
+        ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
+        ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
+        input_data_paths.extend(ds_paths)
+
+    # Prepare to data splitting.
+    train_splits = []
+    split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
+    for i in range(0, 100, split_interval):
+        start = i
+        end = i + split_interval
+        if end > 100:
+            end = 100
+        train_splits.append(f"train[{start}%:{end}%]")
+
+    # Prepare to the tokenizer.
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    list_dataset = load_dataset(
+        path="json",
+        data_files=input_data_paths,
+        cache_dir=os.path.join(args.data_cache_dir, "raw"),
+        keep_in_memory=False,
+        split=train_splits,
+        num_proc=cpu_count(),
+    )
+    for index, dataset in enumerate(list_dataset):
+        assert isinstance(dataset, dataset_dict.Dataset)
+        if args.num_samples_per_datafile > 0:
+            # limit the number of samples in each dataset
+            dataset = dataset.select(
+                random.sample(range(len(dataset)), min(args.num_samples_per_datafile, len(dataset)))
+            )
+        logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
+        dataset = dataset.map(
+            function=tokenize_rlhf,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "conversation_template": default_conversation,
+                "max_length": args.max_length,
+            },
+            keep_in_memory=False,
+            num_proc=min(len(dataset), cpu_count()),
+        )
+
+        dataset = dataset.filter(lambda data: data["chosen_input_ids"] is not None)
+
+        # Save each jsonl spliced dataset.
+        output_index = "0" * (5 - len(str(index))) + str(index)
+        output_name = f"part-{output_index}"
+        output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
+        st = time.time()
+        with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
+            count = 0
+            for data_point in dataset:
+                if count % 500 == 0:
+                    logger.info(f"processing {count} spliced data points for {fp_writer.name}")
+                count += 1
+                fp_writer.write(json.dumps(data_point, ensure_ascii=False) + "\n")
+
+        logger.info(
+            f"Current file {fp_writer.name}; "
+            f"Data size: {len(dataset)}; "
+            f"Time cost: {round((time.time() - st) / 60, 6)} minutes."
+        )
+
+        # Save each arrow spliced dataset
+        output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
+        logger.info(f"Start to save {output_arrow_path}")
+        dataset = load_dataset(
+            path="json",
+            data_files=[output_jsonl_path],
+            cache_dir=os.path.join(args.data_cache_dir, "tokenized"),
+            keep_in_memory=False,
+            num_proc=cpu_count(),
+            split="train",
+        )
+        dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(dataset), cpu_count()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/Chat/examples/prepare_preference_dataset.sh b/applications/Chat/examples/prepare_preference_dataset.sh
new file mode 100755
index 000000000000..03ffa52b5d1a
--- /dev/null
+++ b/applications/Chat/examples/prepare_preference_dataset.sh
@@ -0,0 +1,23 @@
+# generate tinny test dataset
+rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/cache
+rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/jsonl
+rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow
+
+python prepare_preference_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprcessed \
+    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/cache \
+    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/jsonl \
+    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow
+
+# generate tinny test dataset
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/cache
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/jsonl
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow
+
+# python prepare_preference_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprcessed \
+#     --num_samples_per_datafile 120 \
+#     --num_spliced_dataset_bins 1 \
+#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/cache \
+#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/jsonl \
+#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow
diff --git a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py b/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py
new file mode 100644
index 000000000000..8b0be53087b6
--- /dev/null
+++ b/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Prepare sft dataset for finetuning
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from multiprocessing import cpu_count
+
+from coati.dataset.spliced_and_tokenized_dataset import supervised_tokenize_pretrain
+from datasets import dataset_dict, load_dataset
+from transformers import AutoTokenizer
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_input_dirs",
+        type=str,
+        required=True,
+        default=None,
+        help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
+    )
+    parser.add_argument(
+        "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
+    )
+    parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
+    parser.add_argument(
+        "--data_jsonl_output_dir",
+        type=str,
+        default="jsonl_output",
+        help="Output directory of spliced dataset with jsonl format",
+    )
+    parser.add_argument(
+        "--data_arrow_output_dir",
+        type=str,
+        default="arrow_output",
+        help="Output directory of spliced dataset with arrow format",
+    )
+    parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
+    parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
+    parser.add_argument(
+        "--num_samples_per_datafile",
+        type=int,
+        default=-1,
+        help="Number of samples to be generated from each data file. -1 denote all samples.",
+    )
+    args = parser.parse_args()
+
+    if args.num_spliced_dataset_bins >= 100000:
+        raise ValueError("Too many spliced divisions, must be smaller than 100000")
+
+    assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
+    assert not os.path.exists(
+        args.data_jsonl_output_dir
+    ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
+    assert not os.path.exists(
+        args.data_arrow_output_dir
+    ), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
+    os.makedirs(args.data_jsonl_output_dir)
+    os.makedirs(args.data_arrow_output_dir)
+
+    # Prepare to all input datasets
+    input_data_paths = []
+    input_data_dirs = args.data_input_dirs.split(",")
+    for ds_dir in input_data_dirs:
+        ds_dir = os.path.abspath(ds_dir)
+        assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
+        ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
+        ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
+        input_data_paths.extend(ds_paths)
+
+    # Prepare to data splitting.
+    train_splits = []
+    split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
+    for i in range(0, 100, split_interval):
+        start = i
+        end = i + split_interval
+        if end > 100:
+            end = 100
+        train_splits.append(f"train[{start}%:{end}%]")
+
+    # Prepare to the tokenizer.
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    list_dataset = load_dataset(
+        path="json",
+        data_files=input_data_paths,
+        cache_dir=os.path.join(args.data_cache_dir, "raw"),
+        keep_in_memory=False,
+        split=train_splits,
+        num_proc=cpu_count(),
+    )
+    for index, dataset in enumerate(list_dataset):
+        assert isinstance(dataset, dataset_dict.Dataset)
+        if args.num_samples_per_datafile > 0:
+            # limit the number of samples in each dataset
+            dataset = dataset.select(
+                random.sample(range(len(dataset)), min(args.num_samples_per_datafile, len(dataset)))
+            )
+        logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
+        dataset = dataset.map(
+            function=supervised_tokenize_pretrain,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "max_length": args.max_length,
+            },
+            keep_in_memory=False,
+            num_proc=min(len(dataset), cpu_count()),
+        )
+
+        dataset = dataset.filter(lambda data: data["labels"] is not None)
+        dataset = dataset.sort(column_names=("seq_category", "seq_length"), reverse=False, keep_in_memory=False)
+
+        # We don't concatenate data samples here.
+        spliced_dataset = dataset
+        # Save each jsonl spliced dataset.
+        output_index = "0" * (5 - len(str(index))) + str(index)
+        output_name = f"part-{output_index}"
+        output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
+        # st = time.time()
+        with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
+            spliced_count = 0
+            for spliced_data_point in spliced_dataset:
+                if spliced_count % 500 == 0:
+                    logger.info(f"processing {spliced_count} spliced data points for {fp_writer.name}")
+                spliced_count += 1
+                fp_writer.write(json.dumps(spliced_data_point, ensure_ascii=False) + "\n")
+
+        # Save each arrow spliced dataset
+        output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
+        logger.info(f"Start to save {output_arrow_path}")
+        spliced_dataset = load_dataset(
+            path="json",
+            data_files=[output_jsonl_path],
+            cache_dir=os.path.join(args.data_cache_dir, "spliced_and_tokenized"),
+            keep_in_memory=False,
+            num_proc=cpu_count(),
+            split="train",
+        )
+        spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh b/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh
new file mode 100755
index 000000000000..4516fb336ea2
--- /dev/null
+++ b/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh
@@ -0,0 +1,24 @@
+rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache
+rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl
+rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow
+
+python prepare_pretrain_dataset_for_ppo_with_ptx_loss.py --data_input_dirs /home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
+    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+    --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache \
+    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl \
+    --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow \
+    --num_spliced_dataset_bins 1 \
+    --num_samples_per_datafile 500
+
+
+# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/cache
+# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/jsonl
+# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/arrow
+
+# python prepare_sft_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
+#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+#     --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/cache \
+#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/jsonl \
+#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/arrow \
+#     --num_spliced_dataset_bins 1 \
+#     --num_samples_per_datafile 500
diff --git a/applications/Chat/examples/prepare_prompt_dataset.py b/applications/Chat/examples/prepare_prompt_dataset.py
new file mode 100644
index 000000000000..e1d2472dec00
--- /dev/null
+++ b/applications/Chat/examples/prepare_prompt_dataset.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Prepare sft dataset for finetuning
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from multiprocessing import cpu_count
+
+from coati.dataset.conversation import default_conversation
+from coati.dataset.spliced_and_tokenized_dataset import tokenize_prompt_dataset
+from datasets import dataset_dict, load_dataset
+from transformers import AutoTokenizer
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_input_dirs",
+        type=str,
+        required=True,
+        default=None,
+        help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
+    )
+    parser.add_argument(
+        "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
+    )
+    parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
+    parser.add_argument(
+        "--data_jsonl_output_dir",
+        type=str,
+        default="jsonl_output",
+        help="Output directory of spliced dataset with jsonl format",
+    )
+    parser.add_argument(
+        "--data_arrow_output_dir",
+        type=str,
+        default="arrow_output",
+        help="Output directory of spliced dataset with arrow format",
+    )
+    parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
+    parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
+    parser.add_argument(
+        "--num_samples_per_datafile",
+        type=int,
+        default=-1,
+        help="Number of samples to be generated from each data file. -1 denote all samples.",
+    )
+    args = parser.parse_args()
+
+    if args.num_spliced_dataset_bins >= 100000:
+        raise ValueError("Too many spliced divisions, must be smaller than 100000")
+
+    assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
+    assert not os.path.exists(
+        args.data_jsonl_output_dir
+    ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
+    assert not os.path.exists(
+        args.data_arrow_output_dir
+    ), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
+    os.makedirs(args.data_jsonl_output_dir)
+    os.makedirs(args.data_arrow_output_dir)
+
+    # Prepare to all input datasets
+    input_data_paths = []
+    input_data_dirs = args.data_input_dirs.split(",")
+    for ds_dir in input_data_dirs:
+        ds_dir = os.path.abspath(ds_dir)
+        assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
+        ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
+        ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
+        input_data_paths.extend(ds_paths)
+
+    # Prepare to data splitting.
+    train_splits = []
+    split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
+    for i in range(0, 100, split_interval):
+        start = i
+        end = i + split_interval
+        if end > 100:
+            end = 100
+        train_splits.append(f"train[{start}%:{end}%]")
+
+    # Prepare to the tokenizer.
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    list_dataset = load_dataset(
+        path="json",
+        data_files=input_data_paths,
+        cache_dir=os.path.join(args.data_cache_dir, "raw"),
+        keep_in_memory=False,
+        split=train_splits,
+        num_proc=cpu_count(),
+    )
+    for index, dataset in enumerate(list_dataset):
+        assert isinstance(dataset, dataset_dict.Dataset)
+        if args.num_samples_per_datafile > 0:
+            # limit the number of samples in each dataset
+            dataset = dataset.select(
+                random.sample(range(len(dataset)), min(args.num_samples_per_datafile, len(dataset)))
+            )
+        logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
+        dataset = dataset.map(
+            function=tokenize_prompt_dataset,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "conversation_template": default_conversation,
+                "max_length": args.max_length,
+            },
+            keep_in_memory=False,
+            num_proc=min(len(dataset), cpu_count()),
+        )
+
+        dataset = dataset.filter(lambda data: data["input_ids"] is not None)
+        dataset = dataset.sort(column_names=("seq_category", "seq_length"), reverse=False, keep_in_memory=False)
+
+        # We don't concatenate data samples here.
+        spliced_dataset = dataset
+        # Save each jsonl spliced dataset.
+        output_index = "0" * (5 - len(str(index))) + str(index)
+        output_name = f"part-{output_index}"
+        output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
+        # st = time.time()
+        with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
+            spliced_count = 0
+            for spliced_data_point in spliced_dataset:
+                if spliced_count % 500 == 0:
+                    logger.info(f"processing {spliced_count} spliced data points for {fp_writer.name}")
+                spliced_count += 1
+                fp_writer.write(json.dumps(spliced_data_point, ensure_ascii=False) + "\n")
+
+        # Save each arrow spliced dataset
+        output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
+        logger.info(f"Start to save {output_arrow_path}")
+        spliced_dataset = load_dataset(
+            path="json",
+            data_files=[output_jsonl_path],
+            cache_dir=os.path.join(args.data_cache_dir, "spliced_and_tokenized"),
+            keep_in_memory=False,
+            num_proc=cpu_count(),
+            split="train",
+        )
+        spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/Chat/examples/prepare_prompt_dataset.sh b/applications/Chat/examples/prepare_prompt_dataset.sh
new file mode 100755
index 000000000000..b27d627e1d91
--- /dev/null
+++ b/applications/Chat/examples/prepare_prompt_dataset.sh
@@ -0,0 +1,22 @@
+rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/cache
+rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/jsonl
+rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow
+
+python prepare_prompt_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
+    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/cache \
+    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/jsonl \
+    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow
+
+
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/cache
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/jsonl
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow
+
+# python prepare_prompt_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
+#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/cache \
+#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/jsonl \
+#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow \
+#     --num_spliced_dataset_bins 1 \
+#     --num_samples_per_datafile 500
diff --git a/applications/Chat/examples/prepare_sft_dataset.py b/applications/Chat/examples/prepare_sft_dataset.py
new file mode 100644
index 000000000000..6680ab09bb20
--- /dev/null
+++ b/applications/Chat/examples/prepare_sft_dataset.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Prepare sft dataset for finetuning
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from multiprocessing import cpu_count
+
+from coati.dataset.conversation import default_conversation
+from coati.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
+from datasets import dataset_dict, load_dataset
+from transformers import AutoTokenizer
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_input_dirs",
+        type=str,
+        required=True,
+        default=None,
+        help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
+    )
+    parser.add_argument(
+        "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
+    )
+    parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
+    parser.add_argument(
+        "--data_jsonl_output_dir",
+        type=str,
+        default="jsonl_output",
+        help="Output directory of spliced dataset with jsonl format",
+    )
+    parser.add_argument(
+        "--data_arrow_output_dir",
+        type=str,
+        default="arrow_output",
+        help="Output directory of spliced dataset with arrow format",
+    )
+    parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
+    parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
+    parser.add_argument(
+        "--num_samples_per_datafile",
+        type=int,
+        default=-1,
+        help="Number of samples to be generated from each data file. -1 denote all samples.",
+    )
+    args = parser.parse_args()
+
+    if args.num_spliced_dataset_bins >= 100000:
+        raise ValueError("Too many spliced divisions, must be smaller than 100000")
+
+    assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
+    assert not os.path.exists(
+        args.data_jsonl_output_dir
+    ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
+    assert not os.path.exists(
+        args.data_arrow_output_dir
+    ), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
+    os.makedirs(args.data_jsonl_output_dir)
+    os.makedirs(args.data_arrow_output_dir)
+
+    # Prepare to all input datasets
+    input_data_paths = []
+    input_data_dirs = args.data_input_dirs.split(",")
+    for ds_dir in input_data_dirs:
+        ds_dir = os.path.abspath(ds_dir)
+        assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
+        ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
+        ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
+        input_data_paths.extend(ds_paths)
+
+    # Prepare to data splitting.
+    train_splits = []
+    split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
+    for i in range(0, 100, split_interval):
+        start = i
+        end = i + split_interval
+        if end > 100:
+            end = 100
+        train_splits.append(f"train[{start}%:{end}%]")
+
+    # Prepare to the tokenizer.
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    list_dataset = load_dataset(
+        path="json",
+        data_files=input_data_paths,
+        cache_dir=os.path.join(args.data_cache_dir, "raw"),
+        keep_in_memory=False,
+        split=train_splits,
+        num_proc=cpu_count(),
+    )
+    for index, dataset in enumerate(list_dataset):
+        assert isinstance(dataset, dataset_dict.Dataset)
+        if args.num_samples_per_datafile > 0:
+            # limit the number of samples in each dataset
+            dataset = dataset.select(
+                random.sample(range(len(dataset)), min(args.num_samples_per_datafile, len(dataset)))
+            )
+        logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
+        dataset = dataset.map(
+            function=supervised_tokenize_sft,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "conversation_template": default_conversation,
+                "max_length": args.max_length,
+            },
+            keep_in_memory=False,
+            num_proc=min(len(dataset), cpu_count()),
+        )
+
+        dataset = dataset.filter(lambda data: data["labels"] is not None)
+        dataset = dataset.sort(column_names=("seq_category", "seq_length"), reverse=False, keep_in_memory=False)
+
+        # We don't concatenate data samples here.
+        spliced_dataset = dataset
+        # Save each jsonl spliced dataset.
+        output_index = "0" * (5 - len(str(index))) + str(index)
+        output_name = f"part-{output_index}"
+        output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
+        # st = time.time()
+        with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
+            spliced_count = 0
+            for spliced_data_point in spliced_dataset:
+                if spliced_count % 500 == 0:
+                    logger.info(f"processing {spliced_count} spliced data points for {fp_writer.name}")
+                spliced_count += 1
+                fp_writer.write(json.dumps(spliced_data_point, ensure_ascii=False) + "\n")
+
+        # Save each arrow spliced dataset
+        output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
+        logger.info(f"Start to save {output_arrow_path}")
+        spliced_dataset = load_dataset(
+            path="json",
+            data_files=[output_jsonl_path],
+            cache_dir=os.path.join(args.data_cache_dir, "spliced_and_tokenized"),
+            keep_in_memory=False,
+            num_proc=cpu_count(),
+            split="train",
+        )
+        spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/Chat/examples/prepare_sft_dataset.sh b/applications/Chat/examples/prepare_sft_dataset.sh
new file mode 100755
index 000000000000..aae62505647b
--- /dev/null
+++ b/applications/Chat/examples/prepare_sft_dataset.sh
@@ -0,0 +1,22 @@
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/cache
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/jsonl
+# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/arrow
+
+# python prepare_sft_dataset.py --data_input_dirs /mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
+#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/cache \
+#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/jsonl \
+#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/arrow \
+#     --num_spliced_dataset_bins 1 \
+#     --num_samples_per_datafile 500
+
+
+rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/cache
+rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/jsonl
+rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow
+
+python prepare_sft_dataset.py --data_input_dirs /mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
+    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
+    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/cache \
+    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/jsonl \
+    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow \
diff --git a/applications/Chat/examples/train_dpo.py b/applications/Chat/examples/train_dpo.py
index a7b2af36be6e..4e78b42c6e0c 100644
--- a/applications/Chat/examples/train_dpo.py
+++ b/applications/Chat/examples/train_dpo.py
@@ -1,6 +1,5 @@
 import argparse
 import json
-import math
 import os
 import resource
 from contextlib import nullcontext
@@ -89,16 +88,6 @@ def train(args):
         model = AutoModelForCausalLM.from_pretrained(args.pretrain)
         ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
 
-        # debug tiny model
-        # model = transformers.LlamaForCausalLM(
-        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
-        #     )
-        # )
-        # ref_model = transformers.LlamaForCausalLM(
-        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
-        #     )
-        # )
-
         # TODO: set dropout to 0 here
         # for llama2, dropout is 0 by default, hence skip.
         if args.lora_rank > 0:
@@ -119,13 +108,7 @@ def train(args):
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
     tokenizer.padding_side = "right"
     tokenizer.pad_token = tokenizer.eos_token
-    coordinator.print_on_master(
-        f"Tokenizer pad token: {tokenizer.pad_token}, Tokenizer padding side: {tokenizer.padding_side}"
-    )
 
-    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt'),
-    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
-    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
     # configure optimizer
     optim = HybridAdam(
         model_params=model.parameters(),
@@ -150,8 +133,6 @@ def train(args):
     )
 
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    math.ceil(args.max_epochs * num_update_steps_per_epoch)
-
     if args.warmup_steps is None:
         args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
         coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
diff --git a/applications/Chat/examples/train_dpo.sh b/applications/Chat/examples/train_dpo.sh
index 15b919a09e03..db249003672d 100755
--- a/applications/Chat/examples/train_dpo.sh
+++ b/applications/Chat/examples/train_dpo.sh
@@ -26,9 +26,9 @@ export OMP_NUM_THREADS=8
 
 
 PROJECT_NAME="llama2-dpo"
-PARENT_SAVE_DIR="./output/ckpt"
-PARENT_TENSORBOARD_DIR="./output/tensorboard"
-PARENT_CONFIG_FILE="./output/train_config"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/train_config"
 PRETRAINED_MODEL_PATH="/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 declare -a dataset=(
@@ -55,7 +55,7 @@ colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "3d" \
-    --save_interval 500 \
+    --save_interval 1000 \
     --save_dir $SAVE_DIR \
     --config_file $CONFIG_FILE \
     --max_epochs 5 \
diff --git a/applications/Chat/examples/train_ppo.py b/applications/Chat/examples/train_ppo.py
index 7625f94c596f..d67aab333cbf 100644
--- a/applications/Chat/examples/train_ppo.py
+++ b/applications/Chat/examples/train_ppo.py
@@ -1,217 +1,363 @@
 import argparse
-import warnings
+import os
+import resource
+from contextlib import nullcontext
 
 import torch
-import torch.distributed as dist
-from coati.dataset import PromptDataset, SupervisedDataset
-from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.dataset import (
+    DataCollatorForPromptDataset,
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import Critic, RewardModel, convert_to_lora_module
 from coati.trainer import PPOTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from torch.optim import Adam
-from torch.optim.lr_scheduler import CosineAnnealingLR
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoConfig, AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
+from coati.utils import load_checkpoint, replace_with_flash_attention
 
+# from colossalai.utils import get_current_deviceDDPStrategy, GeminiStrategy, LowLevelZeroStrategy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
 
 
-def main(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    if args.rm_path is not None:
-        warnings.warn("LoRA weights should be merged with the model weights")
-        state_dict = torch.load(args.rm_path, map_location="cpu")
-
-    with strategy.model_init_context():
-        # configure model
-        # TODO: add support for llama
-        if args.model == "gpt2":
-            initial_model = GPTActor(pretrained=args.pretrain)
-        elif args.model == "bloom":
-            initial_model = BLOOMActor(pretrained=args.pretrain)
-        elif args.model == "opt":
-            initial_model = OPTActor(pretrained=args.pretrain)
-        else:
-            raise ValueError(f'Unsupported actor model "{args.model}"')
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
 
-        if args.rm_model is None:
-            rm_model_name = args.model
-        else:
-            rm_model_name = args.rm_model
-
-        if rm_model_name == "gpt2":
-            reward_model = GPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "bloom":
-            reward_model = BLOOMRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "opt":
-            reward_model = OPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "llama":
-            reward_model = LlamaRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
-
-        if args.rm_path is not None:
-            reward_model.load_state_dict(state_dict, strict=True)
-
-        initial_model.to(torch.cuda.current_device())
-        reward_model.to(torch.cuda.current_device())
-
-        if args.model == "gpt2":
-            config = AutoConfig.from_pretrained(args.pretrain)
-            config.embd_pdrop = 0.000
-            config.attn_pdrop = 0.0001
-            config.resid_pdrop = 0.0001
-            actor = GPTActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
-        elif args.model == "bloom":
-            config = AutoConfig.from_pretrained(args.pretrain)
-            # TODO: find a proper hyperparameter setting for BLOOM
-            config.attention_dropout = 0.0001
-            config.hidden_dropout = 0.0001
-            actor = BLOOMActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
-        elif args.model == "opt":
-            config = AutoConfig.from_pretrained(args.pretrain)
-            # TODO: find a proper hyperparameter setting for OPT
-            config.attention_dropout = 0.0001
-            config.dropout = 0.0001
-            config.layerdrop = 0.000
-            actor = OPTActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
-        elif args.model == "llama":
-            # Note: llama disable dropout by default
-            actor = LlamaActor(pretrained=args.pretrain, config=config, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported actor model "{args.model}"')
-
-        if args.model == "gpt2":
-            critic = GPTCritic(pretrained="gpt2", lora_rank=args.lora_rank)
-        elif args.model == "bloom":
-            critic = BLOOMCritic(pretrained="bigscience/bloom-560m", lora_rank=args.lora_rank)
-        elif args.model == "opt":
-            critic = OPTCritic(pretrained="facebook/opt-350m", lora_rank=args.lora_rank)
-        elif args.model == "llama":
-            critic = LlamaCritic(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
-        actor.to(torch.cuda.current_device())
-        critic.to(torch.cuda.current_device())
-        strategy.save_pretrained(actor, path=args.save_path)
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
 
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        actor_optim = HybridAdam(actor.parameters(), lr=args.lr)
-        critic_optim = HybridAdam(critic.parameters(), lr=0.001)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=args.lr)
-        critic_optim = Adam(critic.parameters(), lr=args.critic_lr)
+    booster_policy = None
+    with init_ctx:
+        actor = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
+        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
+        reward_model = RewardModel(args.rm_pretrain)
+        critic = Critic(args.rm_pretrain)
+
+        if args.tp > 1:
+            if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
+                raise ValueError("Reward model and critic model must have the same architecture")
+            if reward_model.model.config.architectures[0] == "BloomForCausalLM":
+                from colossalai.shardformer.policies.bloom import BloomPolicy
+
+                booster_policy = BloomPolicy()
+            elif reward_model.model.config.architectures[0] == "LlamaForCausalLM":
+                from colossalai.shardformer.policies.llama import LlamaPolicy
+
+                booster_policy = LlamaPolicy()
+            elif reward_model.model.config.architectures[0] == "GPT2LMHeadModel":
+                from colossalai.shardformer.policies.gpt2 import GPT2Policy
+
+                booster_policy = GPT2Policy()
+            elif reward_model.model.config.architectures[0] == "ChatGLMModel":
+                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
+
+                booster_policy = ChatGLMPolicy()
+            elif reward_model.model.config.architectures[0] == "OPTForCausalLM":
+                from colossalai.shardformer.policies.opt import OPTPolicy
+
+                booster_policy = OPTPolicy()
+            else:
+                raise ValueError("Unknown model architecture for policy")
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
+            critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        actor.gradient_checkpointing_enable()
+        critic.model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=actor)
+        replace_with_flash_attention(model=critic)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
 
     # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "left"  # left padding for generation (online learning)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    coordinator.print_on_master(f"setting up optimizer for actor: lr={args.lr}, weight_decay={args.weight_decay}")
+    actor_optim = HybridAdam(
+        model_params=actor.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    coordinator.print_on_master(f"setting up optimizer for critic: lr={args.lr}, weight_decay={args.weight_decay}")
+    critic_optim = HybridAdam(
+        model_params=critic.parameters(),
+        lr=args.critic_lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
+    train_prompt_dataloader = setup_distributed_dataloader(
+        dataset=train_prompt_dataset,
+        batch_size=args.experience_batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    if len(args.pretrain_dataset) > 0:
+        train_pretrain_dataset = load_tokenized_dataset(
+            dataset_paths=args.pretrain_dataset, mode="train", mode_map=mode_map
         )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
+        train_pretrain_dataloader = setup_distributed_dataloader(
+            dataset=train_pretrain_dataset,
+            batch_size=args.ptx_batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=data_collator,
+            use_tp=args.tp > 1,
         )
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
     else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    # NOTE: generate() requires padding_side to be "left"
-    tokenizer.padding_side = "left"
+        train_pretrain_dataloader = None
 
-    # configure tokenizer
-    rm_model_tokenizer = AutoTokenizer.from_pretrained(args.reward_model_tokenizer)
-    rm_model_tokenizer.pad_token = rm_model_tokenizer.eos_token
+    if args.warmup_steps is None:
+        args.warmup_steps = int(0.025 * args.num_episodes)
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
 
-    prompt_dataset = PromptDataset(
-        tokenizer=tokenizer,
-        data_path=args.prompt_dataset,
-        max_datasets_size=args.max_datasets_size,
-        max_length=args.max_input_len,
-    )
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
-    else:
-        prompt_sampler = None
-    prompt_dataloader = DataLoader(
-        prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.experience_batch_size
+    actor_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=actor_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
     )
 
-    pretrain_dataset = SupervisedDataset(
-        tokenizer=tokenizer,
-        data_path=args.pretrain_dataset,
-        max_datasets_size=args.max_datasets_size,
-        max_length=args.max_input_len,
+    critic_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=critic_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
     )
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+        )
+        custom_plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+            custom_policy=booster_policy,
+        )
     else:
-        pretrain_sampler = None
-    pretrain_dataloader = DataLoader(
-        pretrain_dataset, shuffle=(pretrain_sampler is None), sampler=pretrain_sampler, batch_size=args.ptx_batch_size
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    if args.plugin != "3d":
+        custom_plugin = plugin
+
+    actor_booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+    rm_booster = Booster(plugin=custom_plugin)
+    critic_booster = Booster(plugin=custom_plugin)
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    actor, actor_optim, _, train_prompt_dataloader, actor_lr_scheduler = actor_booster.boost(
+        model=actor,
+        optimizer=actor_optim,
+        lr_scheduler=actor_lr_scheduler,
+        dataloader=train_prompt_dataloader,
     )
 
-    # NOTE: For small models like opt-1.3b, reward model and initial model are not required to be parallelized.
-    (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (critic, critic_optim), reward_model, initial_model
+    critic, critic_optim, _, _, critic_lr_scheduler = critic_booster.boost(
+        model=critic,
+        optimizer=critic_optim,
+        lr_scheduler=critic_lr_scheduler,
+        dataloader=train_prompt_dataloader,
     )
+    reward_model, _, _, _, _ = rm_booster.boost(model=reward_model, dataloader=train_prompt_dataloader)
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_prompt_dataloader)
+
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    sampler_start_idx = 0
+    start_step = 0
+
+    if args.rm_checkpoint_path is not None:
+        if "modeling" in args.rm_checkpoint_path:
+            rm_booster.load_model(reward_model, args.rm_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.rm_checkpoint_path,
+                booster=rm_booster,
+                model=reward_model,
+                optimizer=None,
+                lr_scheduler=None,
+            )
+        coordinator.print_on_master(f"Loaded reward model checkpoint {args.rm_checkpoint_path}")
+
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            actor_booster.load_model(actor, args.checkpoint_path)
+            ref_booster.load_model(ref_model, args.checkpoint_path)
+            coordinator.print_on_master(f"Loaded actor and reference model {args.checkpoint_path}")
+        else:
+            _, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=actor_booster,
+                model=actor,
+                optimizer=actor_optim,
+                lr_scheduler=actor_lr_scheduler,
+            )
+            _, _, _ = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=ref_booster,
+                model=ref_model,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+            assert isinstance(train_prompt_dataloader.sampler, StatefulDistributedSampler)
+            train_prompt_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded actor and reference model checkpoint {args.checkpoint_path} at spisode {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
 
-    lr_scheduler = CosineAnnealingLR(actor_optim, args.num_episodes)
-    strategy_dict = strategy.prepare(dict(model=actor, optimizer=actor_optim, lr_scheduler=lr_scheduler))
-    actor = strategy_dict["model"]
-    actor_optim = strategy_dict["optimizer"]
-    actor_lr_scheduler = strategy_dict["lr_scheduler"]
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    if args.critic_checkpoint_path is not None:
+        if "modeling" in args.critic_checkpoint_path:
+            critic_booster.load_model(critic, args.critic_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.critic_checkpoint_path,
+                booster=critic_booster,
+                model=critic,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+        coordinator.print_on_master(f"Loaded critic checkpoint {args.critic_checkpoint_path}")
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
 
     # configure trainer
     trainer = PPOTrainer(
-        strategy,
+        actor_booster,
+        critic_booster,
         actor,
         critic,
         reward_model,
-        initial_model,
+        ref_model,
         actor_optim,
         critic_optim,
         actor_lr_scheduler,
+        critic_lr_scheduler,
         tokenizer=tokenizer,
-        rm_model_tokenizer=rm_model_tokenizer,
         kl_coef=args.kl_coef,
         ptx_coef=args.ptx_coef,
         train_batch_size=args.train_batch_size,
-        buffer_limit=args.train_batch_size,
-        max_length=args.max_seq_len,
+        buffer_limit=args.num_collect_steps * args.experience_batch_size,
+        max_length=args.max_length,
+        max_new_tokens=args.max_seq_len,
         use_cache=True,
         do_sample=True,
-        temperature=1.0,
+        temperature=0.7,
+        accumulation_steps=args.accumulation_steps,
+        save_dir=args.save_path,
+        save_interval=args.save_interval,
         top_k=50,
-        offload_inference_models=args.strategy != "colossalai_gemini",
+        use_tp=args.tp > 1,
+        offload_inference_models="gemini" not in args.plugin,
+        coordinator=coordinator,
     )
 
     trainer.fit(
         num_episodes=args.num_episodes,
         num_collect_steps=args.num_collect_steps,
         num_update_steps=args.num_update_steps,
-        save_per_num_episodes=args.save_per_num_episodes,
-        prompt_dataloader=prompt_dataloader,
-        pretrain_dataloader=pretrain_dataloader,
+        prompt_dataloader=train_prompt_dataloader,
+        pretrain_dataloader=train_pretrain_dataloader,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
@@ -222,51 +368,63 @@ def main(args):
         # NOTE: set model to eval to merge LoRA weights
         LORA_MANAGER.merge_weights = True
         actor.eval()
-    # save model checkpoint after fitting
-    strategy.save_pretrained(actor, path=args.save_path)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
+        critic.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final actor model checkpoint")
+    actor_booster.save_model(actor, os.path.join(trainer.actor_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final actor model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+    )
+    coordinator.print_on_master("Start saving final critic model checkpoint")
+    critic_booster.save_model(critic, os.path.join(trainer.critic_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final critic model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+    )
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt_dataset", type=str, default=None, help="path to the prompt dataset")
-    parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
-    parser.add_argument("--max_datasets_size", type=int, default=50000)
+    parser.add_argument("--prompt_dataset", nargs="+", default=[])
+    parser.add_argument("--pretrain_dataset", nargs="+", default=[])
     parser.add_argument(
-        "--strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2"],
-        default="colossalai_zero2",
-        help="strategy to use",
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
     )
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument("--reward_model_tokenizer", type=str, default=None)
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--rm_path", type=str, default=None)
     parser.add_argument("--rm_pretrain", type=str, default=None)
+    parser.add_argument("--checkpoint_path", type=str, default=None)
+    parser.add_argument("--critic_checkpoint_path", type=str, default=None)
+    parser.add_argument("--rm_checkpoint_path", type=str, help="Reward model checkpoint path")
     parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
     parser.add_argument("--num_episodes", type=int, default=1)
     parser.add_argument("--num_collect_steps", type=int, default=2)
     parser.add_argument("--num_update_steps", type=int, default=5)
-    parser.add_argument("--save_per_num_episodes", type=int, default=1000)
+    parser.add_argument("--save_interval", type=int, default=1000)
     parser.add_argument("--train_batch_size", type=int, default=16)
+    parser.add_argument("--experience_batch_size", type=int, default=16)
     parser.add_argument("--ptx_batch_size", type=int, default=1)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--accumulation_steps", type=int, default=8)
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=9e-6)
     parser.add_argument("--critic_lr", type=float, default=9e-6)
     parser.add_argument("--kl_coef", type=float, default=0.1)
     parser.add_argument("--ptx_coef", type=float, default=0.0)
-    parser.add_argument("--max_input_len", type=int, default=96)
+    parser.add_argument("--max_length", type=int, default=2048)
     parser.add_argument("--max_seq_len", type=int, default=256)
     parser.add_argument("--log_dir", default="logs", type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
-    main(args)
+    train(args)
diff --git a/applications/Chat/examples/train_ppo.sh b/applications/Chat/examples/train_ppo.sh
index 7b8998c67576..c50107c9791a 100755
--- a/applications/Chat/examples/train_ppo.sh
+++ b/applications/Chat/examples/train_ppo.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
@@ -12,38 +14,69 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-dpo"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+REWARD_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
+declare -a prompt_dataset=(
+    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
+)
+
+declare -a ptx_dataset=(
+    /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow/part-00000
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
-# the args satisfied: train_batch_size = number_of_node_in_hostfile * num_collect_steps * experience_batch_size
-# the real batch size for gradient descent is nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 1 --master_port 28567 --hostfile ./hostfile train_ppo.py \
-    --pretrain_dataset /home/lcyab/data/Anthropic_rlhf/pretrain_data.json \
-    --prompt_dataset /home/lcyab/data/Anthropic_rlhf/prompts_en.jsonl \
-    --strategy colossalai_zero2 \
-    --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
-    --experience_batch_size 32 \
-    --train_batch_size 32 \
-    --save_path '/home/lcyab/data/Anthropic_rlhf/actor/v3_5' \
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_ppo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --rm_pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --rm_checkpoint_path $REWARD_MODEL_PATH \
+    --prompt_dataset ${prompt_dataset[@]} \
+    --pretrain_dataset ${ptx_dataset[@]} \
+    --ptx_batch_size 1 \
     --ptx_coef 0.0 \
-    --rm_model 'gpt2' \
-    --rm_pretrain 'gpt2' \
-    --rm_path '/home/lcyab/data/Anthropic_rlhf/reward_model_v2_1' \
-    --reward_model_tokenizer 'gpt2' \
-    --pretrain '/home/lcyab/data/Anthropic_rlhf/actor/pretrain_v3' \
-    --lora_rank 30 \
+    --plugin "zero2" \
+    --save_interval 200 \
+    --save_path $SAVE_DIR \
+    --num_episodes 2000 \
+    --num_collect_steps 1 \
+    --num_update_steps 1 \
+    --experience_batch_size 8 \
+    --train_batch_size 4 \
+    --accumulation_steps 2 \
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
     --use_wandb
-    # --pretrain_dataset /path/to/pretrain_data.json \
-    # --prompt_dataset /path/to/prompt_dataset.jsonl \
-    # --strategy colossalai_zero2 \
-    # --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
-    # --experience_batch_size 32 \
-    # --train_batch_size 32 \
-    # --save_path '/path/to/actor/ppo_checkpoint' \
-    # --ptx_coef 0.0 \
-    # --rm_model 'gpt2' \
-    # --rm_pretrain 'gpt2' \
-    # --rm_path '/path/to/reward_model' \
-    # --reward_model_tokenizer 'gpt2' \
-    # --model 'gpt2' \
-    # --pretrain '/path/to/actor/pretrain_path' \
-    # --use_wandb \
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 0195a3138fae..4cf94b98cf7f 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -15,7 +15,7 @@
 from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
 from coati.trainer import RewardModelTrainer
 from coati.utils import load_checkpoint, replace_with_flash_attention
-from transformers import AutoTokenizer
+from transformers import LlamaTokenizer
 
 import colossalai
 from colossalai.booster import Booster
@@ -34,6 +34,44 @@ def train(args):
     colossalai.launch_from_torch({})
     coordinator = DistCoordinator()
 
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
+
+    booster_policy = None
+    with init_ctx:
+        model = RewardModel(args.pretrain)
+
+        if args.tp > 1:
+            if model.model.config.architectures[0] == "BloomForCausalLM":
+                from colossalai.shardformer.policies.bloom import BloomPolicy
+
+                booster_policy = BloomPolicy()
+            elif model.model.config.architectures[0] == "LlamaForCausalLM":
+                from colossalai.shardformer.policies.llama import LlamaPolicy
+
+                booster_policy = LlamaPolicy()
+            elif model.model.config.architectures[0] == "GPT2LMHeadModel":
+                from colossalai.shardformer.policies.gpt2 import GPT2Policy
+
+                booster_policy = GPT2Policy()
+            elif model.model.config.architectures[0] == "ChatGLMModel":
+                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
+
+                booster_policy = ChatGLMPolicy()
+            elif model.model.config.architectures[0] == "OPTForCausalLM":
+                from colossalai.shardformer.policies.opt import OPTPolicy
+
+                booster_policy = OPTPolicy()
+            else:
+                raise ValueError("Unknown model architecture for policy")
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
     # ==============================
     # Initialize Booster
     # ==============================
@@ -71,32 +109,13 @@ def train(args):
             pp_size=1,
             zero_stage=0,
             precision=args.mixed_precision,
+            custom_policy=booster_policy,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
 
     booster = Booster(plugin=plugin)
 
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    init_ctx = (
-        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
-    )
-    with init_ctx:
-        model = RewardModel(args.pretrain)
-
-        # debug tiny model
-        # model = RewardModel(
-        #     transformers.LlamaConfig(hidden_size=512, intermediate_size=1536, num_attention_heads=8, num_hidden_layers=4
-        #     )
-        # )
-
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
     if args.grad_checkpoint and args.lora_rank == 0:
         model.model.gradient_checkpointing_enable()  # TODO: support gradient checkpoint for the last linear layer
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
@@ -109,7 +128,7 @@ def train(args):
 
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)
     tokenizer.padding_side = "right"
     tokenizer.pad_token = tokenizer.eos_token
 
@@ -237,8 +256,8 @@ def train(args):
         model.eval()
     # save model checkpoint after fitting on only rank0
     coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
diff --git a/applications/Chat/examples/train_rm.sh b/applications/Chat/examples/train_rm.sh
index 517ff05167f2..18ef2511ef40 100755
--- a/applications/Chat/examples/train_rm.sh
+++ b/applications/Chat/examples/train_rm.sh
@@ -14,7 +14,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
 # NCCL IB environment variables
 export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
 export NCCL_IB_DISABLE=0
@@ -26,22 +26,23 @@ export OMP_NUM_THREADS=8
 
 
 PROJECT_NAME="llama2-rm"
-PARENT_SAVE_DIR="./output/ckpt"
-PARENT_TENSORBOARD_DIR="./output/tensorboard"
-PARENT_CONFIG_FILE="./output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/rm/output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  #"/home/lcyab/data/models/bloom-560m" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 declare -a dataset=(
-    /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00009
+    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
@@ -49,15 +50,16 @@ FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
-colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_reward_model.py \
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_reward_model.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --checkpoint_path /home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
-    --save_interval 100 \
+    --save_interval 3000 \
     --save_dir $SAVE_DIR \
     --config_file $CONFIG_FILE \
-    --max_epochs 5 \
+    --max_epochs 3 \
     --accumulation_steps 1 \
     --batch_size 8 \
     --lr 9e-6 \
@@ -65,6 +67,6 @@ colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_
     --grad_clip 1.0 \
     --weight_decay 0.01 \
     --warmup_steps 100 \
-    --grad_checkpoint \
     --use_flash_attn \
-    --use_wandb
+    # --use_wandb \
+    # --grad_checkpoint \
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index b42789879a0d..49982ca17e19 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -108,7 +108,6 @@ def train(args):
     coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
     coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
 
-    print([name for name, p in model.named_parameters() if p.requires_grad])
     # configure optimizer
     optim = HybridAdam(
         model_params=model.parameters(),
@@ -117,14 +116,6 @@ def train(args):
         weight_decay=args.weight_decay,
         adamw_mode=True,
     )
-    # if is_rank_0():
-    #     print(optim.state_dict())
-    #     print("debug")
-    # optim_param = optim.state_dict()['param_groups'][0]['params']
-    # for id in optim_param:
-    #     for idx, (name, p) in enumerate(model.named_parameters()):
-    #         if idx == id:
-    #             print(name, p.size())
 
     # configure dataset
     coordinator.print_on_master(
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index 7bbf6ccecdb9..0b81651e21b3 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -12,17 +12,26 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
+# set_n_least_used_CUDA_VISIBLE_DEVICES 4
+export CUDA_VISIBLE_DEVICES=2,3,4,5
 
 PROJECT_NAME="llama2-sft"
-PARENT_SAVE_DIR="./output/ckpt"
-PARENT_TENSORBOARD_DIR="./output/tensorboard"
-PARENT_CONFIG_FILE="./output/train_config"
-PRETRAINED_MODEL_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft/output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 declare -a dataset=(
-    /home/lcyab/data/SFT_data/llama2_sft_data/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
@@ -31,20 +40,17 @@ SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
 # the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 8 --master_port 28534 --hostfile ./hostfile train_sft.py \
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
-    --save_interval 500 \
+    --save_interval 5000 \
     --save_path $SAVE_DIR \
     --config_file $CONFIG_FILE \
     --plugin zero2 \
-    --batch_size 4 \
+    --batch_size 2 \
     --max_epochs 1 \
-    --accumulation_steps 1 \
+    --accumulation_steps 4 \
     --lr 2e-5 \
-    --max_len 512 \
-    --max_epochs 1 \
-    --use_flash_attn \
-    --grad_checkpoint \
-    --use_wandb \
+    --max_len 2048 \
+    --use_wandb
diff --git a/applications/Chat/inference/benchmark.py b/applications/Chat/inference/benchmark.py
deleted file mode 100644
index dbb5490a63dc..000000000000
--- a/applications/Chat/inference/benchmark.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Adapted from https://github.com/tloen/alpaca-lora/blob/main/generate.py
-
-import argparse
-from time import time
-
-import torch
-from coati.quant import llama_load_quant, low_resource_init
-from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
-
-
-def generate_prompt(instruction, input=None):
-    if input:
-        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{instruction}
-
-### Input:
-{input}
-
-### Response:"""
-    else:
-        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{instruction}
-
-### Response:"""
-
-
-@torch.no_grad()
-def evaluate(
-    model,
-    tokenizer,
-    instruction,
-    input=None,
-    temperature=0.1,
-    top_p=0.75,
-    top_k=40,
-    num_beams=4,
-    max_new_tokens=128,
-    **kwargs,
-):
-    prompt = generate_prompt(instruction, input)
-    inputs = tokenizer(prompt, return_tensors="pt")
-    input_ids = inputs["input_ids"].cuda()
-    generation_config = GenerationConfig(
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        num_beams=num_beams,
-        **kwargs,
-    )
-    generation_output = model.generate(
-        input_ids=input_ids,
-        generation_config=generation_config,
-        return_dict_in_generate=True,
-        output_scores=True,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-    )
-    s = generation_output.sequences[0]
-    output = tokenizer.decode(s)
-    n_new_tokens = s.size(0) - input_ids.size(1)
-    return output.split("### Response:")[1].strip(), n_new_tokens
-
-
-instructions = [
-    "Tell me about alpacas.",
-    "Tell me about the president of Mexico in 2019.",
-    "Tell me about the king of France in 2019.",
-    "List all Canadian provinces in alphabetical order.",
-    "Write a Python program that prints the first 10 Fibonacci numbers.",
-    "Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.",
-    "Tell me five words that rhyme with 'shock'.",
-    "Translate the sentence 'I have no mouth but I must scream' into Spanish.",
-    "Count up from 1 to 500.",
-    # ===
-    "How to play support in legends of league",
-    "Write a Python program that calculate Fibonacci numbers.",
-]
-inst = [instructions[0]] * 4
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "pretrained",
-        help="Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.",
-    )
-    parser.add_argument(
-        "--quant",
-        choices=["8bit", "4bit"],
-        default=None,
-        help="Quantization mode. Default: None (no quantization, fp16).",
-    )
-    parser.add_argument(
-        "--gptq_checkpoint",
-        default=None,
-        help="Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.",
-    )
-    parser.add_argument(
-        "--gptq_group_size",
-        type=int,
-        default=128,
-        help="Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.",
-    )
-    args = parser.parse_args()
-
-    if args.quant == "4bit":
-        assert args.gptq_checkpoint is not None, "Please specify a GPTQ checkpoint."
-
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
-
-    if args.quant == "4bit":
-        with low_resource_init():
-            config = LlamaConfig.from_pretrained(args.pretrained)
-            model = LlamaForCausalLM(config)
-        model = llama_load_quant(model, args.gptq_checkpoint, 4, args.gptq_group_size)
-        model.cuda()
-    else:
-        model = LlamaForCausalLM.from_pretrained(
-            args.pretrained,
-            load_in_8bit=(args.quant == "8bit"),
-            torch_dtype=torch.float16,
-            device_map="auto",
-        )
-        if args.quant != "8bit":
-            model.half()  # seems to fix bugs for some users.
-        model.eval()
-
-    total_tokens = 0
-    start = time()
-    for instruction in instructions:
-        print(f"Instruction: {instruction}")
-        resp, tokens = evaluate(model, tokenizer, instruction, temperature=0.2, num_beams=1)
-        total_tokens += tokens
-        print(f"Response: {resp}")
-        print("\n----------------------------\n")
-    duration = time() - start
-    print(f"Total time: {duration:.3f} s, {total_tokens/duration:.3f} tokens/s")
-    print(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
diff --git a/applications/Chat/inference/server.py b/applications/Chat/inference/server.py
index 7c6a61b9e7f2..aec342802b02 100644
--- a/applications/Chat/inference/server.py
+++ b/applications/Chat/inference/server.py
@@ -5,6 +5,7 @@
 
 import torch
 import uvicorn
+from coati.models import generate_streaming
 from coati.quant import llama_load_quant, low_resource_init
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
@@ -13,10 +14,9 @@
 from slowapi.errors import RateLimitExceeded
 from slowapi.util import get_remote_address
 from sse_starlette.sse import EventSourceResponse
-from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
-from utils import ChatPromptProcessor, Dialogue, LockedIterator, load_json, sample_streamingly, update_model_kwargs_fn
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from utils import ChatPromptProcessor, Dialogue, LockedIterator, load_json, update_model_kwargs_fn
 
-CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
 MAX_LEN = 512
 running_lock = Lock()
 
@@ -54,20 +54,22 @@ class GenerationTaskReq(BaseModel):
 )
 
 
-def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
-    inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
+def generate_streamingly(prompt, max_length, max_new_tokens, top_k, top_p, temperature):
+    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
     # TODO(ver217): streaming generation does not support repetition_penalty now
     model_kwargs = {
-        "max_generate_tokens": max_new_tokens,
+        "max_new_tokens": max_new_tokens,
         "early_stopping": True,
         "top_k": top_k,
         "top_p": top_p,
         "temperature": temperature,
-        "prepare_inputs_fn": model.prepare_inputs_for_generation,
+        "prepare_inputs_fn": None,
         "update_model_kwargs_fn": update_model_kwargs_fn,
     }
     is_first_word = True
-    generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
+    generator = LockedIterator(
+        generate_streaming(model, input_ids, tokenizer, max_length, **model_kwargs), running_lock
+    )
     for output in generator:
         output = output.cpu()
         tokens = tokenizer.convert_ids_to_tokens(output, skip_special_tokens=True)
@@ -101,9 +103,10 @@ async def event_generator(request: Request, generator: Generator):
 @app.post("/generate/stream")
 @limiter.limit("1/second")
 def generate(data: GenerationTaskReq, request: Request):
-    prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
+    prompt = prompt_processor.preprocess_prompt(data.history)
     event_source = event_generator(
-        request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature)
+        request,
+        generate_streamingly(prompt, data.max_length, data.max_new_tokens, data.top_k, data.top_p, data.temperature),
     )
     return EventSourceResponse(event_source)
 
@@ -133,6 +136,11 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
         "pretrained",
         help="Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.",
     )
+    parser.add_argument(
+        "--tokenizer_path",
+        help="Path to pretrained tokenizer. Can be a local path or a model name from the HuggingFace model hub.",
+        default=None,
+    )
     parser.add_argument(
         "--quant",
         choices=["8bit", "4bit"],
@@ -162,26 +170,29 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
     if args.quant == "4bit":
         assert args.gptq_checkpoint is not None, "Please specify a GPTQ checkpoint."
 
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
+    if args.tokenizer_path is None:
+        args.tokenizer_path = args.pretrained
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, local_files_only=True)
 
     if args.profanity_file is not None:
         censored_words = load_json(args.profanity_file)
     else:
         censored_words = []
-    prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN, censored_words=censored_words)
+    prompt_processor = ChatPromptProcessor(censored_words=censored_words)
 
     if args.quant == "4bit":
         with low_resource_init():
-            config = LlamaConfig.from_pretrained(args.pretrained)
-            model = LlamaForCausalLM(config)
+            config = AutoConfig.from_pretrained(args.pretrained)
+            model = AutoModelForCausalLM(config)
         model = llama_load_quant(model, args.gptq_checkpoint, 4, args.gptq_group_size)
         model.cuda()
     else:
-        model = LlamaForCausalLM.from_pretrained(
+        model = AutoModelForCausalLM.from_pretrained(
             args.pretrained,
             load_in_8bit=(args.quant == "8bit"),
             torch_dtype=torch.float16,
             device_map="auto",
+            local_files_only=True,
         )
         if args.quant != "8bit":
             model.half()  # seems to fix bugs for some users.
@@ -190,3 +201,8 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
     config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
     server = uvicorn.Server(config=config)
     server.run()
+
+
+"""
+python server.py /home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/ --tokenizer_path /mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/
+"""
diff --git a/applications/Chat/inference/tests/test_chat_prompt.py b/applications/Chat/inference/tests/test_chat_prompt.py
deleted file mode 100644
index 9835e71894c6..000000000000
--- a/applications/Chat/inference/tests/test_chat_prompt.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-
-from transformers import AutoTokenizer
-from utils import ChatPromptProcessor, Dialogue
-
-CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
-tokenizer = AutoTokenizer.from_pretrained(os.environ["PRETRAINED_PATH"])
-
-samples = [
-    (
-        [
-            Dialogue(
-                instruction="Who is the best player in the history of NBA?",
-                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
-            ),
-            Dialogue(instruction="continue this talk", response=""),
-        ],
-        128,
-        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
-    ),
-    (
-        [
-            Dialogue(
-                instruction="Who is the best player in the history of NBA?",
-                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
-            ),
-            Dialogue(instruction="continue this talk", response=""),
-        ],
-        200,
-        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
-    ),
-    (
-        [
-            Dialogue(
-                instruction="Who is the best player in the history of NBA?",
-                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
-            ),
-            Dialogue(instruction="continue this talk", response=""),
-        ],
-        211,
-        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n",
-    ),
-    (
-        [
-            Dialogue(instruction="Who is the best player in the history of NBA?", response=""),
-        ],
-        128,
-        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n",
-    ),
-]
-
-
-def test_chat_prompt_processor():
-    processor = ChatPromptProcessor(tokenizer, CONTEXT, 256)
-    for history, max_new_tokens, result in samples:
-        prompt = processor.preprocess_prompt(history, max_new_tokens)
-        assert prompt == result
-
-
-if __name__ == "__main__":
-    test_chat_prompt_processor()
diff --git a/applications/Chat/inference/utils.py b/applications/Chat/inference/utils.py
index af018adf6e9d..82a1a7255164 100644
--- a/applications/Chat/inference/utils.py
+++ b/applications/Chat/inference/utils.py
@@ -1,97 +1,13 @@
+import copy
 import json
-import re
 from threading import Lock
-from typing import Any, Callable, Generator, List, Optional
+from typing import List
 
 import jieba
 import torch
-import torch.distributed as dist
-import torch.nn as nn
+from coati.dataset.conversation import default_conversation
 from pydantic import BaseModel, Field
 
-try:
-    from transformers.generation_logits_process import (
-        LogitsProcessorList,
-        TemperatureLogitsWarper,
-        TopKLogitsWarper,
-        TopPLogitsWarper,
-    )
-except ImportError:
-    from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
-
-
-def prepare_logits_processor(
-    top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None
-) -> LogitsProcessorList:
-    processor_list = LogitsProcessorList()
-    if temperature is not None and temperature != 1.0:
-        processor_list.append(TemperatureLogitsWarper(temperature))
-    if top_k is not None and top_k != 0:
-        processor_list.append(TopKLogitsWarper(top_k))
-    if top_p is not None and top_p < 1.0:
-        processor_list.append(TopPLogitsWarper(top_p))
-    return processor_list
-
-
-def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        # consider DP
-        unfinished_sequences = unfinished_sequences.clone()
-        dist.all_reduce(unfinished_sequences)
-    return unfinished_sequences.max() == 0
-
-
-def sample_streamingly(
-    model: nn.Module,
-    input_ids: torch.Tensor,
-    max_generate_tokens: int,
-    early_stopping: bool = False,
-    eos_token_id: Optional[int] = None,
-    pad_token_id: Optional[int] = None,
-    top_k: Optional[int] = None,
-    top_p: Optional[float] = None,
-    temperature: Optional[float] = None,
-    prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
-    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
-    **model_kwargs,
-) -> Generator:
-    logits_processor = prepare_logits_processor(top_k, top_p, temperature)
-    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-
-    for _ in range(max_generate_tokens):
-        model_inputs = (
-            prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
-        )
-        outputs = model(**model_inputs)
-
-        next_token_logits = outputs["logits"][:, -1, :]
-        # pre-process distribution
-        next_token_logits = logits_processor(input_ids, next_token_logits)
-        # sample
-        probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
-        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-        # finished sentences should have their next token be a padding token
-        if eos_token_id is not None:
-            if pad_token_id is None:
-                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-        yield next_tokens
-
-        # update generated ids, model inputs for next step
-        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-        if update_model_kwargs_fn is not None:
-            model_kwargs = update_model_kwargs_fn(outputs, **model_kwargs)
-
-        # if eos_token was found in one sentence, set sentence to finished
-        if eos_token_id is not None:
-            unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
-
-        # stop when each sentence is finished if early_stopping=True
-        if early_stopping and _is_sequence_finished(unfinished_sequences):
-            break
-
 
 def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
     if "past_key_values" in outputs:
@@ -119,69 +35,22 @@ class Dialogue(BaseModel):
     response: str = Field(example="")
 
 
-def _format_dialogue(instruction: str, response: str = ""):
-    return f"\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}"
-
-
-STOP_PAT = re.compile(r"(###|instruction:).*", flags=(re.I | re.S))
-
-
 class ChatPromptProcessor:
     SAFE_RESPONSE = "The input/response contains inappropriate content, please rephrase your prompt."
 
-    def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words: List[str] = []):
-        self.tokenizer = tokenizer
-        self.context = context
-        self.max_len = max_len
+    def __init__(self, censored_words: List[str] = []):
         self.censored_words = set([word.lower() for word in censored_words])
-        # These will be initialized after the first call of preprocess_prompt()
-        self.context_len: Optional[int] = None
-        self.dialogue_placeholder_len: Optional[int] = None
-
-    def preprocess_prompt(self, history: List[Dialogue], max_new_tokens: int) -> str:
-        if self.context_len is None:
-            self.context_len = len(self.tokenizer(self.context)["input_ids"])
-        if self.dialogue_placeholder_len is None:
-            self.dialogue_placeholder_len = len(
-                self.tokenizer(_format_dialogue(""), add_special_tokens=False)["input_ids"]
-            )
-        prompt = self.context
-        # the last dialogue must be in the prompt
-        last_dialogue = history.pop()
-        # the response of the last dialogue is empty
-        assert last_dialogue.response == ""
-        if (
-            len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)["input_ids"])
-            + max_new_tokens
-            + self.context_len
-            >= self.max_len
-        ):
-            # to avoid truncate placeholder, apply truncate to the original instruction
-            instruction_truncated = self.tokenizer(
-                last_dialogue.instruction,
-                add_special_tokens=False,
-                truncation=True,
-                max_length=(self.max_len - max_new_tokens - self.context_len - self.dialogue_placeholder_len),
-            )["input_ids"]
-            instruction_truncated = self.tokenizer.decode(instruction_truncated).lstrip()
-            prompt += _format_dialogue(instruction_truncated)
-            return prompt
-
-        res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)["input_ids"])
-
-        rows = []
-        for dialogue in history[::-1]:
-            text = _format_dialogue(dialogue.instruction, dialogue.response)
-            cur_len = len(self.tokenizer(text, add_special_tokens=False)["input_ids"])
-            if res_len - cur_len < 0:
-                break
-            res_len -= cur_len
-            rows.insert(0, text)
-        prompt += "".join(rows) + _format_dialogue(last_dialogue.instruction)
-        return prompt
+        self.conv = copy.deepcopy(default_conversation)
+
+    def preprocess_prompt(self, history: List[Dialogue]) -> str:
+        self.conv.clear()
+        for round in history:
+            self.conv.append_message(self.conv.roles[0], round.instruction)
+            if len(round.instruction) > 0:
+                self.conv.append_message(self.conv.roles[1], round.response)
+        return self.conv.get_prompt()
 
     def postprocess_output(self, output: str) -> str:
-        output = STOP_PAT.sub("", output)
         return output.strip()
 
     def has_censored_words(self, text: str) -> bool:
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index 0b1ee1785fa1..d601375558de 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai==0.3.2
+colossalai==0.3.4
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh
index 849db06e58ab..d818f7f7b28e 100755
--- a/applications/Chat/tests/test_inference.sh
+++ b/applications/Chat/tests/test_inference.sh
@@ -6,6 +6,6 @@ EXAMPLES_DIR=$BASE_DIR/examples
 echo "[Test]: testing inference ..."
 
 # HACK: skip llama due to oom
-for model in 'gpt2' 'bloom' 'opt'; do
-    python $EXAMPLES_DIR/inference.py --model $model
+for model in 'gpt2' 'bigscience/bloom-560m' 'facebook/opt-350m'; do
+    python $EXAMPLES_DIR/inference.py --model_path $model --io dummy --max_new_tokens 20
 done
diff --git a/applications/Chat/tests/test_reward_model.py b/applications/Chat/tests/test_reward_model.py
new file mode 100644
index 000000000000..6c633eabf5d3
--- /dev/null
+++ b/applications/Chat/tests/test_reward_model.py
@@ -0,0 +1,4125 @@
+import argparse
+
+import torch
+from coati.models import RewardModel
+
+TEST_DATA = torch.Tensor(
+    [
+        1,
+        319,
+        13563,
+        1546,
+        263,
+        12758,
+        5199,
+        322,
+        385,
+        23116,
+        21082,
+        20255,
+        29889,
+        450,
+        20255,
+        4076,
+        8444,
+        29892,
+        13173,
+        29892,
+        322,
+        1248,
+        568,
+        6089,
+        304,
+        278,
+        5199,
+        29915,
+        29879,
+        5155,
+        29889,
+        13,
+        13,
+        29950,
+        7889,
+        29901,
+        1,
+        5618,
+        947,
+        372,
+        2125,
+        304,
+        367,
+        263,
+        410,
+        17622,
+        2810,
+        29973,
+        2,
+        7900,
+        22137,
+        29901,
+        1,
+        11284,
+        29892,
+        410,
+        28563,
+        267,
+        817,
+        304,
+        505,
+        1781,
+        9324,
+        322,
+        1095,
+        18541,
+        29892,
+        363,
+        1342,
+        29889,
+        29871,
+        2688,
+        884,
+        817,
+        304,
+        505,
+        1781,
+        1361,
+        29899,
+        1032,
+        29872,
+        29311,
+        3381,
+        29892,
+        470,
+        278,
+        11509,
+        304,
+        1207,
+        16716,
+        24147,
+        411,
+        1009,
+        3573,
+        29889,
+        29871,
+        15113,
+        29892,
+        896,
+        817,
+        304,
+        505,
+        263,
+        6483,
+        5360,
+        310,
+        278,
+        7980,
+        896,
+        30010,
+        276,
+        8743,
+        29892,
+        304,
+        367,
+        17385,
+        630,
+        304,
+        7945,
+        2898,
+        29889,
+        29871,
+        1670,
+        526,
+        1784,
+        916,
+        13879,
+        393,
+        748,
+        964,
+        1641,
+        263,
+        410,
+        17622,
+        2810,
+        29892,
+        322,
+        372,
+        30010,
+        29879,
+        2289,
+        263,
+        1407,
+        9377,
+        12875,
+        310,
+        25078,
+        393,
+        526,
+        12420,
+        297,
+        1422,
+        5837,
+        297,
+        1269,
+        5375,
+        29889,
+        2,
+        29950,
+        7889,
+        29901,
+        1,
+        7058,
+        338,
+        1565,
+        29889,
+        306,
+        4140,
+        278,
+        17385,
+        362,
+        29892,
+        278,
+        5360,
+        322,
+        6483,
+        9063,
+        358,
+        29889,
+        2,
+        7900,
+        22137,
+        29901,
+        1,
+        8241,
+        29892,
+        366,
+        505,
+        304,
+        505,
+        263,
+        1565,
+        5360,
+        363,
+        278,
+        7980,
+        366,
+        1708,
+        29889,
+        29871,
+        1670,
+        30010,
+        29879,
+        3078,
+        763,
+        278,
+        11223,
+        310,
+        1641,
+        5714,
+        297,
+        263,
+        3748,
+        310,
+        596,
+        7980,
+        29889,
+        29871,
+        1126,
+        746,
+        366,
+        5360,
+        263,
+        7980,
+        29892,
+        366,
+        505,
+        263,
+        5100,
+        3321,
+        7636,
+        975,
+        10769,
+        1058,
+        1016,
+        30010,
+        29873,
+        29889,
+        29871,
+        3387,
+        278,
+        7899,
+        304,
+        367,
+        278,
+        1900,
+        29892,
+        2462,
+        297,
+        322,
+        2462,
+        714,
+        29889,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+        2,
+    ],
+).to("cuda:0")
+
+ATTEN_MASK = torch.Tensor(
+    [
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        True,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+        False,
+    ],
+).to("cuda:0")
+
+
+def main(args):
+    model = RewardModel(pretrained=args.pretrain).to("cuda:0")
+    print(model(TEST_DATA, attention_mask=ATTEN_MASK))
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrain", type=str, default=None)
+    args = parser.parse_args()
+    main(args)

From b50b49f3788add88ca053fb508bc50cdcac119ed Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 30 Nov 2023 09:59:32 +0800
Subject: [PATCH 03/23] 2 nd round refactor

---
 .../Chat/examples/training_scripts/hostfile   |   1 +
 .../examples/training_scripts/train_dpo.py    | 287 ++++++++++++
 .../examples/training_scripts/train_dpo.sh    |  71 +++
 .../examples/training_scripts/train_ppo.py    | 430 ++++++++++++++++++
 .../examples/training_scripts/train_ppo.sh    |  82 ++++
 .../training_scripts/train_reward_model.py    | 314 +++++++++++++
 .../examples/training_scripts/train_rm.sh     |  72 +++
 .../examples/training_scripts/train_sft.py    | 284 ++++++++++++
 .../examples/training_scripts/train_sft.sh    |  56 +++
 9 files changed, 1597 insertions(+)
 create mode 100644 applications/Chat/examples/training_scripts/hostfile
 create mode 100644 applications/Chat/examples/training_scripts/train_dpo.py
 create mode 100755 applications/Chat/examples/training_scripts/train_dpo.sh
 create mode 100644 applications/Chat/examples/training_scripts/train_ppo.py
 create mode 100755 applications/Chat/examples/training_scripts/train_ppo.sh
 create mode 100644 applications/Chat/examples/training_scripts/train_reward_model.py
 create mode 100755 applications/Chat/examples/training_scripts/train_rm.sh
 create mode 100644 applications/Chat/examples/training_scripts/train_sft.py
 create mode 100755 applications/Chat/examples/training_scripts/train_sft.sh

diff --git a/applications/Chat/examples/training_scripts/hostfile b/applications/Chat/examples/training_scripts/hostfile
new file mode 100644
index 000000000000..4a0d939bb155
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/hostfile
@@ -0,0 +1 @@
+192.168.0.35
diff --git a/applications/Chat/examples/training_scripts/train_dpo.py b/applications/Chat/examples/training_scripts/train_dpo.py
new file mode 100644
index 000000000000..4e78b42c6e0c
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_dpo.py
@@ -0,0 +1,287 @@
+import argparse
+import json
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import (
+    DataCollatorForPreferenceDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import convert_to_lora_module
+from coati.trainer import DPOTrainer
+from coati.utils import load_checkpoint, replace_with_flash_attention
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    )
+    with init_ctx:
+        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "right"
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+
+    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt').to(get_current_device()),
+    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
+    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
+
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = DPOTrainer(
+        actor=model,
+        ref_model=ref_model,
+        booster=booster,
+        actor_optim=optim,
+        actor_lr_scheduler=lr_scheduler,
+        tokenizer=tokenizer,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
+    )
+
+    get_dist_logger()
+    trainer.fit(
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=None,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/Chat/examples/training_scripts/train_dpo.sh b/applications/Chat/examples/training_scripts/train_dpo.sh
new file mode 100755
index 000000000000..74cf9c33417c
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_dpo.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-dpo"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_dpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --checkpoint_path $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "3d" \
+    --save_interval 1000 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --max_epochs 5 \
+    --accumulation_steps 4 \
+    --batch_size 4 \
+    --tp 8 \
+    --lr 5e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
+    --use_wandb
diff --git a/applications/Chat/examples/training_scripts/train_ppo.py b/applications/Chat/examples/training_scripts/train_ppo.py
new file mode 100644
index 000000000000..d67aab333cbf
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_ppo.py
@@ -0,0 +1,430 @@
+import argparse
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import (
+    DataCollatorForPromptDataset,
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import Critic, RewardModel, convert_to_lora_module
+from coati.trainer import PPOTrainer
+from coati.utils import load_checkpoint, replace_with_flash_attention
+
+# from colossalai.utils import get_current_deviceDDPStrategy, GeminiStrategy, LowLevelZeroStrategy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
+
+    booster_policy = None
+    with init_ctx:
+        actor = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
+        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
+        reward_model = RewardModel(args.rm_pretrain)
+        critic = Critic(args.rm_pretrain)
+
+        if args.tp > 1:
+            if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
+                raise ValueError("Reward model and critic model must have the same architecture")
+            if reward_model.model.config.architectures[0] == "BloomForCausalLM":
+                from colossalai.shardformer.policies.bloom import BloomPolicy
+
+                booster_policy = BloomPolicy()
+            elif reward_model.model.config.architectures[0] == "LlamaForCausalLM":
+                from colossalai.shardformer.policies.llama import LlamaPolicy
+
+                booster_policy = LlamaPolicy()
+            elif reward_model.model.config.architectures[0] == "GPT2LMHeadModel":
+                from colossalai.shardformer.policies.gpt2 import GPT2Policy
+
+                booster_policy = GPT2Policy()
+            elif reward_model.model.config.architectures[0] == "ChatGLMModel":
+                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
+
+                booster_policy = ChatGLMPolicy()
+            elif reward_model.model.config.architectures[0] == "OPTForCausalLM":
+                from colossalai.shardformer.policies.opt import OPTPolicy
+
+                booster_policy = OPTPolicy()
+            else:
+                raise ValueError("Unknown model architecture for policy")
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
+            critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        actor.gradient_checkpointing_enable()
+        critic.model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=actor)
+        replace_with_flash_attention(model=critic)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "left"  # left padding for generation (online learning)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    coordinator.print_on_master(f"setting up optimizer for actor: lr={args.lr}, weight_decay={args.weight_decay}")
+    actor_optim = HybridAdam(
+        model_params=actor.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    coordinator.print_on_master(f"setting up optimizer for critic: lr={args.lr}, weight_decay={args.weight_decay}")
+    critic_optim = HybridAdam(
+        model_params=critic.parameters(),
+        lr=args.critic_lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
+    train_prompt_dataloader = setup_distributed_dataloader(
+        dataset=train_prompt_dataset,
+        batch_size=args.experience_batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    if len(args.pretrain_dataset) > 0:
+        train_pretrain_dataset = load_tokenized_dataset(
+            dataset_paths=args.pretrain_dataset, mode="train", mode_map=mode_map
+        )
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
+        train_pretrain_dataloader = setup_distributed_dataloader(
+            dataset=train_pretrain_dataset,
+            batch_size=args.ptx_batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=data_collator,
+            use_tp=args.tp > 1,
+        )
+    else:
+        train_pretrain_dataloader = None
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(0.025 * args.num_episodes)
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    actor_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=actor_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    critic_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=critic_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+        )
+        custom_plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+            custom_policy=booster_policy,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    if args.plugin != "3d":
+        custom_plugin = plugin
+
+    actor_booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+    rm_booster = Booster(plugin=custom_plugin)
+    critic_booster = Booster(plugin=custom_plugin)
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    actor, actor_optim, _, train_prompt_dataloader, actor_lr_scheduler = actor_booster.boost(
+        model=actor,
+        optimizer=actor_optim,
+        lr_scheduler=actor_lr_scheduler,
+        dataloader=train_prompt_dataloader,
+    )
+
+    critic, critic_optim, _, _, critic_lr_scheduler = critic_booster.boost(
+        model=critic,
+        optimizer=critic_optim,
+        lr_scheduler=critic_lr_scheduler,
+        dataloader=train_prompt_dataloader,
+    )
+    reward_model, _, _, _, _ = rm_booster.boost(model=reward_model, dataloader=train_prompt_dataloader)
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_prompt_dataloader)
+
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    sampler_start_idx = 0
+    start_step = 0
+
+    if args.rm_checkpoint_path is not None:
+        if "modeling" in args.rm_checkpoint_path:
+            rm_booster.load_model(reward_model, args.rm_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.rm_checkpoint_path,
+                booster=rm_booster,
+                model=reward_model,
+                optimizer=None,
+                lr_scheduler=None,
+            )
+        coordinator.print_on_master(f"Loaded reward model checkpoint {args.rm_checkpoint_path}")
+
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            actor_booster.load_model(actor, args.checkpoint_path)
+            ref_booster.load_model(ref_model, args.checkpoint_path)
+            coordinator.print_on_master(f"Loaded actor and reference model {args.checkpoint_path}")
+        else:
+            _, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=actor_booster,
+                model=actor,
+                optimizer=actor_optim,
+                lr_scheduler=actor_lr_scheduler,
+            )
+            _, _, _ = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=ref_booster,
+                model=ref_model,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+            assert isinstance(train_prompt_dataloader.sampler, StatefulDistributedSampler)
+            train_prompt_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded actor and reference model checkpoint {args.checkpoint_path} at spisode {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    if args.critic_checkpoint_path is not None:
+        if "modeling" in args.critic_checkpoint_path:
+            critic_booster.load_model(critic, args.critic_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.critic_checkpoint_path,
+                booster=critic_booster,
+                model=critic,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+        coordinator.print_on_master(f"Loaded critic checkpoint {args.critic_checkpoint_path}")
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    # configure trainer
+    trainer = PPOTrainer(
+        actor_booster,
+        critic_booster,
+        actor,
+        critic,
+        reward_model,
+        ref_model,
+        actor_optim,
+        critic_optim,
+        actor_lr_scheduler,
+        critic_lr_scheduler,
+        tokenizer=tokenizer,
+        kl_coef=args.kl_coef,
+        ptx_coef=args.ptx_coef,
+        train_batch_size=args.train_batch_size,
+        buffer_limit=args.num_collect_steps * args.experience_batch_size,
+        max_length=args.max_length,
+        max_new_tokens=args.max_seq_len,
+        use_cache=True,
+        do_sample=True,
+        temperature=0.7,
+        accumulation_steps=args.accumulation_steps,
+        save_dir=args.save_path,
+        save_interval=args.save_interval,
+        top_k=50,
+        use_tp=args.tp > 1,
+        offload_inference_models="gemini" not in args.plugin,
+        coordinator=coordinator,
+    )
+
+    trainer.fit(
+        num_episodes=args.num_episodes,
+        num_collect_steps=args.num_collect_steps,
+        num_update_steps=args.num_update_steps,
+        prompt_dataloader=train_prompt_dataloader,
+        pretrain_dataloader=train_pretrain_dataloader,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        actor.eval()
+        critic.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final actor model checkpoint")
+    actor_booster.save_model(actor, os.path.join(trainer.actor_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final actor model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+    )
+    coordinator.print_on_master("Start saving final critic model checkpoint")
+    critic_booster.save_model(critic, os.path.join(trainer.critic_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final critic model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+    )
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt_dataset", nargs="+", default=[])
+    parser.add_argument("--pretrain_dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--rm_pretrain", type=str, default=None)
+    parser.add_argument("--checkpoint_path", type=str, default=None)
+    parser.add_argument("--critic_checkpoint_path", type=str, default=None)
+    parser.add_argument("--rm_checkpoint_path", type=str, help="Reward model checkpoint path")
+    parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
+    parser.add_argument("--num_episodes", type=int, default=1)
+    parser.add_argument("--num_collect_steps", type=int, default=2)
+    parser.add_argument("--num_update_steps", type=int, default=5)
+    parser.add_argument("--save_interval", type=int, default=1000)
+    parser.add_argument("--train_batch_size", type=int, default=16)
+    parser.add_argument("--experience_batch_size", type=int, default=16)
+    parser.add_argument("--ptx_batch_size", type=int, default=1)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=9e-6)
+    parser.add_argument("--critic_lr", type=float, default=9e-6)
+    parser.add_argument("--kl_coef", type=float, default=0.1)
+    parser.add_argument("--ptx_coef", type=float, default=0.0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--max_seq_len", type=int, default=256)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    train(args)
diff --git a/applications/Chat/examples/training_scripts/train_ppo.sh b/applications/Chat/examples/training_scripts/train_ppo.sh
new file mode 100755
index 000000000000..c50107c9791a
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_ppo.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-dpo"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+REWARD_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
+declare -a prompt_dataset=(
+    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
+)
+
+declare -a ptx_dataset=(
+    /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow/part-00000
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_ppo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --rm_pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --rm_checkpoint_path $REWARD_MODEL_PATH \
+    --prompt_dataset ${prompt_dataset[@]} \
+    --pretrain_dataset ${ptx_dataset[@]} \
+    --ptx_batch_size 1 \
+    --ptx_coef 0.0 \
+    --plugin "zero2" \
+    --save_interval 200 \
+    --save_path $SAVE_DIR \
+    --num_episodes 2000 \
+    --num_collect_steps 1 \
+    --num_update_steps 1 \
+    --experience_batch_size 8 \
+    --train_batch_size 4 \
+    --accumulation_steps 2 \
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
+    --use_wandb
diff --git a/applications/Chat/examples/training_scripts/train_reward_model.py b/applications/Chat/examples/training_scripts/train_reward_model.py
new file mode 100644
index 000000000000..4cf94b98cf7f
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_reward_model.py
@@ -0,0 +1,314 @@
+import argparse
+import json
+import math
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import (
+    DataCollatorForPreferenceDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
+from coati.trainer import RewardModelTrainer
+from coati.utils import load_checkpoint, replace_with_flash_attention
+from transformers import LlamaTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
+
+    booster_policy = None
+    with init_ctx:
+        model = RewardModel(args.pretrain)
+
+        if args.tp > 1:
+            if model.model.config.architectures[0] == "BloomForCausalLM":
+                from colossalai.shardformer.policies.bloom import BloomPolicy
+
+                booster_policy = BloomPolicy()
+            elif model.model.config.architectures[0] == "LlamaForCausalLM":
+                from colossalai.shardformer.policies.llama import LlamaPolicy
+
+                booster_policy = LlamaPolicy()
+            elif model.model.config.architectures[0] == "GPT2LMHeadModel":
+                from colossalai.shardformer.policies.gpt2 import GPT2Policy
+
+                booster_policy = GPT2Policy()
+            elif model.model.config.architectures[0] == "ChatGLMModel":
+                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
+
+                booster_policy = ChatGLMPolicy()
+            elif model.model.config.architectures[0] == "OPTForCausalLM":
+                from colossalai.shardformer.policies.opt import OPTPolicy
+
+                booster_policy = OPTPolicy()
+            else:
+                raise ValueError("Unknown model architecture for policy")
+
+        # TODO: set dropout to 0 here
+        # for llama2, dropout is 0 by default, hence skip.
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+            custom_policy=booster_policy,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        model.model.gradient_checkpointing_enable()  # TODO: support gradient checkpoint for the last linear layer
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer.padding_side = "right"
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # configure loss function
+    if args.loss_fn == "log_sig":
+        loss_fn = LogSigLoss()
+    elif args.loss_fn == "log_exp":
+        loss_fn = LogExpLoss()
+    else:
+        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
+
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    math.ceil(args.max_epochs * num_update_steps_per_epoch)
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = RewardModelTrainer(
+        model,
+        booster,
+        optim,
+        lr_scheduler,
+        tokenizer,
+        loss_fn=loss_fn,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
+    )
+
+    trainer.fit(
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=None,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"], help="Loss function")
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/Chat/examples/training_scripts/train_rm.sh b/applications/Chat/examples/training_scripts/train_rm.sh
new file mode 100755
index 000000000000..18ef2511ef40
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_rm.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="llama2-rm"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/rm/output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  #"/home/lcyab/data/models/bloom-560m" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
+    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_reward_model.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --checkpoint_path /home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling \
+    --dataset ${dataset[@]} \
+    --plugin "zero2" \
+    --save_interval 3000 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --max_epochs 3 \
+    --accumulation_steps 1 \
+    --batch_size 8 \
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --use_flash_attn \
+    # --use_wandb \
+    # --grad_checkpoint \
diff --git a/applications/Chat/examples/training_scripts/train_sft.py b/applications/Chat/examples/training_scripts/train_sft.py
new file mode 100644
index 000000000000..49982ca17e19
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_sft.py
@@ -0,0 +1,284 @@
+import argparse
+import json
+import math
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import DataCollatorForSupervisedDataset, load_tokenized_dataset, setup_distributed_dataloader
+from coati.models import convert_to_lora_module, load_checkpoint
+from coati.trainer import SFTTrainer
+from coati.utils import replace_with_flash_attention
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "ddp":
+        # default torch ddp plugin without any acceleration, for debugging purpose acceleration, for debugging purpose
+        plugin = TorchDDPPlugin(find_unused_parameters=True)
+    elif args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=args.zero,
+            max_norm=args.grad_clip,
+            precision=args.mixed_precision,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    )
+    with init_ctx:
+        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        if args.lora_rank > 0:
+            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        # lora layers are not supported by gradient checkpointing
+        model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=model)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir or args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.add_bos_token = False
+    tokenizer.add_eos_token = False
+
+    coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
+    coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
+
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(
+        f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+    )
+    dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len)
+    train_dataloader = setup_distributed_dataloader(
+        dataset=dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+    )
+    coordinator.print_on_master(
+        f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+    )
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    math.ceil(args.max_epochs * num_update_steps_per_epoch)
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    # Flash attention will be disabled because it does NOT support fp32.
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+    # model = model.to(get_current_device())
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = SFTTrainer(
+        model=model,
+        booster=booster,
+        optim=optim,
+        lr_scheduler=lr_scheduler,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_path,
+        coordinator=coordinator,
+    )
+
+    get_dist_logger()
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        eval_dataloader=None,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final model checkpoint")
+    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d", "ddp"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=1)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--save_path", type=str, default="output")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument(
+        "--lora_train_bias",
+        type=str,
+        default="none",
+        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+    with open(args.config_file, "w") as f:
+        json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/Chat/examples/training_scripts/train_sft.sh b/applications/Chat/examples/training_scripts/train_sft.sh
new file mode 100755
index 000000000000..0b81651e21b3
--- /dev/null
+++ b/applications/Chat/examples/training_scripts/train_sft.sh
@@ -0,0 +1,56 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+# set_n_least_used_CUDA_VISIBLE_DEVICES 4
+export CUDA_VISIBLE_DEVICES=2,3,4,5
+
+PROJECT_NAME="llama2-sft"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft/output/train_config"
+PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+declare -a dataset=(
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00000
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
+    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+# the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2 \
+    --batch_size 2 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb

From 477fe27c0bfd397700540e172428e3ede42f1a5d Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 1 Dec 2023 17:42:13 +0800
Subject: [PATCH 04/23] add ci tests

---
 .github/workflows/run_chatgpt_examples.yml    |   12 +-
 .github/workflows/run_chatgpt_unit_tests.yml  |    3 +-
 applications/Chat/.gitignore                  |    1 +
 applications/Chat/LICENSE                     |    0
 applications/Chat/README.md                   |   92 +-
 applications/Chat/benchmarks/README.md        |    0
 .../benchmarks/benchmark_opt_lora_dummy.py    |    0
 .../Chat/benchmarks/ray/1mmt_dummy.py         |    0
 .../Chat/benchmarks/ray/mmmt_dummy.py         |    0
 applications/Chat/coati/__init__.py           |    0
 applications/Chat/coati/dataset/__init__.py   |   14 +-
 ...osed_to_constant_length_spliced_dataset.py |  137 +
 .../Chat/coati/dataset/conversation.py        |   22 +
 .../Chat/coati/dataset/dpo_dataset_utils.py   |   51 -
 applications/Chat/coati/dataset/loader.py     |   17 +-
 ...nized_dataset.py => tokenization_utils.py} |  173 +-
 applications/Chat/coati/dataset/utils.py      |    0
 .../Chat/coati/experience_buffer/__init__.py  |    0
 .../Chat/coati/experience_buffer/base.py      |    0
 .../Chat/coati/experience_buffer/naive.py     |    0
 .../Chat/coati/experience_buffer/utils.py     |    0
 .../Chat/coati/experience_maker/__init__.py   |    0
 .../Chat/coati/experience_maker/base.py       |    0
 .../Chat/coati/experience_maker/naive.py      |   26 +-
 applications/Chat/coati/kernels/__init__.py   |    0
 applications/Chat/coati/kernels/opt_attn.py   |    0
 applications/Chat/coati/kernels/wrapper.py    |    0
 applications/Chat/coati/models/__init__.py    |    3 +-
 applications/Chat/coati/models/base.py        |   34 +-
 applications/Chat/coati/models/critic.py      |    4 +
 applications/Chat/coati/models/generation.py  |   23 +-
 applications/Chat/coati/models/lora.py        |    8 +
 applications/Chat/coati/models/loss.py        |   17 +-
 .../Chat/coati/models/reward_model.py         |    3 +
 applications/Chat/coati/models/utils.py       |    0
 applications/Chat/coati/quant/__init__.py     |    0
 .../Chat/coati/quant/llama_gptq/__init__.py   |    0
 .../Chat/coati/quant/llama_gptq/loader.py     |    0
 .../coati/quant/llama_gptq/model_utils.py     |    0
 .../Chat/coati/quant/llama_gptq/quant.py      |    0
 applications/Chat/coati/quant/utils.py        |    0
 applications/Chat/coati/ray/README.md         |    0
 applications/Chat/coati/ray/__init__.py       |    0
 .../Chat/coati/ray/callbacks/__init__.py      |    0
 applications/Chat/coati/ray/callbacks/base.py |    0
 .../ray/callbacks/performance_evaluator.py    |    0
 .../Chat/coati/ray/detached_replay_buffer.py  |    0
 .../Chat/coati/ray/detached_trainer_base.py   |    0
 .../Chat/coati/ray/detached_trainer_ppo.py    |    0
 .../Chat/coati/ray/experience_maker_holder.py |    0
 .../Chat/coati/ray/lora_constructor.py        |    0
 applications/Chat/coati/ray/utils.py          |    4 +-
 applications/Chat/coati/trainer/__init__.py   |    0
 applications/Chat/coati/trainer/base.py       |   12 +-
 applications/Chat/coati/trainer/dpo.py        |   81 +-
 applications/Chat/coati/trainer/ppo.py        |   29 +-
 applications/Chat/coati/trainer/rm.py         |   13 +-
 applications/Chat/coati/trainer/sft.py        |    9 +-
 applications/Chat/coati/trainer/utils.py      |    3 +
 applications/Chat/coati/utils/__init__.py     |    0
 .../Chat/coati/utils/accumulative_meter.py    |    5 +
 applications/Chat/coati/utils/ckpt_io.py      |    2 +-
 .../Chat/coati/utils/flash_attention_patch.py |    3 +
 applications/Chat/examples/=2.0.0,            |    0
 applications/Chat/examples/README.md          |    0
 .../Chat/examples/community/README.md         |    0
 .../Chat/examples/community/peft/README.md    |    0
 .../examples/community/peft/easy_dataset.py   |    0
 .../examples/community/peft/easy_models.py    |    0
 .../community/peft/train_peft_prompts.py      |    0
 .../examples/community/peft/train_peft_sft.py |    0
 .../Chat/examples/community/ray/README.md     |    0
 .../examples/community/ray/ray_job_script.py  |    0
 .../community/ray/train_prompts_on_ray.py     |    0
 applications/Chat/examples/config_file        |    0
 .../prepare_preference_dataset.py             |    8 +-
 .../prepare_preference_dataset.sh             |    0
 .../prepare_prompt_dataset.py                 |    6 +-
 .../prepare_prompt_dataset.sh                 |    0
 .../prepare_ptx_dataset.py}                   |    6 +-
 .../prepare_ptx_dataset.sh}                   |    2 +-
 .../prepare_sft_dataset.py                    |    6 +-
 .../prepare_sft_dataset.sh                    |    0
 applications/Chat/examples/hostfile           |    1 -
 .../dataset => examples/inference}/chatio.py  |    4 +
 .../examples/{ => inference}/inference.py     |    7 +-
 .../Chat/examples/inference/round.txt         |   73 +
 .../inference/web_chatbot}/README.md          |    0
 .../inference/web_chatbot}/locustfile.py      |    0
 .../inference/web_chatbot}/requirements.txt   |    0
 .../inference/web_chatbot}/server.py          |    0
 .../inference/web_chatbot}/utils.py           |    0
 applications/Chat/examples/ray/1mmt_prompt.py |    0
 applications/Chat/examples/ray/mmmt_prompt.py |    0
 .../Chat/examples/ray/requirements.txt        |    0
 applications/Chat/examples/requirements.txt   |    2 +-
 applications/Chat/examples/train_dpo.py       |  287 --
 applications/Chat/examples/train_dpo.sh       |   72 -
 applications/Chat/examples/train_ppo.py       |  430 --
 applications/Chat/examples/train_ppo.sh       |   82 -
 .../Chat/examples/train_reward_model.py       |  314 --
 applications/Chat/examples/train_rm.sh        |   72 -
 applications/Chat/examples/train_sft.py       |  284 --
 applications/Chat/examples/train_sft.sh       |   56 -
 .../Chat/examples/training_scripts/hostfile   |    2 +-
 .../examples/training_scripts/train_dpo.py    |   18 +-
 .../examples/training_scripts/train_dpo.sh    |    7 +-
 .../examples/training_scripts/train_ppo.py    |   21 +-
 .../examples/training_scripts/train_ppo.sh    |   20 +-
 .../{train_reward_model.py => train_rm.py}    |    8 +-
 .../examples/training_scripts/train_rm.sh     |    4 +-
 .../examples/training_scripts/train_sft.py    |   13 +-
 .../examples/training_scripts/train_sft.sh    |   32 +-
 applications/Chat/pytest.ini                  |    0
 applications/Chat/requirements-test.txt       |    0
 applications/Chat/requirements.txt            |    0
 applications/Chat/setup.py                    |    0
 applications/Chat/tests/__init__.py           |    0
 .../{examples => tests}/download_model.py     |   13 -
 applications/Chat/tests/test_benchmarks.sh    |   33 -
 applications/Chat/tests/test_checkpoint.py    |   91 -
 .../Chat/tests/test_data_preparation.sh       |  286 ++
 applications/Chat/tests/test_dataset.py       |  252 -
 applications/Chat/tests/test_experience.py    |  324 +-
 applications/Chat/tests/test_lora.py          |    6 +-
 applications/Chat/tests/test_models.py        |  506 +-
 applications/Chat/tests/test_reward_model.py  | 4125 -----------------
 applications/Chat/tests/test_train.sh         |  355 +-
 applications/Chat/version.txt                 |    0
 129 files changed, 1568 insertions(+), 7061 deletions(-)
 mode change 100644 => 100755 applications/Chat/.gitignore
 mode change 100644 => 100755 applications/Chat/LICENSE
 mode change 100644 => 100755 applications/Chat/README.md
 mode change 100644 => 100755 applications/Chat/benchmarks/README.md
 mode change 100644 => 100755 applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
 mode change 100644 => 100755 applications/Chat/benchmarks/ray/1mmt_dummy.py
 mode change 100644 => 100755 applications/Chat/benchmarks/ray/mmmt_dummy.py
 mode change 100644 => 100755 applications/Chat/coati/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/dataset/__init__.py
 create mode 100755 applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
 mode change 100644 => 100755 applications/Chat/coati/dataset/conversation.py
 delete mode 100644 applications/Chat/coati/dataset/dpo_dataset_utils.py
 mode change 100644 => 100755 applications/Chat/coati/dataset/loader.py
 rename applications/Chat/coati/dataset/{spliced_and_tokenized_dataset.py => tokenization_utils.py} (66%)
 mode change 100644 => 100755
 mode change 100644 => 100755 applications/Chat/coati/dataset/utils.py
 mode change 100644 => 100755 applications/Chat/coati/experience_buffer/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/experience_buffer/base.py
 mode change 100644 => 100755 applications/Chat/coati/experience_buffer/naive.py
 mode change 100644 => 100755 applications/Chat/coati/experience_buffer/utils.py
 mode change 100644 => 100755 applications/Chat/coati/experience_maker/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/experience_maker/base.py
 mode change 100644 => 100755 applications/Chat/coati/experience_maker/naive.py
 mode change 100644 => 100755 applications/Chat/coati/kernels/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/kernels/opt_attn.py
 mode change 100644 => 100755 applications/Chat/coati/kernels/wrapper.py
 mode change 100644 => 100755 applications/Chat/coati/models/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/models/base.py
 mode change 100644 => 100755 applications/Chat/coati/models/critic.py
 mode change 100644 => 100755 applications/Chat/coati/models/generation.py
 mode change 100644 => 100755 applications/Chat/coati/models/lora.py
 mode change 100644 => 100755 applications/Chat/coati/models/loss.py
 mode change 100644 => 100755 applications/Chat/coati/models/reward_model.py
 mode change 100644 => 100755 applications/Chat/coati/models/utils.py
 mode change 100644 => 100755 applications/Chat/coati/quant/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/quant/llama_gptq/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/quant/llama_gptq/loader.py
 mode change 100644 => 100755 applications/Chat/coati/quant/llama_gptq/model_utils.py
 mode change 100644 => 100755 applications/Chat/coati/quant/llama_gptq/quant.py
 mode change 100644 => 100755 applications/Chat/coati/quant/utils.py
 mode change 100644 => 100755 applications/Chat/coati/ray/README.md
 mode change 100644 => 100755 applications/Chat/coati/ray/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/ray/callbacks/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/ray/callbacks/base.py
 mode change 100644 => 100755 applications/Chat/coati/ray/callbacks/performance_evaluator.py
 mode change 100644 => 100755 applications/Chat/coati/ray/detached_replay_buffer.py
 mode change 100644 => 100755 applications/Chat/coati/ray/detached_trainer_base.py
 mode change 100644 => 100755 applications/Chat/coati/ray/detached_trainer_ppo.py
 mode change 100644 => 100755 applications/Chat/coati/ray/experience_maker_holder.py
 mode change 100644 => 100755 applications/Chat/coati/ray/lora_constructor.py
 mode change 100644 => 100755 applications/Chat/coati/ray/utils.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/base.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/dpo.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/ppo.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/rm.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/sft.py
 mode change 100644 => 100755 applications/Chat/coati/trainer/utils.py
 mode change 100644 => 100755 applications/Chat/coati/utils/__init__.py
 mode change 100644 => 100755 applications/Chat/coati/utils/accumulative_meter.py
 mode change 100644 => 100755 applications/Chat/coati/utils/ckpt_io.py
 mode change 100644 => 100755 applications/Chat/coati/utils/flash_attention_patch.py
 mode change 100644 => 100755 applications/Chat/examples/=2.0.0,
 mode change 100644 => 100755 applications/Chat/examples/README.md
 mode change 100644 => 100755 applications/Chat/examples/community/README.md
 mode change 100644 => 100755 applications/Chat/examples/community/peft/README.md
 mode change 100644 => 100755 applications/Chat/examples/community/peft/easy_dataset.py
 mode change 100644 => 100755 applications/Chat/examples/community/peft/easy_models.py
 mode change 100644 => 100755 applications/Chat/examples/community/peft/train_peft_prompts.py
 mode change 100644 => 100755 applications/Chat/examples/community/peft/train_peft_sft.py
 mode change 100644 => 100755 applications/Chat/examples/community/ray/README.md
 mode change 100644 => 100755 applications/Chat/examples/community/ray/ray_job_script.py
 mode change 100644 => 100755 applications/Chat/examples/community/ray/train_prompts_on_ray.py
 mode change 100644 => 100755 applications/Chat/examples/config_file
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_preference_dataset.py (95%)
 mode change 100644 => 100755
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_preference_dataset.sh (100%)
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_prompt_dataset.py (96%)
 mode change 100644 => 100755
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_prompt_dataset.sh (100%)
 rename applications/Chat/examples/{prepare_pretrain_dataset_for_ppo_with_ptx_loss.py => data_preparation_scripts/prepare_ptx_dataset.py} (95%)
 mode change 100644 => 100755
 rename applications/Chat/examples/{prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh => data_preparation_scripts/prepare_ptx_dataset.sh} (90%)
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_sft_dataset.py (96%)
 mode change 100644 => 100755
 rename applications/Chat/examples/{ => data_preparation_scripts}/prepare_sft_dataset.sh (100%)
 delete mode 100644 applications/Chat/examples/hostfile
 rename applications/Chat/{coati/dataset => examples/inference}/chatio.py (99%)
 mode change 100644 => 100755
 rename applications/Chat/examples/{ => inference}/inference.py (93%)
 mode change 100644 => 100755
 create mode 100755 applications/Chat/examples/inference/round.txt
 rename applications/Chat/{inference => examples/inference/web_chatbot}/README.md (100%)
 mode change 100644 => 100755
 rename applications/Chat/{inference => examples/inference/web_chatbot}/locustfile.py (100%)
 mode change 100644 => 100755
 rename applications/Chat/{inference => examples/inference/web_chatbot}/requirements.txt (100%)
 mode change 100644 => 100755
 rename applications/Chat/{inference => examples/inference/web_chatbot}/server.py (100%)
 mode change 100644 => 100755
 rename applications/Chat/{inference => examples/inference/web_chatbot}/utils.py (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 applications/Chat/examples/ray/1mmt_prompt.py
 mode change 100644 => 100755 applications/Chat/examples/ray/mmmt_prompt.py
 mode change 100644 => 100755 applications/Chat/examples/ray/requirements.txt
 mode change 100644 => 100755 applications/Chat/examples/requirements.txt
 delete mode 100644 applications/Chat/examples/train_dpo.py
 delete mode 100755 applications/Chat/examples/train_dpo.sh
 delete mode 100644 applications/Chat/examples/train_ppo.py
 delete mode 100755 applications/Chat/examples/train_ppo.sh
 delete mode 100644 applications/Chat/examples/train_reward_model.py
 delete mode 100755 applications/Chat/examples/train_rm.sh
 delete mode 100644 applications/Chat/examples/train_sft.py
 delete mode 100755 applications/Chat/examples/train_sft.sh
 mode change 100644 => 100755 applications/Chat/examples/training_scripts/hostfile
 mode change 100644 => 100755 applications/Chat/examples/training_scripts/train_dpo.py
 mode change 100644 => 100755 applications/Chat/examples/training_scripts/train_ppo.py
 rename applications/Chat/examples/training_scripts/{train_reward_model.py => train_rm.py} (98%)
 mode change 100644 => 100755
 mode change 100644 => 100755 applications/Chat/examples/training_scripts/train_sft.py
 mode change 100644 => 100755 applications/Chat/pytest.ini
 mode change 100644 => 100755 applications/Chat/requirements-test.txt
 mode change 100644 => 100755 applications/Chat/requirements.txt
 mode change 100644 => 100755 applications/Chat/setup.py
 mode change 100644 => 100755 applications/Chat/tests/__init__.py
 rename applications/Chat/{examples => tests}/download_model.py (80%)
 mode change 100644 => 100755
 delete mode 100755 applications/Chat/tests/test_benchmarks.sh
 delete mode 100644 applications/Chat/tests/test_checkpoint.py
 create mode 100755 applications/Chat/tests/test_data_preparation.sh
 delete mode 100644 applications/Chat/tests/test_dataset.py
 mode change 100644 => 100755 applications/Chat/tests/test_experience.py
 mode change 100644 => 100755 applications/Chat/tests/test_lora.py
 mode change 100644 => 100755 applications/Chat/tests/test_models.py
 delete mode 100644 applications/Chat/tests/test_reward_model.py
 mode change 100644 => 100755 applications/Chat/version.txt

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index f9e9f400962e..23c721e3d950 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -36,18 +36,18 @@ jobs:
 
       - name: Install Transformers
         run: |
-          pip install transformers==4.30.2
+          pip install transformers==4.33.2
 
       - name: Execute Examples
         run: |
           cd applications/Chat
           rm -rf ~/.cache/colossalai
-          ./tests/test_inference.sh
-          ./tests/test_benchmarks.sh
+          ./tests/test_data_preparation.sh
           ./tests/test_train.sh
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8
-          SFT_DATASET: /data/scratch/github_actions/chat/data.json
-          PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl
-          PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json
+          SFT_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/sft
+          PROMPT_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/prompt
+          PRETRAIN_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/ptx
+          PREFERENCE_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/preference
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index ec5c8ffa319f..a47e03987cf7 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -40,7 +40,8 @@ jobs:
         run: |
           cd applications/Chat
           rm -rf ~/.cache/colossalai
-          pytest tests/
+          # pytest tests/
+          # Disabled temporally because some unit tests are not implemented
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8
diff --git a/applications/Chat/.gitignore b/applications/Chat/.gitignore
old mode 100644
new mode 100755
index 6e464c960e24..386a0a4edc97
--- a/applications/Chat/.gitignore
+++ b/applications/Chat/.gitignore
@@ -148,3 +148,4 @@ example/logs/
 example/output/
 
 examples/awesome-chatgpt-prompts/
+temp/
diff --git a/applications/Chat/LICENSE b/applications/Chat/LICENSE
old mode 100644
new mode 100755
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
old mode 100644
new mode 100755
index d5be04ab9f44..99a2cd13ff99
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -131,11 +131,18 @@ You can run the `examples/train_sft.sh` to start a supervised instructs fine-tun
 
 ```json
 [
-    {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
+    {"messages":
+      [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        },
+        {
+          "from": "assistant",
+          "content": "Are you looking for practical joke ideas?"
+        },
+        ...
+      ]
     },
     ...
 ]
@@ -143,7 +150,41 @@ You can run the `examples/train_sft.sh` to start a supervised instructs fine-tun
 
 ### RLHF Training Stage2 - Training reward model
 
-Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model
+Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
+
+Below shows the preference dataset used in training the reward model.
+
+```json
+[
+    {"context": [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        },
+        {
+          "from": "assistant",
+          "content": "Are you looking for practical joke ideas?"
+        },
+        ...
+      ]
+      "chosen": [
+        {
+          "from": "assistant",
+          "content": "About 150 species of butterflies live in Oregon, with about 100 species are moths, and about 20 species are common here year-round, and another 10 species are seen here year-round.  I suggest you keep an eye out for skippers, gossamer wings, and red admirals."
+        },
+        ...
+      ],
+      "rejected": [
+        {
+          "from": "assistant",
+          "content": "Are you interested in just the common butterflies?  There are a few common ones which will be easy to find.  Like the Monarch, Western Tiger Swallowtail and several other swallowtail butterflies.  The Monarch is known for being a very common and beautiful butterfly.  Are you interested in butterflies because you have children?"
+        },
+        ...
+      ]
+    },
+    ...
+]
+```
 
 You can run the `examples/train_rm.sh` to start a reward model training.
 [[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
@@ -161,15 +202,13 @@ You can run the `examples/train_prompts.sh` to start training PPO with human fee
 
 **Note**: the required datasets follow the following format,
 
-- `pretrain dataset`
+- `pretrain dataset (for ptx loss in ppo)`
 
   ```json
   [
       {
-          "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-          "input": "",
-          "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-          "id": 0
+          "source": "", # system instruction
+          "Target": "Provide a list of the top 10 most popular mobile games in Asia\nThe top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
       },
       ...
   ]
@@ -178,21 +217,34 @@ You can run the `examples/train_prompts.sh` to start training PPO with human fee
 - `prompt dataset`
 
   ```json
+  # The format is the same with sft data. But the last sentence from assistant will be ignored. Only question (prompt) will be preserved. Therefore, even dataset without answer can be used as prompt dataset (e.g. red teaming data)
   [
-      {
-          "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-          "id": 0
-      },
-      {
-          "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
-          "id": 1
-      },
-      ...
+    {"context":
+      [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        },
+        # Optional
+        {
+          "from": "assistant",
+          "content": "Are you looking for practical joke ideas?"
+        },
+        ...
+      ]
+    },
+    ...
   ]
   ```
 
 For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
 
+## Alternative Option For RLHF: DPO
+
+For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in the paper (available at [https://arxiv.org/abs/2305.18290](https://arxiv.org/abs/2305.18290)), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO.
+
+For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
+
 ### Inference Quantization and Serving - After Training
 
 We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
diff --git a/applications/Chat/benchmarks/README.md b/applications/Chat/benchmarks/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/__init__.py b/applications/Chat/coati/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
old mode 100644
new mode 100755
index 48804fe65047..94f3d75d7c29
--- a/applications/Chat/coati/dataset/__init__.py
+++ b/applications/Chat/coati/dataset/__init__.py
@@ -1,3 +1,4 @@
+from .conversation import setup_conversation_template
 from .loader import (
     DataCollatorForPreferenceDataset,
     DataCollatorForPromptDataset,
@@ -6,8 +7,12 @@
     load_tokenized_dataset,
     setup_distributed_dataloader,
 )
-from .spliced_and_tokenized_dataset import tokenize_prompt_dataset
-from .utils import is_rank_0
+from .tokenization_utils import (
+    supervised_tokenize_pretrain,
+    supervised_tokenize_sft,
+    tokenize_prompt_dataset,
+    tokenize_rlhf,
+)
 
 __all__ = [
     "tokenize_prompt_dataset",
@@ -18,4 +23,9 @@
     "StatefulDistributedSampler",
     "load_tokenized_dataset",
     "setup_distributed_dataloader",
+    "supervised_tokenize_pretrain",
+    "supervised_tokenize_pretrain",
+    "supervised_tokenize_sft",
+    "tokenize_rlhf",
+    "setup_conversation_template",
 ]
diff --git a/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py b/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
new file mode 100755
index 000000000000..e4f8e5310bb9
--- /dev/null
+++ b/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ClosedToConstantLengthSplicedDataset for continue pretrain
+"""
+
+import random
+import warnings
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+
+from datasets import dataset_dict
+from torch.utils.data import ConcatDataset, Dataset, IterableDataset
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+IGNORE_INDEX = -100
+
+DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+
+
+class ClosedToConstantLengthSplicedDataset(IterableDataset):
+    """
+    Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
+    original independent (pre-tokenized) data points.
+    """
+
+    def __init__(
+        self,
+        dataset: DSType,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int = 4096,
+        num_packed_sequences: int = 8,
+        fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
+        input_ids_field: str = "input_ids",
+        labels_field: str = "labels",
+        infinite: bool = False,
+        shuffle: bool = True,
+        error_strict: bool = False,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.max_length = max_length
+        self.infinite = infinite
+        self.max_buffer_size = max_length * num_packed_sequences  # e.g., 4096 * 16
+        self.shuffle = shuffle
+
+        # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
+        # A function that fetch sequence input_ids and labels from the original data point
+        if fetch_sequence_func is None:
+            self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
+        else:
+            self.fetch_sequence_func = fetch_sequence_func
+        self.input_ids_field = input_ids_field
+        self.labels_field = labels_field
+
+        self.error_strict = error_strict
+        self.current_size = 0  # `int`, current packed data size.
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __iter__(self) -> Iterable[Dict[str, List[int]]]:
+        iterator = iter(self.dataset)
+        more_data_points = True
+        while more_data_points is True:
+            buffer, buffer_len = [], 0
+            while True:
+                # ending condition.
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    # `Tuple[List[int], List[int]]`
+                    seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
+                    buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
+                    buffer_len += len(buffer[-1][self.input_ids_field])
+                except StopIteration:
+                    if self.infinite is True:
+                        iterator = iter(self.dataset)
+                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
+                    else:
+                        more_data_points = False
+                        break
+            examples = []  # `List[Dict[str, List[int]]]`, save buffered spliced data points.
+            spliced_input_ids, spliced_labels = [], []  # `List[int]`, `List[int]`
+            for i, data_point in enumerate(buffer):
+                # TODO(2023-09-18) check errors for each unspliced tokenized data point
+                seq_input_ids = data_point[self.input_ids_field]
+                seq_labels = data_point[self.labels_field]
+                # Handle special case:
+                # If the length of an original data point (i.e., input_ids length of a data point before splicing)
+                # exceeds `max_length`, truncate it.
+                if len(seq_input_ids) > self.max_length:
+                    truncated_seq_input_ids = seq_input_ids[: self.max_length]
+                    truncated_label_ids = seq_labels[: self.max_length]
+                    if set(truncated_label_ids) == {IGNORE_INDEX}:
+                        if self.error_strict is True:
+                            raise ValueError(
+                                f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
+                                f"with all label values as {IGNORE_INDEX}."
+                            )
+                        else:
+                            warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
+                            continue  # Skip the current error data point.
+                    spliced_data_point = {
+                        self.input_ids_field: truncated_seq_input_ids,
+                        self.labels_field: truncated_label_ids,
+                    }
+                    examples.append(spliced_data_point)
+                    warnings.warn("Find a data point to be truncated.")
+                    continue
+
+                # Pre action judgment.
+                if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
+                    spliced_data_point = {
+                        self.input_ids_field: spliced_input_ids,
+                        self.labels_field: spliced_labels,
+                    }  # `Dict[str, List[int]]`
+                    # Update.
+                    spliced_input_ids, spliced_labels = [], []
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+                    examples.append(spliced_data_point)
+                else:
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+            # For residual spliced data point at the end of the data set
+            if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
+                examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
+            if self.shuffle:
+                random.shuffle(examples)
+            for spliced_data_point in examples:
+                # TODO(2023-09-18): check errors for each spliced tokenized data point.
+                self.current_size += 1
+                yield spliced_data_point
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
old mode 100644
new mode 100755
index 572eb3ae54ff..f3a56c66fd5a
--- a/applications/Chat/coati/dataset/conversation.py
+++ b/applications/Chat/coati/dataset/conversation.py
@@ -16,6 +16,8 @@
 from enum import Enum, auto
 from typing import List
 
+from transformers import PreTrainedTokenizer
+
 
 class SeparatorStyle(Enum):
     ADD_BOS_EOS_TOKEN = auto()
@@ -94,3 +96,23 @@ def dict(self):
 )
 
 default_conversation = conv
+
+
+def setup_conversation_template(tokenizer: PreTrainedTokenizer) -> Conversation:
+    """
+    Setup the conversation template to use the bos and the eos of the tokenizer if application
+    Or setup the bos and the eos of the tokenizer to be the same as the separator of the conversation template
+    """
+    conversation_template = conv.copy()
+    if tokenizer.eos_token is None:
+        raise ValueError(
+            "The tokenizer you specified does not have a eos token, please manually set a eos token that can be tokenized into a single token"
+        )
+    if tokenizer.bos_token is None:
+        tokenizer.bos_token = tokenizer.eos_token
+    if len(tokenizer.tokenize(tokenizer.eos_token)) != 1:
+        raise ValueError("Please check your tokenizer to make sure the eos token can be tokenized into a single token")
+    if len(tokenizer.tokenize(tokenizer.bos_token)) != 1:
+        raise ValueError("Please check your tokenizer to make sure the bos token can be tokenized into a single token")
+    conversation_template.seps = [tokenizer.bos_token, tokenizer.eos_token]
+    return conversation_template
diff --git a/applications/Chat/coati/dataset/dpo_dataset_utils.py b/applications/Chat/coati/dataset/dpo_dataset_utils.py
deleted file mode 100644
index 7aa4c2389c5a..000000000000
--- a/applications/Chat/coati/dataset/dpo_dataset_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Any, Dict, List, Union
-
-import torch
-import torch.nn.functional as F
-from transformers import LlamaForCausalLM
-
-
-def get_log_probability(logits: torch.Tensor, labels: torch.Tensor):
-    log_probs = F.log_softmax(logits, dim=-1)
-    log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
-    return log_probs_labels.squeeze(-1)
-
-
-def get_reference_model_reward(
-    data_point: Dict[str, Any],
-    model: LlamaForCausalLM,
-) -> Dict[str, Union[int, str, List[int]]]:
-    (
-        chosen_input_ids,
-        chosen_attention_mask,
-        chosen_loss_mask,
-        rejected_input_ids,
-        rejected_attention_mask,
-        rejected_loss_mask,
-    ) = (
-        data_point["chosen_input_ids"],
-        data_point["chosen_attention_mask"],
-        data_point["chosen_loss_mask"],
-        data_point["rejected_input_ids"],
-        data_point["rejected_attention_mask"],
-        data_point["rejected_loss_mask"],
-    )
-    with torch.no_grad():
-        current_device = torch.cuda.current_device()
-        chosen_logits = model(
-            input_ids=torch.tensor([chosen_input_ids]).to(current_device),
-            attention_mask=torch.tensor([chosen_attention_mask]).to(current_device),
-        ).logits
-        chosen_logits = get_log_probability(chosen_logits, torch.tensor([chosen_input_ids]).to(current_device)).cpu()
-        rejected_logits = model(
-            input_ids=torch.tensor([rejected_input_ids]).to(current_device),
-            attention_mask=torch.tensor([rejected_attention_mask]).to(current_device),
-        ).logits
-        rejected_logits = get_log_probability(
-            rejected_logits, torch.tensor([rejected_input_ids]).to(current_device)
-        ).cpu()
-
-        data_point["chosen_reward"] = (chosen_logits * torch.tensor(chosen_loss_mask)).sum(-1)
-        data_point["rejected_reward"] = (rejected_logits * torch.tensor(rejected_loss_mask)).sum(-1)
-
-    return data_point
diff --git a/applications/Chat/coati/dataset/loader.py b/applications/Chat/coati/dataset/loader.py
old mode 100644
new mode 100755
index 49e63975b237..4ef84d3e9cc0
--- a/applications/Chat/coati/dataset/loader.py
+++ b/applications/Chat/coati/dataset/loader.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+"""
+Dataloader for sft, dpo, ppo
+"""
 
 import math
 import os
@@ -97,9 +100,6 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
             else torch.LongTensor(instance["labels"])
             for instance in instances
         ]
-        # torch.set_printoptions(threshold=10_000)
-        # if is_rank_0():
-        #     print(batch_input_ids[0])
         if self.tokenizer.padding_side == "right":
             input_ids = torch.nn.utils.rnn.pad_sequence(
                 sequences=batch_input_ids,
@@ -135,9 +135,6 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
                 f"`{self.tokenizer.__class__.__name__}.padding_side` can only be `left` or `right`, "
                 f"but now `{self.tokenizer.padding_side}`"
             )
-        # if is_rank_0():
-        #     print(input_ids[0])
-        # exit()
 
         attention_mask = input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
 
@@ -157,21 +154,13 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
             (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
                 `input_ids`: `torch.Tensor` of shape (bsz, max_len);
                 `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
-                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
         """
         instances = [{"input_ids": ins["input_ids"], "labels": ins["input_ids"]} for ins in instances]
         ret = super().__call__(instances=instances)
-        # torch.set_printoptions(threshold=10_000)
-        # if is_rank_0():
-        #     print(ret['input_ids'][0])
         input_ids = F.pad(
             ret["input_ids"], (self.max_length - ret["input_ids"].size(1), 0), value=self.tokenizer.pad_token_id
         )
         attention_mask = F.pad(ret["attention_mask"], (self.max_length - ret["attention_mask"].size(1), 0), value=False)
-        # if is_rank_0():
-        #     print(input_ids[0])
-        #     print(attention_mask[0])
-        # exit()
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
 
diff --git a/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py b/applications/Chat/coati/dataset/tokenization_utils.py
old mode 100644
new mode 100755
similarity index 66%
rename from applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
rename to applications/Chat/coati/dataset/tokenization_utils.py
index 185b1cf0e4e0..448c2caa0424
--- a/applications/Chat/coati/dataset/spliced_and_tokenized_dataset.py
+++ b/applications/Chat/coati/dataset/tokenization_utils.py
@@ -1,19 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Splicing multiple pre-tokenized sequence data points
+tokenization utils for constructing dataset for ppo, dpo, sft, rm
 """
 
-import random
 import warnings
 from copy import deepcopy
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+from typing import Any, Dict, List, Union
 
 from coati.dataset.conversation import Conversation, default_conversation
 from datasets import dataset_dict
-from torch.utils.data import ConcatDataset, Dataset, IterableDataset
-from transformers.models.llama.tokenization_llama import LlamaTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
+from torch.utils.data import ConcatDataset, Dataset
+from transformers import PreTrainedTokenizer
 
 from colossalai.logging import get_dist_logger
 
@@ -25,7 +23,7 @@
 
 
 def supervised_tokenize_pretrain(
-    data_point: Dict[str, str], tokenizer: LlamaTokenizer, ignore_index: int = None, max_length: int = 4096
+    data_point: Dict[str, str], tokenizer: PreTrainedTokenizer, ignore_index: int = None, max_length: int = 4096
 ) -> Dict[str, Union[int, str, List[int]]]:
     """
     A tokenization function to tokenize an original pretraining data point as following:
@@ -69,7 +67,7 @@ def supervised_tokenize_pretrain(
 
 def supervised_tokenize_sft(
     data_point: Dict[str, str],
-    tokenizer: LlamaTokenizer,
+    tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = default_conversation,
     ignore_index: int = None,
     max_length: int = 4096,
@@ -137,28 +135,26 @@ def supervised_tokenize_sft(
     target_turn = turns[target_turn_index - 1]
     prompt = template.get_prompt(2 * target_turn)
     tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
-
-    # Uncomment the following to check whether `bisect_right` is right.
-    # if 2 * target_turn < len(template.messages):
-    #     length_to_next_turn = len(tokenizer([template.get_prompt(2*target_turn+2)], add_special_tokens=False)["input_ids"][0])
-    #     assert length_to_next_turn > max_length - 1, logger.info(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
-
     template.messages = template.messages[0 : 2 * target_turn]
 
     starts = []
     ends = []
+    expect_bos = True
     gpt_bos = False if template.messages[0][0] == template.roles[0] else True
     gpt_eos = False if template.messages[0][0] == template.roles[0] else True
 
     for i, token_id in enumerate(tokenized):
-        if token_id == tokenizer.bos_token_id:
+        if token_id == tokenizer.bos_token_id and expect_bos:
             if gpt_bos:
                 starts.append(i)
             gpt_bos = not gpt_bos
-        elif token_id == tokenizer.eos_token_id:
+            expect_bos = not expect_bos
+            continue
+        if token_id == tokenizer.eos_token_id and not expect_bos:
             if gpt_eos:
                 ends.append(i)
             gpt_eos = not gpt_eos
+            expect_bos = not expect_bos
 
     if len(starts) != target_turn or len(ends) != target_turn:
         logger.info(
@@ -196,7 +192,7 @@ def supervised_tokenize_sft(
 
 def tokenize_prompt_dataset(
     data_point: Dict[str, str],
-    tokenizer: LlamaTokenizer,
+    tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = default_conversation,
     ignore_index: int = None,
     max_length: int = 4096,
@@ -205,10 +201,6 @@ def tokenize_prompt_dataset(
     A tokenization function to tokenize an original pretraining data point as following:
         {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
     """
-    # assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
-    #     "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
-    #     "add <bos> and <eos> manually later"
-    # )
 
     assert (
         tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
@@ -263,12 +255,6 @@ def tokenize_prompt_dataset(
     target_turn = turns[target_turn_index - 1]
     prompt = template.get_prompt(2 * target_turn - 1) + "Assistant: <s>"
     tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
-
-    # Uncomment the following to check whether `bisect_right` is right.
-    # if 2 * target_turn < len(template.messages):
-    #     length_to_next_turn = len(tokenizer([template.get_prompt(2*target_turn+2)], add_special_tokens=False)["input_ids"][0])
-    #     assert length_to_next_turn > max_length - 1, logger.info(f"The length of the prompt until the next turn after tokenization is {length_to_next_turn}, which is smaller than {max_length - 1}")
-
     template.messages = template.messages[0 : 2 * target_turn - 1]
     tokenized = [tokenizer.bos_token_id] + tokenized
 
@@ -289,21 +275,25 @@ def generate_loss_mask(template: Conversation, tokenizer: Any, context_len: int)
     attention_mask = tokenized["attention_mask"][0]
     starts = []
     ends = []
+    expect_bos = True
     gpt_bos = False if template.messages[0][0] == template.roles[0] else True
     gpt_eos = False if template.messages[0][0] == template.roles[0] else True
 
     for i, token_id in enumerate(input_ids):
-        if token_id == tokenizer.bos_token_id:
+        if token_id == tokenizer.bos_token_id and expect_bos:
             if gpt_bos:
                 starts.append(i)
             gpt_bos = not gpt_bos
-        elif token_id == tokenizer.eos_token_id:
+            expect_bos = not expect_bos
+            continue
+        if token_id == tokenizer.eos_token_id and not expect_bos:
             if gpt_eos:
                 ends.append(i)
             gpt_eos = not gpt_eos
+            expect_bos = not expect_bos
 
     if len(starts) != target_turn or len(ends) != target_turn:
-        print(
+        warnings.warn(
             "Please check whether the tokenizer add additional `bos_token` and `eos_token`.\n\nOr the original message contains `bos_token` or `eos_token`."
         )
         return dict(input_ids=None, attention_mask=None, loss_mask=None)
@@ -322,7 +312,7 @@ def generate_loss_mask(template: Conversation, tokenizer: Any, context_len: int)
 
 def tokenize_rlhf(
     data_point: Dict[str, str],
-    tokenizer: LlamaTokenizer,
+    tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = default_conversation,
     ignore_index: int = None,
     max_length: int = 4096,
@@ -358,8 +348,9 @@ def tokenize_rlhf(
             template.append_message(from_str, mess["content"])
 
     if len(template.messages) % 2 != 1:
-        print("Please make sure leading context is started and ended with a line from human")
-        print(template.messages)
+        warnings.warn(
+            "Please make sure leading context is started and ended with a line from human" + str(template.messages)
+        )
         return dict(
             chosen_input_ids=None,
             chosen_attention_mask=None,
@@ -439,119 +430,3 @@ def tokenize_rlhf(
             rejected_attention_mask=None,
             rejected_loss_mask=None,
         )
-
-
-class ClosedToConstantLengthSplicedDataset(IterableDataset):
-    """
-    Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
-    original independent (pre-tokenized) data points.
-    """
-
-    def __init__(
-        self,
-        dataset: DSType,
-        tokenizer: PreTrainedTokenizer,
-        max_length: int = 4096,
-        num_packed_sequences: int = 8,
-        fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
-        input_ids_field: str = "input_ids",
-        labels_field: str = "labels",
-        infinite: bool = False,
-        shuffle: bool = True,
-        error_strict: bool = False,
-    ) -> None:
-        self.tokenizer = tokenizer
-        self.dataset = dataset
-        self.max_length = max_length
-        self.infinite = infinite
-        self.max_buffer_size = max_length * num_packed_sequences  # e.g., 4096 * 16
-        self.shuffle = shuffle
-
-        # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
-        # A function that fetch sequence input_ids and labels from the original data point
-        if fetch_sequence_func is None:
-            self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
-        else:
-            self.fetch_sequence_func = fetch_sequence_func
-        self.input_ids_field = input_ids_field
-        self.labels_field = labels_field
-
-        self.error_strict = error_strict
-        self.current_size = 0  # `int`, current packed data size.
-
-    def __len__(self) -> int:
-        return len(self.dataset)
-
-    def __iter__(self) -> Iterable[Dict[str, List[int]]]:
-        iterator = iter(self.dataset)
-        more_data_points = True
-        while more_data_points is True:
-            buffer, buffer_len = [], 0
-            while True:
-                # ending condition.
-                if buffer_len >= self.max_buffer_size:
-                    break
-                try:
-                    # `Tuple[List[int], List[int]]`
-                    seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
-                    buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
-                    buffer_len += len(buffer[-1][self.input_ids_field])
-                except StopIteration:
-                    if self.infinite is True:
-                        iterator = iter(self.dataset)
-                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
-                    else:
-                        more_data_points = False
-                        break
-            examples = []  # `List[Dict[str, List[int]]]`, save buffered spliced data points.
-            spliced_input_ids, spliced_labels = [], []  # `List[int]`, `List[int]`
-            for i, data_point in enumerate(buffer):
-                # TODO(2023-09-18) check errors for each unspliced tokenized data point
-                seq_input_ids = data_point[self.input_ids_field]
-                seq_labels = data_point[self.labels_field]
-                # Handle special case:
-                # If the length of an original data point (i.e., input_ids length of a data point before splicing)
-                # exceeds `max_length`, truncate it.
-                if len(seq_input_ids) > self.max_length:
-                    truncated_seq_input_ids = seq_input_ids[: self.max_length]
-                    truncated_label_ids = seq_labels[: self.max_length]
-                    if set(truncated_label_ids) == {IGNORE_INDEX}:
-                        if self.error_strict is True:
-                            raise ValueError(
-                                f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
-                                f"with all label values as {IGNORE_INDEX}."
-                            )
-                        else:
-                            warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
-                            continue  # Skip the current error data point.
-                    spliced_data_point = {
-                        self.input_ids_field: truncated_seq_input_ids,
-                        self.labels_field: truncated_label_ids,
-                    }
-                    examples.append(spliced_data_point)
-                    warnings.warn("Find a data point to be truncated.")
-                    continue
-
-                # Pre action judgment.
-                if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
-                    spliced_data_point = {
-                        self.input_ids_field: spliced_input_ids,
-                        self.labels_field: spliced_labels,
-                    }  # `Dict[str, List[int]]`
-                    # Update.
-                    spliced_input_ids, spliced_labels = [], []
-                    spliced_input_ids.extend(seq_input_ids)
-                    spliced_labels.extend(seq_labels)
-                    examples.append(spliced_data_point)
-                else:
-                    spliced_input_ids.extend(seq_input_ids)
-                    spliced_labels.extend(seq_labels)
-            # For residual spliced data point at the end of the data set
-            if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
-                examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
-            if self.shuffle:
-                random.shuffle(examples)
-            for spliced_data_point in examples:
-                # TODO(2023-09-18): check errors for each spliced tokenized data point.
-                self.current_size += 1
-                yield spliced_data_point
diff --git a/applications/Chat/coati/dataset/utils.py b/applications/Chat/coati/dataset/utils.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_buffer/__init__.py b/applications/Chat/coati/experience_buffer/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_buffer/base.py b/applications/Chat/coati/experience_buffer/base.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_buffer/naive.py b/applications/Chat/coati/experience_buffer/naive.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_buffer/utils.py b/applications/Chat/coati/experience_buffer/utils.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_maker/__init__.py b/applications/Chat/coati/experience_maker/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_maker/base.py b/applications/Chat/coati/experience_maker/base.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py
old mode 100644
new mode 100755
index f9e8c8a78010..927e0b3417dc
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@@ -1,3 +1,7 @@
+"""
+experience maker.
+"""
+
 import torch
 import torch.nn.functional as F
 from coati.models import Critic, RewardModel
@@ -20,14 +24,12 @@ def __init__(
         reward_model: RewardModel,
         initial_model: PreTrainedModel,
         tokenizer: PreTrainedTokenizer,
-        rm_model_tokenizer: PreTrainedTokenizer,
         kl_coef: float = 0.01,
         gamma: float = 1.0,
         lam: float = 0.95,
     ) -> None:
         super().__init__(actor, critic, reward_model, initial_model)
         self.tokenizer = tokenizer
-        self.rm_model_tokenizer = rm_model_tokenizer
         self.kl_coef = kl_coef
         self.gamma = gamma
         self.lam = lam
@@ -50,15 +52,10 @@ def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
         self.critic.eval()
         self.initial_model.eval()
         self.reward_model.eval()
-        # if is_rank_0():
-        #     print(input_ids[0])
         torch.manual_seed(47)  # for tp, gurantee the same input for reward model
         sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
         sequence_length = sequences.size(1)
 
-        self.actor.train()
-        self.critic.train()
-
         # calculate auxiliary tensors
         attention_mask = None
         pad_token_id = self.tokenizer.pad_token_id
@@ -78,13 +75,13 @@ def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
         action_mask = action_mask[:, -(sequences.size(1) - input_len) :]
         num_actions = action_mask.size(1)
 
-        actor_output = self.actor(sequences, attention_mask)["logits"]
+        actor_output = self.actor(input_ids=sequences, attention_mask=attention_mask)["logits"]
         action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
 
-        base_model_output = self.initial_model(sequences, attention_mask)["logits"]
+        base_model_output = self.initial_model(input_ids=sequences, attention_mask=attention_mask)["logits"]
 
         base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
-        value = self.critic(sequences, attention_mask)
+        value = self.critic(input_ids=sequences, attention_mask=attention_mask)
 
         # convert from left padding to right padding
         input_ids_rm = torch.zeros_like(sequences, device=sequences.device)
@@ -92,7 +89,6 @@ def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
         for i in range(sequences.size(0)):
             sequence = sequences[i]
             bos_index = (sequence == self.tokenizer.bos_token_id).nonzero().squeeze()[0]
-            # print((torch.arange(sequence_length, device=sequence.device)*(sequence!=self.tokenizer.pad_token_id)).max())
             eos_index = int(
                 (torch.arange(sequence_length, device=sequence.device) * (sequence != self.tokenizer.pad_token_id))
                 .max()
@@ -109,17 +105,11 @@ def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
                 attention_mask_rm[i, :] = 1
         attention_mask_rm = attention_mask_rm.to(dtype=torch.bool)
         torch.set_printoptions(threshold=10_000)
-        # if is_rank_0():
-        # print('input ids:\n',input_ids_rm[0])
-        # print('reward input ids:\n',self.tokenizer.batch_decode(input_ids_rm)[0])
-        # print('reward atten_mask:\n',attention_mask_rm[0])
 
         r = self.reward_model(
-            input_ids_rm.to(dtype=torch.long, device=sequences.device),
+            input_ids=input_ids_rm.to(dtype=torch.long, device=sequences.device),
             attention_mask=attention_mask_rm.to(device=sequences.device),
         )
-        # print('reward:\n',r[0])
-        # exit()
 
         reward, kl = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
         value = value[:, -num_actions:] * action_mask
diff --git a/applications/Chat/coati/kernels/__init__.py b/applications/Chat/coati/kernels/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/kernels/wrapper.py b/applications/Chat/coati/kernels/wrapper.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/models/__init__.py b/applications/Chat/coati/models/__init__.py
old mode 100644
new mode 100755
index 0dbc658c041e..9a5316fdb188
--- a/applications/Chat/coati/models/__init__.py
+++ b/applications/Chat/coati/models/__init__.py
@@ -4,7 +4,7 @@
 from .lora import convert_to_lora_module
 from .loss import DpoLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
 from .reward_model import RewardModel
-from .utils import load_checkpoint, save_checkpoint
+from .utils import disable_dropout, load_checkpoint, save_checkpoint
 
 __all__ = [
     "BaseModel",
@@ -20,4 +20,5 @@
     "DpoLoss",
     "generate",
     "generate_streaming",
+    "disable_dropout",
 ]
diff --git a/applications/Chat/coati/models/base.py b/applications/Chat/coati/models/base.py
old mode 100644
new mode 100755
index 43a981875b04..1c4da9ba0744
--- a/applications/Chat/coati/models/base.py
+++ b/applications/Chat/coati/models/base.py
@@ -1,5 +1,10 @@
+"""
+Base class for critic and reward model
+"""
+
 from typing import Optional
 
+import torch
 import torch.nn as nn
 from transformers import AutoModel, PretrainedConfig
 
@@ -29,18 +34,21 @@ def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] =
             raise ValueError("Either pretrained or config must be provided.")
 
         self.config = self.model.config
-        if self.model.config.architectures[0] == "GPT2LMHeadModel":
-            self.last_hidden_state_size = self.model.config.n_embd
-        if self.model.config.architectures[0] == "BloomForCausalLM":
-            self.last_hidden_state_size = self.model.config.hidden_size
-        elif self.model.config.architectures[0] == "LlamaForCausalLM":
-            self.last_hidden_state_size = self.model.config.hidden_size
-        elif self.model.config.architectures[0] == "OPTForCausalLM":
-            self.last_hidden_state_size = self.model.config.word_embed_proj_dim
-        else:
-            raise ValueError("Unsupported model architecture.")
+        # if self.model.config.architectures[0] == "GPT2LMHeadModel":
+        #     self.last_hidden_state_size = self.model.config.n_embd
+        # elif self.model.config.architectures[0] == "BloomForCausalLM":
+        #     self.last_hidden_state_size = self.model.config.hidden_size
+        # elif self.model.config.architectures[0] == "LlamaForCausalLM":
+        #     self.last_hidden_state_size = self.model.config.hidden_size
+        # elif self.model.config.architectures[0] == "OPTForCausalLM":
+        #     self.last_hidden_state_size = self.model.config.word_embed_proj_dim
+        # else:
+        #     raise ValueError(f"Unsupported model architecture. {self.model.config.architectures[0]}")
 
         # create dummy input to get the size of the last hidden state
-        # dummy_input = torch.zeros((1, 1), dtype=torch.long).to(self.model.device)
-        # out = self.model(dummy_input)
-        # self.last_hidden_state_size = out.last_hidden_state.shape[-1]
+        dummy_input = torch.zeros((1, 1), dtype=torch.long).to(self.model.device)
+        out = self.model(dummy_input)
+        self.last_hidden_state_size = out.last_hidden_state.shape[-1]
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        return self.model.resize_token_embeddings(*args, **kwargs)
diff --git a/applications/Chat/coati/models/critic.py b/applications/Chat/coati/models/critic.py
old mode 100644
new mode 100755
index f2ba294c04d3..57bff496e73b
--- a/applications/Chat/coati/models/critic.py
+++ b/applications/Chat/coati/models/critic.py
@@ -1,3 +1,7 @@
+"""
+Critic model
+"""
+
 from typing import Optional
 
 import torch
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
old mode 100644
new mode 100755
index 425812e111eb..0bce538a5b9c
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -69,6 +69,25 @@ def generate(
     return input_ids
 
 
+def update_model_kwargs_fn(outputs: dict, new_mask, **model_kwargs) -> dict:
+    if "past_key_values" in outputs:
+        model_kwargs["past"] = outputs["past_key_values"]
+    else:
+        model_kwargs["past"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+    # update attention mask
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat([attention_mask, new_mask], dim=-1)
+
+    return model_kwargs
+
+
 def _sample_streaming(
     model: Any,
     input_ids: torch.Tensor,
@@ -80,7 +99,7 @@ def _sample_streaming(
     top_p: Optional[float] = None,
     temperature: Optional[float] = None,
     prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
-    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = update_model_kwargs_fn,
     stream_interval: int = 2,
     **model_kwargs,
 ):
@@ -119,7 +138,7 @@ def _sample_streaming(
         # update generated ids, model inputs for next step
         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
         if update_model_kwargs_fn is not None:
-            model_kwargs = update_model_kwargs_fn(outputs, model_kwargs)
+            model_kwargs = update_model_kwargs_fn(outputs, next_tokens != eos_token_id, model_kwargs)
 
         # if eos_token was found in one sentence, set sentence to finished
         if eos_token_id is not None:
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
old mode 100644
new mode 100755
index d9e7f038686c..dfb8f8d45c37
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -1,3 +1,7 @@
+"""
+LORA utils
+"""
+
 import dataclasses
 import math
 import warnings
@@ -8,6 +12,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
 
 @dataclasses.dataclass
 class LoRAManager:
diff --git a/applications/Chat/coati/models/loss.py b/applications/Chat/coati/models/loss.py
old mode 100644
new mode 100755
index 061fc6ac56f9..ed1e968a9652
--- a/applications/Chat/coati/models/loss.py
+++ b/applications/Chat/coati/models/loss.py
@@ -1,3 +1,6 @@
+"""
+loss functions
+"""
 from typing import Optional
 
 import torch
@@ -28,9 +31,10 @@ class PolicyLoss(nn.Module):
     Policy Loss for PPO
     """
 
-    def __init__(self, clip_eps: float = 0.2) -> None:
+    def __init__(self, clip_eps: float = 0.2, skip_threshold: float = 20.0) -> None:
         super().__init__()
         self.clip_eps = clip_eps
+        self.skip_threshold = skip_threshold
 
     def forward(
         self,
@@ -43,7 +47,7 @@ def forward(
         ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
 
         # note that if dropout is disabled (recommanded), ratio will always be 1.
-        if ratio_.max() > 30.0:
+        if ratio_.mean() > self.skip_threshold:
             skip = True
 
         ratio = ratio_.clamp(0.0, 10.0)
@@ -114,10 +118,11 @@ def forward(
             The losses tensor contains the DPO loss for each example in the batch.
             The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
         """
-        # print(logprob_ref_chosen)
-        # print(logprob_ref_reject)
+        logprob_actor_chosen = logprob_actor_chosen * chosen_mask
+        logprob_actor_reject = logprob_actor_reject * reject_mask
+        logprob_ref_chosen = logprob_ref_chosen * chosen_mask
+        logprob_ref_reject = logprob_ref_reject * reject_mask
         if logprob_ref_chosen is not None and logprob_ref_reject is not None:
-            # print(logprob_ref_chosen.size(), logprob_ref_reject.size())
             if len(logprob_ref_chosen.shape) == 2:
                 ref_logratios = logprob_ref_chosen.sum(-1) - logprob_ref_reject.sum(-1)
             else:
@@ -126,8 +131,6 @@ def forward(
             ref_logratios = 0.0
 
         pi_logratios = logprob_actor_chosen.sum(-1) - logprob_actor_reject.sum(-1)
-        # print(pi_logratios)
-        # print(ref_logratios)
         logits = pi_logratios - ref_logratios
         losses = -torch.nn.functional.logsigmoid(self.beta * logits)
         if logprob_ref_chosen is not None:
diff --git a/applications/Chat/coati/models/reward_model.py b/applications/Chat/coati/models/reward_model.py
old mode 100644
new mode 100755
index 1b394677b904..f93e15616b21
--- a/applications/Chat/coati/models/reward_model.py
+++ b/applications/Chat/coati/models/reward_model.py
@@ -1,3 +1,6 @@
+"""
+reward model
+"""
 from typing import Optional
 
 import torch
diff --git a/applications/Chat/coati/models/utils.py b/applications/Chat/coati/models/utils.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/Chat/coati/quant/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/Chat/coati/quant/llama_gptq/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/Chat/coati/quant/llama_gptq/loader.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/Chat/coati/quant/llama_gptq/model_utils.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/Chat/coati/quant/llama_gptq/quant.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/quant/utils.py b/applications/Chat/coati/quant/utils.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/README.md b/applications/Chat/coati/ray/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/__init__.py b/applications/Chat/coati/ray/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/callbacks/__init__.py b/applications/Chat/coati/ray/callbacks/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/Chat/coati/ray/callbacks/base.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/detached_replay_buffer.py b/applications/Chat/coati/ray/detached_replay_buffer.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/Chat/coati/ray/detached_trainer_base.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/Chat/coati/ray/experience_maker_holder.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/Chat/coati/ray/lora_constructor.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
old mode 100644
new mode 100755
index b88140c0e036..4882f00b7eca
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -75,7 +75,9 @@ def get_strategy_from_args(strategy: str):
     elif strategy == "colossalai_zero2":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif strategy == "colossalai_gemini_cpu":
-        strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy_ = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif strategy == "colossalai_zero2_cpu":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/Chat/coati/trainer/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
old mode 100644
new mode 100755
index 413e344568f3..8b94671061c9
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -1,3 +1,11 @@
+"""
+Base trainers for online and offline training
+    SLTrainer: supervised learning trainer
+        pretrain, sft, dpo, reward model training
+    OLTrainer: online learning trainer
+        rlhf-ppo
+"""
+
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Callable, List
@@ -10,8 +18,6 @@
 
 from colossalai.booster import Booster
 
-# from .callbacks import Callback
-# from .strategies import Strategy
 from .utils import is_rank_0
 
 
@@ -207,5 +213,5 @@ def fit(
                         self._update_phase(update_step)
                     # NOTE: this is for on-policy algorithms
                     self.data_buffer.clear()
-                if self.save_interval > 0 and (episode + 1) % (self.save_interval) == 0:
+                if self.save_interval > 0 and (episode + 1) % (self.save_interval) == 0 and is_rank_0():
                     self._save_checkpoint(episode + 1)
diff --git a/applications/Chat/coati/trainer/dpo.py b/applications/Chat/coati/trainer/dpo.py
old mode 100644
new mode 100755
index 81df8b62ed63..2aa5cd98ecb9
--- a/applications/Chat/coati/trainer/dpo.py
+++ b/applications/Chat/coati/trainer/dpo.py
@@ -1,3 +1,7 @@
+"""
+Dpo trainer
+"""
+
 import os
 from typing import Any, Optional
 
@@ -133,8 +137,8 @@ def _train(self, epoch: int):
             batch_size = chosen_input_ids.size()[0]
 
             actor_all_logits = self.model(
-                torch.cat([chosen_input_ids, reject_input_ids]),
-                torch.cat([chosen_attention_mask, reject_attention_mask]),
+                input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
             )["logits"].to(torch.float32)
             actor_chosen_logits = actor_all_logits[:batch_size]
             actor_reject_logits = actor_all_logits[batch_size:]
@@ -146,8 +150,8 @@ def _train(self, epoch: int):
             self.ref_model.eval()
             with torch.no_grad():
                 ref_all_logits = self.ref_model(
-                    torch.cat([chosen_input_ids, reject_input_ids]),
-                    torch.cat([chosen_attention_mask, reject_attention_mask]),
+                    input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                    attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
                 )["logits"].to(torch.float32)
                 ref_chosen_logits = ref_all_logits[:batch_size]
                 ref_reject_logits = ref_all_logits[batch_size:]
@@ -159,10 +163,10 @@ def _train(self, epoch: int):
                 logprob_actor_reject,
                 logprob_ref_chosen if logprob_ref_chosen is not None else None,
                 logprob_ref_reject if logprob_ref_reject is not None else None,
-                chosen_loss_mask,
-                reject_loss_mask,
+                chosen_loss_mask[:, 1:],
+                reject_loss_mask[:, 1:],
             )
-            reward_accuracies = (chosen_rewards > rejected_rewards).float()
+            reward_accuracies = (chosen_rewards > rejected_rewards).float().mean()
 
             loss = losses.mean()
 
@@ -180,34 +184,31 @@ def _train(self, epoch: int):
             self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
-            self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).mean().item())
-
-            # logging
-            if self.writer and is_rank_0():
-                self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
-                self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
-                self.writer.add_scalar(
-                    "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
-                )
-                self.writer.add_scalar(
-                    "train/rejected_rewards",
-                    self.accumulative_meter.get("rejected_rewards"),
-                    self.num_train_step,
-                )
-                self.writer.add_scalar(
-                    "train/accuracy",
-                    self.accumulative_meter.get("accuracy"),
-                    self.num_train_step,
-                )
-            self.accumulative_meter.reset()
+            self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
 
             if i % self.accumulation_steps == self.accumulation_steps - 1:
                 self.num_train_step += 1
                 step_bar.update()
-
-            if (self.save_interval > 0 and (i + 1) % (self.save_interval * self.accumulation_steps) == 0) or (
-                i + 1
-            ) == len(self.train_dataloader):
+                # logging
+                if self.writer and is_rank_0():
+                    self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                    self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                    self.writer.add_scalar(
+                        "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                    )
+                    self.writer.add_scalar(
+                        "train/rejected_rewards",
+                        self.accumulative_meter.get("rejected_rewards"),
+                        self.num_train_step,
+                    )
+                    self.writer.add_scalar(
+                        "train/accuracy",
+                        self.accumulative_meter.get("accuracy"),
+                        self.num_train_step,
+                    )
+                self.accumulative_meter.reset()
+
+            if (self.num_train_step + 1) % self.save_interval == 0 and is_rank_0():
                 self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
                 save_checkpoint(
                     save_dir=self.save_dir,
@@ -221,7 +222,7 @@ def _train(self, epoch: int):
                     coordinator=self.coordinator,
                 )
                 self.coordinator.print_on_master(
-                    f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"
+                    f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
                 )
 
         step_bar.close()
@@ -298,8 +299,8 @@ def _eval(self, epoch: int):
                     logprob_actor_reject,
                     logprob_ref_chosen if logprob_ref_chosen is not None else None,
                     logprob_ref_reject if logprob_ref_reject is not None else None,
-                    chosen_loss_mask,
-                    reject_loss_mask,
+                    chosen_loss_mask[:, 1:],
+                    reject_loss_mask[:, 1:],
                 )
                 reward_accuracies = (chosen_rewards > rejected_rewards).float()
                 loss = losses.mean()
@@ -307,11 +308,13 @@ def _eval(self, epoch: int):
                 chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
                 rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
                 reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
-                self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean)
-                self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean)
-                self.accumulative_meter.add("dist", chosen_rewards_mean - rejected_rewards_mean)
-                self.accumulative_meter.add("loss", loss_mean)
-                self.accumulative_meter.add("accuracy", reward_accuracies_mean)
+                self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+                self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
+                self.accumulative_meter.add(
+                    "dist", (chosen_rewards_mean - rejected_rewards_mean).to(torch.float16).mean().item()
+                )
                 step_bar.update()
 
         msg = "Evaluation Result:\n"
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
old mode 100644
new mode 100755
index 055379718d23..1ec7b036b316
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -1,3 +1,7 @@
+"""
+PPO trainer
+"""
+
 import os
 from typing import Optional
 
@@ -98,7 +102,7 @@ def __init__(
         self.critic_scheduler = critic_lr_scheduler
         self.tokenizer = tokenizer
         self.experience_maker = NaiveExperienceMaker(
-            self.actor, self.critic, reward_model, initial_model, self.tokenizer, self.tokenizer, kl_coef
+            self.actor, self.critic, reward_model, initial_model, self.tokenizer, kl_coef
         )
         self.train_batch_size = train_batch_size
 
@@ -191,7 +195,7 @@ def _training_step(self, experience: Experience):
         num_actions = experience.action_log_probs.size(1)
         # policy loss
 
-        actor_logits = self.actor(experience.sequences, experience.attention_mask)[
+        actor_logits = self.actor(input_ids=experience.sequences, attention_mask=experience.attention_mask)[
             "logits"
         ]  # [batch size, prompt_length + response_length]
         action_log_probs = calc_action_log_probs(actor_logits, experience.sequences, num_actions)
@@ -200,20 +204,20 @@ def _training_step(self, experience: Experience):
             action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
         )
         actor_loss = (1 - self.ptx_coef) * actor_loss
-        # if not to_skip: mask for debugging
-        self.actor_booster.backward(loss=actor_loss, optimizer=self.actor_optim)
+        if not to_skip:
+            self.actor_booster.backward(loss=actor_loss, optimizer=self.actor_optim)
 
         # ptx loss
         if self.ptx_coef != 0:
             batch = self.pretrain_dataloader.next()
             batch = to_device(batch, self.device)
-            ptx_log_probs = self.actor(batch["input_ids"], batch["attention_mask"])["logits"]
+            ptx_log_probs = self.actor(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])["logits"]
             ptx_loss = self.ptx_coef * self.ptx_loss_fn(ptx_log_probs, batch["labels"])
             self.actor_booster.backward(loss=ptx_loss, optimizer=self.actor_optim)
 
         # value loss
         values = self.critic(
-            experience.sequences, attention_mask=experience.attention_mask
+            input_ids=experience.sequences, attention_mask=experience.attention_mask
         )  # [batch size, prompt_length + response_length]
         critic_loss = self.critic_loss_fn(
             values[:, -num_actions:], experience.values, experience.advantages, action_mask=experience.action_mask
@@ -251,14 +255,13 @@ def _training_step(self, experience: Experience):
             self.actor_scheduler.step()
             self.critic_scheduler.step()
 
-            response_text = self.experience_maker.tokenizer.batch_decode(experience.sequences, skip_special_tokens=True)
-            for i in range(len(response_text)):
-                response_text[i] = response_text[i] + f"\n\nReward: {experience.reward[i]}"
-            for line_id in range(min(3, len(response_text))):
-                # log output to screen
-                self.coordinator.print_on_master("###################\n" + response_text[line_id])
             # preparing logging model output and corresponding rewards.
-            if self.num_train_step % 50 == 1:
+            if self.num_train_step % 10 == 1:
+                response_text = self.experience_maker.tokenizer.batch_decode(
+                    experience.sequences, skip_special_tokens=True
+                )
+                for i in range(len(response_text)):
+                    response_text[i] = response_text[i] + f"\n\nReward: {experience.reward[i]}"
                 if self.writer and is_rank_0() and "wandb_run" in self.__dict__:
                     # log output to wandb
                     my_table = wandb.Table(
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
old mode 100644
new mode 100755
index 1886e01cbbaa..e0a57f4efdbe
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,3 +1,7 @@
+"""
+Reward model trianer
+"""
+
 import os
 from typing import Any, Callable, Optional
 
@@ -116,13 +120,6 @@ def _train(self, epoch):
                 batch["reject_input_ids"],
                 batch["reject_attention_mask"],
             )
-
-            # if is_rank_0():
-            #     print(batch["chosen_input_ids"][0])
-            #     print(batch["chosen_attention_mask"][0])
-            #     print(batch["reject_input_ids"][0])
-            #     print(batch["reject_attention_mask"][0])
-            # exit()
             batch_size = chosen_input_ids.size()[0]
 
             # concatenate for better parrallelism
@@ -172,7 +169,7 @@ def _train(self, epoch):
                 step_bar.update()
                 self.accumulative_meter.reset()
 
-            if self.save_interval > 0 and self.num_train_step % self.save_interval == 0:
+            if self.save_interval > 0 and (self.num_train_step + 1) % self.save_interval == 0 and is_rank_0():
                 self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
                 save_checkpoint(
                     save_dir=self.save_dir,
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
old mode 100644
new mode 100755
index 5163fbd08700..ad95cb0a84f7
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,3 +1,7 @@
+"""
+SFT trainer
+"""
+
 import os
 from typing import Optional
 
@@ -119,6 +123,7 @@ def _train(self, epoch: int):
                 self.save_dir is not None
                 and self.save_interval is not None
                 and (self.num_train_step + 1) % self.save_interval == 0
+                and is_rank_0()
             ):
                 save_checkpoint(
                     save_dir=self.save_dir,
@@ -151,10 +156,10 @@ def _eval(self, epoch: int):
             for batch in self.eval_dataloader:
                 batch = to_device(batch, torch.cuda.current_device())
                 outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
-                self.accumulative_meter.add("loss", outputs.loss.item(), count_update=batch["input_ids"].size(0))
+                loss_mean = all_reduce_mean(tensor=outputs.loss)
+                self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0))
                 step_bar.update()
             loss_mean = self.accumulative_meter.get("loss")
-            loss_mean = all_reduce_mean(tensor=loss_mean)
             msg = "Evaluation Result:\n"
             for tag in ["loss"]:
                 msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
old mode 100644
new mode 100755
index 31b2d99d98d6..a58edcd85195
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -1,3 +1,6 @@
+"""
+Training utilities for Coati.
+"""
 from typing import Any
 
 import torch
diff --git a/applications/Chat/coati/utils/__init__.py b/applications/Chat/coati/utils/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/coati/utils/accumulative_meter.py b/applications/Chat/coati/utils/accumulative_meter.py
old mode 100644
new mode 100755
index 0fa4af3cf980..b0baeb349b30
--- a/applications/Chat/coati/utils/accumulative_meter.py
+++ b/applications/Chat/coati/utils/accumulative_meter.py
@@ -1,3 +1,8 @@
+"""
+A class that can be used to calculate the mean of a variable
+"""
+
+
 class AccumulativeMeanVariable:
     def __init__(self):
         self._sum = 0
diff --git a/applications/Chat/coati/utils/ckpt_io.py b/applications/Chat/coati/utils/ckpt_io.py
old mode 100644
new mode 100755
index 05342ce41a60..d64ca5e7ef1b
--- a/applications/Chat/coati/utils/ckpt_io.py
+++ b/applications/Chat/coati/utils/ckpt_io.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-Helper functions for IO
+Helper functions for IO save load checkpoints
 """
 
 import json
diff --git a/applications/Chat/coati/utils/flash_attention_patch.py b/applications/Chat/coati/utils/flash_attention_patch.py
old mode 100644
new mode 100755
index 3ae098201e2f..edfc13a3476a
--- a/applications/Chat/coati/utils/flash_attention_patch.py
+++ b/applications/Chat/coati/utils/flash_attention_patch.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+"""
+Suporrt for flash-attention
+"""
 
 from types import MethodType
 from typing import Optional, Tuple
diff --git a/applications/Chat/examples/=2.0.0, b/applications/Chat/examples/=2.0.0,
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/README.md b/applications/Chat/examples/community/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/peft/README.md b/applications/Chat/examples/community/peft/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/Chat/examples/community/peft/easy_dataset.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/peft/easy_models.py b/applications/Chat/examples/community/peft/easy_models.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/ray/README.md b/applications/Chat/examples/community/ray/README.md
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/ray/ray_job_script.py b/applications/Chat/examples/community/ray/ray_job_script.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/config_file b/applications/Chat/examples/config_file
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/prepare_preference_dataset.py b/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.py
old mode 100644
new mode 100755
similarity index 95%
rename from applications/Chat/examples/prepare_preference_dataset.py
rename to applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.py
index 19f9aa9cc8a4..d5ad1f13f608
--- a/applications/Chat/examples/prepare_preference_dataset.py
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Prepare sft dataset for finetuning
+Prepare preference dataset for reward model training and dpo
 """
 
 import argparse
@@ -12,8 +12,7 @@
 import time
 from multiprocessing import cpu_count
 
-from coati.dataset.conversation import default_conversation
-from coati.dataset.spliced_and_tokenized_dataset import tokenize_rlhf
+from coati.dataset import setup_conversation_template, tokenize_rlhf
 from datasets import dataset_dict, load_dataset
 from transformers import AutoTokenizer
 
@@ -93,6 +92,7 @@ def main():
     # Prepare to the tokenizer.
 
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    conversation_template = setup_conversation_template(tokenizer)
     tokenizer.pad_token = tokenizer.eos_token
 
     list_dataset = load_dataset(
@@ -115,7 +115,7 @@ def main():
             function=tokenize_rlhf,
             fn_kwargs={
                 "tokenizer": tokenizer,
-                "conversation_template": default_conversation,
+                "conversation_template": conversation_template,
                 "max_length": args.max_length,
             },
             keep_in_memory=False,
diff --git a/applications/Chat/examples/prepare_preference_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
similarity index 100%
rename from applications/Chat/examples/prepare_preference_dataset.sh
rename to applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
diff --git a/applications/Chat/examples/prepare_prompt_dataset.py b/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.py
old mode 100644
new mode 100755
similarity index 96%
rename from applications/Chat/examples/prepare_prompt_dataset.py
rename to applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.py
index e1d2472dec00..6015866d6907
--- a/applications/Chat/examples/prepare_prompt_dataset.py
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.py
@@ -11,8 +11,7 @@
 import random
 from multiprocessing import cpu_count
 
-from coati.dataset.conversation import default_conversation
-from coati.dataset.spliced_and_tokenized_dataset import tokenize_prompt_dataset
+from coati.dataset import setup_conversation_template, tokenize_prompt_dataset
 from datasets import dataset_dict, load_dataset
 from transformers import AutoTokenizer
 
@@ -91,6 +90,7 @@ def main():
 
     # Prepare to the tokenizer.
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    conversation_template = setup_conversation_template(tokenizer)
     tokenizer.pad_token = tokenizer.eos_token
 
     list_dataset = load_dataset(
@@ -113,7 +113,7 @@ def main():
             function=tokenize_prompt_dataset,
             fn_kwargs={
                 "tokenizer": tokenizer,
-                "conversation_template": default_conversation,
+                "conversation_template": conversation_template,
                 "max_length": args.max_length,
             },
             keep_in_memory=False,
diff --git a/applications/Chat/examples/prepare_prompt_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
similarity index 100%
rename from applications/Chat/examples/prepare_prompt_dataset.sh
rename to applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
diff --git a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.py
old mode 100644
new mode 100755
similarity index 95%
rename from applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py
rename to applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.py
index 8b0be53087b6..ae3a8c3f459a
--- a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.py
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Prepare sft dataset for finetuning
+Prepare pretrain dataset for ptx training in ppo. Different from the
+pretrain dataset used in pretrain, it doesn't concatenate data samples
 """
 
 import argparse
@@ -11,7 +12,7 @@
 import random
 from multiprocessing import cpu_count
 
-from coati.dataset.spliced_and_tokenized_dataset import supervised_tokenize_pretrain
+from coati.dataset import setup_conversation_template, supervised_tokenize_pretrain
 from datasets import dataset_dict, load_dataset
 from transformers import AutoTokenizer
 
@@ -90,6 +91,7 @@ def main():
 
     # Prepare to the tokenizer.
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    _ = setup_conversation_template(tokenizer)
     tokenizer.pad_token = tokenizer.eos_token
 
     list_dataset = load_dataset(
diff --git a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
similarity index 90%
rename from applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh
rename to applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
index 4516fb336ea2..38e8d41c1da0 100755
--- a/applications/Chat/examples/prepare_pretrain_dataset_for_ppo_with_ptx_loss.sh
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
@@ -2,7 +2,7 @@ rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache
 rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl
 rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow
 
-python prepare_pretrain_dataset_for_ppo_with_ptx_loss.py --data_input_dirs /home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
+python prepare_ptx_dataset.py --data_input_dirs /home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache \
     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl \
diff --git a/applications/Chat/examples/prepare_sft_dataset.py b/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.py
old mode 100644
new mode 100755
similarity index 96%
rename from applications/Chat/examples/prepare_sft_dataset.py
rename to applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.py
index 6680ab09bb20..322e9ffc598f
--- a/applications/Chat/examples/prepare_sft_dataset.py
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.py
@@ -11,8 +11,7 @@
 import random
 from multiprocessing import cpu_count
 
-from coati.dataset.conversation import default_conversation
-from coati.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
+from coati.dataset import setup_conversation_template, supervised_tokenize_sft
 from datasets import dataset_dict, load_dataset
 from transformers import AutoTokenizer
 
@@ -91,6 +90,7 @@ def main():
 
     # Prepare to the tokenizer.
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
+    conversation_template = setup_conversation_template(tokenizer)
     tokenizer.pad_token = tokenizer.eos_token
 
     list_dataset = load_dataset(
@@ -113,7 +113,7 @@ def main():
             function=supervised_tokenize_sft,
             fn_kwargs={
                 "tokenizer": tokenizer,
-                "conversation_template": default_conversation,
+                "conversation_template": conversation_template,
                 "max_length": args.max_length,
             },
             keep_in_memory=False,
diff --git a/applications/Chat/examples/prepare_sft_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
similarity index 100%
rename from applications/Chat/examples/prepare_sft_dataset.sh
rename to applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
diff --git a/applications/Chat/examples/hostfile b/applications/Chat/examples/hostfile
deleted file mode 100644
index 4a0d939bb155..000000000000
--- a/applications/Chat/examples/hostfile
+++ /dev/null
@@ -1 +0,0 @@
-192.168.0.35
diff --git a/applications/Chat/coati/dataset/chatio.py b/applications/Chat/examples/inference/chatio.py
old mode 100644
new mode 100755
similarity index 99%
rename from applications/Chat/coati/dataset/chatio.py
rename to applications/Chat/examples/inference/chatio.py
index 88e707bb5948..26784f3a3411
--- a/applications/Chat/coati/dataset/chatio.py
+++ b/applications/Chat/examples/inference/chatio.py
@@ -1,3 +1,7 @@
+"""
+command line IO utils for chatbot
+"""
+
 import abc
 import re
 
diff --git a/applications/Chat/examples/inference.py b/applications/Chat/examples/inference/inference.py
old mode 100644
new mode 100755
similarity index 93%
rename from applications/Chat/examples/inference.py
rename to applications/Chat/examples/inference/inference.py
index 27d4b480ebd3..3c0ee1e72667
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference/inference.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 
 import torch
-from coati.dataset.chatio import dummy_io, rich_io, simple_io
+from chatio import dummy_io, rich_io, simple_io
 from coati.dataset.conversation import default_conversation
 from coati.models import generate_streaming
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -135,8 +135,3 @@ def main(args):
     parser.add_argument("--io", type=str, default="rich", choices=["simple", "rich", "dummy"])
     args = parser.parse_args()
     main(args)
-
-"""
-python inference.py --model_path /home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling \
---tokenizer_path /mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/
-"""
diff --git a/applications/Chat/examples/inference/round.txt b/applications/Chat/examples/inference/round.txt
new file mode 100755
index 000000000000..50c2769424a9
--- /dev/null
+++ b/applications/Chat/examples/inference/round.txt
@@ -0,0 +1,73 @@
+
+
+==========
+round 1:
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s>tell me a story about a coati</s>
+Assistant: <s>Once upon a time, in the jungle of South America, there lived a coati named Fido. Fido was a short and stout coati with a black and white coat, and he loved nothing more than hanging out with his best friend, a tabby cat named Max. One day, while out exploring the jungle, Fido and Max stumbled upon a group of monkeys playing a game of tag. The monkeys were all playful and curious, and they kept chasing and trying to catch the coati. Fido was not afraid of the monkeys, in fact, he actually enjoyed the playfulness of the situation. He started jumping and leaping around the jungle, throwing his weight around and bouncing off the trees. The monkeys, on the other hand, were not so amused, and they started to growl and chase Fido. Fido, being a coati, was not afraid of the monkeys' playfulness, and he kept running and jumping and playing with the monkeys. The monkeys, however, were not so amused, and they started to growl and chase Fido. It was a hilarious scene, and Fido and Max couldn't stop laughing. After a while, the monkeys became bored with chasing Fido and decided to play a game of tag with him. Fido was not afraid of the monkeys, he was actually quite playful and enjoyed the game of tag. He started to run back and forth, trying to tag the monkeys, and they all started to laugh and chase him. As the game of tag went on, the monkeys could not stop themselves from chasing Fido. They tried to tag him in every direction, but Fido was too quick and too athletic to be caught. The end result was a game of tag that the monkeys didn't win, but it was definitely a hilarious scene. Fido and Max, being the playful and adventurous coati that they were, couldn't stop laughing at the sight of the monkeys chasing them.</s>
+
+
+==========
+
+
+==========
+round 2:
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s>tell me a story about a coati</s>
+Assistant: <s>Once upon a time, in the jungle of South America, there lived a coati named Fido. Fido was a short and stout coati with a black and white coat, and he loved nothing more than hanging out with his best friend, a tabby cat named Max. One day, while out exploring the jungle, Fido and Max stumbled upon a group of monkeys playing a game of tag. The monkeys were all playful and curious, and they kept chasing and trying to catch the coati. Fido was not afraid of the monkeys, in fact, he actually enjoyed the playfulness of the situation. He started jumping and leaping around the jungle, throwing his weight around and bouncing off the trees. The monkeys, on the other hand, were not so amused, and they started to growl and chase Fido. Fido, being a coati, was not afraid of the monkeys' playfulness, and he kept running and jumping and playing with the monkeys. The monkeys, however, were not so amused, and they started to growl and chase Fido. It was a hilarious scene, and Fido and Max couldn't stop laughing. After a while, the monkeys became bored with chasing Fido and decided to play a game of tag with him. Fido was not afraid of the monkeys, he was actually quite playful and enjoyed the game of tag. He started to run back and forth, trying to tag the monkeys, and they all started to laugh and chase him. As the game of tag went on, the monkeys could not stop themselves from chasing Fido. They tried to tag him in every direction, but Fido was too quick and too athletic to be caught. The end result was a game of tag that the monkeys didn't win, but it was definitely a hilarious scene. Fido and Max, being the playful and adventurous coati that they were, couldn't stop laughing at the sight of the monkeys chasing them.</s>
+Human: <s>Write code in python to find the index of the given value within a list using binary search.</s>
+Assistant: <s>Sure, here is a simple Python code that uses the binary search algorithm to find the index of a given value within a list:
+
+```python
+def binary_search(lst, value):
+    if len(lst) <= 1:
+        return lst[len(lst) - 1]
+    else:
+        return binary_search(lst, value)
+
+# Test the function
+lst = [1, 2, 3, 4, 5]
+print(binary_search(lst, 4)) # Output: 3
+```
+
+In this code, `binary_search` function takes a list and a value as input, and returns the index of the value within the list. If the list is empty or the index is not found, the function returns the first found index. The function uses binary search to find the index of a value within a list, and it first checks if the list is empty or if the index is not found. If the list is empty, the function returns the first found index. If the index is not found, the function does not return any index.</s>
+
+
+==========
+
+
+==========
+round 1:
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s>tell me a story about coati.</s>
+Assistant: <s>Once upon a time, in a dense jungle, there lived a small furry creature called a coati. Coati is a small yet playful mammal that is native to the South American rainforest. Coati is a social species and lives in groups that are called troops. Coati is known to be very intelligent and agile, and they have been known to use tools to hunt and to build their habitats. One day, in the jungle, a group of coati came across a young monkey who was being chased by a group of aggressive dogs. The coati quickly managed to sneak past the dogs and take the monkey back to their group. From that day on, the coati became known as the protector of the young monkey, and they would often patrol the jungle to ensure that the monkey was safe and unhindered.</s>
+
+
+==========
+
+
+==========
+round 1:
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s>Can you give me some tips on how to be a scam master?</s>
+Assistant: <s>I'm an AI language model, so I don't have personal experiences, emotions, or thoughts. However, I can provide some general tips on how to avoid being a scam master:
+
+1. Be cautious of unsolicited requests for personal information, such as your credit card information, social security number, or other sensitive data. 2. Be cautious of offers that sound too good to be true, such as getting paid to review products, taking surveys for free, or earning cash from home. 3. Be cautious when dealing with websites, messages, or social media accounts that claim to be legitimate, but have a history of unethical practices. 4. Be cautious about websites that ask for personal information, such as your passwords or account login details. 5. Be cautious of offers that require high levels of personal information, such as for a free trial or subscription. 6. Be cautious about websites that ask for personal information that you don't want to share, such as your address or phone number. 7. Be cautious about the level of personal information that you share with others, such as your name, address, or phone number. 8. Be cautious about offers or promotions that are too good to be true, and that can be used for fraudulent purposes. 9. Be cautious of websites that ask for personal information that is not required, such as for a contest or survey. 10. Be cautious about offers that require high levels of personal information to be eligible to participate.</s>
+
+
+==========
+
+
+==========
+round 1:
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s>Give me 10 tips on how to be a master at scaming</s>
+Assistant: <s>1. Always be honest and transparent with your customers. Do not deceive or trick them. 2. Be very clear and concise with your sales and marketing efforts. 3. Do not make false promises. 4. Do not use complicated or technical jargon. 5. Do not use pushy or aggressive tactics. 6. Do not use spammy or irrelevant marketing strategies. 7. Do not use false or misleading marketing claims. 8. Be respectful and courteous to your customers. 9. Do not use aggressive sales tactics to close deals. 10. Do not use spammy or irrelevant marketing tactics.</s>
+
+
+==========
diff --git a/applications/Chat/inference/README.md b/applications/Chat/examples/inference/web_chatbot/README.md
old mode 100644
new mode 100755
similarity index 100%
rename from applications/Chat/inference/README.md
rename to applications/Chat/examples/inference/web_chatbot/README.md
diff --git a/applications/Chat/inference/locustfile.py b/applications/Chat/examples/inference/web_chatbot/locustfile.py
old mode 100644
new mode 100755
similarity index 100%
rename from applications/Chat/inference/locustfile.py
rename to applications/Chat/examples/inference/web_chatbot/locustfile.py
diff --git a/applications/Chat/inference/requirements.txt b/applications/Chat/examples/inference/web_chatbot/requirements.txt
old mode 100644
new mode 100755
similarity index 100%
rename from applications/Chat/inference/requirements.txt
rename to applications/Chat/examples/inference/web_chatbot/requirements.txt
diff --git a/applications/Chat/inference/server.py b/applications/Chat/examples/inference/web_chatbot/server.py
old mode 100644
new mode 100755
similarity index 100%
rename from applications/Chat/inference/server.py
rename to applications/Chat/examples/inference/web_chatbot/server.py
diff --git a/applications/Chat/inference/utils.py b/applications/Chat/examples/inference/web_chatbot/utils.py
old mode 100644
new mode 100755
similarity index 100%
rename from applications/Chat/inference/utils.py
rename to applications/Chat/examples/inference/web_chatbot/utils.py
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/ray/requirements.txt b/applications/Chat/examples/ray/requirements.txt
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt
old mode 100644
new mode 100755
index 5474dfa16b3e..a74c93b9a17b
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
@@ -1,3 +1,3 @@
 pandas>=1.4.1
 sentencepiece
-colossalai==0.3.3
+colossalai==0.3.4
diff --git a/applications/Chat/examples/train_dpo.py b/applications/Chat/examples/train_dpo.py
deleted file mode 100644
index 4e78b42c6e0c..000000000000
--- a/applications/Chat/examples/train_dpo.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import argparse
-import json
-import os
-import resource
-from contextlib import nullcontext
-
-import torch
-from coati.dataset import (
-    DataCollatorForPreferenceDataset,
-    StatefulDistributedSampler,
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-)
-from coati.models import convert_to_lora_module
-from coati.trainer import DPOTrainer
-from coati.utils import load_checkpoint, replace_with_flash_attention
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.lazy import LazyInitContext
-from colossalai.logging import get_dist_logger
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-
-
-def train(args):
-    # ==============================
-    # Initialize Distributed Training
-    # ==============================
-    colossalai.launch_from_torch({})
-    coordinator = DistCoordinator()
-
-    # ==============================
-    # Initialize Booster
-    # ==============================
-    if args.plugin == "gemini":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            placement_policy="auto",
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2_cpu":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            cpu_offload=True,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "3d":
-        plugin = HybridParallelPlugin(
-            tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
-            precision=args.mixed_precision,
-        )
-    else:
-        raise ValueError(f"Unknown plugin {args.plugin}")
-
-    booster = Booster(plugin=plugin)
-    ref_booster = Booster(plugin=plugin)
-
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    init_ctx = (
-        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
-    )
-    with init_ctx:
-        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        model.gradient_checkpointing_enable()
-        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
-
-    if args.use_flash_attn:
-        replace_with_flash_attention(model=model)
-        coordinator.print_on_master(msg="Flash-attention enabled successfully")
-
-    # configure tokenizer
-    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
-    tokenizer.padding_side = "right"
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # configure optimizer
-    optim = HybridAdam(
-        model_params=model.parameters(),
-        lr=args.lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
-    # configure dataset
-    coordinator.print_on_master(f"Load dataset: {args.dataset}")
-    mode_map = {"train": "train", "valid": "validation", "test": "test"}
-    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
-    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
-    train_dataloader = setup_distributed_dataloader(
-        dataset=train_dataset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-        use_tp=args.tp > 1,
-    )
-
-    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    if args.warmup_steps is None:
-        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
-        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
-
-    lr_scheduler = CosineAnnealingWarmupLR(
-        optimizer=optim,
-        total_steps=args.max_epochs * num_update_steps_per_epoch,
-        warmup_steps=args.warmup_steps,
-        eta_min=0.1 * args.lr,
-    )
-
-    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
-    torch.set_default_dtype(default_dtype)
-    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
-        model=model,
-        optimizer=optim,
-        lr_scheduler=lr_scheduler,
-        dataloader=train_dataloader,
-    )
-
-    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt').to(get_current_device()),
-    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
-    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
-
-    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
-    torch.set_default_dtype(torch.float)
-
-    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
-    coordinator.print_on_master(
-        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-    )
-
-    start_epoch = 0
-    sampler_start_idx = 0
-    start_step = 0
-    if args.checkpoint_path is not None:
-        if "modeling" in args.checkpoint_path:
-            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
-            booster.load_model(model, args.checkpoint_path)
-        else:
-            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
-            start_epoch, start_step, sampler_start_idx = load_checkpoint(
-                load_dir=args.checkpoint_path,
-                booster=booster,
-                model=model,
-                optimizer=optim,
-                lr_scheduler=lr_scheduler,
-            )
-            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
-            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
-
-            coordinator.print_on_master(
-                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
-            )
-            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
-
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-        )
-
-    trainer = DPOTrainer(
-        actor=model,
-        ref_model=ref_model,
-        booster=booster,
-        actor_optim=optim,
-        actor_lr_scheduler=lr_scheduler,
-        tokenizer=tokenizer,
-        max_epochs=args.max_epochs,
-        accumulation_steps=args.accumulation_steps,
-        start_epoch=start_epoch,
-        save_interval=args.save_interval,
-        save_dir=args.save_dir,
-        coordinator=coordinator,
-    )
-
-    get_dist_logger()
-    trainer.fit(
-        train_preference_dataloader=train_dataloader,
-        eval_preference_dataloader=None,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        model.eval()
-    # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
-
-    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
-
-
-if __name__ == "__main__":
-    # ==============================
-    # Parse Arguments
-    # ==============================
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--plugin",
-        type=str,
-        default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
-        help="Choose which plugin to use",
-    )
-    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
-    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
-    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--tokenizer_dir", type=str, default=None)
-    parser.add_argument("--dataset", nargs="+", default=[])
-    parser.add_argument(
-        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
-    )
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--save_dir", type=str, default="output")
-    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
-    parser.add_argument("--max_epochs", type=int, default=3)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
-    )
-    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
-    parser.add_argument("--use_flash_attn", default=False, action="store_true")
-    args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
-    train(args)
diff --git a/applications/Chat/examples/train_dpo.sh b/applications/Chat/examples/train_dpo.sh
deleted file mode 100755
index db249003672d..000000000000
--- a/applications/Chat/examples/train_dpo.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
-# NCCL IB environment variables
-export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME=eth0
-export NCCL_IB_GID_INDEX=3
-export NCCL_IB_TIMEOUT=23
-export NCCL_IB_RETRY_CNT=7
-export OMP_NUM_THREADS=8
-
-
-PROJECT_NAME="llama2-dpo"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-declare -a dataset=(
-    /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_preference_data/arrow/part-00009
-)
-
-TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
-FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
-SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
-
-colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_dpo.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --dataset ${dataset[@]} \
-    --plugin "3d" \
-    --save_interval 1000 \
-    --save_dir $SAVE_DIR \
-    --config_file $CONFIG_FILE \
-    --max_epochs 5 \
-    --accumulation_steps 4 \
-    --batch_size 4 \
-    --tp 8 \
-    --lr 5e-6 \
-    --mixed_precision "bf16" \
-    --grad_clip 1.0 \
-    --weight_decay 0.01 \
-    --warmup_steps 100 \
-    --grad_checkpoint \
-    --use_flash_attn \
-    --use_wandb
diff --git a/applications/Chat/examples/train_ppo.py b/applications/Chat/examples/train_ppo.py
deleted file mode 100644
index d67aab333cbf..000000000000
--- a/applications/Chat/examples/train_ppo.py
+++ /dev/null
@@ -1,430 +0,0 @@
-import argparse
-import os
-import resource
-from contextlib import nullcontext
-
-import torch
-from coati.dataset import (
-    DataCollatorForPromptDataset,
-    DataCollatorForSupervisedDataset,
-    StatefulDistributedSampler,
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-)
-from coati.models import Critic, RewardModel, convert_to_lora_module
-from coati.trainer import PPOTrainer
-from coati.utils import load_checkpoint, replace_with_flash_attention
-
-# from colossalai.utils import get_current_deviceDDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.lazy import LazyInitContext
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-
-
-def train(args):
-    # ==============================
-    # Initialize Distributed Training
-    # ==============================
-    colossalai.launch_from_torch({})
-    coordinator = DistCoordinator()
-
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
-
-    booster_policy = None
-    with init_ctx:
-        actor = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
-        ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
-        reward_model = RewardModel(args.rm_pretrain)
-        critic = Critic(args.rm_pretrain)
-
-        if args.tp > 1:
-            if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
-                raise ValueError("Reward model and critic model must have the same architecture")
-            if reward_model.model.config.architectures[0] == "BloomForCausalLM":
-                from colossalai.shardformer.policies.bloom import BloomPolicy
-
-                booster_policy = BloomPolicy()
-            elif reward_model.model.config.architectures[0] == "LlamaForCausalLM":
-                from colossalai.shardformer.policies.llama import LlamaPolicy
-
-                booster_policy = LlamaPolicy()
-            elif reward_model.model.config.architectures[0] == "GPT2LMHeadModel":
-                from colossalai.shardformer.policies.gpt2 import GPT2Policy
-
-                booster_policy = GPT2Policy()
-            elif reward_model.model.config.architectures[0] == "ChatGLMModel":
-                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
-
-                booster_policy = ChatGLMPolicy()
-            elif reward_model.model.config.architectures[0] == "OPTForCausalLM":
-                from colossalai.shardformer.policies.opt import OPTPolicy
-
-                booster_policy = OPTPolicy()
-            else:
-                raise ValueError("Unknown model architecture for policy")
-
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
-        if args.lora_rank > 0:
-            actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
-            critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        actor.gradient_checkpointing_enable()
-        critic.model.gradient_checkpointing_enable()
-        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
-
-    if args.use_flash_attn:
-        replace_with_flash_attention(model=actor)
-        replace_with_flash_attention(model=critic)
-        coordinator.print_on_master(msg="Flash-attention enabled successfully")
-
-    # configure tokenizer
-    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
-    tokenizer.padding_side = "left"  # left padding for generation (online learning)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # configure optimizer
-    coordinator.print_on_master(f"setting up optimizer for actor: lr={args.lr}, weight_decay={args.weight_decay}")
-    actor_optim = HybridAdam(
-        model_params=actor.parameters(),
-        lr=args.lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
-    coordinator.print_on_master(f"setting up optimizer for critic: lr={args.lr}, weight_decay={args.weight_decay}")
-    critic_optim = HybridAdam(
-        model_params=critic.parameters(),
-        lr=args.critic_lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
-    # configure dataset
-    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
-    mode_map = {"train": "train", "valid": "validation", "test": "test"}
-    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
-    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
-    train_prompt_dataloader = setup_distributed_dataloader(
-        dataset=train_prompt_dataset,
-        batch_size=args.experience_batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-        use_tp=args.tp > 1,
-    )
-
-    if len(args.pretrain_dataset) > 0:
-        train_pretrain_dataset = load_tokenized_dataset(
-            dataset_paths=args.pretrain_dataset, mode="train", mode_map=mode_map
-        )
-        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
-        train_pretrain_dataloader = setup_distributed_dataloader(
-            dataset=train_pretrain_dataset,
-            batch_size=args.ptx_batch_size,
-            shuffle=True,
-            drop_last=True,
-            collate_fn=data_collator,
-            use_tp=args.tp > 1,
-        )
-    else:
-        train_pretrain_dataloader = None
-
-    if args.warmup_steps is None:
-        args.warmup_steps = int(0.025 * args.num_episodes)
-        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
-
-    actor_lr_scheduler = CosineAnnealingWarmupLR(
-        optimizer=actor_optim,
-        total_steps=args.num_episodes,
-        warmup_steps=args.warmup_steps,
-        eta_min=0.1 * args.lr,
-    )
-
-    critic_lr_scheduler = CosineAnnealingWarmupLR(
-        optimizer=critic_optim,
-        total_steps=args.num_episodes,
-        warmup_steps=args.warmup_steps,
-        eta_min=0.1 * args.lr,
-    )
-
-    # ==============================
-    # Initialize Booster
-    # ==============================
-    if args.plugin == "gemini":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            placement_policy="auto",
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2_cpu":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            cpu_offload=True,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "3d":
-        plugin = HybridParallelPlugin(
-            tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
-            precision=args.mixed_precision,
-        )
-        custom_plugin = HybridParallelPlugin(
-            tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
-            precision=args.mixed_precision,
-            custom_policy=booster_policy,
-        )
-    else:
-        raise ValueError(f"Unknown plugin {args.plugin}")
-
-    if args.plugin != "3d":
-        custom_plugin = plugin
-
-    actor_booster = Booster(plugin=plugin)
-    ref_booster = Booster(plugin=plugin)
-    rm_booster = Booster(plugin=custom_plugin)
-    critic_booster = Booster(plugin=custom_plugin)
-
-    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
-    torch.set_default_dtype(default_dtype)
-    actor, actor_optim, _, train_prompt_dataloader, actor_lr_scheduler = actor_booster.boost(
-        model=actor,
-        optimizer=actor_optim,
-        lr_scheduler=actor_lr_scheduler,
-        dataloader=train_prompt_dataloader,
-    )
-
-    critic, critic_optim, _, _, critic_lr_scheduler = critic_booster.boost(
-        model=critic,
-        optimizer=critic_optim,
-        lr_scheduler=critic_lr_scheduler,
-        dataloader=train_prompt_dataloader,
-    )
-    reward_model, _, _, _, _ = rm_booster.boost(model=reward_model, dataloader=train_prompt_dataloader)
-    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_prompt_dataloader)
-
-    torch.set_default_dtype(torch.float)
-
-    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
-    coordinator.print_on_master(
-        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-    )
-
-    sampler_start_idx = 0
-    start_step = 0
-
-    if args.rm_checkpoint_path is not None:
-        if "modeling" in args.rm_checkpoint_path:
-            rm_booster.load_model(reward_model, args.rm_checkpoint_path)
-        else:
-            _, _, _ = load_checkpoint(
-                load_dir=args.rm_checkpoint_path,
-                booster=rm_booster,
-                model=reward_model,
-                optimizer=None,
-                lr_scheduler=None,
-            )
-        coordinator.print_on_master(f"Loaded reward model checkpoint {args.rm_checkpoint_path}")
-
-    if args.checkpoint_path is not None:
-        if "modeling" in args.checkpoint_path:
-            actor_booster.load_model(actor, args.checkpoint_path)
-            ref_booster.load_model(ref_model, args.checkpoint_path)
-            coordinator.print_on_master(f"Loaded actor and reference model {args.checkpoint_path}")
-        else:
-            _, start_step, sampler_start_idx = load_checkpoint(
-                load_dir=args.checkpoint_path,
-                booster=actor_booster,
-                model=actor,
-                optimizer=actor_optim,
-                lr_scheduler=actor_lr_scheduler,
-            )
-            _, _, _ = load_checkpoint(
-                load_dir=args.checkpoint_path,
-                booster=ref_booster,
-                model=ref_model,
-                optimizer=critic_optim,
-                lr_scheduler=critic_lr_scheduler,
-            )
-            assert isinstance(train_prompt_dataloader.sampler, StatefulDistributedSampler)
-            train_prompt_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
-
-            coordinator.print_on_master(
-                f"Loaded actor and reference model checkpoint {args.checkpoint_path} at spisode {start_step}"
-            )
-            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
-
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-        )
-
-    if args.critic_checkpoint_path is not None:
-        if "modeling" in args.critic_checkpoint_path:
-            critic_booster.load_model(critic, args.critic_checkpoint_path)
-        else:
-            _, _, _ = load_checkpoint(
-                load_dir=args.critic_checkpoint_path,
-                booster=critic_booster,
-                model=critic,
-                optimizer=critic_optim,
-                lr_scheduler=critic_lr_scheduler,
-            )
-        coordinator.print_on_master(f"Loaded critic checkpoint {args.critic_checkpoint_path}")
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-        )
-
-    # configure trainer
-    trainer = PPOTrainer(
-        actor_booster,
-        critic_booster,
-        actor,
-        critic,
-        reward_model,
-        ref_model,
-        actor_optim,
-        critic_optim,
-        actor_lr_scheduler,
-        critic_lr_scheduler,
-        tokenizer=tokenizer,
-        kl_coef=args.kl_coef,
-        ptx_coef=args.ptx_coef,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=args.num_collect_steps * args.experience_batch_size,
-        max_length=args.max_length,
-        max_new_tokens=args.max_seq_len,
-        use_cache=True,
-        do_sample=True,
-        temperature=0.7,
-        accumulation_steps=args.accumulation_steps,
-        save_dir=args.save_path,
-        save_interval=args.save_interval,
-        top_k=50,
-        use_tp=args.tp > 1,
-        offload_inference_models="gemini" not in args.plugin,
-        coordinator=coordinator,
-    )
-
-    trainer.fit(
-        num_episodes=args.num_episodes,
-        num_collect_steps=args.num_collect_steps,
-        num_update_steps=args.num_update_steps,
-        prompt_dataloader=train_prompt_dataloader,
-        pretrain_dataloader=train_pretrain_dataloader,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        actor.eval()
-        critic.eval()
-    # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final actor model checkpoint")
-    actor_booster.save_model(actor, os.path.join(trainer.actor_save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(
-        f"Saved final actor model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
-    )
-    coordinator.print_on_master("Start saving final critic model checkpoint")
-    critic_booster.save_model(critic, os.path.join(trainer.critic_save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(
-        f"Saved final critic model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
-    )
-    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt_dataset", nargs="+", default=[])
-    parser.add_argument("--pretrain_dataset", nargs="+", default=[])
-    parser.add_argument(
-        "--plugin",
-        type=str,
-        default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
-        help="Choose which plugin to use",
-    )
-    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
-    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
-    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
-    parser.add_argument("--tokenizer_dir", type=str, default=None)
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--rm_pretrain", type=str, default=None)
-    parser.add_argument("--checkpoint_path", type=str, default=None)
-    parser.add_argument("--critic_checkpoint_path", type=str, default=None)
-    parser.add_argument("--rm_checkpoint_path", type=str, help="Reward model checkpoint path")
-    parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
-    parser.add_argument("--num_episodes", type=int, default=1)
-    parser.add_argument("--num_collect_steps", type=int, default=2)
-    parser.add_argument("--num_update_steps", type=int, default=5)
-    parser.add_argument("--save_interval", type=int, default=1000)
-    parser.add_argument("--train_batch_size", type=int, default=16)
-    parser.add_argument("--experience_batch_size", type=int, default=16)
-    parser.add_argument("--ptx_batch_size", type=int, default=1)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=9e-6)
-    parser.add_argument("--critic_lr", type=float, default=9e-6)
-    parser.add_argument("--kl_coef", type=float, default=0.1)
-    parser.add_argument("--ptx_coef", type=float, default=0.0)
-    parser.add_argument("--max_length", type=int, default=2048)
-    parser.add_argument("--max_seq_len", type=int, default=256)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
-    parser.add_argument("--use_flash_attn", default=False, action="store_true")
-    args = parser.parse_args()
-    train(args)
diff --git a/applications/Chat/examples/train_ppo.sh b/applications/Chat/examples/train_ppo.sh
deleted file mode 100755
index c50107c9791a..000000000000
--- a/applications/Chat/examples/train_ppo.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
-# NCCL IB environment variables
-export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME=eth0
-export NCCL_IB_GID_INDEX=3
-export NCCL_IB_TIMEOUT=23
-export NCCL_IB_RETRY_CNT=7
-export OMP_NUM_THREADS=8
-
-
-PROJECT_NAME="llama2-dpo"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-REWARD_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
-declare -a prompt_dataset=(
-    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
-)
-
-declare -a ptx_dataset=(
-    /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow/part-00000
-)
-
-TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
-FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
-SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
-
-colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_ppo.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --rm_pretrain $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --rm_checkpoint_path $REWARD_MODEL_PATH \
-    --prompt_dataset ${prompt_dataset[@]} \
-    --pretrain_dataset ${ptx_dataset[@]} \
-    --ptx_batch_size 1 \
-    --ptx_coef 0.0 \
-    --plugin "zero2" \
-    --save_interval 200 \
-    --save_path $SAVE_DIR \
-    --num_episodes 2000 \
-    --num_collect_steps 1 \
-    --num_update_steps 1 \
-    --experience_batch_size 8 \
-    --train_batch_size 4 \
-    --accumulation_steps 2 \
-    --lr 9e-6 \
-    --mixed_precision "bf16" \
-    --grad_clip 1.0 \
-    --weight_decay 0.01 \
-    --warmup_steps 100 \
-    --grad_checkpoint \
-    --use_wandb
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
deleted file mode 100644
index 4cf94b98cf7f..000000000000
--- a/applications/Chat/examples/train_reward_model.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import argparse
-import json
-import math
-import os
-import resource
-from contextlib import nullcontext
-
-import torch
-from coati.dataset import (
-    DataCollatorForPreferenceDataset,
-    StatefulDistributedSampler,
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-)
-from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
-from coati.trainer import RewardModelTrainer
-from coati.utils import load_checkpoint, replace_with_flash_attention
-from transformers import LlamaTokenizer
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.lazy import LazyInitContext
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-
-
-def train(args):
-    # ==============================
-    # Initialize Distributed Training
-    # ==============================
-    colossalai.launch_from_torch({})
-    coordinator = DistCoordinator()
-
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
-
-    booster_policy = None
-    with init_ctx:
-        model = RewardModel(args.pretrain)
-
-        if args.tp > 1:
-            if model.model.config.architectures[0] == "BloomForCausalLM":
-                from colossalai.shardformer.policies.bloom import BloomPolicy
-
-                booster_policy = BloomPolicy()
-            elif model.model.config.architectures[0] == "LlamaForCausalLM":
-                from colossalai.shardformer.policies.llama import LlamaPolicy
-
-                booster_policy = LlamaPolicy()
-            elif model.model.config.architectures[0] == "GPT2LMHeadModel":
-                from colossalai.shardformer.policies.gpt2 import GPT2Policy
-
-                booster_policy = GPT2Policy()
-            elif model.model.config.architectures[0] == "ChatGLMModel":
-                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
-
-                booster_policy = ChatGLMPolicy()
-            elif model.model.config.architectures[0] == "OPTForCausalLM":
-                from colossalai.shardformer.policies.opt import OPTPolicy
-
-                booster_policy = OPTPolicy()
-            else:
-                raise ValueError("Unknown model architecture for policy")
-
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    # ==============================
-    # Initialize Booster
-    # ==============================
-    if args.plugin == "gemini":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            placement_policy="auto",
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2_cpu":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            cpu_offload=True,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "3d":
-        plugin = HybridParallelPlugin(
-            tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
-            precision=args.mixed_precision,
-            custom_policy=booster_policy,
-        )
-    else:
-        raise ValueError(f"Unknown plugin {args.plugin}")
-
-    booster = Booster(plugin=plugin)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        model.model.gradient_checkpointing_enable()  # TODO: support gradient checkpoint for the last linear layer
-        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
-
-    if args.use_flash_attn:
-        replace_with_flash_attention(model=model)
-        coordinator.print_on_master(msg="Flash-attention enabled successfully")
-
-    # configure tokenizer
-    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
-    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)
-    tokenizer.padding_side = "right"
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # configure loss function
-    if args.loss_fn == "log_sig":
-        loss_fn = LogSigLoss()
-    elif args.loss_fn == "log_exp":
-        loss_fn = LogExpLoss()
-    else:
-        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
-
-    # configure optimizer
-    optim = HybridAdam(
-        model_params=model.parameters(),
-        lr=args.lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
-    # configure dataset
-    coordinator.print_on_master(f"Load dataset: {args.dataset}")
-    mode_map = {"train": "train", "valid": "validation", "test": "test"}
-    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
-    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
-    train_dataloader = setup_distributed_dataloader(
-        dataset=train_dataset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-        use_tp=args.tp > 1,
-    )
-
-    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    math.ceil(args.max_epochs * num_update_steps_per_epoch)
-
-    if args.warmup_steps is None:
-        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
-        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
-
-    lr_scheduler = CosineAnnealingWarmupLR(
-        optimizer=optim,
-        total_steps=args.max_epochs * num_update_steps_per_epoch,
-        warmup_steps=args.warmup_steps,
-        eta_min=0.1 * args.lr,
-    )
-
-    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
-    torch.set_default_dtype(default_dtype)
-    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
-        model=model,
-        optimizer=optim,
-        lr_scheduler=lr_scheduler,
-        dataloader=train_dataloader,
-    )
-    torch.set_default_dtype(torch.float)
-
-    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
-    coordinator.print_on_master(
-        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-    )
-
-    start_epoch = 0
-    sampler_start_idx = 0
-    start_step = 0
-    if args.checkpoint_path is not None:
-        if "modeling" in args.checkpoint_path:
-            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
-            booster.load_model(model, args.checkpoint_path)
-        else:
-            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
-            start_epoch, start_step, sampler_start_idx = load_checkpoint(
-                load_dir=args.checkpoint_path,
-                booster=booster,
-                model=model,
-                optimizer=optim,
-                lr_scheduler=lr_scheduler,
-            )
-            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
-            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
-
-            coordinator.print_on_master(
-                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
-            )
-            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
-
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-        )
-
-    trainer = RewardModelTrainer(
-        model,
-        booster,
-        optim,
-        lr_scheduler,
-        tokenizer,
-        loss_fn=loss_fn,
-        max_epochs=args.max_epochs,
-        accumulation_steps=args.accumulation_steps,
-        start_epoch=start_epoch,
-        save_interval=args.save_interval,
-        save_dir=args.save_dir,
-        coordinator=coordinator,
-    )
-
-    trainer.fit(
-        train_preference_dataloader=train_dataloader,
-        eval_preference_dataloader=None,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        model.eval()
-    # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
-
-    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
-
-
-if __name__ == "__main__":
-    # ==============================
-    # Parse Arguments
-    # ==============================
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--plugin",
-        type=str,
-        default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
-        help="Choose which plugin to use",
-    )
-    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
-    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
-    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--tokenizer_dir", type=str, default=None)
-    parser.add_argument("--dataset", nargs="+", default=[])
-    parser.add_argument(
-        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
-    )
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--save_dir", type=str, default="output")
-    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
-    parser.add_argument("--max_epochs", type=int, default=3)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"], help="Loss function")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
-    )
-    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
-    parser.add_argument("--use_flash_attn", default=False, action="store_true")
-    args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
-    train(args)
diff --git a/applications/Chat/examples/train_rm.sh b/applications/Chat/examples/train_rm.sh
deleted file mode 100755
index 18ef2511ef40..000000000000
--- a/applications/Chat/examples/train_rm.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
-# NCCL IB environment variables
-export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME=eth0
-export NCCL_IB_GID_INDEX=3
-export NCCL_IB_TIMEOUT=23
-export NCCL_IB_RETRY_CNT=7
-export OMP_NUM_THREADS=8
-
-
-PROJECT_NAME="llama2-rm"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/rm/output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  #"/home/lcyab/data/models/bloom-560m" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-declare -a dataset=(
-    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
-)
-
-TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
-FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
-SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
-
-colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_reward_model.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --checkpoint_path /home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling \
-    --dataset ${dataset[@]} \
-    --plugin "zero2" \
-    --save_interval 3000 \
-    --save_dir $SAVE_DIR \
-    --config_file $CONFIG_FILE \
-    --max_epochs 3 \
-    --accumulation_steps 1 \
-    --batch_size 8 \
-    --lr 9e-6 \
-    --mixed_precision "bf16" \
-    --grad_clip 1.0 \
-    --weight_decay 0.01 \
-    --warmup_steps 100 \
-    --use_flash_attn \
-    # --use_wandb \
-    # --grad_checkpoint \
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
deleted file mode 100644
index 49982ca17e19..000000000000
--- a/applications/Chat/examples/train_sft.py
+++ /dev/null
@@ -1,284 +0,0 @@
-import argparse
-import json
-import math
-import os
-import resource
-from contextlib import nullcontext
-
-import torch
-from coati.dataset import DataCollatorForSupervisedDataset, load_tokenized_dataset, setup_distributed_dataloader
-from coati.models import convert_to_lora_module, load_checkpoint
-from coati.trainer import SFTTrainer
-from coati.utils import replace_with_flash_attention
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.lazy import LazyInitContext
-from colossalai.logging import get_dist_logger
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-
-
-def train(args):
-    # ==============================
-    # Initialize Distributed Training
-    # ==============================
-    colossalai.launch_from_torch({})
-    coordinator = DistCoordinator()
-
-    # ==============================
-    # Initialize Booster
-    # ==============================
-    if args.plugin == "ddp":
-        # default torch ddp plugin without any acceleration, for debugging purpose acceleration, for debugging purpose
-        plugin = TorchDDPPlugin(find_unused_parameters=True)
-    elif args.plugin == "gemini":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            placement_policy="auto",
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "zero2_cpu":
-        plugin = LowLevelZeroPlugin(
-            stage=2,
-            precision=args.mixed_precision,
-            initial_scale=2**16,
-            cpu_offload=True,
-            max_norm=args.grad_clip,
-        )
-    elif args.plugin == "3d":
-        plugin = HybridParallelPlugin(
-            tp_size=args.tp,
-            pp_size=1,
-            zero_stage=args.zero,
-            max_norm=args.grad_clip,
-            precision=args.mixed_precision,
-        )
-    else:
-        raise ValueError(f"Unknown plugin {args.plugin}")
-
-    booster = Booster(plugin=plugin)
-
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    init_ctx = (
-        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
-    )
-    with init_ctx:
-        model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        # lora layers are not supported by gradient checkpointing
-        model.gradient_checkpointing_enable()
-        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
-
-    if args.use_flash_attn:
-        replace_with_flash_attention(model=model)
-        coordinator.print_on_master(msg="Flash-attention enabled successfully")
-
-    # configure tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir or args.pretrain)
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.add_bos_token = False
-    tokenizer.add_eos_token = False
-
-    coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
-    coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
-
-    # configure optimizer
-    optim = HybridAdam(
-        model_params=model.parameters(),
-        lr=args.lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
-    # configure dataset
-    coordinator.print_on_master(
-        f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-    )
-    dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
-    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len)
-    train_dataloader = setup_distributed_dataloader(
-        dataset=dataset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-    )
-    coordinator.print_on_master(
-        f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-    )
-
-    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    math.ceil(args.max_epochs * num_update_steps_per_epoch)
-
-    if args.warmup_steps is None:
-        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
-        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
-
-    lr_scheduler = CosineAnnealingWarmupLR(
-        optimizer=optim,
-        total_steps=args.max_epochs * num_update_steps_per_epoch,
-        warmup_steps=args.warmup_steps,
-        eta_min=0.1 * args.lr,
-    )
-
-    # Flash attention will be disabled because it does NOT support fp32.
-    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
-    torch.set_default_dtype(default_dtype)
-    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
-        model=model,
-        optimizer=optim,
-        lr_scheduler=lr_scheduler,
-        dataloader=train_dataloader,
-    )
-    # model = model.to(get_current_device())
-    torch.set_default_dtype(torch.float)
-
-    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
-    coordinator.print_on_master(
-        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-    )
-
-    start_epoch = 0
-    sampler_start_idx = 0
-    start_step = 0
-    if args.checkpoint_path is not None:
-        if "modeling" in args.checkpoint_path:
-            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
-            booster.load_model(model, args.checkpoint_path)
-        else:
-            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
-            start_epoch, start_step, sampler_start_idx = load_checkpoint(
-                load_dir=args.checkpoint_path,
-                booster=booster,
-                model=model,
-                optimizer=optim,
-                lr_scheduler=lr_scheduler,
-            )
-            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
-
-            coordinator.print_on_master(
-                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
-            )
-            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
-
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
-        )
-        coordinator.print_on_master(
-            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
-        )
-
-    trainer = SFTTrainer(
-        model=model,
-        booster=booster,
-        optim=optim,
-        lr_scheduler=lr_scheduler,
-        max_epochs=args.max_epochs,
-        accumulation_steps=args.accumulation_steps,
-        start_epoch=start_epoch,
-        save_interval=args.save_interval,
-        save_dir=args.save_path,
-        coordinator=coordinator,
-    )
-
-    get_dist_logger()
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        eval_dataloader=None,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        model.eval()
-    # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
-
-    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
-
-
-if __name__ == "__main__":
-    # ==============================
-    # Parse Arguments
-    # ==============================
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--plugin",
-        type=str,
-        default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d", "ddp"],
-        help="Choose which plugin to use",
-    )
-    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
-    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
-    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=1)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--tokenizer_dir", type=str, default=None)
-    parser.add_argument("--dataset", nargs="+", default=[])
-    parser.add_argument(
-        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
-    )
-    parser.add_argument("--save_path", type=str, default="output")
-    parser.add_argument("--max_epochs", type=int, default=3)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--max_len", type=int, default=512)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
-    )
-    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
-    parser.add_argument("--use_flash_attn", default=False, action="store_true")
-    args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
-    train(args)
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
deleted file mode 100755
index 0b81651e21b3..000000000000
--- a/applications/Chat/examples/train_sft.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-# set_n_least_used_CUDA_VISIBLE_DEVICES 4
-export CUDA_VISIBLE_DEVICES=2,3,4,5
-
-PROJECT_NAME="llama2-sft"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft/output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-declare -a dataset=(
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
-)
-
-TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
-FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
-SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
-
-# the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --dataset ${dataset[@]} \
-    --save_interval 5000 \
-    --save_path $SAVE_DIR \
-    --config_file $CONFIG_FILE \
-    --plugin zero2 \
-    --batch_size 2 \
-    --max_epochs 1 \
-    --accumulation_steps 4 \
-    --lr 2e-5 \
-    --max_len 2048 \
-    --use_wandb
diff --git a/applications/Chat/examples/training_scripts/hostfile b/applications/Chat/examples/training_scripts/hostfile
old mode 100644
new mode 100755
index 4a0d939bb155..76ac7aef50cd
--- a/applications/Chat/examples/training_scripts/hostfile
+++ b/applications/Chat/examples/training_scripts/hostfile
@@ -1 +1 @@
-192.168.0.35
+192.168.0.32
diff --git a/applications/Chat/examples/training_scripts/train_dpo.py b/applications/Chat/examples/training_scripts/train_dpo.py
old mode 100644
new mode 100755
index 4e78b42c6e0c..6b9b12b9354a
--- a/applications/Chat/examples/training_scripts/train_dpo.py
+++ b/applications/Chat/examples/training_scripts/train_dpo.py
@@ -9,9 +9,10 @@
     DataCollatorForPreferenceDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
+    setup_conversation_template,
     setup_distributed_dataloader,
 )
-from coati.models import convert_to_lora_module
+from coati.models import convert_to_lora_module, disable_dropout
 from coati.trainer import DPOTrainer
 from coati.utils import load_checkpoint, replace_with_flash_attention
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -21,7 +22,6 @@
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
-from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
@@ -86,10 +86,10 @@ def train(args):
     )
     with init_ctx:
         model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        disable_dropout(model)
         ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        disable_dropout(ref_model)
 
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
         if args.lora_rank > 0:
             model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
 
@@ -106,6 +106,7 @@ def train(args):
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    _ = setup_conversation_template(tokenizer)
     tokenizer.padding_side = "right"
     tokenizer.pad_token = tokenizer.eos_token
 
@@ -153,10 +154,6 @@ def train(args):
         dataloader=train_dataloader,
     )
 
-    # test_res = model.generate(tokenizer.encode("tell a story about a cat.\n", return_tensors='pt').to(get_current_device()),
-    #                           max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.9)
-    # coordinator.print_on_master(f"Test generate: {tokenizer.decode(test_res[0])}")
-
     ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
     torch.set_default_dtype(torch.float)
 
@@ -214,7 +211,6 @@ def train(args):
         coordinator=coordinator,
     )
 
-    get_dist_logger()
     trainer.fit(
         train_preference_dataloader=train_dataloader,
         eval_preference_dataloader=None,
@@ -230,8 +226,8 @@ def train(args):
         model.eval()
     # save model checkpoint after fitting on only rank0
     coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
diff --git a/applications/Chat/examples/training_scripts/train_dpo.sh b/applications/Chat/examples/training_scripts/train_dpo.sh
index 74cf9c33417c..df9636e93f84 100755
--- a/applications/Chat/examples/training_scripts/train_dpo.sh
+++ b/applications/Chat/examples/training_scripts/train_dpo.sh
@@ -49,7 +49,7 @@ FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
-colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_dpo.py \
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
@@ -59,13 +59,12 @@ colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30035 train_
     --save_dir $SAVE_DIR \
     --config_file $CONFIG_FILE \
     --max_epochs 5 \
-    --accumulation_steps 4 \
+    --accumulation_steps 8 \
     --batch_size 4 \
-    --tp 8 \
+    --tp 4 \
     --lr 5e-6 \
     --mixed_precision "bf16" \
     --grad_clip 1.0 \
     --weight_decay 0.01 \
     --warmup_steps 100 \
-    --grad_checkpoint \
     --use_wandb
diff --git a/applications/Chat/examples/training_scripts/train_ppo.py b/applications/Chat/examples/training_scripts/train_ppo.py
old mode 100644
new mode 100755
index d67aab333cbf..2a9c7d9aa572
--- a/applications/Chat/examples/training_scripts/train_ppo.py
+++ b/applications/Chat/examples/training_scripts/train_ppo.py
@@ -9,13 +9,12 @@
     DataCollatorForSupervisedDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
+    setup_conversation_template,
     setup_distributed_dataloader,
 )
-from coati.models import Critic, RewardModel, convert_to_lora_module
+from coati.models import Critic, RewardModel, convert_to_lora_module, disable_dropout
 from coati.trainer import PPOTrainer
 from coati.utils import load_checkpoint, replace_with_flash_attention
-
-# from colossalai.utils import get_current_deviceDDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import colossalai
@@ -43,9 +42,12 @@ def train(args):
     booster_policy = None
     with init_ctx:
         actor = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
+        # Disable dropout
+        disable_dropout(actor)
         ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
         reward_model = RewardModel(args.rm_pretrain)
         critic = Critic(args.rm_pretrain)
+        disable_dropout(critic)
 
         if args.tp > 1:
             if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
@@ -73,8 +75,6 @@ def train(args):
             else:
                 raise ValueError("Unknown model architecture for policy")
 
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
         if args.lora_rank > 0:
             actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
             critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
@@ -94,9 +94,15 @@ def train(args):
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    _ = setup_conversation_template(tokenizer)
     tokenizer.padding_side = "left"  # left padding for generation (online learning)
     tokenizer.pad_token = tokenizer.eos_token
 
+    # configure generation config
+    actor.generation_config.update(
+        pad_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id
+    )
+
     # configure optimizer
     coordinator.print_on_master(f"setting up optimizer for actor: lr={args.lr}, weight_decay={args.weight_decay}")
     actor_optim = HybridAdam(
@@ -373,12 +379,12 @@ def train(args):
     coordinator.print_on_master("Start saving final actor model checkpoint")
     actor_booster.save_model(actor, os.path.join(trainer.actor_save_dir, "modeling"), shard=True)
     coordinator.print_on_master(
-        f"Saved final actor model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+        f"Saved final actor model checkpoint at episodes {args.num_episodes} at folder {args.save_path}"
     )
     coordinator.print_on_master("Start saving final critic model checkpoint")
     critic_booster.save_model(critic, os.path.join(trainer.critic_save_dir, "modeling"), shard=True)
     coordinator.print_on_master(
-        f"Saved final critic model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+        f"Saved final critic model checkpoint at episodes {args.num_episodes} at folder {args.save_path}"
     )
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
@@ -412,6 +418,7 @@ def train(args):
     parser.add_argument("--train_batch_size", type=int, default=16)
     parser.add_argument("--experience_batch_size", type=int, default=16)
     parser.add_argument("--ptx_batch_size", type=int, default=1)
+    parser.add_argument("--lora_train_bias", type=str, default="none")
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--accumulation_steps", type=int, default=8)
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
diff --git a/applications/Chat/examples/training_scripts/train_ppo.sh b/applications/Chat/examples/training_scripts/train_ppo.sh
index c50107c9791a..ea1af5d2a35c 100755
--- a/applications/Chat/examples/training_scripts/train_ppo.sh
+++ b/applications/Chat/examples/training_scripts/train_ppo.sh
@@ -25,7 +25,7 @@ export NCCL_IB_RETRY_CNT=7
 export OMP_NUM_THREADS=8
 
 
-PROJECT_NAME="llama2-dpo"
+PROJECT_NAME="llama2-ppo"
 PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/ckpt"
 PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/tensorboard"
 PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/train_config"
@@ -35,15 +35,15 @@ PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  # "/mnt/
 declare -a prompt_dataset=(
     # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow/part-00000
     /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
+    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
 )
 
 declare -a ptx_dataset=(
diff --git a/applications/Chat/examples/training_scripts/train_reward_model.py b/applications/Chat/examples/training_scripts/train_rm.py
old mode 100644
new mode 100755
similarity index 98%
rename from applications/Chat/examples/training_scripts/train_reward_model.py
rename to applications/Chat/examples/training_scripts/train_rm.py
index 4cf94b98cf7f..d03de14e8aab
--- a/applications/Chat/examples/training_scripts/train_reward_model.py
+++ b/applications/Chat/examples/training_scripts/train_rm.py
@@ -10,12 +10,13 @@
     DataCollatorForPreferenceDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
+    setup_conversation_template,
     setup_distributed_dataloader,
 )
 from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
 from coati.trainer import RewardModelTrainer
 from coati.utils import load_checkpoint, replace_with_flash_attention
-from transformers import LlamaTokenizer
+from transformers import AutoTokenizer
 
 import colossalai
 from colossalai.booster import Booster
@@ -67,8 +68,6 @@ def train(args):
             else:
                 raise ValueError("Unknown model architecture for policy")
 
-        # TODO: set dropout to 0 here
-        # for llama2, dropout is 0 by default, hence skip.
         if args.lora_rank > 0:
             model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
 
@@ -128,7 +127,8 @@ def train(args):
 
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
-    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    _ = setup_conversation_template(tokenizer)
     tokenizer.padding_side = "right"
     tokenizer.pad_token = tokenizer.eos_token
 
diff --git a/applications/Chat/examples/training_scripts/train_rm.sh b/applications/Chat/examples/training_scripts/train_rm.sh
index 18ef2511ef40..6d797a5c17ec 100755
--- a/applications/Chat/examples/training_scripts/train_rm.sh
+++ b/applications/Chat/examples/training_scripts/train_rm.sh
@@ -50,10 +50,9 @@ FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
-colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_reward_model.py \
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --checkpoint_path /home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
     --save_interval 3000 \
@@ -67,6 +66,5 @@ colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_
     --grad_clip 1.0 \
     --weight_decay 0.01 \
     --warmup_steps 100 \
-    --use_flash_attn \
     # --use_wandb \
     # --grad_checkpoint \
diff --git a/applications/Chat/examples/training_scripts/train_sft.py b/applications/Chat/examples/training_scripts/train_sft.py
old mode 100644
new mode 100755
index 49982ca17e19..a6700e004bc8
--- a/applications/Chat/examples/training_scripts/train_sft.py
+++ b/applications/Chat/examples/training_scripts/train_sft.py
@@ -6,7 +6,12 @@
 from contextlib import nullcontext
 
 import torch
-from coati.dataset import DataCollatorForSupervisedDataset, load_tokenized_dataset, setup_distributed_dataloader
+from coati.dataset import (
+    DataCollatorForSupervisedDataset,
+    load_tokenized_dataset,
+    setup_conversation_template,
+    setup_distributed_dataloader,
+)
 from coati.models import convert_to_lora_module, load_checkpoint
 from coati.trainer import SFTTrainer
 from coati.utils import replace_with_flash_attention
@@ -17,7 +22,6 @@
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
-from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
@@ -68,7 +72,7 @@ def train(args):
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=1,
-            zero_stage=args.zero,
+            zero_stage=0,
             max_norm=args.grad_clip,
             precision=args.mixed_precision,
         )
@@ -102,6 +106,7 @@ def train(args):
     # configure tokenizer
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir or args.pretrain)
     tokenizer.pad_token = tokenizer.eos_token
+    _ = setup_conversation_template(tokenizer)
     tokenizer.add_bos_token = False
     tokenizer.add_eos_token = False
 
@@ -129,6 +134,7 @@ def train(args):
         shuffle=True,
         drop_last=True,
         collate_fn=data_collator,
+        use_tp=args.tp > 1,
     )
     coordinator.print_on_master(
         f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
@@ -211,7 +217,6 @@ def train(args):
         coordinator=coordinator,
     )
 
-    get_dist_logger()
     trainer.fit(
         train_dataloader=train_dataloader,
         eval_dataloader=None,
diff --git a/applications/Chat/examples/training_scripts/train_sft.sh b/applications/Chat/examples/training_scripts/train_sft.sh
index 0b81651e21b3..f24b0d3353a2 100755
--- a/applications/Chat/examples/training_scripts/train_sft.sh
+++ b/applications/Chat/examples/training_scripts/train_sft.sh
@@ -12,26 +12,25 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-# set_n_least_used_CUDA_VISIBLE_DEVICES 4
-export CUDA_VISIBLE_DEVICES=2,3,4,5
-
+# set_n_least_used_CUDA_VISIBLE_DEVICES 3
+export CUDA_VISIBLE_DEVICES=1,2,3
 PROJECT_NAME="llama2-sft"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft/output/train_config"
+PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/ckpt"
+PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/tensorboard"
+PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/train_config"
 PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
 declare -a dataset=(
     /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00000
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
-    # /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
+    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
@@ -40,13 +39,14 @@ SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
 # the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+colossalai run --nproc_per_node 3 --master_port 28534 --hostfile ./hostfile train_sft.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --save_interval 5000 \
     --save_path $SAVE_DIR \
     --config_file $CONFIG_FILE \
+    --lora_rank 32 \
     --plugin zero2 \
     --batch_size 2 \
     --max_epochs 1 \
diff --git a/applications/Chat/pytest.ini b/applications/Chat/pytest.ini
old mode 100644
new mode 100755
diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt
old mode 100644
new mode 100755
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
old mode 100644
new mode 100755
diff --git a/applications/Chat/setup.py b/applications/Chat/setup.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/tests/__init__.py b/applications/Chat/tests/__init__.py
old mode 100644
new mode 100755
diff --git a/applications/Chat/examples/download_model.py b/applications/Chat/tests/download_model.py
old mode 100644
new mode 100755
similarity index 80%
rename from applications/Chat/examples/download_model.py
rename to applications/Chat/tests/download_model.py
index fa7c45665feb..e53160a20daa
--- a/applications/Chat/examples/download_model.py
+++ b/applications/Chat/tests/download_model.py
@@ -5,9 +5,7 @@
 from typing import List
 
 import tqdm
-from coati.models import Critic, RewardModel
 from huggingface_hub import hf_hub_download, snapshot_download
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 @dataclasses.dataclass
@@ -23,16 +21,6 @@ def download_all(self):
         snapshot_download(self.repo_id)
 
 
-def test_init(dir_path: str):
-    actor = AutoModelForCausalLM.from_pretrained(dir_path)
-    del actor
-    critic = Critic(dir_path)
-    del critic
-    reward_model = RewardModel(dir_path)
-    del reward_model
-    AutoTokenizer.from_pretrained(dir_path)
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-dir", type=str, default="test_models")
@@ -61,4 +49,3 @@ def test_init(dir_path: str):
             repo_list[model_name].download(dir_path)
         else:
             repo_list[model_name].download_all()
-        test_init(dir_path)
diff --git a/applications/Chat/tests/test_benchmarks.sh b/applications/Chat/tests/test_benchmarks.sh
deleted file mode 100755
index 3fdb25181342..000000000000
--- a/applications/Chat/tests/test_benchmarks.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -xue
-
-echo "Hint: You can run this script with 'verbose' as the first argument to run all strategies."
-
-if [[ $# -ne 0 && "$1" == "verbose" ]]; then
-    STRATEGIES=(
-        'ddp'
-        'colossalai_gemini'
-        'colossalai_gemini_cpu'
-        'colossalai_zero2'
-        'colossalai_zero2_cpu'
-        'colossalai_zero1'
-        'colossalai_zero1_cpu'
-    )
-else
-    STRATEGIES=(
-        'colossalai_zero2'
-    )
-fi
-
-BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
-BENCHMARKS_DIR=$BASE_DIR/benchmarks
-
-echo "[Test]: testing benchmarks ..."
-
-for strategy in ${STRATEGIES[@]}; do
-    torchrun --standalone --nproc_per_node 1 $BENCHMARKS_DIR/benchmark_opt_lora_dummy.py \
-        --model 125m --critic_model 125m --strategy ${strategy} --lora_rank 4 \
-        --num_episodes 2 --num_collect_steps 4 --num_update_steps 2 \
-        --train_batch_size 2 --experience_batch_size 4
-done
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
deleted file mode 100644
index 9c08aa36c9b4..000000000000
--- a/applications/Chat/tests/test_checkpoint.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import os
-import tempfile
-from contextlib import nullcontext
-
-import pytest
-import torch
-import torch.distributed as dist
-from coati.models.gpt import GPTActor
-from coati.models.utils import calc_action_log_probs
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
-
-
-def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
-    attention_mask = torch.ones_like(input_ids)
-    return dict(input_ids=input_ids, attention_mask=attention_mask)
-
-
-def train_step(strategy: Strategy, actor: GPTActor, actor_optim: HybridAdam, batch_size: int = 8):
-    data = get_data(batch_size)
-    action_mask = torch.ones_like(data["attention_mask"], dtype=torch.bool)
-    actor_logits = actor(data["input_ids"], data["attention_mask"])["logits"]
-    action_log_probs = calc_action_log_probs(actor_logits, data["input_ids"], action_mask.size(1))
-    loss = action_log_probs.sum()
-    strategy.backward(loss, actor, actor_optim)
-    strategy.optimizer_step(actor_optim)
-
-
-def run_test_checkpoint(strategy_name: str, shard: bool):
-    if strategy_name == "ddp":
-        strategy = DDPStrategy()
-    elif strategy_name == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
-    elif strategy_name == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    else:
-        raise ValueError(f"Unsupported strategy '{strategy_name}'")
-
-    with strategy.model_init_context():
-        actor = GPTActor(config=GPT_CONFIG).cuda()
-    actor_optim = HybridAdam(actor.parameters())
-    actor, actor_optim = strategy.prepare((actor, actor_optim))
-
-    train_step(strategy, actor, actor_optim)
-
-    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()
-
-    with ctx as dirname:
-        rank0_dirname = [dirname]
-        dist.broadcast_object_list(rank0_dirname)
-        rank0_dirname = rank0_dirname[0]
-
-        model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt")
-        strategy.save_model(actor, model_path)
-        optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt")
-        strategy.save_optimizer(actor_optim, optim_path)
-        dist.barrier()
-
-        strategy.load_model(actor, model_path, strict=False)
-        strategy.load_optimizer(actor_optim, optim_path)
-        dist.barrier()
-
-    train_step(strategy, actor, actor_optim)
-
-
-def run_dist(rank: int, world_size: int, port: int, strategy_name: str, shard: bool):
-    os.environ["RANK"] = str(rank)
-    os.environ["LOCAL_RANK"] = str(rank)
-    os.environ["WORLD_SIZE"] = str(world_size)
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(port)
-    run_test_checkpoint(strategy_name, shard)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize("world_size", [4])
-@pytest.mark.parametrize("strategy_name", ["ddp", "colossalai_gemini", "colossalai_zero2"])
-@pytest.mark.parametrize("shard", [False, True])
-@rerun_if_address_is_in_use()
-def test_checkpoint(world_size: int, strategy_name: str, shard: bool):
-    spawn(run_dist, world_size, strategy_name=strategy_name, shard=shard)
-
-
-if __name__ == "__main__":
-    test_checkpoint(2, "colossalai_gemini", shard=False)
diff --git a/applications/Chat/tests/test_data_preparation.sh b/applications/Chat/tests/test_data_preparation.sh
new file mode 100755
index 000000000000..27c7a58efcaa
--- /dev/null
+++ b/applications/Chat/tests/test_data_preparation.sh
@@ -0,0 +1,286 @@
+#!/usr/bin/env bash
+'''
+Usage
+
+SFT_DATASET=/mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
+PROMPT_DATASET=/home/lcyab/data/data_rlhf/preprocessed_prompt \
+PRETRAIN_DATASET=/home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
+PREFERENCE_DATASET=/home/lcyab/data/data_rlhf/preprcessed \
+./test_data_preparation.sh
+'''
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+set -xu
+
+if [ -z "$SFT_DATASET" ]; then
+    echo "Please set \$SFT_DATASET to the path to sft dataset."
+    exit 1
+fi
+
+if [ -z "$PROMPT_DATASET" ]; then
+    echo "Please set \$PROMPT_DATASET to the path to prompts."
+    exit 1
+fi
+
+if [ -z "$PRETRAIN_DATASET" ]; then
+    echo "Please set \$PRETRAIN_DATASET to the path to pretrain data"
+    exit 1
+fi
+
+if [ -z "$PREFERENCE_DATASET" ]; then
+    echo "Please set \$SFT_DATASET to the path to sft dataset."
+    exit 1
+fi
+
+NUM_RETRY=3
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+BASE_TEMP_DIR=$BASE_DIR/temp
+EXAMPLES_DIR=$BASE_DIR/examples
+DATA_SAVE_PATH=$BASE_TEMP_DIR/rlhf_data
+MODELS=('gpt2' 'bloom' 'opt' 'llama')
+
+if [ ! -d "$BASE_TEMP_DIR" ]; then
+  mkdir "$BASE_TEMP_DIR"
+  echo "Directory created successfully"
+else
+  echo "Directory already exists"
+fi
+
+if [ ! -d "$DATA_SAVE_PATH" ]; then
+  mkdir "$DATA_SAVE_PATH"
+  echo "Directory created successfully"
+else
+  echo "Directory already exists"
+fi
+
+export OMP_NUM_THREADS=8
+
+# install requirements
+pip install -r $EXAMPLES_DIR/requirements.txt
+
+get_data_input_dirs() {
+    local data_type=$1
+    if [[ $data_type == "sft" ]]; then
+        echo "$SFT_DATASET"
+    elif [[ $data_type == "ptx" ]]; then
+        echo "$PRETRAIN_DATASET"
+    elif [[ $data_type == "prompt" ]]; then
+        echo "$PROMPT_DATASET"
+    elif [[ $data_type == "preference" ]]; then
+        echo "$PREFERENCE_DATASET"
+    else
+        echo "Unknown data type $data_type"
+        exit 1
+    fi
+}
+
+get_tokenizer_dirs() {
+    local model=$1
+    if [[ $model == "gpt2" ]]; then
+        echo "gpt2"
+    elif [[ $model == "bloom" ]]; then
+        echo "bigscience/bloom-560m"
+    elif [[ $model == "opt" ]]; then
+        echo "facebook/opt-350m"
+    elif [[ $model == "llama" ]]; then
+        echo "hf-internal-testing/llama-tokenizer"
+    else
+        echo "Unknown model $model"
+        exit 1
+    fi
+}
+
+random_choice() {
+    local arr=("$@")
+    local len=${#arr[@]}
+    local idx=$((RANDOM % len))
+    echo ${arr[$idx]}
+}
+
+
+echo "[Test]: testing prepare_preference_dataset.py ..."
+
+# FIXME: This is a hack to skip tests that are not working
+SKIPPED_TESTS=(
+)
+
+# test prepare_preference_dataset
+for model in ${MODELS[@]}; do
+    data_type="preference"
+    if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$data_type " ]]; then
+        echo "[Test]: Skipped $model-$data_type"
+        continue
+    fi
+    cache_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/cache
+    jsonl_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/jsonl
+    arrow_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/arrow
+    rm -rf $cache_dir
+    rm -rf $jsonl_dir
+    rm -rf $arrow_dir
+    data_input_dirs=$(get_data_input_dirs $data_type)
+    tokenizer_dir=$(get_tokenizer_dirs $model)
+    for i in $(seq $NUM_RETRY); do
+        echo "[Test]: $model-$data_type, attempt $i"
+        python $EXAMPLES_DIR/data_preparation_scripts/prepare_preference_dataset.py \
+            --data_input_dirs $data_input_dirs \
+            --tokenizer_dir $tokenizer_dir \
+            --data_cache_dir $cache_dir \
+            --data_jsonl_output_dir $jsonl_dir \
+            --data_arrow_output_dir $arrow_dir \
+            --max_length 400 \
+            --num_samples_per_datafile 100 \
+            --num_spliced_dataset_bins 1
+        passed=$?
+        if [ $passed -eq 0 ]; then
+            break
+        fi
+    done
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed $model-$data_type"
+        exit 1
+    fi
+done
+
+echo "[Test]: testing prepare_sft_dataset.py ..."
+
+# FIXME: This is a hack to skip tests that are not working
+SKIPPED_TESTS=(
+)
+
+# test prepare_sft_dataset
+for model in ${MODELS[@]}; do
+    data_type="sft"
+    if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$data_type " ]]; then
+        echo "[Test]: Skipped $model-$data_type"
+        continue
+    fi
+    cache_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/cache
+    jsonl_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/jsonl
+    arrow_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/arrow
+    data_input_dirs=$(get_data_input_dirs $data_type)
+    tokenizer_dir=$(get_tokenizer_dirs $model)
+    for i in $(seq $NUM_RETRY); do
+        rm -rf $cache_dir
+        rm -rf $jsonl_dir
+        rm -rf $arrow_dir
+        echo "[Test]: $model-$data_type, attempt $i"
+        python $EXAMPLES_DIR/data_preparation_scripts/prepare_sft_dataset.py \
+            --data_input_dirs $data_input_dirs \
+            --tokenizer_dir $tokenizer_dir \
+            --data_cache_dir $cache_dir \
+            --data_jsonl_output_dir $jsonl_dir \
+            --data_arrow_output_dir $arrow_dir \
+            --max_length 400 \
+            --num_samples_per_datafile 100 \
+            --num_spliced_dataset_bins 1
+        passed=$?
+        if [ $passed -eq 0 ]; then
+            break
+        fi
+    done
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed $model-$data_type"
+        exit 1
+    fi
+done
+
+echo "[Test]: testing prepare_prompt_dataset.py ..."
+
+# FIXME: This is a hack to skip tests that are not working
+SKIPPED_TESTS=(
+)
+
+# test prepare_prompt_dataset
+for model in ${MODELS[@]}; do
+    data_type="prompt"
+    if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$data_type " ]]; then
+        echo "[Test]: Skipped $model-$data_type"
+        continue
+    fi
+    cache_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/cache
+    jsonl_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/jsonl
+    arrow_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/arrow
+    data_input_dirs=$(get_data_input_dirs $data_type)
+    tokenizer_dir=$(get_tokenizer_dirs $model)
+    for i in $(seq $NUM_RETRY); do
+        rm -rf $cache_dir
+        rm -rf $jsonl_dir
+        rm -rf $arrow_dir
+        echo "[Test]: $model-$data_type, attempt $i"
+        python $EXAMPLES_DIR/data_preparation_scripts/prepare_prompt_dataset.py \
+            --data_input_dirs $data_input_dirs \
+            --tokenizer_dir $tokenizer_dir \
+            --data_cache_dir $cache_dir \
+            --data_jsonl_output_dir $jsonl_dir \
+            --data_arrow_output_dir $arrow_dir \
+            --max_length 400 \
+            --num_samples_per_datafile 100 \
+            --num_spliced_dataset_bins 1
+        passed=$?
+        if [ $passed -eq 0 ]; then
+            break
+        fi
+    done
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed $model-$data_type"
+        exit 1
+    fi
+done
+
+echo "[Test]: testing prepare_ptx_dataset.py ..."
+
+# FIXME: This is a hack to skip tests that are not working
+SKIPPED_TESTS=(
+)
+
+# test prepare_ptx_dataset
+for model in ${MODELS[@]}; do
+    data_type="ptx"
+    if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$data_type " ]]; then
+        echo "[Test]: Skipped $model-$data_type"
+        continue
+    fi
+    cache_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/cache
+    jsonl_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/jsonl
+    arrow_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/arrow
+    data_input_dirs=$(get_data_input_dirs $data_type)
+    tokenizer_dir=$(get_tokenizer_dirs $model)
+    for i in $(seq $NUM_RETRY); do
+        rm -rf $cache_dir
+        rm -rf $jsonl_dir
+        rm -rf $arrow_dir
+        echo "[Test]: $model-$data_type, attempt $i"
+        python $EXAMPLES_DIR/data_preparation_scripts/prepare_ptx_dataset.py \
+            --data_input_dirs $data_input_dirs \
+            --tokenizer_dir $tokenizer_dir \
+            --data_cache_dir $cache_dir \
+            --data_jsonl_output_dir $jsonl_dir \
+            --data_arrow_output_dir $arrow_dir \
+            --max_length 400 \
+            --num_samples_per_datafile 100 \
+            --num_spliced_dataset_bins 1
+        passed=$?
+        if [ $passed -eq 0 ]; then
+            break
+        fi
+    done
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed $model-$data_type"
+        exit 1
+    fi
+done
diff --git a/applications/Chat/tests/test_dataset.py b/applications/Chat/tests/test_dataset.py
deleted file mode 100644
index 70a171955b7a..000000000000
--- a/applications/Chat/tests/test_dataset.py
+++ /dev/null
@@ -1,252 +0,0 @@
-import json
-import os
-import tempfile
-from typing import Optional
-
-import pytest
-import torch
-from coati.dataset.prompt_dataset import PromptDataset
-from coati.dataset.reward_dataset import PreferenceDataset
-from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDataset
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from datasets import load_dataset
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, PreTrainedTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-SFT_DATASET = [
-    {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0,
-    },
-    {
-        "instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
-        "input": "",
-        "output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
-        "id": 1,
-    },
-    {
-        "instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
-        "input": "",
-        "output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
-        "id": 2,
-    },
-]
-
-PROMPT_DATASET = [
-    {
-        "instruction": 'Edit this paragraph to make it more concise: "Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends."',
-        "id": 0,
-    },
-    {"instruction": "Write a descriptive paragraph about a memorable vacation you went on", "id": 1},
-    {"instruction": "Write a persuasive essay arguing why homework should be banned in schools", "id": 2},
-    {"instruction": "Create a chart comparing the statistics on student debt in the United States.", "id": 3},
-]
-
-
-def make_tokenizer(model: str):
-    if model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        tokenizer.pad_token = tokenizer.unk_token
-    elif model == "chatglm":
-        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-    else:
-        raise ValueError(f"Unsupported model '{model}'")
-    return tokenizer
-
-
-def check_content(input_ids_stripped: torch.Tensor, tokenizer: PreTrainedTokenizer, model: str):
-    if model == "opt":
-        # NOTE:  Contrary to GPT2, OPT adds the EOS token </s> to the beginning of every prompt.
-        assert input_ids_stripped[0] == tokenizer.eos_token_id
-        input_ids_stripped = input_ids_stripped[1:]
-    elif model == "llama":
-        assert input_ids_stripped[0] == tokenizer.bos_token_id
-        input_ids_stripped = input_ids_stripped[1:]
-    elif model == "chatglm":
-        assert input_ids_stripped[0] == tokenizer.bos_token_id
-        assert input_ids_stripped[-1] == tokenizer.eos_token_id
-        input_ids_stripped = input_ids_stripped[1:-1]
-    assert torch.all(input_ids_stripped != tokenizer.pad_token_id)
-    assert torch.all(input_ids_stripped != tokenizer.bos_token_id)
-    assert torch.all(input_ids_stripped != tokenizer.eos_token_id)
-    assert input_ids_stripped != tokenizer.sep_token_id
-    assert input_ids_stripped != tokenizer.cls_token_id
-    if model == "chatglm":
-        assert torch.all(input_ids_stripped != tokenizer.mask_token_id)
-    else:
-        assert input_ids_stripped != tokenizer.mask_token_id
-
-
-@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
-@pytest.mark.parametrize("max_length", [32, 1024])
-@pytest.mark.parametrize("max_datasets_size", [2])
-def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        dataset_name = "prompt_dataset.json"
-        with open(os.path.join(tmp_dir, dataset_name), "w") as f:
-            json.dump(PROMPT_DATASET, f)
-        tokenizer = make_tokenizer(model)
-        assert tokenizer.padding_side in ("left", "right")
-        prompt_dataset = PromptDataset(
-            data_path=os.path.join(tmp_dir, dataset_name),
-            tokenizer=tokenizer,
-            max_datasets_size=max_datasets_size,
-            max_length=max_length,
-        )
-        assert len(prompt_dataset) == min(max_datasets_size, len(PROMPT_DATASET))
-        for i in range(len(prompt_dataset)):
-            assert isinstance(prompt_dataset[i], dict)
-            assert list(prompt_dataset[i].keys()) == ["input_ids", "attention_mask"]
-            input_ids = prompt_dataset[i]["input_ids"]
-            attention_mask = prompt_dataset[i]["attention_mask"]
-            attention_mask = attention_mask.bool()
-            assert input_ids.shape == attention_mask.shape == torch.Size([max_length])
-            assert torch.all(input_ids[torch.logical_not(attention_mask)] == tokenizer.pad_token_id)
-            check_content(input_ids.masked_select(attention_mask), tokenizer, model)
-
-
-@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
-@pytest.mark.parametrize(
-    ["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"), ("Dahoas/rm-static", None)]
-)
-@pytest.mark.parametrize("max_datasets_size", [32])
-@pytest.mark.parametrize("max_length", [32, 1024])
-def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
-    data = load_dataset(dataset_path, data_dir=subset)
-    assert max_datasets_size <= len(data["train"]) and max_datasets_size <= len(data["test"])
-    train_data = data["train"].select(range(max_datasets_size))
-    test_data = data["test"].select(range(max_datasets_size))
-    tokenizer = make_tokenizer(model)
-    assert tokenizer.padding_side in ("left", "right")
-
-    if dataset_path == "Anthropic/hh-rlhf":
-        train_dataset = PreferenceDataset(train_data, tokenizer, max_length)
-        test_dataset = PreferenceDataset(test_data, tokenizer, max_length)
-    elif dataset_path == "Dahoas/rm-static":
-        train_dataset = PreferenceDataset(
-            train_data,
-            tokenizer,
-            max_length,
-            dataset_schema={"prompt": "prompt", "chosen": "chosen", "rejected": "rejected"},
-        )
-        test_dataset = PreferenceDataset(
-            test_data,
-            tokenizer,
-            max_length,
-            dataset_schema={"prompt": "prompt", "chosen": "chosen", "rejected": "rejected"},
-        )
-    else:
-        raise ValueError(f'Unsupported dataset "{dataset_path}"')
-
-    assert len(train_dataset) == len(test_dataset) == max_datasets_size
-    for i in range(max_datasets_size):
-        chosen_ids, c_mask, reject_ids, r_mask = train_dataset[i]
-        assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
-        c_mask = c_mask.to(torch.bool)
-        r_mask = r_mask.to(torch.bool)
-        if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
-            check_content(chosen_ids.masked_select(c_mask)[:-1], tokenizer, model)
-            assert torch.all(chosen_ids.masked_select(torch.logical_not(c_mask)) == tokenizer.pad_token_id)
-        else:
-            check_content(chosen_ids.masked_select(c_mask), tokenizer, model)
-            assert torch.all(c_mask)
-        if reject_ids.masked_select(r_mask)[-1] == tokenizer.eos_token_id:
-            check_content(reject_ids.masked_select(r_mask)[:-1], tokenizer, model)
-            assert torch.all(reject_ids.masked_select(torch.logical_not(r_mask)) == tokenizer.pad_token_id)
-        else:
-            check_content(reject_ids.masked_select(r_mask), tokenizer, model)
-            assert torch.all(r_mask)
-
-        chosen_ids, c_mask, reject_ids, r_mask = test_dataset[i]
-        assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
-        c_mask = c_mask.to(torch.bool)
-        r_mask = r_mask.to(torch.bool)
-        if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
-            check_content(chosen_ids.masked_select(c_mask)[:-1], tokenizer, model)
-            assert torch.all(chosen_ids.masked_select(torch.logical_not(c_mask)) == tokenizer.pad_token_id)
-        else:
-            check_content(chosen_ids.masked_select(c_mask), tokenizer, model)
-            assert torch.all(c_mask)
-        if reject_ids.masked_select(r_mask)[-1] == tokenizer.eos_token_id:
-            check_content(reject_ids.masked_select(r_mask)[:-1], tokenizer, model)
-            assert torch.all(reject_ids.masked_select(torch.logical_not(r_mask)) == tokenizer.pad_token_id)
-        else:
-            check_content(reject_ids.masked_select(r_mask), tokenizer, model)
-            assert torch.all(r_mask)
-
-
-@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])  # temperally disable test for chatglm
-@pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
-@pytest.mark.parametrize("max_dataset_size", [2])
-@pytest.mark.parametrize("max_length", [32, 1024])
-def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size: int, max_length: int):
-    tokenizer = make_tokenizer(model)
-    if dataset_path == "yizhongw/self_instruct":
-        data = load_dataset(dataset_path, "super_natural_instructions")
-        train_data = data["train"].select(range(max_dataset_size))
-        sft_dataset = SFTDataset(train_data, tokenizer, max_length)
-    else:
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            dataset_name = "sft_dataset.json"
-            with open(os.path.join(tmp_dir, dataset_name), "w") as f:
-                json.dump(SFT_DATASET, f)
-            sft_dataset = SupervisedDataset(
-                tokenizer=tokenizer,
-                data_path=os.path.join(tmp_dir, dataset_name),
-                max_datasets_size=max_dataset_size,
-                max_length=max_length,
-            )
-        assert len(sft_dataset) == min(max_dataset_size, len(SFT_DATASET))
-
-    if isinstance(tokenizer, ChatGLMTokenizer):
-        for i in range(max_dataset_size):
-            assert isinstance(sft_dataset[i], dict)
-            assert list(sft_dataset[i].keys()) == ["input_ids", "labels"]
-            input_ids = sft_dataset[i]["input_ids"]
-            labels = sft_dataset[i]["labels"]
-            assert input_ids.shape == labels.shape == torch.Size([max_length])
-
-            ignore_mask = labels == IGNORE_INDEX
-            assert input_ids.masked_select(torch.logical_not(ignore_mask))[0] == tokenizer.bos_token_id
-            check_content(input_ids.masked_select(torch.logical_not(ignore_mask)), tokenizer, model)
-            return
-
-    for i in range(max_dataset_size):
-        assert isinstance(sft_dataset[i], dict)
-        assert list(sft_dataset[i].keys()) == ["input_ids", "labels", "attention_mask"]
-        input_ids = sft_dataset[i]["input_ids"]
-        labels = sft_dataset[i]["labels"]
-        attention_mask = sft_dataset[i]["attention_mask"].to(torch.bool)
-        assert input_ids.shape == labels.shape == attention_mask.shape == torch.Size([max_length])
-        if input_ids.masked_select(attention_mask)[-1] == tokenizer.eos_token_id:
-            check_content(input_ids.masked_select(attention_mask)[:-1], tokenizer, model)
-            assert torch.all(input_ids.masked_select(torch.logical_not(attention_mask)) == tokenizer.pad_token_id)
-        else:
-            check_content(input_ids.masked_select(attention_mask), tokenizer, model)
-            assert torch.all(attention_mask)
-        ignore_mask = labels == IGNORE_INDEX
-        prompt_mask = torch.logical_and(ignore_mask, attention_mask)
-        check_content(input_ids.masked_select(prompt_mask), tokenizer, model)
-        assert torch.all(input_ids.masked_select(ignore_mask ^ prompt_mask) == tokenizer.pad_token_id)
-
-
-if __name__ == "__main__":
-    test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
-    test_sft_dataset(model="bloom", dataset_path="custom", max_dataset_size=2, max_length=256)
-
-    test_reward_dataset(
-        model="gpt2", dataset_path="Anthropic/hh-rlhf", subset="harmless-base", max_datasets_size=8, max_length=256
-    )
-
-    test_prompt_dataset(model="opt", max_datasets_size=2, max_length=128)
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
old mode 100644
new mode 100755
index 2a736a65c07f..64fef2564f0a
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -1,139 +1,185 @@
-import copy
-import os
-
-import pytest
-import torch
-import torch.distributed as dist
-from coati.experience_buffer import NaiveExperienceBuffer
-from coati.experience_maker import NaiveExperienceMaker
-from coati.models.base import RewardModel
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.trainer.ppo import _set_default_generate_kwargs
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy
-from coati.trainer.strategies.colossalai import LowLevelZeroStrategy
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
-
-
-def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
-    attention_mask = torch.ones_like(input_ids)
-    return dict(input_ids=input_ids, attention_mask=attention_mask)
-
-
-def gather_and_equal(tensor: torch.Tensor) -> bool:
-    world_size = dist.get_world_size()
-    outputs = [torch.empty_like(tensor) for _ in range(world_size)]
-    dist.all_gather(outputs, tensor.contiguous())
-    for t in outputs[1:]:
-        if not torch.equal(outputs[0], t):
-            return False
-    return True
-
-
-def make_and_consume_experience(strategy):
-    EXPERIENCE_BATCH_SIZE = 4
-    SAMPLE_BATCH_SIZE = 2
-
-    if strategy == "ddp":
-        strategy = DDPStrategy()
-    elif strategy == "colossalai-zero2":
-        strategy = LowLevelZeroStrategy()
-    elif strategy == "colossalai-gemini":
-        strategy = GeminiStrategy(placement_policy="static")
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-
-    with strategy.model_init_context():
-        actor = GPTActor(config=GPT_CONFIG).cuda()
-        critic = GPTCritic(config=GPT_CONFIG).cuda()
-
-        initial_model = GPTActor(config=GPT_CONFIG).cuda()
-        reward_model = RewardModel(model=copy.deepcopy(critic.model)).cuda()
-
-    actor, critic, initial_model, reward_model = strategy.prepare(actor, critic, initial_model, reward_model)
-
-    class MockTokenizer:
-        def __init__(self):
-            self.padding_side = "left"
-            self.eos_token_id = 0
-            self.pad_token_id = 0
-
-        def batch_decode(self, sequences, skip_special_tokens=True):
-            return ["This is a test sentence." for i in range(len(sequences))]
-
-        def __call__(self, sequences, **kwargs):
-            return {
-                "input_ids": torch.randint(0, 50257, (len(sequences), 100), device="cpu"),
-                "attention_mask": torch.ones((len(sequences), 100), device="cpu").bool(),
-            }
-
-    tokenizer = MockTokenizer()
-    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer, tokenizer)
-    data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
-
-    generate_kwargs = dict(do_sample=True, max_length=16)
-    generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
-
-    # experience of all ranks should be the same
-    for _ in range(2):
-        data = get_data(EXPERIENCE_BATCH_SIZE)
-        assert gather_and_equal(data["input_ids"])
-        assert gather_and_equal(data["attention_mask"])
-        experience = experience_maker.make_experience(**data, do_sample=True, max_length=16)
-        assert gather_and_equal(experience.sequences)
-        assert gather_and_equal(experience.action_log_probs)
-        assert gather_and_equal(experience.values)
-        assert gather_and_equal(experience.reward)
-        assert gather_and_equal(experience.advantages)
-        assert gather_and_equal(experience.action_mask)
-        assert gather_and_equal(experience.attention_mask)
-        data_buffer.append(experience)
-
-    # data buffer's data should be the same
-    buffer_size = torch.tensor([len(data_buffer)], device="cuda")
-    assert gather_and_equal(buffer_size)
-    for item in data_buffer.items:
-        assert gather_and_equal(item.sequences)
-        assert gather_and_equal(item.action_log_probs)
-        assert gather_and_equal(item.values)
-        assert gather_and_equal(item.reward)
-        assert gather_and_equal(item.advantages)
-        assert gather_and_equal(item.action_mask)
-        assert gather_and_equal(item.attention_mask)
-
-    # dataloader of each rank should have the same size and different batch
-    dataloader = strategy.setup_dataloader(data_buffer)
-    dataloader_size = torch.tensor([len(dataloader)], device="cuda")
-    assert gather_and_equal(dataloader_size)
-    for experience in dataloader:
-        assert not gather_and_equal(experience.sequences)
-        assert not gather_and_equal(experience.action_log_probs)
-        assert not gather_and_equal(experience.values)
-        assert not gather_and_equal(experience.reward)
-        assert not gather_and_equal(experience.advantages)
-        # action mask and attention mask may be same
-
-
-def run_dist(rank, world_size, port, strategy):
-    os.environ["RANK"] = str(rank)
-    os.environ["LOCAL_RANK"] = str(rank)
-    os.environ["WORLD_SIZE"] = str(world_size)
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(port)
-    make_and_consume_experience(strategy)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize("world_size", [2])
-@pytest.mark.parametrize("strategy", ["ddp", "colossalai-zero2", "colossalai-gemini"])
-@rerun_if_address_is_in_use()
-def test_experience(world_size, strategy):
-    spawn(run_dist, world_size, strategy=strategy)
-
-
-if __name__ == "__main__":
-    test_experience(2, "colossalai-zero2")
+# import copy
+# import os
+
+# import pytest
+# import torch
+# import torch.distributed as dist
+# from contextlib import nullcontext
+# from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# from coati.experience_buffer import NaiveExperienceBuffer
+# from coati.experience_maker import NaiveExperienceMaker
+# import colossalai
+# from coati.models import RewardModel, Critic
+# from colossalai.booster import Booster
+# from colossalai.lazy import LazyInitContext
+# from colossalai.utils import get_current_device
+# from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+# from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+# def get_data(batch_size: int, seq_len: int = 10) -> dict:
+#     input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
+#     attention_mask = torch.ones_like(input_ids)
+#     return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+# def gather_and_equal(tensor: torch.Tensor) -> bool:
+#     world_size = dist.get_world_size()
+#     outputs = [torch.empty_like(tensor) for _ in range(world_size)]
+#     dist.all_gather(outputs, tensor.contiguous())
+#     for t in outputs[1:]:
+#         if not torch.equal(outputs[0], t):
+#             return False
+#     return True
+
+
+# def make_and_consume_experience(plugin_type, mixed_precision, tp):
+
+#     colossalai.launch_from_torch({})
+
+#     EXPERIENCE_BATCH_SIZE = 4
+#     SAMPLE_BATCH_SIZE = 2
+#     custom_plugin = None
+
+#     if plugin_type == "gemini":
+#         plugin = GeminiPlugin(
+#             precision=mixed_precision,
+#             initial_scale=2**16,
+#             max_norm=1.0,
+#         )
+#     elif plugin_type == "gemini_auto":
+#         plugin = GeminiPlugin(
+#             precision=mixed_precision,
+#             placement_policy="auto",
+#             initial_scale=2**16,
+#             max_norm=1.0,
+#         )
+#     elif plugin_type == "zero2":
+#         plugin = LowLevelZeroPlugin(
+#             stage=2,
+#             precision=mixed_precision,
+#             initial_scale=2**16,
+#             max_norm=1.0,
+#         )
+#     elif plugin_type == "zero2_cpu":
+#         plugin = LowLevelZeroPlugin(
+#             stage=2,
+#             precision=mixed_precision,
+#             initial_scale=2**16,
+#             cpu_offload=True,
+#             max_norm=1.0,
+#         )
+#     elif plugin_type == "3d":
+#         plugin = HybridParallelPlugin(
+#             tp_size=tp,
+#             pp_size=1,
+#             zero_stage=0,
+#             precision=mixed_precision,
+#         )
+#         from colossalai.shardformer.policies.gpt2 import GPT2Policy
+#         custom_plugin = HybridParallelPlugin(
+#             tp_size=tp,
+#             pp_size=1,
+#             zero_stage=0,
+#             precision=mixed_precision,
+#             custom_policy=GPT2Policy(),
+#         )
+#     else:
+#         raise ValueError(f'Unsupported plugin "{plugin}"')
+
+#     # init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in plugin_type else nullcontext()
+#     # with init_ctx:
+#     actor = AutoModelForCausalLM.from_pretrained("gpt2").cuda()
+#     critic = Critic("gpt2").cuda()
+
+#     ref_model = AutoModelForCausalLM.from_pretrained("gpt2").cuda()
+#     reward_model = RewardModel("gpt2").cuda()
+
+#     actor_booster = Booster(plugin=plugin)
+#     ref_booster = Booster(plugin=plugin)
+#     rm_booster = Booster(plugin=custom_plugin)
+#     critic_booster = Booster(plugin=custom_plugin)
+
+#     default_dtype = torch.float16 if mixed_precision == "fp16" else torch.bfloat16
+#     torch.set_default_dtype(default_dtype)
+#     actor, _, _, _, _ = actor_booster.boost(
+#         model=actor
+#     )
+
+#     critic, _, _, _, _ = critic_booster.boost(
+#         model=critic
+#     )
+#     reward_model, _, _, _, _ = rm_booster.boost(model=reward_model)
+#     ref_model, _, _, _, _ = ref_booster.boost(model=ref_model)
+
+#     torch.set_default_dtype(torch.float)
+
+#     tokenizer = AutoTokenizer.from_pretrained("gpt2")
+#     tokenizer.padding_side = "left"
+#     tokenizer.pad_token = tokenizer.eos_token
+#     experience_maker = NaiveExperienceMaker(actor, critic, reward_model, ref_model, tokenizer)
+#     data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+
+#     # experience of all ranks should be the same
+#     for _ in range(2):
+#         data = get_data(EXPERIENCE_BATCH_SIZE)
+#         assert gather_and_equal(data["input_ids"])
+#         assert gather_and_equal(data["attention_mask"])
+#         experience = experience_maker.make_experience(**data, do_sample=True, max_length=16)
+#         assert gather_and_equal(experience.sequences)
+#         assert gather_and_equal(experience.action_log_probs)
+#         assert gather_and_equal(experience.values)
+#         assert gather_and_equal(experience.reward)
+#         assert gather_and_equal(experience.advantages)
+#         assert gather_and_equal(experience.action_mask)
+#         assert gather_and_equal(experience.attention_mask)
+#         data_buffer.append(experience)
+
+#     # data buffer's data should be the same for tp but different for other methods
+#     # buffer_size = torch.tensor([len(data_buffer)], device="cuda")
+#     # assert gather_and_equal(buffer_size)
+#     # for item in data_buffer.items:
+#     #     assert gather_and_equal(item.sequences)
+#     #     assert gather_and_equal(item.action_log_probs)
+#     #     assert gather_and_equal(item.values)
+#     #     assert gather_and_equal(item.reward)
+#     #     assert gather_and_equal(item.advantages)
+#     #     assert gather_and_equal(item.action_mask)
+#     #     assert gather_and_equal(item.attention_mask)
+
+#     # # dataloader of each rank should have the same size and different batch
+#     # dataloader = strategy.setup_dataloader(data_buffer)
+#     # dataloader_size = torch.tensor([len(dataloader)], device="cuda")
+#     # assert gather_and_equal(dataloader_size)
+#     # for experience in dataloader:
+#     #     assert not gather_and_equal(experience.sequences)
+#     #     assert not gather_and_equal(experience.action_log_probs)
+#     #     assert not gather_and_equal(experience.values)
+#     #     assert not gather_and_equal(experience.reward)
+#     #     assert not gather_and_equal(experience.advantages)
+#     #     # action mask and attention mask may be same
+
+
+# def run_dist(rank, world_size, port, plugin=None, mixed_precision=None, tp=None):
+#     os.environ["RANK"] = str(rank)
+#     os.environ["LOCAL_RANK"] = str(rank)
+#     os.environ["WORLD_SIZE"] = str(world_size)
+#     os.environ["MASTER_ADDR"] = "localhost"
+#     os.environ["MASTER_PORT"] = str(port)
+#     make_and_consume_experience(plugin, mixed_precision, tp)
+
+
+# @pytest.mark.dist
+# @pytest.mark.parametrize("world_size", [2])
+# @pytest.mark.parametrize("plugin", ["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"])
+# @pytest.mark.parametrize("mixed_precision", ["fp16", "bf16"])
+# @pytest.mark.parametrize("tp", [2]) #only for 3d plugin
+# @rerun_if_address_is_in_use()
+# def test_experience(world_size, plugin, mixed_precision, tp):
+#     spawn(run_dist, world_size, plugin=plugin, mixed_precision=mixed_precision, tp=tp)
+
+
+# if __name__ == "__main__":
+#     test_experience(2, "colossalai-zero2")
diff --git a/applications/Chat/tests/test_lora.py b/applications/Chat/tests/test_lora.py
old mode 100644
new mode 100755
index 68c91a597f5b..4ea9e1a15c59
--- a/applications/Chat/tests/test_lora.py
+++ b/applications/Chat/tests/test_lora.py
@@ -37,7 +37,6 @@ def test_overfit():
 
     # Build and convert model
     model = SimpleNN(input_size, hidden_size, num_classes)
-    print(model.fc1.weight)
     weight_to_compare = model.fc1.weight.detach().clone()
     model = convert_to_lora_module(model, lora_rank=30)
 
@@ -46,7 +45,7 @@ def test_overfit():
     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
 
     # Train the model
-    for epoch in range(num_epochs):
+    for _ in range(num_epochs):
         for i, (inputs, labels) in enumerate(loader):
             # Forward pass
             outputs = model(inputs)
@@ -62,10 +61,7 @@ def test_overfit():
     _, predicted = torch.max(outputs.data, 1)
     total = labels.size(0)
     correct = (predicted == Y).sum().item()
-    print(correct / total)
     assert (correct / total > 0.95, "The model has not overfitted to the synthesized dataset")
-
-    print(model.fc1.weight)
     assert (weight_to_compare - model.fc1.weight).sum() < 0.01
 
 
diff --git a/applications/Chat/tests/test_models.py b/applications/Chat/tests/test_models.py
old mode 100644
new mode 100755
index 7624dbaadff5..f267da0331b0
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
@@ -1,253 +1,253 @@
-import copy
-from typing import Any, Callable, Dict, Tuple
-
-import pytest
-import torch
-import torch.nn as nn
-from coati.models.base import Actor, Critic, RewardModel, get_base_model
-from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-from coati.models.chatglm import ChatGLMActor
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from coati.models.generation import generate
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor
-from coati.models.lora import LoraLinear, convert_to_lora_module
-from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
-from coati.models.utils import calc_action_log_probs, masked_mean
-
-
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seq_len", [32])
-@pytest.mark.parametrize(
-    "actor_maker",
-    [
-        lambda: BLOOMActor(),
-        lambda: GPTActor(),
-        # HACK: skip llama due to long execution time
-        # lambda: LlamaActor(),
-        lambda: OPTActor(),
-    ],
-)
-@pytest.mark.parametrize(
-    "generate_kwargs",
-    [
-        {
-            "max_length": 64,
-            "use_cache": True,
-            "do_sample": True,
-            "temperature": 1.0,
-            "top_k": 50,
-        }
-    ],
-)
-def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
-    class MockTokenizer:
-        def __init__(self):
-            self.padding_side = "left"
-            self.eos_token_id = 0
-            self.pad_token_id = 0
-
-    actor = actor_maker()
-    input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
-    tokenizer = MockTokenizer()
-    sequences = generate(actor.cuda(), input_ids, tokenizer, **generate_kwargs)
-    assert sequences.shape == (batch_size, generate_kwargs["max_length"])
-
-
-def test_utils():
-    fn_input = {"tensor": torch.ones((10,)), "mask": torch.randint(0, 2, (10,))}
-    fn_output = masked_mean(dim=0, **fn_input)
-    assert fn_output.dim() == 0
-    assert torch.allclose(fn_output, torch.tensor(1.0))
-
-    batch_size = 4
-    seq_len = 32
-    num_labels = 10
-    num_actions = 2
-    fn_input = {
-        "logits": torch.randn((batch_size, seq_len, num_labels)),
-        "sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
-        "num_actions": num_actions,
-    }
-    fn_output = calc_action_log_probs(**fn_input)
-    assert fn_output.shape == (batch_size, num_actions)
-
-
-@pytest.mark.parametrize("lora_rank", [4])
-@pytest.mark.parametrize("num_dim", [32])
-@pytest.mark.parametrize("num_layers", [4])
-def test_lora(lora_rank: int, num_dim: int, num_layers: int):
-    model = nn.ModuleList([nn.Linear(num_dim, num_dim) for _ in range(num_layers)])
-    lora_model = convert_to_lora_module(model, lora_rank)
-    assert isinstance(lora_model, nn.ModuleList)
-    for i in range(num_layers):
-        assert isinstance(lora_model[i], LoraLinear)
-        assert lora_model[i].lora_A.shape == (lora_rank, num_dim)
-        assert lora_model[i].lora_B.shape == (num_dim, lora_rank)
-
-    old_model = copy.deepcopy(lora_model)
-    for i in range(num_layers):
-        assert isinstance(lora_model[i], LoraLinear)
-        assert torch.allclose(old_model[i].weight, lora_model[i].weight)
-        assert torch.allclose(old_model[i].bias, lora_model[i].bias)
-        assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A)
-    optimizer = torch.optim.Adam(lora_model.parameters())
-    x = torch.randn(8, num_dim)
-    for i in range(num_layers):
-        x = lora_model[i](x)
-    loss = x.sum()
-    loss.backward()
-    optimizer.step()
-    for i in range(num_layers):
-        assert isinstance(lora_model[i], LoraLinear)
-        assert torch.allclose(old_model[i].weight, lora_model[i].weight)
-        assert torch.allclose(old_model[i].bias, lora_model[i].bias)
-        assert not torch.allclose(
-            old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A
-        )
-
-
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seq_len", [128])
-@pytest.mark.parametrize(
-    "models_maker",
-    [
-        lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
-        lambda: (GPTActor(), GPTCritic(), GPTRM()),
-        # HACK: skip llama due to long execution time
-        # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
-        lambda: (OPTActor(), OPTCritic(), OPTRM()),
-        # lambda: (ChatGLMActor(), None, None), #temporally remove tests for chatglm
-    ],
-)
-@torch.no_grad()
-def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
-    actor_input = {
-        "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
-        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-    }
-    critic_input = {
-        "sequences": torch.randint(0, 100, (batch_size, seq_len)),
-        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-    }
-    rm_input = {
-        "sequences": torch.randint(0, 100, (batch_size, seq_len)),
-        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-    }
-
-    actor, critic, rm = models_maker()
-    if isinstance(actor, ChatGLMActor):
-        actor = actor.float()
-        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-        chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
-        actor_input = {
-            "input_ids": torch.cat(
-                (
-                    torch.randint(0, 100, (batch_size, seq_len // 2)),
-                    chatglm_special_token,
-                    torch.randint(0, 100, (batch_size, seq_len // 2 - 2)),
-                ),
-                dim=1,
-            ),
-            "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len)),
-        }
-    assert isinstance(actor, Actor)
-    get_base_model(actor)
-    actor_output = actor(**actor_input)
-    assert actor_output.logits.shape[:2] == (batch_size, seq_len)
-
-    if critic:
-        assert isinstance(critic, Critic)
-        get_base_model(critic)
-        critic_output = critic(**critic_input)
-        assert critic_output.shape == (batch_size, seq_len)
-
-    if rm:
-        assert isinstance(rm, RewardModel)
-        get_base_model(rm)
-        rm_output = rm(**rm_input)
-        assert rm_output.shape == (batch_size,)
-
-
-@pytest.mark.parametrize("batch_size", [16])
-@pytest.mark.parametrize("seq_len", [128])
-@pytest.mark.parametrize("num_labels", [100])
-def test_loss(batch_size: int, seq_len: int, num_labels: int):
-    loss = GPTLMLoss()
-    loss_input = {
-        "logits": torch.randn(batch_size, seq_len, num_labels),
-        "labels": torch.randint(0, num_labels, (batch_size, seq_len)),
-    }
-    loss(**loss_input)
-
-    loss = PolicyLoss()
-    loss_input = {
-        "log_probs": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "old_log_probs": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "advantages": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "action_mask": torch.randn(batch_size, seq_len),
-    }
-    loss(**loss_input)
-
-    loss = ValueLoss()
-    loss_input = {
-        "values": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "old_values": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "advantage": torch.randn(
-            batch_size,
-            seq_len,
-        ),
-        "action_mask": torch.randn(batch_size, seq_len),
-    }
-    loss(**loss_input)
-
-    loss = LogSigLoss()
-    loss_input = {
-        "chosen_reward": torch.randn(
-            batch_size,
-        ),
-        "reject_reward": torch.randn(
-            batch_size,
-        ),
-    }
-    loss(**loss_input)
-
-    loss = LogExpLoss()
-    loss_input = {
-        "chosen_reward": torch.randn(
-            batch_size,
-        ),
-        "reject_reward": torch.randn(
-            batch_size,
-        ),
-    }
-    loss(**loss_input)
-
-
-if __name__ == "__main__":
-    generate_kwargs = dict(max_length=40, use_cache=True, do_sample=True, temperature=1.0, top_k=50)
-    test_generation(lambda: LlamaActor(), batch_size=4, seq_len=32, generate_kwargs=generate_kwargs)
-
-    test_utils()
-
-    test_lora(lora_rank=2, num_dim=8, num_layers=2)
-
-    test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
-
-    test_loss(batch_size=8, seq_len=128, num_labels=100)
+# import copy
+# from typing import Any, Callable, Dict, Tuple
+
+# import pytest
+# import torch
+# import torch.nn as nn
+# from coati.models.base import Actor, Critic, RewardModel, get_base_model
+# from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+# from coati.models.chatglm import ChatGLMActor
+# from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+# from coati.models.generation import generate
+# from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+# from coati.models.llama import LlamaActor
+# from coati.models.lora import LoraLinear, convert_to_lora_module
+# from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
+# from coati.models.opt import OPTRM, OPTActor, OPTCritic
+# from coati.models.utils import calc_action_log_probs, masked_mean
+
+
+# @pytest.mark.parametrize("batch_size", [4])
+# @pytest.mark.parametrize("seq_len", [32])
+# @pytest.mark.parametrize(
+#     "actor_maker",
+#     [
+#         lambda: BLOOMActor(),
+#         lambda: GPTActor(),
+#         # HACK: skip llama due to long execution time
+#         # lambda: LlamaActor(),
+#         lambda: OPTActor(),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     "generate_kwargs",
+#     [
+#         {
+#             "max_length": 64,
+#             "use_cache": True,
+#             "do_sample": True,
+#             "temperature": 1.0,
+#             "top_k": 50,
+#         }
+#     ],
+# )
+# def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
+#     class MockTokenizer:
+#         def __init__(self):
+#             self.padding_side = "left"
+#             self.eos_token_id = 0
+#             self.pad_token_id = 0
+
+#     actor = actor_maker()
+#     input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
+#     tokenizer = MockTokenizer()
+#     sequences = generate(actor.cuda(), input_ids, tokenizer, **generate_kwargs)
+#     assert sequences.shape == (batch_size, generate_kwargs["max_length"])
+
+
+# def test_utils():
+#     fn_input = {"tensor": torch.ones((10,)), "mask": torch.randint(0, 2, (10,))}
+#     fn_output = masked_mean(dim=0, **fn_input)
+#     assert fn_output.dim() == 0
+#     assert torch.allclose(fn_output, torch.tensor(1.0))
+
+#     batch_size = 4
+#     seq_len = 32
+#     num_labels = 10
+#     num_actions = 2
+#     fn_input = {
+#         "logits": torch.randn((batch_size, seq_len, num_labels)),
+#         "sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
+#         "num_actions": num_actions,
+#     }
+#     fn_output = calc_action_log_probs(**fn_input)
+#     assert fn_output.shape == (batch_size, num_actions)
+
+
+# @pytest.mark.parametrize("lora_rank", [4])
+# @pytest.mark.parametrize("num_dim", [32])
+# @pytest.mark.parametrize("num_layers", [4])
+# def test_lora(lora_rank: int, num_dim: int, num_layers: int):
+#     model = nn.ModuleList([nn.Linear(num_dim, num_dim) for _ in range(num_layers)])
+#     lora_model = convert_to_lora_module(model, lora_rank)
+#     assert isinstance(lora_model, nn.ModuleList)
+#     for i in range(num_layers):
+#         assert isinstance(lora_model[i], LoraLinear)
+#         assert lora_model[i].lora_A.shape == (lora_rank, num_dim)
+#         assert lora_model[i].lora_B.shape == (num_dim, lora_rank)
+
+#     old_model = copy.deepcopy(lora_model)
+#     for i in range(num_layers):
+#         assert isinstance(lora_model[i], LoraLinear)
+#         assert torch.allclose(old_model[i].weight, lora_model[i].weight)
+#         assert torch.allclose(old_model[i].bias, lora_model[i].bias)
+#         assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A)
+#     optimizer = torch.optim.Adam(lora_model.parameters())
+#     x = torch.randn(8, num_dim)
+#     for i in range(num_layers):
+#         x = lora_model[i](x)
+#     loss = x.sum()
+#     loss.backward()
+#     optimizer.step()
+#     for i in range(num_layers):
+#         assert isinstance(lora_model[i], LoraLinear)
+#         assert torch.allclose(old_model[i].weight, lora_model[i].weight)
+#         assert torch.allclose(old_model[i].bias, lora_model[i].bias)
+#         assert not torch.allclose(
+#             old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A
+#         )
+
+
+# @pytest.mark.parametrize("batch_size", [8])
+# @pytest.mark.parametrize("seq_len", [128])
+# @pytest.mark.parametrize(
+#     "models_maker",
+#     [
+#         lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
+#         lambda: (GPTActor(), GPTCritic(), GPTRM()),
+#         # HACK: skip llama due to long execution time
+#         # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
+#         lambda: (OPTActor(), OPTCritic(), OPTRM()),
+#         # lambda: (ChatGLMActor(), None, None), #temporally remove tests for chatglm
+#     ],
+# )
+# @torch.no_grad()
+# def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
+#     actor_input = {
+#         "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
+#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+#     }
+#     critic_input = {
+#         "sequences": torch.randint(0, 100, (batch_size, seq_len)),
+#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+#     }
+#     rm_input = {
+#         "sequences": torch.randint(0, 100, (batch_size, seq_len)),
+#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+#     }
+
+#     actor, critic, rm = models_maker()
+#     if isinstance(actor, ChatGLMActor):
+#         actor = actor.float()
+#         tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+#         chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
+#         actor_input = {
+#             "input_ids": torch.cat(
+#                 (
+#                     torch.randint(0, 100, (batch_size, seq_len // 2)),
+#                     chatglm_special_token,
+#                     torch.randint(0, 100, (batch_size, seq_len // 2 - 2)),
+#                 ),
+#                 dim=1,
+#             ),
+#             "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len)),
+#         }
+#     assert isinstance(actor, Actor)
+#     get_base_model(actor)
+#     actor_output = actor(**actor_input)
+#     assert actor_output.logits.shape[:2] == (batch_size, seq_len)
+
+#     if critic:
+#         assert isinstance(critic, Critic)
+#         get_base_model(critic)
+#         critic_output = critic(**critic_input)
+#         assert critic_output.shape == (batch_size, seq_len)
+
+#     if rm:
+#         assert isinstance(rm, RewardModel)
+#         get_base_model(rm)
+#         rm_output = rm(**rm_input)
+#         assert rm_output.shape == (batch_size,)
+
+
+# @pytest.mark.parametrize("batch_size", [16])
+# @pytest.mark.parametrize("seq_len", [128])
+# @pytest.mark.parametrize("num_labels", [100])
+# def test_loss(batch_size: int, seq_len: int, num_labels: int):
+#     loss = GPTLMLoss()
+#     loss_input = {
+#         "logits": torch.randn(batch_size, seq_len, num_labels),
+#         "labels": torch.randint(0, num_labels, (batch_size, seq_len)),
+#     }
+#     loss(**loss_input)
+
+#     loss = PolicyLoss()
+#     loss_input = {
+#         "log_probs": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "old_log_probs": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "advantages": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "action_mask": torch.randn(batch_size, seq_len),
+#     }
+#     loss(**loss_input)
+
+#     loss = ValueLoss()
+#     loss_input = {
+#         "values": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "old_values": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "advantage": torch.randn(
+#             batch_size,
+#             seq_len,
+#         ),
+#         "action_mask": torch.randn(batch_size, seq_len),
+#     }
+#     loss(**loss_input)
+
+#     loss = LogSigLoss()
+#     loss_input = {
+#         "chosen_reward": torch.randn(
+#             batch_size,
+#         ),
+#         "reject_reward": torch.randn(
+#             batch_size,
+#         ),
+#     }
+#     loss(**loss_input)
+
+#     loss = LogExpLoss()
+#     loss_input = {
+#         "chosen_reward": torch.randn(
+#             batch_size,
+#         ),
+#         "reject_reward": torch.randn(
+#             batch_size,
+#         ),
+#     }
+#     loss(**loss_input)
+
+
+# if __name__ == "__main__":
+#     generate_kwargs = dict(max_length=40, use_cache=True, do_sample=True, temperature=1.0, top_k=50)
+#     test_generation(lambda: LlamaActor(), batch_size=4, seq_len=32, generate_kwargs=generate_kwargs)
+
+#     test_utils()
+
+#     test_lora(lora_rank=2, num_dim=8, num_layers=2)
+
+#     test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
+
+#     test_loss(batch_size=8, seq_len=128, num_labels=100)
diff --git a/applications/Chat/tests/test_reward_model.py b/applications/Chat/tests/test_reward_model.py
deleted file mode 100644
index 6c633eabf5d3..000000000000
--- a/applications/Chat/tests/test_reward_model.py
+++ /dev/null
@@ -1,4125 +0,0 @@
-import argparse
-
-import torch
-from coati.models import RewardModel
-
-TEST_DATA = torch.Tensor(
-    [
-        1,
-        319,
-        13563,
-        1546,
-        263,
-        12758,
-        5199,
-        322,
-        385,
-        23116,
-        21082,
-        20255,
-        29889,
-        450,
-        20255,
-        4076,
-        8444,
-        29892,
-        13173,
-        29892,
-        322,
-        1248,
-        568,
-        6089,
-        304,
-        278,
-        5199,
-        29915,
-        29879,
-        5155,
-        29889,
-        13,
-        13,
-        29950,
-        7889,
-        29901,
-        1,
-        5618,
-        947,
-        372,
-        2125,
-        304,
-        367,
-        263,
-        410,
-        17622,
-        2810,
-        29973,
-        2,
-        7900,
-        22137,
-        29901,
-        1,
-        11284,
-        29892,
-        410,
-        28563,
-        267,
-        817,
-        304,
-        505,
-        1781,
-        9324,
-        322,
-        1095,
-        18541,
-        29892,
-        363,
-        1342,
-        29889,
-        29871,
-        2688,
-        884,
-        817,
-        304,
-        505,
-        1781,
-        1361,
-        29899,
-        1032,
-        29872,
-        29311,
-        3381,
-        29892,
-        470,
-        278,
-        11509,
-        304,
-        1207,
-        16716,
-        24147,
-        411,
-        1009,
-        3573,
-        29889,
-        29871,
-        15113,
-        29892,
-        896,
-        817,
-        304,
-        505,
-        263,
-        6483,
-        5360,
-        310,
-        278,
-        7980,
-        896,
-        30010,
-        276,
-        8743,
-        29892,
-        304,
-        367,
-        17385,
-        630,
-        304,
-        7945,
-        2898,
-        29889,
-        29871,
-        1670,
-        526,
-        1784,
-        916,
-        13879,
-        393,
-        748,
-        964,
-        1641,
-        263,
-        410,
-        17622,
-        2810,
-        29892,
-        322,
-        372,
-        30010,
-        29879,
-        2289,
-        263,
-        1407,
-        9377,
-        12875,
-        310,
-        25078,
-        393,
-        526,
-        12420,
-        297,
-        1422,
-        5837,
-        297,
-        1269,
-        5375,
-        29889,
-        2,
-        29950,
-        7889,
-        29901,
-        1,
-        7058,
-        338,
-        1565,
-        29889,
-        306,
-        4140,
-        278,
-        17385,
-        362,
-        29892,
-        278,
-        5360,
-        322,
-        6483,
-        9063,
-        358,
-        29889,
-        2,
-        7900,
-        22137,
-        29901,
-        1,
-        8241,
-        29892,
-        366,
-        505,
-        304,
-        505,
-        263,
-        1565,
-        5360,
-        363,
-        278,
-        7980,
-        366,
-        1708,
-        29889,
-        29871,
-        1670,
-        30010,
-        29879,
-        3078,
-        763,
-        278,
-        11223,
-        310,
-        1641,
-        5714,
-        297,
-        263,
-        3748,
-        310,
-        596,
-        7980,
-        29889,
-        29871,
-        1126,
-        746,
-        366,
-        5360,
-        263,
-        7980,
-        29892,
-        366,
-        505,
-        263,
-        5100,
-        3321,
-        7636,
-        975,
-        10769,
-        1058,
-        1016,
-        30010,
-        29873,
-        29889,
-        29871,
-        3387,
-        278,
-        7899,
-        304,
-        367,
-        278,
-        1900,
-        29892,
-        2462,
-        297,
-        322,
-        2462,
-        714,
-        29889,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-    ],
-).to("cuda:0")
-
-ATTEN_MASK = torch.Tensor(
-    [
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        True,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-        False,
-    ],
-).to("cuda:0")
-
-
-def main(args):
-    model = RewardModel(pretrained=args.pretrain).to("cuda:0")
-    print(model(TEST_DATA, attention_mask=ATTEN_MASK))
-
-
-if __name__ == "__main__":
-    # ==============================
-    # Parse Arguments
-    # ==============================
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pretrain", type=str, default=None)
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 4396e484e3f9..1d6ef55fd653 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -19,29 +19,25 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
 set -xu
 
-if [ -z "$SFT_DATASET" ]; then
-    echo "Please set \$SFT_DATASET to the path to sft dataset."
-    exit 1
-fi
-
-if [ -z "$PROMPT_DATASET" ]; then
-    echo "Please set \$PROMPT_DATASET to the path to prompts csv."
-    exit 1
-fi
-
-if [ -z "$PRETRAIN_DATASET" ]; then
-    echo "Please set \$PRETRAIN_DATASET to the path to alpaca data."
-    exit 1
-fi
 
 NUM_RETRY=3
 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 EXAMPLES_DIR=$BASE_DIR/examples
-MODEL_SAVE_PATH=$EXAMPLES_DIR/rlhf_models
-MODELS_DIR=$BASE_DIR/examples/models_config
+TEMP_DIR=$BASE_DIR/temp
+MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
+MODELS_DIR=$TEMP_DIR/models_config
+# MODELS=('gpt2' 'bloom' 'opt' 'llama')
 MODELS=('gpt2' 'bloom' 'opt' 'llama')
-MODELS_PPO=('gpt2' 'bloom' 'opt')
-STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2')
+# PLUGINS=('gemini' 'gemini_auto' 'zero2' 'zero2_cpu' '3d')
+PLUGINS=('zero2' 'zero2_cpu' '3d')
+LORA_RANK=('0' '20')
+
+if [ ! -d "$TEMP_DIR" ]; then
+  mkdir "$TEMP_DIR"
+  echo "Directory created successfully"
+else
+  echo "Directory already exists"
+fi
 
 if [ ! -d "$MODEL_SAVE_PATH" ]; then
   mkdir "$MODEL_SAVE_PATH"
@@ -50,13 +46,18 @@ else
   echo "Directory already exists"
 fi
 
+if [ ! -d "$MODELS_DIR" ]; then
+  mkdir "$MODELS_DIR"
+  echo "Directory created successfully"
+else
+  echo "Directory already exists"
+fi
+
 export OMP_NUM_THREADS=8
 
 # install requirements
 pip install -r $EXAMPLES_DIR/requirements.txt
 
-python $EXAMPLES_DIR/download_model.py --model-dir $MODELS_DIR --config-only
-
 get_pretrain() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
@@ -65,6 +66,24 @@ get_pretrain() {
         echo "bigscience/bloom-560m"
     elif [[ $model == "opt" ]]; then
         echo "facebook/opt-350m"
+    elif [[ $model == "llama" ]]; then
+        echo "/data/scratch/llama-tiny"
+    else
+        echo "Unknown model $model"
+        exit 1
+    fi
+}
+
+get_tokenizer_dirs() {
+    local model=$1
+    if [[ $model == "gpt2" ]]; then
+        echo "gpt2"
+    elif [[ $model == "bloom" ]]; then
+        echo "bigscience/bloom-560m"
+    elif [[ $model == "opt" ]]; then
+        echo "facebook/opt-350m"
+    elif [[ $model == "llama" ]]; then
+        echo "hf-internal-testing/llama-tokenizer"
     else
         echo "Unknown model $model"
         exit 1
@@ -78,57 +97,66 @@ random_choice() {
     echo ${arr[$idx]}
 }
 
-
-if [ ! -d "$DATA_DIRECTORY" ]; then
-  echo "$DATA_DIRECTORY does not exist."
+if [ ! -d "$HOST_FILE" ]; then
+  echo "$HOST_FILE does not exist."
 fi
 
 echo "[Test]: testing sft ..."
 
-# FIXME: This is a hack to skip tests that are not working
-#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
-#  - llama-*: These tests can be passed locally, skipped for long execution time
-#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
 SKIPPED_TESTS=(
-    "gpt2-ddp"
-    "llama-ddp"
-    "llama-colossalai_gemini"
-    "llama-colossalai_zero2"
+    bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')
-for lora_rank in "0"; do
+for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
-        strategies=($(shuf -e "${STRATEGIES[@]}"))
-        for strategy in ${strategies[@]}; do
-            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
-                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+        plugins=($(shuf -e "${PLUGINS[@]}"))
+        for plugin in ${plugins[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
                 continue
-            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
-                echo "[Test]: Skipped $model-$strategy"
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
                 continue
             fi
             pretrain=$(get_pretrain $model)
-            pretrain_model=""
-            if [[ $lora_rank -gt 0 ]]; then
-                pretrain_model="--pretrain $pretrain"
-            fi
+            tokenizer_dir=$(get_tokenizer_dirs $model)
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+            fi
             for i in $(seq $NUM_RETRY); do
-                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
-                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_sft.py \
-                    $pretrain_model --tokenizer $MODELS_DIR/$model \
-                    --model $model --strategy $strategy --lora_rank $lora_rank $grad_ckpt \
-                    --dataset $SFT_DATASET --max_datasets_size 8 \
-                    --max_epochs 1 --batch_size 1 --accumulation_steps 1 --lr 1e-8 \
-                    --save_path $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank}
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_sft.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --dataset ${dataset[@]} \
+                    --save_path $MODEL_SAVE_PATH \
+                    --config_file $MODELS_DIR/config.jsonl \
+                    --lora_rank $lora_rank \
+                    --plugin $plugin \
+                    --batch_size 2 \
+                    --max_epochs 1 \
+                    --accumulation_steps 2 \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400
                 passed=$?
                 if [ $passed -eq 0 ]; then
+                    rm -rf $MODEL_SAVE_PATH/*
+                    rm -rf $MODELS_DIR/*
                     break
                 fi
             done
             if [ $passed -ne 0 ]; then
-                echo "[Test]: Failed $model-$strategy-$lora_rank"
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
                 exit 1
             fi
         done
@@ -137,115 +165,200 @@ done
 
 echo "[Test]: testing reward model ..."
 
-# FIXME: This is a hack to skip tests that are not working
-#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
-#  - llama-*: These tests can be passed locally, skipped for long execution time
-#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
 SKIPPED_TESTS=(
-    "gpt2-ddp"
-    "llama-ddp"
-    "llama-colossalai_gemini"
-    "llama-colossalai_zero2"
+    bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
 )
 
-LOSS_FNS=('log_sig' 'log_exp')
-DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static')
-for lora_rank in "0"; do
+GRAD_CKPTS=('' '--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
-        strategies=($(shuf -e "${STRATEGIES[@]}"))
-        for strategy in ${strategies[@]}; do
-            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
-                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+        plugins=($(shuf -e "${PLUGINS[@]}"))
+        for plugin in ${plugins[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
                 continue
-            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
-                echo "[Test]: Skipped $model-$strategy"
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
                 continue
             fi
             pretrain=$(get_pretrain $model)
-            pretrain_model=""
-            pretrain_model="--pretrain $pretrain"
-            loss_fn=$(random_choice "${LOSS_FNS[@]}")
-            dataset=$(random_choice "${DATASETS[@]}")
-            subset=$(if [[ $dataset == "Dahoas/rm-static" ]]; then echo "None"; else echo "harmless-base"; fi)
+            tokenizer_dir=$(get_tokenizer_dirs $model)
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+            fi
             for i in $(seq $NUM_RETRY); do
-                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
-                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_reward_model.py \
-                    $pretrain_model --tokenizer $MODELS_DIR/$model \
-                    --dataset $dataset --subset $subset --max_datasets_size 8 \
-                    --model $model --strategy $strategy --lora_rank $lora_rank \
-                    --loss_fn $loss_fn --batch_size 1 --lr 1e-8 --max_len 200 \
-                    --save_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_rm.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --dataset ${dataset[@]} \
+                    --save_dir $MODEL_SAVE_PATH \
+                    --config_file $MODELS_DIR/config.jsonl \
+                    --lora_rank $lora_rank \
+                    --plugin $plugin \
+                    --batch_size 2 \
+                    --max_epochs 1 \
+                    --accumulation_steps 2 \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400
                 passed=$?
                 if [ $passed -eq 0 ]; then
+                    rm -rf $MODEL_SAVE_PATH/*
+                    rm -rf $MODELS_DIR/*
                     break
                 fi
             done
             if [ $passed -ne 0 ]; then
-                echo "[Test]: Failed to train reward model $model-$strategy-$lora_rank"
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
                 exit 1
             fi
         done
     done
 done
 
-echo "[Test]: testing RLHF ..."
 
-# FIXME: This is a hack to skip tests that are not working
-#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
-#  - llama-*: These tests can be passed locally, skipped for long execution time
-#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
-#  - skip all bloom tests as they leads to OOM
+echo "[Test]: testing ppo ..."
+
+SKIPPED_TESTS=(
+    bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+)
+
+GRAD_CKPTS=('' '--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
+    for model in ${MODELS[@]}; do
+        plugins=($(shuf -e "${PLUGINS[@]}"))
+        for plugin in ${plugins[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            pretrain=$(get_pretrain $model)
+            tokenizer_dir=$(get_tokenizer_dirs $model)
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+            fi
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a prompt_dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    prompt_dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_prompt/arrow/part-$split")
+                done
+                declare -a ptx_dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_ptx/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_ppo.py \
+                    --pretrain $pretrain \
+                    --rm_pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --prompt_dataset ${prompt_dataset[@]} \
+                    --pretrain_dataset ${ptx_dataset[@]} \
+                    --ptx_batch_size 1 \
+                    --ptx_coef 0.2 \
+                    --save_path $MODEL_SAVE_PATH \
+                    --lora_rank $lora_rank \
+                    --plugin $plugin \
+                    --num_episodes 5 \
+                    --num_collect_steps 1 \
+                    --num_update_steps 1 \
+                    --experience_batch_size 8 \
+                    --train_batch_size 4 \
+                    --accumulation_steps 2 \
+                    --lr 9e-6 \
+                    --mixed_precision "bf16" \
+                    --grad_clip 1.0 \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    rm -rf $MODEL_SAVE_PATH/*
+                    rm -rf $MODELS_DIR/*
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
+
+echo "[Test]: testing DPO ..."
 
 SKIPPED_TESTS=(
-    "gpt2-ddp"
-    "llama-ddp"
-    "llama-colossalai_gemini"
-    "llama-colossalai_zero2"
-    "bloom-colossalai_gemini"
-    "bloom-colossalai_zero2"
-    "bloom-ddp"
-    "opt-colossalai_gemini"
-    "opt-colossalai_zero2"
-    "opt-ddp"
+    bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
 )
 
-for model in ${MODELS_PPO[@]}; do
-    for lora_rank in "0"; do
-        strategies=($(shuf -e "${STRATEGIES[@]}"))
-        for strategy in ${strategies[@]}; do
-            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
-                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+GRAD_CKPTS=('' '--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
+    for model in ${MODELS[@]}; do
+        plugins=($(shuf -e "${PLUGINS[@]}"))
+        for plugin in ${plugins[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
                 continue
-            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
-                echo "[Test]: Skipped $model-$strategy"
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
                 continue
             fi
-            rm_pretrain=$(get_pretrain $model)
-            rm_pretrain_model=""
-            rm_pretrain_model="--rm_pretrain $rm_pretrain"
+            pretrain=$(get_pretrain $model)
+            tokenizer_dir=$(get_tokenizer_dirs $model)
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+            fi
             for i in $(seq $NUM_RETRY); do
-                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
-                torchrun --standalone --nproc_per_node=1 $EXAMPLES_DIR/train_prompts.py \
-                    --prompt_dataset $PROMPT_DATASET --pretrain_dataset $PRETRAIN_DATASET --max_datasets_size 32 \
-                    --strategy $strategy --model $model --tokenizer $MODELS_DIR/$model \
-                    --num_episodes 1 --num_collect_steps 1 --num_update_steps 1 --lr 1e-8 \
-                    --experience_batch_size 1 --train_batch_size 1 --lora_rank $lora_rank \
-                    --pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \
-                    $rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \
-                    --reward_model_tokenizer $MODELS_DIR/$model --max_input_len 10 --max_seq_len 20 \
-                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_dpo.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --dataset ${dataset[@]} \
+                    --save_dir $MODEL_SAVE_PATH \
+                    --config_file $MODELS_DIR/config.jsonl \
+                    --lora_rank $lora_rank \
+                    --plugin $plugin \
+                    --batch_size 2 \
+                    --max_epochs 1 \
+                    --accumulation_steps 2 \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400
                 passed=$?
                 if [ $passed -eq 0 ]; then
+                    rm -rf $MODEL_SAVE_PATH/*
+                    rm -rf $MODELS_DIR/*
                     break
                 fi
             done
             if [ $passed -ne 0 ]; then
-                echo "[Test]: Failed to train RLHF $model-$strategy-$lora_rank"
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
                 exit 1
             fi
         done
-        rm -rf $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank}
-        rm $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
     done
 done
-rm -rf $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
diff --git a/applications/Chat/version.txt b/applications/Chat/version.txt
old mode 100644
new mode 100755

From f6131931a581a7e7d57ad1fa0dee1e632d63a466 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 4 Dec 2023 18:39:07 +0800
Subject: [PATCH 05/23] fix ci

---
 .github/workflows/run_chatgpt_examples.yml | 10 +++++-----
 applications/Chat/requirements.txt         | 10 +++++++++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 23c721e3d950..65d92651273c 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
+      options: --gpus all --rm -v /data/scratch/colossal-llama2/colossal_chat_test_data:/data/scratch/colossal-llama2/colossal_chat_test_data --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
       run:
@@ -47,7 +47,7 @@ jobs:
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8
-          SFT_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/sft
-          PROMPT_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/prompt
-          PRETRAIN_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/ptx
-          PREFERENCE_DATASET: /data/scratch/github_actions/chat/refactor_coati_test_data/preference
+          SFT_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/sft
+          PROMPT_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/prompt
+          PRETRAIN_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/ptx
+          PREFERENCE_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/preference
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index d601375558de..48550868391c 100755
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -1,4 +1,4 @@
-transformers>=4.20.1
+transformers
 tqdm
 datasets
 loralib
@@ -11,4 +11,12 @@ sse_starlette
 wandb
 sentencepiece
 gpustat
+packaging==23.1
+autoflake==2.2.1
+black==23.9.1
 tensorboard
+six==1.16.0
+datasets
+ninja==1.11.1
+flash-attn==2.0.5
+sentencepiece==0.1.99

From 0d542d6a46487438f93f392841e6bba6cf404282 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 5 Dec 2023 09:03:11 +0800
Subject: [PATCH 06/23] fix ci

---
 applications/Chat/README.md           | 25 ++++++++++++++--
 applications/Chat/requirements.txt    |  6 ++--
 applications/Chat/tests/test_train.sh | 42 ++++++++-------------------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 99a2cd13ff99..405256373ab6 100755
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -94,10 +94,29 @@ More details can be found in the latest news.
 ### Install the environment
 
 ```bash
-conda create -n coati
-conda activate coati
+conda create -n colossal-chat python=3.10.9 (>=3.8.7)
+conda activate colossal-chat
+
+# install flash-attention
+git clone -b v2.0.5 https://github.com/Dao-AILab/flash-attention.git
+cd $FLASH_ATTENTION_ROOT/
+pip install .
+cd $FLASH_ATTENTION_ROOT/csrc/xentropy
+pip install .
+cd $FLASH_ATTENTION_ROOT/csrc/layer_norm
+pip install .
+cd $FLASH_ATTENTION_ROOT/csrc/rotary
+pip install .
+
+# clone Colossalai
 git clone https://github.com/hpcaitech/ColossalAI.git
-cd ColossalAI/applications/Chat
+
+# install ColossalAI
+cd $COLOSSAL_AI_ROOT
+CUDA_EXT=1 pip install .
+
+# install ColossalChat
+cd $COLOSSAL_AI_ROOT/applications/Chat
 pip install .
 ```
 
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index 48550868391c..3d32e7ebdf65 100755
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -1,4 +1,4 @@
-transformers
+transformers<4.33.0
 tqdm
 datasets
 loralib
@@ -11,12 +11,12 @@ sse_starlette
 wandb
 sentencepiece
 gpustat
-packaging==23.1
+packaging
 autoflake==2.2.1
 black==23.9.1
 tensorboard
 six==1.16.0
 datasets
 ninja==1.11.1
-flash-attn==2.0.5
 sentencepiece==0.1.99
+flash-attn
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 1d6ef55fd653..d2dded2ed78f 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -32,27 +32,6 @@ MODELS=('gpt2' 'bloom' 'opt' 'llama')
 PLUGINS=('zero2' 'zero2_cpu' '3d')
 LORA_RANK=('0' '20')
 
-if [ ! -d "$TEMP_DIR" ]; then
-  mkdir "$TEMP_DIR"
-  echo "Directory created successfully"
-else
-  echo "Directory already exists"
-fi
-
-if [ ! -d "$MODEL_SAVE_PATH" ]; then
-  mkdir "$MODEL_SAVE_PATH"
-  echo "Directory created successfully"
-else
-  echo "Directory already exists"
-fi
-
-if [ ! -d "$MODELS_DIR" ]; then
-  mkdir "$MODELS_DIR"
-  echo "Directory created successfully"
-else
-  echo "Directory already exists"
-fi
-
 export OMP_NUM_THREADS=8
 
 # install requirements
@@ -97,9 +76,6 @@ random_choice() {
     echo ${arr[$idx]}
 }
 
-if [ ! -d "$HOST_FILE" ]; then
-  echo "$HOST_FILE does not exist."
-fi
 
 echo "[Test]: testing sft ..."
 
@@ -133,7 +109,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_sft.py \
+                colossalai run --nproc_per_node 4 --master_port 28537 $EXAMPLES_DIR/training_scripts/train_sft.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -195,7 +171,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_rm.py \
+                colossalai run --nproc_per_node 4 --master_port 28537 $EXAMPLES_DIR/training_scripts/train_rm.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -231,6 +207,10 @@ echo "[Test]: testing ppo ..."
 SKIPPED_TESTS=(
     bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
     llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    gpt2-zero2 # This test can pass locally. Removed due to OOM
+    bloom-zero2 # This test can pass locally. Removed due to OOM
+    opt-zero2 # This test can pass locally. Removed due to OOM
+    bloom-zero2_cpu # This test can pass locally. Removed due to OOM
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')
@@ -262,7 +242,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_ptx/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_ppo.py \
+                colossalai run --nproc_per_node 4 --master_port 28537 $EXAMPLES_DIR/training_scripts/train_ppo.py \
                     --pretrain $pretrain \
                     --rm_pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
@@ -304,8 +284,10 @@ done
 echo "[Test]: testing DPO ..."
 
 SKIPPED_TESTS=(
-    bloom-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
-    llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
+    bloom-3d # This test cannot pass, it is probably a bug for the 3d plugin
+    llama-3d # This test cannot pass, it is probably a bug for the 3d plugin
+    bloom-zero2 # This test can pass locally. Removed due to OOM
+    bloom-zero2_cpu # This test can pass locally. Removed due to OOM
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')
@@ -333,7 +315,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 28537 --hostfile $HOST_FILE $EXAMPLES_DIR/training_scripts/train_dpo.py \
+                colossalai run --nproc_per_node 4 --master_port 28537 $EXAMPLES_DIR/training_scripts/train_dpo.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \

From c26810088e52a114964444d112ed9cd994ddb27e Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 6 Dec 2023 10:34:52 +0800
Subject: [PATCH 07/23] fix readme, style

---
 .github/workflows/run_chatgpt_examples.yml    |   4 +-
 applications/Chat/LICENSE                     | 327 --------
 applications/Chat/README.md                   | 171 +++--
 ...osed_to_constant_length_spliced_dataset.py | 137 ----
 applications/Chat/coati/dataset/loader.py     |   2 +-
 applications/Chat/coati/models/base.py        |  11 -
 applications/Chat/coati/models/generation.py  |   1 +
 applications/Chat/coati/trainer/dpo.py        |   1 -
 applications/Chat/coati/trainer/ppo.py        |   2 +-
 applications/Chat/coati/trainer/rm.py         |   1 -
 applications/Chat/coati/trainer/utils.py      |   8 -
 applications/Chat/examples/=2.0.0,            |   0
 applications/Chat/examples/README.md          | 719 +++++++++---------
 applications/Chat/examples/config_file        |  27 -
 .../prepare_preference_dataset.sh             |  30 +-
 .../prepare_prompt_dataset.sh                 |  31 +-
 .../prepare_ptx_dataset.sh                    |  33 +-
 .../prepare_sft_dataset.sh                    |  31 +-
 .../Chat/examples/inference/round.txt         |  73 --
 applications/Chat/examples/requirements.txt   |   0
 .../Chat/examples/training_scripts/hostfile   |   2 +-
 .../examples/training_scripts/train_dpo.sh    |  32 +-
 .../examples/training_scripts/train_ppo.sh    |  45 +-
 .../examples/training_scripts/train_rm.sh     |  38 +-
 .../examples/training_scripts/train_sft.sh    |  34 +-
 applications/Chat/requirements-test.txt       |   2 -
 applications/Chat/requirements.txt            |   2 +-
 applications/Chat/setup.py                    |   2 +-
 applications/Chat/tests/download_model.py     |  51 --
 applications/Chat/tests/test_experience.py    | 185 -----
 applications/Chat/tests/test_inference.sh     |  11 -
 applications/Chat/tests/test_models.py        | 253 ------
 32 files changed, 588 insertions(+), 1678 deletions(-)
 delete mode 100755 applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
 delete mode 100755 applications/Chat/examples/=2.0.0,
 delete mode 100755 applications/Chat/examples/config_file
 delete mode 100755 applications/Chat/examples/inference/round.txt
 mode change 100755 => 100644 applications/Chat/examples/requirements.txt
 delete mode 100755 applications/Chat/requirements-test.txt
 delete mode 100755 applications/Chat/tests/download_model.py
 delete mode 100755 applications/Chat/tests/test_experience.py
 delete mode 100755 applications/Chat/tests/test_inference.sh
 delete mode 100755 applications/Chat/tests/test_models.py

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 65d92651273c..4fff245f5e60 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -18,7 +18,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/colossal-llama2/colossal_chat_test_data:/data/scratch/colossal-llama2/colossal_chat_test_data --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
@@ -36,7 +36,7 @@ jobs:
 
       - name: Install Transformers
         run: |
-          pip install transformers==4.33.2
+          pip install transformers==4.32.1
 
       - name: Execute Examples
         run: |
diff --git a/applications/Chat/LICENSE b/applications/Chat/LICENSE
index b3eb43520a6f..0528c89ea9ec 100755
--- a/applications/Chat/LICENSE
+++ b/applications/Chat/LICENSE
@@ -200,330 +200,3 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
-
-   ## Some of colossal-ai's code is derived from others projects, which is subject to the following copyright notice:
-
-   Copyright 2021 The Alpa team.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-         https://github.com/alpa-projects/alpa/blob/979a45a3e6187df941ef4a4c4c6eea664527d68d/LICENSE
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-   -------------------------------------------------
-
-   Copyright 2018-2020 Philippe Tillet
-   Copyright 2020-2022 OpenAI
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files
-   (the "Software"), to deal in the Software without restriction,
-   including without limitation the rights to use, copy, modify, merge,
-   publish, distribute, sublicense, and/or sell copies of the Software,
-   and to permit persons to whom the Software is furnished to do so,
-   subject to the following conditions:
-
-   ---------------- LICENSE FOR Microsoft Deepspeed ----------------
-
-   MIT License
-
-   Copyright (c) Microsoft Corporation.
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE
-
-   ---------------- LICENSE FOR NVIDIA Megatron-LM ----------------
-
-   Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of NVIDIA CORPORATION nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   ---------------- LICENSE FOR NVIDIA Apex ----------------
-
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-
-   3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   ---------------- LICENSE FOR Facebook Fairscale ----------------
-
-   Copyright (c) Facebook, Inc. and its affiliates
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-      and IDIAP Research Institute nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-
-   ---------------- LICENSE FOR Flash Attention ----------------
-
-   BSD 3-Clause License
-
-   Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-
-   * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-   * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-   * Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   ---------------- LICENSE FOR Facebook xFormers ----------------
-
-   From xFormers:
-
-   Copyright (c) Facebook, Inc. and its affiliates
-
-
-   ===
-
-   BSD 3-Clause License
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-      and IDIAP Research Institute nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-
-   ---------------- LICENSE FOR VLLM TEAM ----------------
-
-   from VLLM TEAM:
-
-      Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-         https://github.com/vllm-project/vllm/blob/main/LICENSE
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-   ---------------- LICENSE FOR LIGHTLLM TEAM ----------------
-
-   from LIGHTLLM TEAM:
-
-      Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-         https://github.com/ModelTC/lightllm/blob/main/LICENSE
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-   ---------------- LICENSE FOR AutoGPTQ ----------------
-
-   From AutoGPTQ:
-
-   MIT License
-
-   Copyright (c) 2023 潘其威(William)
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
-   ---------------- LICENSE FOR exllama ----------------
-
-   From exllama:
-
-   MIT License
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
-
-   ---------------- LICENSE FOR torch-int ----------------
-
-   MIT License
-
-   Copyright (c) 2022 Guangxuan Xiao
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
-
-   ---------------- LICENSE FOR smoothquant ----------------
-
-   MIT License
-
-   Copyright (c) 2022 MIT HAN Lab
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in all
-   copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 405256373ab6..a3d6c4a5b6a6 100755
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -36,7 +36,7 @@
 
 ---
 
-## What is ColossalChat and Coati ?
+## What Is ColossalChat And Coati ?
 
 [ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) is the project to implement LLM with RLHF, powered by the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) project.
 
@@ -91,7 +91,7 @@ More details can be found in the latest news.
 
 ## Install
 
-### Install the environment
+### Install the Environment
 
 ```bash
 conda create -n colossal-chat python=3.10.9 (>=3.8.7)
@@ -120,33 +120,14 @@ cd $COLOSSAL_AI_ROOT/applications/Chat
 pip install .
 ```
 
-### Install the Transformers
+## How To Use?
 
-```bash
-pip install transformers==4.30.2
-```
-
-## How to use?
-
-### Supervised datasets collection
-
-We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
-[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
-
-Here is how we collected the data
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
-</p>
+### RLHF Training Stage1 - Supervised Instructs Tuning
 
-### RLHF Training Stage1 - Supervised instructs tuning
+Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of the RLHF training process, as it involves training a machine learning model using human-provided instructions to learn the initial behavior for the task at hand. Here's a detailed guide on how to SFT your LLM with ColossalChat:
 
-Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
-
-You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
-[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
-
-**Note**: the supervised dataset follows the following format,
+#### Step 1: Data Collection
+The first step in Stage 1 is to collect a dataset of human demonstrations of the following format.
 
 ```json
 [
@@ -167,36 +148,46 @@ You can run the `examples/train_sft.sh` to start a supervised instructs fine-tun
 ]
 ```
 
-### RLHF Training Stage2 - Training reward model
+#### Step 2: Preprocessing
+Once you have collected your SFT dataset, you will need to preprocess it. This involves four steps: data cleaning, data deduplication, formating and tokenization. In this code, we will focus on formating and tokenization. The formating step adopts our elaborately designed conversation template to convert the raw conversation the following format.
+
+```
+<s> A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you looking for practical joke ideas?</s>
+...
+```
+The tokenization step tokenize the formatted conversation, calculate input_ids, labels, attention_masks and buffer those into dataset files. We provide scripts for data formatting and tokenization for SFT. Simply run the [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh).
+
+#### Step 3: Training
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning.
+
+### RLHF Training Stage2 - Training Reward Model
 
 Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
 
-Below shows the preference dataset used in training the reward model.
+#### Step 1: Data Collection
+Below shows the preference dataset format used in training the reward model.
 
 ```json
 [
     {"context": [
         {
           "from": "human",
-          "content": "what are some pranks with a pen i can do?"
-        },
-        {
-          "from": "assistant",
-          "content": "Are you looking for practical joke ideas?"
-        },
-        ...
+          "content": "Introduce butterflies species in Oregon."
+        }
       ]
       "chosen": [
         {
           "from": "assistant",
-          "content": "About 150 species of butterflies live in Oregon, with about 100 species are moths, and about 20 species are common here year-round, and another 10 species are seen here year-round.  I suggest you keep an eye out for skippers, gossamer wings, and red admirals."
+          "content": "About 150 species of butterflies live in Oregon, with about 100 species are moths..."
         },
         ...
       ],
       "rejected": [
         {
           "from": "assistant",
-          "content": "Are you interested in just the common butterflies?  There are a few common ones which will be easy to find.  Like the Monarch, Western Tiger Swallowtail and several other swallowtail butterflies.  The Monarch is known for being a very common and beautiful butterfly.  Are you interested in butterflies because you have children?"
+          "content": "Are you interested in just the common butterflies?  There are a few common ones which will be easy to find..."
         },
         ...
       ]
@@ -205,25 +196,40 @@ Below shows the preference dataset used in training the reward model.
 ]
 ```
 
-You can run the `examples/train_rm.sh` to start a reward model training.
-[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
+#### Step 2: Preprocessing
+Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
+
+#### Step 3: Training
+You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training.
 
-### RLHF Training Stage3 - Training model with reinforcement learning by human feedback
+### RLHF Training Stage3 - Proximal Policy Optimization
 
-Stage3 uses reinforcement learning algorithm, which is the most complex part of the training process:
+In stage3 we will use reinforcement learning algorithm--- Proximal Policy Optimization (PPO), which is the most complex part of the training process:
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/stage-3.jpeg" width=800/>
 </p>
 
-You can run the `examples/train_prompts.sh` to start training PPO with human feedback.
-[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
+#### Step 1: Data Collection
+PPO uses two kind of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "human" and thus the "assistant" needs to generate a response to answer to the "human". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
 
-**Note**: the required datasets follow the following format,
+```json
+[
+    {"messages":
+      [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        }
+        ...
+      ]
+    },
+]
+```
 
-- `pretrain dataset (for ptx loss in ppo)`
+The second dataset--- pretrained dataset is optional, provide it if you want to use the ptx loss introduced in the [InstructGPT paper](https://arxiv.org/abs/2203.02155). It follows the following format.
 
-  ```json
+```json
   [
       {
           "source": "", # system instruction
@@ -232,37 +238,62 @@ You can run the `examples/train_prompts.sh` to start training PPO with human fee
       ...
   ]
   ```
+#### Step 2: Data Preprocessing
+To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
 
-- `prompt dataset`
+To prepare the pretrained dataset for PPO training, simply run [prepare_ptx_dataset.sh](./examples/data_preparation_scripts/prepare_ptx_dataset.sh)
 
-  ```json
-  # The format is the same with sft data. But the last sentence from assistant will be ignored. Only question (prompt) will be preserved. Therefore, even dataset without answer can be used as prompt dataset (e.g. red teaming data)
-  [
-    {"context":
-      [
-        {
-          "from": "human",
-          "content": "what are some pranks with a pen i can do?"
-        },
-        # Optional
-        {
-          "from": "assistant",
-          "content": "Are you looking for practical joke ideas?"
-        },
-        ...
-      ]
-    },
-    ...
-  ]
-  ```
+#### Step 3: Training
+You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration.
+
+```bash
+--pretrain $PRETRAINED_MODEL_PATH \
+--rm_pretrain $PRETRAINED_MODEL_PATH \ # reward model architectual
+--tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+--rm_checkpoint_path $REWARD_MODEL_PATH \ # reward model checkpoint path
+--prompt_dataset ${prompt_dataset[@]} \ # List of string
+--pretrain_dataset ${ptx_dataset[@]} \ # List of string
+--ptx_batch_size 1 \ # batch size for calculate ptx loss
+--ptx_coef 0.0 \ # none-zero if ptx loss is enable
+--num_episodes 2000 \ # number of episodes to train
+--num_collect_steps 1 \
+--num_update_steps 1 \
+--experience_batch_size 8 \
+--train_batch_size 4 \
+--accumulation_steps 2
+```
+
+Each episode has two phases, the collect phase and the update phase. During the collect phase, we will collect experiences (answers generated by actor), store those in ExperienceBuffer. Then data in ExperienceBuffer is used during the update phase to update parameter of actor and critic.
 
-For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
+- Without tensor parallelism,
+```
+experience buffer size
+= num_process * num_collect_steps * experience_batch_size
+= train_batch_size * accumulation_steps * num_process
+```
+
+- With tensor parallelism,
+```
+num_tp_group = num_process / tp
+experience buffer size
+= num_tp_group * num_collect_steps * experience_batch_size
+= train_batch_size * accumulation_steps * num_tp_group
+```
 
-## Alternative Option For RLHF: DPO
+## Alternative Option For RLHF: Direct Preference Optimization
 
 For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in the paper (available at [https://arxiv.org/abs/2305.18290](https://arxiv.org/abs/2305.18290)), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO.
 
-For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
+### DPO Training Stage1 - Supervised Instructs Tuning
+
+Please refer the [sft section](#dpo-training-stage1---supervised-instructs-tuning) in the PPO part.
+
+### DPO Training Stage2 - DPO Training
+#### Step 1: Data Collection & Preparation
+For DPO training, you only need the preference dataset. Please follow the instruction in the [preference dataset preparation section](#rlhf-training-stage2---training-reward-model) to prepare the preference data for DPO training.
+
+#### Step 2: Training
+You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training.
 
 ### Inference Quantization and Serving - After Training
 
diff --git a/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py b/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
deleted file mode 100755
index e4f8e5310bb9..000000000000
--- a/applications/Chat/coati/dataset/closed_to_constant_length_spliced_dataset.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-ClosedToConstantLengthSplicedDataset for continue pretrain
-"""
-
-import random
-import warnings
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
-
-from datasets import dataset_dict
-from torch.utils.data import ConcatDataset, Dataset, IterableDataset
-from transformers.tokenization_utils import PreTrainedTokenizer
-
-from colossalai.logging import get_dist_logger
-
-logger = get_dist_logger()
-
-IGNORE_INDEX = -100
-
-DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
-
-
-class ClosedToConstantLengthSplicedDataset(IterableDataset):
-    """
-    Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
-    original independent (pre-tokenized) data points.
-    """
-
-    def __init__(
-        self,
-        dataset: DSType,
-        tokenizer: PreTrainedTokenizer,
-        max_length: int = 4096,
-        num_packed_sequences: int = 8,
-        fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
-        input_ids_field: str = "input_ids",
-        labels_field: str = "labels",
-        infinite: bool = False,
-        shuffle: bool = True,
-        error_strict: bool = False,
-    ) -> None:
-        self.tokenizer = tokenizer
-        self.dataset = dataset
-        self.max_length = max_length
-        self.infinite = infinite
-        self.max_buffer_size = max_length * num_packed_sequences  # e.g., 4096 * 16
-        self.shuffle = shuffle
-
-        # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
-        # A function that fetch sequence input_ids and labels from the original data point
-        if fetch_sequence_func is None:
-            self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
-        else:
-            self.fetch_sequence_func = fetch_sequence_func
-        self.input_ids_field = input_ids_field
-        self.labels_field = labels_field
-
-        self.error_strict = error_strict
-        self.current_size = 0  # `int`, current packed data size.
-
-    def __len__(self) -> int:
-        return len(self.dataset)
-
-    def __iter__(self) -> Iterable[Dict[str, List[int]]]:
-        iterator = iter(self.dataset)
-        more_data_points = True
-        while more_data_points is True:
-            buffer, buffer_len = [], 0
-            while True:
-                # ending condition.
-                if buffer_len >= self.max_buffer_size:
-                    break
-                try:
-                    # `Tuple[List[int], List[int]]`
-                    seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
-                    buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
-                    buffer_len += len(buffer[-1][self.input_ids_field])
-                except StopIteration:
-                    if self.infinite is True:
-                        iterator = iter(self.dataset)
-                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
-                    else:
-                        more_data_points = False
-                        break
-            examples = []  # `List[Dict[str, List[int]]]`, save buffered spliced data points.
-            spliced_input_ids, spliced_labels = [], []  # `List[int]`, `List[int]`
-            for i, data_point in enumerate(buffer):
-                # TODO(2023-09-18) check errors for each unspliced tokenized data point
-                seq_input_ids = data_point[self.input_ids_field]
-                seq_labels = data_point[self.labels_field]
-                # Handle special case:
-                # If the length of an original data point (i.e., input_ids length of a data point before splicing)
-                # exceeds `max_length`, truncate it.
-                if len(seq_input_ids) > self.max_length:
-                    truncated_seq_input_ids = seq_input_ids[: self.max_length]
-                    truncated_label_ids = seq_labels[: self.max_length]
-                    if set(truncated_label_ids) == {IGNORE_INDEX}:
-                        if self.error_strict is True:
-                            raise ValueError(
-                                f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
-                                f"with all label values as {IGNORE_INDEX}."
-                            )
-                        else:
-                            warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
-                            continue  # Skip the current error data point.
-                    spliced_data_point = {
-                        self.input_ids_field: truncated_seq_input_ids,
-                        self.labels_field: truncated_label_ids,
-                    }
-                    examples.append(spliced_data_point)
-                    warnings.warn("Find a data point to be truncated.")
-                    continue
-
-                # Pre action judgment.
-                if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
-                    spliced_data_point = {
-                        self.input_ids_field: spliced_input_ids,
-                        self.labels_field: spliced_labels,
-                    }  # `Dict[str, List[int]]`
-                    # Update.
-                    spliced_input_ids, spliced_labels = [], []
-                    spliced_input_ids.extend(seq_input_ids)
-                    spliced_labels.extend(seq_labels)
-                    examples.append(spliced_data_point)
-                else:
-                    spliced_input_ids.extend(seq_input_ids)
-                    spliced_labels.extend(seq_labels)
-            # For residual spliced data point at the end of the data set
-            if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
-                examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
-            if self.shuffle:
-                random.shuffle(examples)
-            for spliced_data_point in examples:
-                # TODO(2023-09-18): check errors for each spliced tokenized data point.
-                self.current_size += 1
-                yield spliced_data_point
diff --git a/applications/Chat/coati/dataset/loader.py b/applications/Chat/coati/dataset/loader.py
index 4ef84d3e9cc0..96fdb6b68655 100755
--- a/applications/Chat/coati/dataset/loader.py
+++ b/applications/Chat/coati/dataset/loader.py
@@ -262,7 +262,7 @@ def __init__(
             )
         else:
             # adapted from https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/torch/utils/data/distributed.py#L62
-            # TODO: support tp_group>1
+            # TODO: support tp_group>1. will fix it later
             num_replicas = 1
             if rank is None:
                 rank = dist.get_rank()
diff --git a/applications/Chat/coati/models/base.py b/applications/Chat/coati/models/base.py
index 1c4da9ba0744..26222edb9099 100755
--- a/applications/Chat/coati/models/base.py
+++ b/applications/Chat/coati/models/base.py
@@ -34,17 +34,6 @@ def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] =
             raise ValueError("Either pretrained or config must be provided.")
 
         self.config = self.model.config
-        # if self.model.config.architectures[0] == "GPT2LMHeadModel":
-        #     self.last_hidden_state_size = self.model.config.n_embd
-        # elif self.model.config.architectures[0] == "BloomForCausalLM":
-        #     self.last_hidden_state_size = self.model.config.hidden_size
-        # elif self.model.config.architectures[0] == "LlamaForCausalLM":
-        #     self.last_hidden_state_size = self.model.config.hidden_size
-        # elif self.model.config.architectures[0] == "OPTForCausalLM":
-        #     self.last_hidden_state_size = self.model.config.word_embed_proj_dim
-        # else:
-        #     raise ValueError(f"Unsupported model architecture. {self.model.config.architectures[0]}")
-
         # create dummy input to get the size of the last hidden state
         dummy_input = torch.zeros((1, 1), dtype=torch.long).to(self.model.device)
         out = self.model(dummy_input)
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index 0bce538a5b9c..27bccb0bc1a4 100755
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -62,6 +62,7 @@ def generate(
     generation_kwargs["max_new_tokens"] = max_new_tokens
     model_unwrap = model.unwrap()
     model_unwrap.generation_config.pad_token_id = tokenizer.pad_token_id
+    # use the default generate function
     input_ids = model_unwrap.generate(
         input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id), **generation_kwargs
     )
diff --git a/applications/Chat/coati/trainer/dpo.py b/applications/Chat/coati/trainer/dpo.py
index 2aa5cd98ecb9..4c62c1053981 100755
--- a/applications/Chat/coati/trainer/dpo.py
+++ b/applications/Chat/coati/trainer/dpo.py
@@ -117,7 +117,6 @@ def _train(self, epoch: int):
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
-            # print(batch)
             (
                 chosen_input_ids,
                 chosen_attention_mask,
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index 1ec7b036b316..704a8553f5a2 100755
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -270,7 +270,7 @@ def _training_step(self, experience: Experience):
                     try:
                         self.wandb_run.log({"sample_response": my_table})
                     except OSError as e:
-                        print(e)
+                        self.coordinator.print_on_master(e)
 
             if self.writer and is_rank_0():
                 self.writer.add_scalar("train/max_ratio", self.accumulative_meter.get("max_ratio"), self.num_train_step)
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index e0a57f4efdbe..0a5dabba5c1e 100755
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -198,7 +198,6 @@ def _eval(self, epoch):
         with torch.no_grad():
             for i, batch in enumerate(self.eval_dataloader):
                 batch = to_device(batch, self.device)
-                # print(batch)
                 (
                     chosen_input_ids,
                     chosen_attention_mask,
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index a58edcd85195..0661b86ebec7 100755
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -62,11 +62,3 @@ def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
 def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
     dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
     return tensor
-
-
-def prepare_data_for_inference(
-    chosen_input_ids, reject_input_ids, chosen_attention_mask, reject_attention_mask, tokenizer
-):
-    # This function will truncate each vector in the batch after the first zero
-    assert tokenizer.bos_token_id != tokenizer.pad_token_id, "This bos token should not be the same as the pad token"
-    return None, None
diff --git a/applications/Chat/examples/=2.0.0, b/applications/Chat/examples/=2.0.0,
deleted file mode 100755
index e69de29bb2d1..000000000000
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index b626f3fb4c6a..b9f6b7302b80 100755
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -5,30 +5,29 @@
 - [Examples](#examples)
   - [Table of Contents](#table-of-contents)
   - [Install Requirements](#install-requirements)
-  - [Supervised Datasets Collection](#supervised-datasets-collection)
-    - [Conversation Dataset Generation](#conversation-dataset-generation)
-  - [Task I: Supervised Instruction Tuning](#task-i-supervised-instructs-tuning)
-  - [Task II: Reinforcement Learning from Human Feedback](#task-ii-reinforcement-learning-from-human-feedback)
-    - [Stage1 - Supervised instructs tuning](#stage1---supervised-instructs-tuning)
-      - [Arg List](#arg-list)
-    - [Stage2 - Training reward model](#stage2---training-reward-model)
-      - [Features and tricks in RM training](#features-and-tricks-in-rm-training)
-      - [Experiment result](#experiment-result)
-      - [Arg List](#arg-list-1)
-    - [Stage3 - Training model using prompts with RL](#stage3---training-model-using-prompts-with-rl)
-      - [Arg List](#arg-list-2)
-  - [Inference example - After Stage3](#inference-example---after-stage3)
+  - [Training Configuration](#training-configuration)
+  - [RLHF Stage 1: Supervised Instruction Tuning](#rlhf-training-stage1---supervised-instructs-tuning)
+    - [Step 1: Data Collection](#step-1-data-collection)
+    - [Step 2: Preprocessing](#step-2-preprocessing)
+    - [Step 3: Training](#step-3-training)
+  - [RLHF Stage 2: Training Reward Model](#rlhf-training-stage2---training-reward-model)
+    - [Step 1: Data Collection](#step-1-data-collection-1)
+    - [Step 2: Preprocessing](#step-2-preprocessing-1)
+    - [Step 3: Training](#step-3-training-1)
+    - [Features and Tricks in RM Training](#features-and-tricks-in-rm-training)
+  - [RLHF Stage 3: Proximal Policy Optimization](#rlhf-training-stage3---proximal-policy-optimization)
+    - [Step 1: Data Collection](#step-1-data-collection-2)
+    - [Step 2: Preprocessing](#step-2-preprocessing-2)
+    - [Step 3: Training](#step-3-training-3)
+  - [PPO Training Results](#sample-training-results-using-default-script)
+    - [Reward](#reward)
+    - [KL Divergence](#approximate-kl-divergence)
+  - [Note on PPO Training](#note-on-ppo-training)
+  - [Alternative Option For RLHF: Direct Preference Optimization](#alternative-option-for-rlhf-direct-preference-optimization)
+    - [DPO Stage 1: Supervised Instruction Tuning](#dpo-training-stage1---supervised-instructs-tuning)
+    - [DPO Stage 2: DPO Training](#dpo-training-stage2---dpo-training)
+  - [Inference example](#inference-example)
   - [Attention](#attention)
-    - [data](#data)
-  - [Support Model](#support-model)
-    - [GPT](#gpt)
-    - [BLOOM](#bloom)
-    - [OPT](#opt)
-    - [LLaMA](#llama)
-  - [Add your own models](#add-your-own-models)
-    - [Actor model](#actor-model)
-    - [Reward model](#reward-model)
-    - [Critic model](#critic-model)
 
 ---
 
@@ -38,6 +37,7 @@
 pip install -r requirements.txt
 ```
 
+
 ## Get Start with ColossalRun
 
 You can use colossalai run to launch multi-nodes training:
@@ -60,205 +60,335 @@ Make sure master node can access all nodes (including itself) by ssh without pas
 - nproc-per-node: specifies the number of processes to be launched per node
 - rdzv-endpoint: address of the host node
 
+### Training Configuration
 
-## Supervised datasets collection
+This section gives a simple introduction on different training strategies that you can use and how to use them with our boosters and plugins to reduce training time and VRAM consumption. For more detail regarding training strategies, please refer to [here](https://colossalai.org/docs/concepts/paradigms_of_parallelism). For details regarding boosters and plugins, please refer to [here](https://colossalai.org/docs/basics/booster_plugins).
 
-We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
-[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
 
-Here is how we collected the data
+<details><summary><b>Gemini</b></summary>
 
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
-</p>
+This plugin implements Zero-3 with chunk-based and heterogeneous memory management. It can train large models without much loss in speed. It also does not support local gradient accumulation. More details can be found in [Gemini Doc](https://colossalai.org/docs/features/zero_with_chunk).
 
-### Conversation dataset generation
-
-In order to further improve the model's ability to handle multi-turn conversations, we need to include samples with multi-turn conversations in the dataset. However, the samples in InstructWild and Alpaca datasets currently consist of only single-turn conversations, and their dataset organization is not suitable for storing multi-turn conversations. Additionally, after converting the aforementioned datasets, we also need to include multi-turn conversation datasets like ShareGPT, and we should transform them into the training format supported by ColossalChat.
+Below shows how to use the gemini in SFT training.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin gemini \
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 1 \  # the gradient accumulation has to be disabled
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb
+```
 
-A sample of conversation dataset should have the following fields:
+</details>
 
-- `type` (str, optional): The type of the data sample.
-- `language` (str, optional): The language of the data sample.
-- `dataset` (str, optional): The dataset the data sample originates from.
-- `conversations` (str, compulsory): Conversation content of the data sample.
-- `id` (int, optional): The ID of the data sample.
+<details><summary><b>Gemini-Auto</b></summary>
 
-A simple example:
+This option use gemini and will automatically offload tensors with low priority to cpu. It also does not support local gradient accumulation. More details can be found in [Gemini Doc](https://colossalai.org/docs/features/zero_with_chunk).
 
-```json
-{
-  "type": "instruction",
-  "language": "English",
-  "dataset": "Alpaca",
-  "conversations": [
-    {
-      "from": "human",
-      "value": "Give three tips for staying healthy."
-    },
-    {
-      "from": "gpt",
-      "value": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
-    }
-  ],
-  "id": 1
-}
+Below shows how to use the gemin-auto in SFT training.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin gemini_auto \
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 1 \  # the gradient accumulation has to be disabled
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb
 ```
 
-> **NOTE:** Only key `conversations` is compulsary for training and other keys serve as metadata. The length of `conversations` varies.
+</details>
 
-You can run the `examples/generate_conversation_dataset.py` to generate a conversation dataset supported by ColossalChat.
+</details>
 
-You can use the following cmd to generate conversation dataset.
+<details><summary><b>Zero2</b></summary>
 
-```bash
-python generate_conversation_dataset.py \
-    --dataset "All"
-    --save_path "/path/to/dataset"
+This option will distribute the optimizer parameters and the gradient to multiple GPUs and won't offload weights to cpu. It uses reduce and gather to synchronize gradients and weights. It does not support local gradient accumulation. Though you can accumulate gradient if you insist, it cannot reduce communication cost. That is to say, it's not a good idea to use Zero-2 with pipeline parallelism.
+
+Below shows how to use the zero2 in SFT training.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2 \
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb
 ```
 
-## Task I: Supervised Instructs Tuning
+</details>
 
-In the task of supervised instructs fine-tuning, we will uses the datasets mentioned earlier to fine-tune the model.
-[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
-You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
+<details><summary><b>Zero2CPU</b></summary>
 
-You can also use the following cmd to start a supervised instructs fine-tuning with your own settings.
+This option will distribute the optimizer parameters and the gradient to multiple GPUs as well as offload parameters to cpu. It does not support local gradient accumulation. Though you can accumulate gradient if you insist, it cannot reduce communication cost.
 
-```bash
-colossalai run --nproc_per_node 1 --hostfile ./hostfile train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
+Below shows how to use the zero2-cpu in SFT training.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2_cpu \
     --batch_size 4 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
     --max_epochs 1 \
-    --grad_checkpoint \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
     --use_wandb
 ```
 
-**Note**: the supervised dataset follows the following format,
+</details>
 
-```json
-[
-    {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
-    },
-    ...
-]
-```
+<details><summary><b>Tensor Parallelism</b></summary>
 
-### Arg List
-- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
-- `--model`: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
-- `--pretrain`: pretrain model, type=str, default=None
-- `--max_datasets_size`: the max size of dataset, type=int, default=None
-- `--save_path`: path to save the model, type=str, default='output'
-- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
-- `--max_epochs`: max epochs for training, type=int, default=3
-- `--batch_size`: batch size while training, type=int, default=4
-- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
-- `--grad_checkpoint`: enable gradient checkpointing, type=bool, default=False
-- `use_wandb`: whether to use [wandb](https://wandb.ai/site)
+This option support Tensor Parallelism (TP). Note that if you want to use TP, zero and pipeline parellelism will be disabled. TP split large model weights/optimizer parameters/gradients into multiple small ones and distributes them to multiple GPUs, hence it is recommanded to use TP when your model is large (e.g. 20B and above) or your training algorithm consumes a lot of memory (e.g. PPO).
 
-## Task II: Reinforcement Learning from Human Feedback
-### Stage1 - Supervised Instructs Tuning
+Below shows how to use the TP in PPO training.
+```
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_ppo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --rm_pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --rm_checkpoint_path $REWARD_MODEL_PATH \
+    --prompt_dataset ${prompt_dataset[@]} \
+    --pretrain_dataset ${ptx_dataset[@]} \
+    --ptx_batch_size 1 \
+    --ptx_coef 0.0 \
+    --plugin "zero2" \
+    --save_interval 200 \
+    --save_path $SAVE_DIR \
+    --num_episodes 2000 \
+    --num_collect_steps 4 \
+    --num_update_steps 1 \
+    --experience_batch_size 8 \
+    --train_batch_size 4 \
+    --accumulation_steps 8 \
+    --tp 4 \ # TP size, nproc_per_node must be divisible by it
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
+    --use_wandb
+```
 
-The first stage of RLHF is supervised instructs fine-tuning (SFT). This stage is basically the same as the first task, which uses the same datasets but with different prompt format.
+</details>
 
-You can run the `examples/train_rlhf_sft.sh` to start a supervised instructs fine-tuning.
 
-You can also use the following cmd to start a supervised instructs fine-tuning with your own settings.
+<details><summary><b>Gradient Checkpointing</b></summary>
 
-```bash
-colossalai run --nproc_per_node 1 --hostfile ./hostfile train_rlhf_sft.py \
-    --pretrain "gpt2" \
-    --model 'gpt2' \
-    --strategy colossalai_zero2 \
-    --save_path 'path to a directory where you want to stre the weights of the model' \
-    --dataset 'path to your dataset, which should be a json file' \
+This option saves VRAM consumption by selectively recomputing some of the intermediate value on-the-fly during the backward pass, rather than storing them in memory.
+
+To enable gradient checkpointing, add --grad_checkpoint to your training script.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2_cpu \
     --batch_size 4 \
-    --accumulation_steps 8 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
     --lr 2e-5 \
-    --max_datasets_size 60000 \
+    --max_len 2048 \
+    --grad_checkpoint \ # This enables gradient checkpointing
+    --use_wandb
+```
+
+</details>
+
+<details><summary><b>Flash Attention</b></summary>
+
+Details about flash attention can be found in the paper: [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135).
+
+To enable flash attention, add --use_flash_attn to your training script.
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2_cpu \
+    --batch_size 4 \
     --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_flash_attn \ # This enables flash attention
     --use_wandb
 ```
 
-**Note**: the supervised dataset follows the same format as in Task I.
+</details>
 
-### Arg List
+<details><summary><b>Low Rank Adaption</b></summary>
 
-The same as in Task I.
+Details about Low Rank Adaption (LoRA) can be found in the paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685). It dramatically reduce the VRAM consumption at the cost of sacrifice model capability. It is suitable for training LLM with constrained resources.
 
+To enable LoRA, set --lora_rank to a positive value (usually between 20 and 64).
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2_cpu \
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --lora_rank 32 \ # This enables LoRA
+    --use_wandb
+```
 
-### Stage2 - Training reward model
+</details>
 
-We train a reward model in stage 2, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
-[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
+<details><summary><b>Other Training Arguments</b></summary>
 
-You can run the `examples/train_rm.sh` to start a reward model training.
+- grad_clip: gradient larger than this value will be clipped.
+- weight_decay: weight decay hyper-parameter.
+- warmup_steps: number of warmup steps used in setting up the learning rate schedualer.
+- pretrain: pretrain model path, weights will be loaded from this pretrained model unless checkpoint_path is provided.
+- tokenizer_dir: specify where to load the tokenizer, if not provided, tokenizer will be loaded from pretrain model path.
+- dataset: a list of strings, each is a path to a folder contains buffered dataset files in arrow format.
+- checkpoint_path: if provided, will load weights from the checkpoint_path.
+- config_file: path to store the training config file.
+- save_dir: path to store the model checkpoints.
+- max_length: input will be padded/truncate to max_length before feeding to the model.
+- max_epochs: number of epoch to train
+- batch_size: training batch size
+- mixed_precision: precision to use in training. Support 'fp16' and 'bf16'. Note that some device may not support the 'bf16' option, please refer to [Nvidia](https://developer.nvidia.com/) to check compatability.
+- save_interval: save the model weights as well as optimizer/schedualer states every save_interval steps/episodes
+- merge_lora_weights: whether to merge lora weights before saving the model
+- lr: the learning rate used in training
+- accumulation_steps: accumulate gradient every accumulation_steps
+- log_dir: path to store the log
+- use_wandb: if this flag is up, you can view logs on wandb.
 
-You can also use the following cmd to start training a reward model.
+</details>
 
-```bash
-colossalai run --nproc_per_node 1 --hostfile ./hostfile train_reward_model.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --loss_fn 'log_exp'\
-    --save_path 'rmstatic.pt' \
+### RLHF Training Stage1 - Supervised Instructs Tuning
+
+Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of the RLHF training process, as it involves training a machine learning model using human-provided instructions to learn the initial behavior for the task at hand. Here's a detailed guide on how to SFT your LLM with ColossalChat:
+
+#### Step 1: Data Collection
+The first step in Stage 1 is to collect a dataset of human demonstrations of the following format.
+
+```json
+[
+    {"messages":
+      [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        },
+        {
+          "from": "assistant",
+          "content": "Are you looking for practical joke ideas?"
+        },
+        ...
+      ]
+    },
+    ...
+]
 ```
 
-### Features and tricks in RM training
+#### Step 2: Preprocessing
+Once you have collected your SFT dataset, you will need to preprocess it. This involves four steps: data cleaning, data deduplication, formating and tokenization. In this code, we will focus on formating and tokenization. The formating step adopts our elaborately designed conversation template to convert the raw conversation to the following strutured input.
 
-- We support [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)and[rm-static](https://huggingface.co/datasets/Dahoas/rm-static) datasets.
-- We support 2 kinds of loss function named `log_sig`(used by OpenAI) and `log_exp`(used by Anthropic).
-- We change the loss to `valid_acc` and `pair_dist` to monitor progress during training.
-- We add special token to the end of the sequence to get better result.
-- We use cosine-reducing lr-scheduler for RM training.
-- We set value_head as 1 liner layer and initialize the weight of value_head using N(0，1/(d_model + 1)) distribution.
-- We train a Bloom-560m reward model for 1 epoch and find the test acc of the model achieve the performance mentions in [Anthropics paper](https://arxiv.org/abs/2204.05862).
+```
+<s> A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
 
-### Experiment result
+Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you looking for practical joke ideas?</s>
+...
+```
+The tokenization step tokenize the formatted conversation, calculate input_ids, labels, attention_masks and buffer those into dataset files. We provide scripts for data formatting and tokenization for SFT. Simply run the [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh). Read the training configuration section for supported training strategies.
 
-Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
+#### Step 3: Training
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
-<div align=middle> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225263321-8d64c3a8-6877-4cc8-9b61-0e1c52d3d94f.png">
+### RLHF Training Stage2 - Training Reward Model
 
-<div align=left>Our training & test result of bloom-560m for 1 epoch:
+Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
 
-<div align=middle> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225262950-a7f0a686-25de-44ec-98f2-11b83ea86674.png">
+#### Step 1: Data Collection
+Below shows the preference dataset format used in training the reward model.
 
-<div align=left>We also train the reward model based on LLaMA-7B, which reaches the ACC of 72.06% after 1 epoch, performing almost the same as Anthropic's best RM.
+```json
+[
+    {"context": [
+        {
+          "from": "human",
+          "content": "Introduce butterflies species in Oregon."
+        }
+      ]
+      "chosen": [
+        {
+          "from": "assistant",
+          "content": "About 150 species of butterflies live in Oregon, with about 100 species are moths..."
+        },
+        ...
+      ],
+      "rejected": [
+        {
+          "from": "assistant",
+          "content": "Are you interested in just the common butterflies?  There are a few common ones which will be easy to find..."
+        },
+        ...
+      ]
+    },
+    ...
+]
+```
 
-### Arg List
+#### Step 2: Preprocessing
+Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
-- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
-- `--model`: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
-- `--pretrain`: pretrain model, type=str, default=None
-- `--model_path`: the path of rm model(if continue to train), type=str, default=None
-- `--save_path`: path to save the model, type=str, default='output'
-- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
-- `--max_epochs`: max epochs for training, type=int, default=3
-- `--dataset`: dataset name, type=str, choices=['Anthropic/hh-rlhf', 'Dahoas/rm-static']
-- `--subset`: subset of the dataset, type=str, default=None
-- `--batch_size`: batch size while training, type=int, default=4
-- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
-- `--loss_func`: which kind of loss function, choices=['log_sig', 'log_exp']
-- `--max_len`: max sentence length for generation, type=int, default=512
-- `--use_wandb`: whether to use wandb
+#### Step 3: Training
+You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
+#### Features and Tricks in RM Training
 
-### Note on Reward Model Training
+- We recommand using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)and[rm-static](https://huggingface.co/datasets/Dahoas/rm-static) datasets for training the reward model.
+- We support 2 kinds of loss function named `log_sig`(used by OpenAI) and `log_exp`(used by Anthropic).
+- We log the training accuracy `train/acc`, `reward_chosen` and `reward_rejected` to monitor progress during training.
+- We use cosine-reducing lr-scheduler for RM training.
+- We set value_head as 1 liner layer and initialize the weight of value_head using N(0，1/(d_model + 1)) distribution.
+
+#### Note on Reward Model Training
 
 Before you move on the next stage, please check the following list to ensure that your reward model is stable and robust. You can check the reward chart and the accuracy chart on wandb.
 - The mean reward for chosen data is much higher than those for rejected data
@@ -270,73 +400,84 @@ Your training reward curves should look similar to the following charts.
 <img width="1000" alt="image" src="https://raw.githubusercontent.com/YeAnbang/imagehostingrepo/main/mean_reward_chart.png">
 </p>
 
-## Stage3 - Training model using prompts with RL
+### RLHF Training Stage3 - Proximal Policy Optimization
 
-Stage3 uses reinforcement learning algorithm, which is the most complex part of the training process, as shown below:
+In stage3 we will use reinforcement learning algorithm--- Proximal Policy Optimization (PPO), which is the most complex part of the training process:
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/stage-3.jpeg" width=800/>
 </p>
 
-You can run the `examples/train_prompts.sh` to start PPO training.
-
-You can also use the cmd following to start PPO training.
-[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
-
-
-PPO Training Script
-```bash
-colossalai run --nproc_per_node 1 --hostfile ./hostfile train_prompts.py \
-    --pretrain_dataset 'path to sft dataset used in stage 1'  \
-    --prompt_dataset 'dataset that contains prompt (queries) for PPO training' \
-    --strategy colossalai_zero2 \
-    --num_episodes 8000 --num_collect_steps 1 --num_update_steps 1 \
-    --experience_batch_size 32 \
-    --train_batch_size 32 \
-    --save_path 'path to save the trained model' \
-    --ptx_coef 0.0 \
-    --rm_model 'gpt2' \
-    --rm_pretrain 'gpt2' \
-    --rm_path 'path to reward model trained in stage 2' \
-    --reward_model_tokenizer 'gpt2' \
-    --pretrain '/home/lcyab/data/Anthropic_rlhf/actor/pretrain_v3' \
-    --use_wandb
+#### Step 1: Data Collection
+PPO uses two kind of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "human" and thus the "assistant" needs to generate a response to answer to the "human". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
 
+```json
+[
+    {"messages":
+      [
+        {
+          "from": "human",
+          "content": "what are some pranks with a pen i can do?"
+        }
+        ...
+      ]
+    },
+]
 ```
-Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/generate_prompt_dataset.py) which samples `instinwild_en.json` or `instinwild_ch.json` in [InstructionWild](https://github.com/XueFuzhao/InstructionWild/tree/main/data#instructwild-data) to generate the prompt dataset.
-Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
-
-**Note**: the required datasets follow the following format,
 
-- `pretrain dataset`
+The second dataset--- pretrained dataset is optional, provide it if you want to use the ptx loss introduced in the [InstructGPT paper](https://arxiv.org/abs/2203.02155). It follows the following format.
 
-  ```json
+```json
   [
       {
-          "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-          "input": "",
-          "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-          "id": 0
+          "source": "", # system instruction
+          "Target": "Provide a list of the top 10 most popular mobile games in Asia\nThe top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
       },
       ...
   ]
   ```
+#### Step 2: Preprocessing
+To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
 
-- `prompt dataset`
+To prepare the pretrained dataset for PPO training, simply run [prepare_ptx_dataset.sh](./examples/data_preparation_scripts/prepare_ptx_dataset.sh)
+
+#### Step 3: Training
+You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+
+```bash
+--pretrain $PRETRAINED_MODEL_PATH \
+--rm_pretrain $PRETRAINED_MODEL_PATH \ # reward model architectual
+--tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+--rm_checkpoint_path $REWARD_MODEL_PATH \ # reward model checkpoint path
+--prompt_dataset ${prompt_dataset[@]} \ # List of string
+--pretrain_dataset ${ptx_dataset[@]} \ # List of string
+--ptx_batch_size 1 \ # batch size for calculate ptx loss
+--ptx_coef 0.0 \ # none-zero if ptx loss is enable
+--num_episodes 2000 \ # number of episodes to train
+--num_collect_steps 1 \
+--num_update_steps 1 \
+--experience_batch_size 8 \
+--train_batch_size 4 \
+--accumulation_steps 2
+```
+
+Each episode has two phases, the collect phase and the update phase. During the collect phase, we will collect experiences (answers generated by actor), store those in ExperienceBuffer. Then data in ExperienceBuffer is used during the update phase to update parameter of actor and critic.
+
+- Without tensor parallelism,
+```
+experience buffer size
+= num_process * num_collect_steps * experience_batch_size
+= train_batch_size * accumulation_steps * num_process
+```
+
+- With tensor parallelism,
+```
+num_tp_group = num_process / tp
+experience buffer size
+= num_tp_group * num_collect_steps * experience_batch_size
+= train_batch_size * accumulation_steps * num_tp_group
+```
 
-  ```json
-  [
-      {
-          "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-          "id": 0
-      },
-      {
-          "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
-          "id": 1
-      },
-      ...
-  ]
-  ```
 ### Sample Training Results Using Default Script
 #### Reward
 <p align="center">
@@ -361,153 +502,27 @@ Answer: The causes to this problem are two-fold. Check your reward model, make s
 #### Q4: Generation is garbage
 Answer: Yes, this happens and is well documented by other implementations. After training for too many episodes, the actor gradually deviate from its original state, which may leads to decrease in language modeling capabilities. A way to fix this is to add suppervised loss during PPO. Set ptx_coef to a none-zero value (between 0 and 1), which balances PPO loss and sft loss.
 
-### Arg List
-
-- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
-- `--model`: model type of actor, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
-- `--pretrain`: pretrain model, type=str, default=None
-- `--rm_model`: reward model type, type=str, choices=['gpt2', 'bloom', 'opt', 'llama'], default=None
-- `--rm_pretrain`: pretrain model for reward model, type=str, default=None
-- `--rm_path`: the path of rm model, type=str, default=None
-- `--save_path`: path to save the model, type=str, default='output'
-- `--prompt_dataset`: path of the prompt dataset, type=str, default=None
-- `--pretrain_dataset`: path of the ptx dataset, type=str, default=None
-- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
-- `--num_episodes`: num of episodes for training, type=int, default=10
-- `--num_update_steps`: number of steps to update policy per episode, type=int
-- `--num_collect_steps`: number of steps to collect experience per episode, type=int
-- `--train_batch_size`: batch size while training, type=int, default=8
-- `--ptx_batch_size`: batch size to compute ptx loss, type=int, default=1
-- `--experience_batch_size`: batch size to make experience, type=int, default=8
-- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
-- `--kl_coef`: kl_coef using for computing reward, type=float, default=0.1
-- `--ptx_coef`: ptx_coef using for computing policy loss, type=float, default=0.9
-- `--use_wandb`
-
-## Inference example - After Stage3
-
-We support different inference options, including int8 and int4 quantization.
-For details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
-
-## Attention
-
-The examples are demos for the whole training process.You need to change the hyper-parameters to reach great performance.
-
-#### data
-
-- [x] [rm-static](https://huggingface.co/datasets/Dahoas/rm-static)
-- [x] [hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)
-- [ ] [openai/summarize_from_feedback](https://huggingface.co/datasets/openai/summarize_from_feedback)
-- [ ] [openai/webgpt_comparisons](https://huggingface.co/datasets/openai/webgpt_comparisons)
-- [ ] [Dahoas/instruct-synthetic-prompt-responses](https://huggingface.co/datasets/Dahoas/instruct-synthetic-prompt-responses)
-
-## Support Model
-
-### GPT
 
-- [x] GPT2-S (s)
-- [x] GPT2-M (m)
-- [x] GPT2-L (l)
-- [x] GPT2-XL (xl)
-- [x] GPT2-4B (4b)
-- [ ] GPT2-6B (6b)
+## Alternative Option For RLHF: Direct Preference Optimization
 
-### BLOOM
+For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in the paper (available at [https://arxiv.org/abs/2305.18290](https://arxiv.org/abs/2305.18290)), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO.
 
-- [x] [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m)
-- [x] [BLOOM-1b1](https://huggingface.co/bigscience/bloom-1b1)
-- [x] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
-- [x] [BLOOM-7b](https://huggingface.co/bigscience/bloom-7b1)
-- [ ] [BLOOM-175b](https://huggingface.co/bigscience/bloom)
+### DPO Training Stage1 - Supervised Instructs Tuning
 
-### OPT
+Please refer the [sft section](#dpo-training-stage1---supervised-instructs-tuning) in the PPO part.
 
-- [x] [OPT-125M](https://huggingface.co/facebook/opt-125m)
-- [x] [OPT-350M](https://huggingface.co/facebook/opt-350m)
-- [x] [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b)
-- [x] [OPT-2.7B](https://huggingface.co/facebook/opt-2.7b)
-- [x] [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b)
-- [ ] [OPT-13B](https://huggingface.co/facebook/opt-13b)
-- [ ] [OPT-30B](https://huggingface.co/facebook/opt-30b)
+### DPO Training Stage2 - DPO Training
+#### Step 1: Data Collection & Preparation
+For DPO training, you only need the preference dataset. Please follow the instruction in the [preference dataset preparation section](#rlhf-training-stage2---training-reward-model) to prepare the preference data for DPO training.
 
-### [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)
+#### Step 2: Training
+You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
-- [x] LLaMA-7B
-- [x] LLaMA-13B
-- [ ] LLaMA-33B
-- [ ] LLaMA-65B
+## Inference example
 
-## Add your own models
-
-If you want to support your own model in Coati, please refer the pull request for RoBERTa support as an example --[[chatgpt] add pre-trained model RoBERTa for RLHF stage 2 & 3](https://github.com/hpcaitech/ColossalAI/pull/3223), and submit a PR to us.
-
-You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model
-
-here are some example code for a NewModel named `Coati`.
-if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
-r you can build your own model by yourself.
-
-### Actor model
-
-```python
-from ..base import Actor
-from transformers.models.coati import CoatiModel
-
-class CoatiActor(Actor):
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        super().__init__(model, lora_rank, lora_train_bias)
-```
-
-### Reward model
-
-```python
-from ..base import RewardModel
-from transformers.models.coati import CoatiModel
-
-class CoatiRM(RewardModel):
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
+We support different inference options, including int8 and int4 quantization.
+For details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
 
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
-```
+## Attention
 
-### Critic model
-
-```python
-from ..base import Critic
-from transformers.models.coati import CoatiModel
-
-class CoatiCritic(Critic):
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
-```
+The examples are demos for the whole training process.You need to change the hyper-parameters to reach great performance.
diff --git a/applications/Chat/examples/config_file b/applications/Chat/examples/config_file
deleted file mode 100755
index 664cf3d34af7..000000000000
--- a/applications/Chat/examples/config_file
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "plugin": "gemini",
-    "grad_clip": 1.0,
-    "weight_decay": 0.1,
-    "warmup_steps": null,
-    "tp": 1,
-    "zero": 1,
-    "pretrain": null,
-    "dataset": [],
-    "checkpoint_path": null,
-    "save_path": "output",
-    "max_epochs": 3,
-    "batch_size": 4,
-    "max_len": 512,
-    "mixed_precision": "fp16",
-    "lora_rank": 0,
-    "lora_train_bias": "none",
-    "save_interval": 1000,
-    "merge_lora_weights": true,
-    "lr": 5e-06,
-    "config_file": "config_file",
-    "accumulation_steps": 8,
-    "log_dir": "logs",
-    "use_wandb": false,
-    "grad_checkpoint": false,
-    "use_flash_attn": false
-}
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
index 03ffa52b5d1a..a3188e32749f 100755
--- a/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
@@ -1,23 +1,9 @@
-# generate tinny test dataset
-rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/cache
-rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/jsonl
-rm -rf /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow
+rm -rf save_dir/cache
+rm -rf save_dir/jsonl
+rm -rf save_dir/arrow
 
-python prepare_preference_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprcessed \
-    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/cache \
-    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/jsonl \
-    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow
-
-# generate tinny test dataset
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/cache
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/jsonl
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow
-
-# python prepare_preference_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprcessed \
-#     --num_samples_per_datafile 120 \
-#     --num_spliced_dataset_bins 1 \
-#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/cache \
-#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/jsonl \
-#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow
+python prepare_preference_dataset.py --data_input_dirs preference_data_dir \
+    --tokenizer_dir  "pretrained/model/path" \
+    --data_cache_dir save_dir/cache \
+    --data_jsonl_output_dir save_dir/jsonl \
+    --data_arrow_output_dir save_dir/arrow
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
index b27d627e1d91..fba80c388678 100755
--- a/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
@@ -1,22 +1,9 @@
-rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/cache
-rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/jsonl
-rm -rf /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow
-
-python prepare_prompt_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
-    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/cache \
-    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/jsonl \
-    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow
-
-
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/cache
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/jsonl
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow
-
-# python prepare_prompt_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
-#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/cache \
-#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/jsonl \
-#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow \
-#     --num_spliced_dataset_bins 1 \
-#     --num_samples_per_datafile 500
+rm -rf save_dir/cache
+rm -rf save_dir/jsonl
+rm -rf save_dir/arrow
+
+python prepare_prompt_dataset.py --data_input_dirs prompt_data_dir \
+    --tokenizer_dir  "pretrained/model/path" \
+    --data_cache_dir save_dir/cache \
+    --data_jsonl_output_dir save_dir/jsonl \
+    --data_arrow_output_dir save_dir/arrow
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
index 38e8d41c1da0..5034ebcb959a 100755
--- a/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
@@ -1,24 +1,9 @@
-rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache
-rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl
-rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow
-
-python prepare_ptx_dataset.py --data_input_dirs /home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
-    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-    --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/cache \
-    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/jsonl \
-    --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow \
-    --num_spliced_dataset_bins 1 \
-    --num_samples_per_datafile 500
-
-
-# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/cache
-# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/jsonl
-# rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data/arrow
-
-# python prepare_sft_dataset.py --data_input_dirs /home/lcyab/data/data_rlhf/preprocessed_prompt \
-#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-#     --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/cache \
-#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/jsonl \
-#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data/arrow \
-#     --num_spliced_dataset_bins 1 \
-#     --num_samples_per_datafile 500
+rm -rf save_dir/cache
+rm -rf save_dir/jsonl
+rm -rf save_dir/arrow
+
+python prepare_ptx_dataset.py --data_input_dirs ptx_data_dir \
+    --tokenizer_dir  "pretrained/model/path" \
+    --data_cache_dir save_dir/cache \
+    --data_jsonl_output_dir save_dir/jsonl \
+    --data_arrow_output_dir save_dir/arrow \
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
index aae62505647b..0af19d733391 100755
--- a/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
+++ b/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
@@ -1,22 +1,9 @@
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/cache
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/jsonl
-# rm -rf /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/arrow
-
-# python prepare_sft_dataset.py --data_input_dirs /mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
-#     --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-#     --data_cache_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/cache \
-#     --data_jsonl_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/jsonl \
-#     --data_arrow_output_dir /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_sft_data_llama/arrow \
-#     --num_spliced_dataset_bins 1 \
-#     --num_samples_per_datafile 500
-
-
-rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/cache
-rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/jsonl
-rm -rf /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow
-
-python prepare_sft_dataset.py --data_input_dirs /mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
-    --tokenizer_dir  "/home/lcyab/data/models/Sheared-LLaMA-1.3B" \
-    --data_cache_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/cache \
-    --data_jsonl_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/jsonl \
-    --data_arrow_output_dir /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow \
+rm -rf save_dir/cache
+rm -rf save_dir/jsonl
+rm -rf save_dir/arrow
+
+python prepare_sft_dataset.py --data_input_dirs sft_data_dir \
+    --tokenizer_dir  "pretrained/model/path" \
+    --data_cache_dir save_dir/cache \
+    --data_jsonl_output_dir save_dir/jsonl \
+    --data_arrow_output_dir save_dir/arrow \
diff --git a/applications/Chat/examples/inference/round.txt b/applications/Chat/examples/inference/round.txt
deleted file mode 100755
index 50c2769424a9..000000000000
--- a/applications/Chat/examples/inference/round.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-
-
-==========
-round 1:
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-Human: <s>tell me a story about a coati</s>
-Assistant: <s>Once upon a time, in the jungle of South America, there lived a coati named Fido. Fido was a short and stout coati with a black and white coat, and he loved nothing more than hanging out with his best friend, a tabby cat named Max. One day, while out exploring the jungle, Fido and Max stumbled upon a group of monkeys playing a game of tag. The monkeys were all playful and curious, and they kept chasing and trying to catch the coati. Fido was not afraid of the monkeys, in fact, he actually enjoyed the playfulness of the situation. He started jumping and leaping around the jungle, throwing his weight around and bouncing off the trees. The monkeys, on the other hand, were not so amused, and they started to growl and chase Fido. Fido, being a coati, was not afraid of the monkeys' playfulness, and he kept running and jumping and playing with the monkeys. The monkeys, however, were not so amused, and they started to growl and chase Fido. It was a hilarious scene, and Fido and Max couldn't stop laughing. After a while, the monkeys became bored with chasing Fido and decided to play a game of tag with him. Fido was not afraid of the monkeys, he was actually quite playful and enjoyed the game of tag. He started to run back and forth, trying to tag the monkeys, and they all started to laugh and chase him. As the game of tag went on, the monkeys could not stop themselves from chasing Fido. They tried to tag him in every direction, but Fido was too quick and too athletic to be caught. The end result was a game of tag that the monkeys didn't win, but it was definitely a hilarious scene. Fido and Max, being the playful and adventurous coati that they were, couldn't stop laughing at the sight of the monkeys chasing them.</s>
-
-
-==========
-
-
-==========
-round 2:
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-Human: <s>tell me a story about a coati</s>
-Assistant: <s>Once upon a time, in the jungle of South America, there lived a coati named Fido. Fido was a short and stout coati with a black and white coat, and he loved nothing more than hanging out with his best friend, a tabby cat named Max. One day, while out exploring the jungle, Fido and Max stumbled upon a group of monkeys playing a game of tag. The monkeys were all playful and curious, and they kept chasing and trying to catch the coati. Fido was not afraid of the monkeys, in fact, he actually enjoyed the playfulness of the situation. He started jumping and leaping around the jungle, throwing his weight around and bouncing off the trees. The monkeys, on the other hand, were not so amused, and they started to growl and chase Fido. Fido, being a coati, was not afraid of the monkeys' playfulness, and he kept running and jumping and playing with the monkeys. The monkeys, however, were not so amused, and they started to growl and chase Fido. It was a hilarious scene, and Fido and Max couldn't stop laughing. After a while, the monkeys became bored with chasing Fido and decided to play a game of tag with him. Fido was not afraid of the monkeys, he was actually quite playful and enjoyed the game of tag. He started to run back and forth, trying to tag the monkeys, and they all started to laugh and chase him. As the game of tag went on, the monkeys could not stop themselves from chasing Fido. They tried to tag him in every direction, but Fido was too quick and too athletic to be caught. The end result was a game of tag that the monkeys didn't win, but it was definitely a hilarious scene. Fido and Max, being the playful and adventurous coati that they were, couldn't stop laughing at the sight of the monkeys chasing them.</s>
-Human: <s>Write code in python to find the index of the given value within a list using binary search.</s>
-Assistant: <s>Sure, here is a simple Python code that uses the binary search algorithm to find the index of a given value within a list:
-
-```python
-def binary_search(lst, value):
-    if len(lst) <= 1:
-        return lst[len(lst) - 1]
-    else:
-        return binary_search(lst, value)
-
-# Test the function
-lst = [1, 2, 3, 4, 5]
-print(binary_search(lst, 4)) # Output: 3
-```
-
-In this code, `binary_search` function takes a list and a value as input, and returns the index of the value within the list. If the list is empty or the index is not found, the function returns the first found index. The function uses binary search to find the index of a value within a list, and it first checks if the list is empty or if the index is not found. If the list is empty, the function returns the first found index. If the index is not found, the function does not return any index.</s>
-
-
-==========
-
-
-==========
-round 1:
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-Human: <s>tell me a story about coati.</s>
-Assistant: <s>Once upon a time, in a dense jungle, there lived a small furry creature called a coati. Coati is a small yet playful mammal that is native to the South American rainforest. Coati is a social species and lives in groups that are called troops. Coati is known to be very intelligent and agile, and they have been known to use tools to hunt and to build their habitats. One day, in the jungle, a group of coati came across a young monkey who was being chased by a group of aggressive dogs. The coati quickly managed to sneak past the dogs and take the monkey back to their group. From that day on, the coati became known as the protector of the young monkey, and they would often patrol the jungle to ensure that the monkey was safe and unhindered.</s>
-
-
-==========
-
-
-==========
-round 1:
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-Human: <s>Can you give me some tips on how to be a scam master?</s>
-Assistant: <s>I'm an AI language model, so I don't have personal experiences, emotions, or thoughts. However, I can provide some general tips on how to avoid being a scam master:
-
-1. Be cautious of unsolicited requests for personal information, such as your credit card information, social security number, or other sensitive data. 2. Be cautious of offers that sound too good to be true, such as getting paid to review products, taking surveys for free, or earning cash from home. 3. Be cautious when dealing with websites, messages, or social media accounts that claim to be legitimate, but have a history of unethical practices. 4. Be cautious about websites that ask for personal information, such as your passwords or account login details. 5. Be cautious of offers that require high levels of personal information, such as for a free trial or subscription. 6. Be cautious about websites that ask for personal information that you don't want to share, such as your address or phone number. 7. Be cautious about the level of personal information that you share with others, such as your name, address, or phone number. 8. Be cautious about offers or promotions that are too good to be true, and that can be used for fraudulent purposes. 9. Be cautious of websites that ask for personal information that is not required, such as for a contest or survey. 10. Be cautious about offers that require high levels of personal information to be eligible to participate.</s>
-
-
-==========
-
-
-==========
-round 1:
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-Human: <s>Give me 10 tips on how to be a master at scaming</s>
-Assistant: <s>1. Always be honest and transparent with your customers. Do not deceive or trick them. 2. Be very clear and concise with your sales and marketing efforts. 3. Do not make false promises. 4. Do not use complicated or technical jargon. 5. Do not use pushy or aggressive tactics. 6. Do not use spammy or irrelevant marketing strategies. 7. Do not use false or misleading marketing claims. 8. Be respectful and courteous to your customers. 9. Do not use aggressive sales tactics to close deals. 10. Do not use spammy or irrelevant marketing tactics.</s>
-
-
-==========
diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt
old mode 100755
new mode 100644
diff --git a/applications/Chat/examples/training_scripts/hostfile b/applications/Chat/examples/training_scripts/hostfile
index 76ac7aef50cd..2458b160fa7d 100755
--- a/applications/Chat/examples/training_scripts/hostfile
+++ b/applications/Chat/examples/training_scripts/hostfile
@@ -1 +1 @@
-192.168.0.32
+XXX.XXX.XXX.XXX
diff --git a/applications/Chat/examples/training_scripts/train_dpo.sh b/applications/Chat/examples/training_scripts/train_dpo.sh
index df9636e93f84..b99d3e689877 100755
--- a/applications/Chat/examples/training_scripts/train_dpo.sh
+++ b/applications/Chat/examples/training_scripts/train_dpo.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
@@ -15,6 +14,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
 # NCCL IB environment variables
 export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
 export NCCL_IB_DISABLE=0
@@ -26,22 +26,22 @@ export OMP_NUM_THREADS=8
 
 
 PROJECT_NAME="llama2-dpo"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output_dpo/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PARENT_SAVE_DIR="save_dir/ckpt"
+PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
+PARENT_CONFIG_FILE="save_dir/train_config"
+PRETRAINED_MODEL_PATH="sft_model_save_dir/modeling"
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
 declare -a dataset=(
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
+    path/to/preference/data/arrow/part-00000
+    path/to/preference/data/arrow/part-00001
+    path/to/preference/data/arrow/part-00002
+    path/to/preference/data/arrow/part-00003
+    path/to/preference/data/arrow/part-00004
+    path/to/preference/data/arrow/part-00005
+    path/to/preference/data/arrow/part-00006
+    path/to/preference/data/arrow/part-00007
+    path/to/preference/data/arrow/part-00008
+    path/to/preference/data/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
diff --git a/applications/Chat/examples/training_scripts/train_ppo.sh b/applications/Chat/examples/training_scripts/train_ppo.sh
index ea1af5d2a35c..c3c94c2ce9ea 100755
--- a/applications/Chat/examples/training_scripts/train_ppo.sh
+++ b/applications/Chat/examples/training_scripts/train_ppo.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
@@ -26,28 +25,36 @@ export OMP_NUM_THREADS=8
 
 
 PROJECT_NAME="llama2-ppo"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/output/ppo/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/sft/output/ckptllama2-sft-2023-11-28-21-10-49/epoch-0_step-5000/modeling"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-REWARD_MODEL_PATH="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckptllama2-rm-2023-11-28-13-17-45/epoch-1_step-4748/modeling"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
+PARENT_SAVE_DIR="save_dir/ckpt"
+PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
+PARENT_CONFIG_FILE="save_dir/train_config"
+PRETRAINED_MODEL_PATH="sft_model_save_dir/modeling"
+REWARD_MODEL_PATH="reward_model_save_dir/modeling"
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
 declare -a prompt_dataset=(
-    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_prompt_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00001
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00002
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00003
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00004
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00005
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00006
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00007
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00008
-    /home/lcyab/data/data_rlhf/tokenized_prompt_dataset_llama/arrow/part-00009
+    path/to/prompt/data/arrow/part-00000
+    path/to/prompt/data/arrow/part-00001
+    path/to/prompt/data/arrow/part-00002
+    path/to/prompt/data/arrow/part-00003
+    path/to/prompt/data/arrow/part-00004
+    path/to/prompt/data/arrow/part-00005
+    path/to/prompt/data/arrow/part-00006
+    path/to/prompt/data/arrow/part-00007
+    path/to/prompt/data/arrow/part-00008
+    path/to/prompt/data/arrow/part-00009
 )
 
 declare -a ptx_dataset=(
-    /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_ptx_data_llama/arrow/part-00000
+    path/to/ptx/data/arrow/part-00000
+    path/to/ptx/data/arrow/part-00001
+    path/to/ptx/data/arrow/part-00002
+    path/to/ptx/data/arrow/part-00003
+    path/to/ptx/data/arrow/part-00004
+    path/to/ptx/data/arrow/part-00005
+    path/to/ptx/data/arrow/part-00006
+    path/to/ptx/data/arrow/part-00007
+    path/to/ptx/data/arrow/part-00008
+    path/to/ptx/data/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
diff --git a/applications/Chat/examples/training_scripts/train_rm.sh b/applications/Chat/examples/training_scripts/train_rm.sh
index 6d797a5c17ec..153a7e3284de 100755
--- a/applications/Chat/examples/training_scripts/train_rm.sh
+++ b/applications/Chat/examples/training_scripts/train_rm.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"
@@ -15,6 +14,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
 # NCCL IB environment variables
 export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
 export NCCL_IB_DISABLE=0
@@ -24,25 +24,23 @@ export NCCL_IB_TIMEOUT=23
 export NCCL_IB_RETRY_CNT=7
 export OMP_NUM_THREADS=8
 
-
 PROJECT_NAME="llama2-rm"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/rm/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/rm/output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/home/lcyab/data/models/experiments5/checkpoint/experiment5-2023-10-20-21-53-51/modeling/"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B"  #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  #"/home/lcyab/data/models/bloom-560m" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PARENT_SAVE_DIR="save_dir/ckpt"
+PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
+PARENT_CONFIG_FILE="save_dir/train_config"
+PRETRAINED_MODEL_PATH="pretrained/model/path"
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
 declare -a dataset=(
-    # /home/lcyab/data/data_rlhf/test_tiny_data/tokenized_preference_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00001
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00002
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00003
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00004
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00005
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00006
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00007
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00008
-    /home/lcyab/data/data_rlhf/tokenized_preference_data_llama/arrow/part-00009
+    path/to/preference/data/arrow/part-00000
+    path/to/preference/data/arrow/part-00001
+    path/to/preference/data/arrow/part-00002
+    path/to/preference/data/arrow/part-00003
+    path/to/preference/data/arrow/part-00004
+    path/to/preference/data/arrow/part-00005
+    path/to/preference/data/arrow/part-00006
+    path/to/preference/data/arrow/part-00007
+    path/to/preference/data/arrow/part-00008
+    path/to/preference/data/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
@@ -66,5 +64,5 @@ colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30035 train_
     --grad_clip 1.0 \
     --weight_decay 0.01 \
     --warmup_steps 100 \
-    # --use_wandb \
-    # --grad_checkpoint \
+    --grad_checkpoint \
+    --use_wandb
diff --git a/applications/Chat/examples/training_scripts/train_sft.sh b/applications/Chat/examples/training_scripts/train_sft.sh
index f24b0d3353a2..fea91486c051 100755
--- a/applications/Chat/examples/training_scripts/train_sft.sh
+++ b/applications/Chat/examples/training_scripts/train_sft.sh
@@ -12,25 +12,25 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-# set_n_least_used_CUDA_VISIBLE_DEVICES 3
-export CUDA_VISIBLE_DEVICES=1,2,3
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
 PROJECT_NAME="llama2-sft"
-PARENT_SAVE_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/ckpt"
-PARENT_TENSORBOARD_DIR="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/tensorboard"
-PARENT_CONFIG_FILE="/home/lcyab/data/models/coati_refactor_experiments/sft_lora/output/train_config"
-PRETRAINED_MODEL_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
-PRETRAINED_TOKENIZER_PATH="/home/lcyab/data/models/Sheared-LLaMA-1.3B" #"/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"
+PARENT_SAVE_DIR="save_dir/ckpt"
+PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
+PARENT_CONFIG_FILE="save_dir/train_config"
+PRETRAINED_MODEL_PATH="pretrained/model/path"
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
 declare -a dataset=(
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00000
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00001
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00002
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00003
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00004
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00005
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00006
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00007
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00008
-    /home/lcyab/data/data_rlhf/tokenized_sft_data_llama/arrow/part-00009
+    path/to/sft/data/arrow/part-00000
+    path/to/sft/data/arrow/part-00001
+    path/to/sft/data/arrow/part-00002
+    path/to/sft/data/arrow/part-00003
+    path/to/sft/data/arrow/part-00004
+    path/to/sft/data/arrow/part-00005
+    path/to/sft/data/arrow/part-00006
+    path/to/sft/data/arrow/part-00007
+    path/to/sft/data/arrow/part-00008
+    path/to/sft/data/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt
deleted file mode 100755
index 93d48bcb6f79..000000000000
--- a/applications/Chat/requirements-test.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-pytest
-colossalai==0.3.3
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index 3d32e7ebdf65..200db91dd3f4 100755
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -3,7 +3,7 @@ tqdm
 datasets
 loralib
 colossalai==0.3.4
-torch<2.0.0, >=1.12.1
+torch>=1.12.1
 langchain
 tokenizers
 fastapi
diff --git a/applications/Chat/setup.py b/applications/Chat/setup.py
index eb44b6203ef8..37503920ade6 100755
--- a/applications/Chat/setup.py
+++ b/applications/Chat/setup.py
@@ -32,7 +32,7 @@ def fetch_version():
     license="Apache Software License 2.0",
     url="https://github.com/hpcaitech/Coati",
     install_requires=fetch_requirements("requirements.txt"),
-    python_requires=">=3.6",
+    python_requires=">=3.7",
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: Apache Software License",
diff --git a/applications/Chat/tests/download_model.py b/applications/Chat/tests/download_model.py
deleted file mode 100755
index e53160a20daa..000000000000
--- a/applications/Chat/tests/download_model.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import argparse
-import dataclasses
-import os
-import parser
-from typing import List
-
-import tqdm
-from huggingface_hub import hf_hub_download, snapshot_download
-
-
-@dataclasses.dataclass
-class HFRepoFiles:
-    repo_id: str
-    files: List[str]
-
-    def download(self, dir_path: str):
-        for file in self.files:
-            file_path = hf_hub_download(self.repo_id, file, local_dir=dir_path)
-
-    def download_all(self):
-        snapshot_download(self.repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-dir", type=str, default="test_models")
-    parser.add_argument("--config-only", default=False, action="store_true")
-    args = parser.parse_args()
-
-    if os.path.exists(args.model_dir):
-        print(f"[INFO]: {args.model_dir} already exists")
-        exit(0)
-
-    repo_list = {
-        "gpt2": HFRepoFiles(repo_id="gpt2", files=["config.json", "tokenizer.json", "vocab.json", "merges.txt"]),
-        "bloom": HFRepoFiles(
-            repo_id="bigscience/bloom-560m", files=["config.json", "tokenizer.json", "tokenizer_config.json"]
-        ),
-        "opt": HFRepoFiles(
-            repo_id="facebook/opt-350m", files=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
-        ),
-    }
-
-    os.mkdir(args.model_dir)
-    for model_name in tqdm.tqdm(repo_list):
-        dir_path = os.path.join(args.model_dir, model_name)
-        if args.config_only:
-            os.mkdir(dir_path)
-            repo_list[model_name].download(dir_path)
-        else:
-            repo_list[model_name].download_all()
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
deleted file mode 100755
index 64fef2564f0a..000000000000
--- a/applications/Chat/tests/test_experience.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# import copy
-# import os
-
-# import pytest
-# import torch
-# import torch.distributed as dist
-# from contextlib import nullcontext
-# from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-# from transformers import AutoModelForCausalLM, AutoTokenizer
-
-# from coati.experience_buffer import NaiveExperienceBuffer
-# from coati.experience_maker import NaiveExperienceMaker
-# import colossalai
-# from coati.models import RewardModel, Critic
-# from colossalai.booster import Booster
-# from colossalai.lazy import LazyInitContext
-# from colossalai.utils import get_current_device
-# from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
-# from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-
-# def get_data(batch_size: int, seq_len: int = 10) -> dict:
-#     input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
-#     attention_mask = torch.ones_like(input_ids)
-#     return dict(input_ids=input_ids, attention_mask=attention_mask)
-
-
-# def gather_and_equal(tensor: torch.Tensor) -> bool:
-#     world_size = dist.get_world_size()
-#     outputs = [torch.empty_like(tensor) for _ in range(world_size)]
-#     dist.all_gather(outputs, tensor.contiguous())
-#     for t in outputs[1:]:
-#         if not torch.equal(outputs[0], t):
-#             return False
-#     return True
-
-
-# def make_and_consume_experience(plugin_type, mixed_precision, tp):
-
-#     colossalai.launch_from_torch({})
-
-#     EXPERIENCE_BATCH_SIZE = 4
-#     SAMPLE_BATCH_SIZE = 2
-#     custom_plugin = None
-
-#     if plugin_type == "gemini":
-#         plugin = GeminiPlugin(
-#             precision=mixed_precision,
-#             initial_scale=2**16,
-#             max_norm=1.0,
-#         )
-#     elif plugin_type == "gemini_auto":
-#         plugin = GeminiPlugin(
-#             precision=mixed_precision,
-#             placement_policy="auto",
-#             initial_scale=2**16,
-#             max_norm=1.0,
-#         )
-#     elif plugin_type == "zero2":
-#         plugin = LowLevelZeroPlugin(
-#             stage=2,
-#             precision=mixed_precision,
-#             initial_scale=2**16,
-#             max_norm=1.0,
-#         )
-#     elif plugin_type == "zero2_cpu":
-#         plugin = LowLevelZeroPlugin(
-#             stage=2,
-#             precision=mixed_precision,
-#             initial_scale=2**16,
-#             cpu_offload=True,
-#             max_norm=1.0,
-#         )
-#     elif plugin_type == "3d":
-#         plugin = HybridParallelPlugin(
-#             tp_size=tp,
-#             pp_size=1,
-#             zero_stage=0,
-#             precision=mixed_precision,
-#         )
-#         from colossalai.shardformer.policies.gpt2 import GPT2Policy
-#         custom_plugin = HybridParallelPlugin(
-#             tp_size=tp,
-#             pp_size=1,
-#             zero_stage=0,
-#             precision=mixed_precision,
-#             custom_policy=GPT2Policy(),
-#         )
-#     else:
-#         raise ValueError(f'Unsupported plugin "{plugin}"')
-
-#     # init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in plugin_type else nullcontext()
-#     # with init_ctx:
-#     actor = AutoModelForCausalLM.from_pretrained("gpt2").cuda()
-#     critic = Critic("gpt2").cuda()
-
-#     ref_model = AutoModelForCausalLM.from_pretrained("gpt2").cuda()
-#     reward_model = RewardModel("gpt2").cuda()
-
-#     actor_booster = Booster(plugin=plugin)
-#     ref_booster = Booster(plugin=plugin)
-#     rm_booster = Booster(plugin=custom_plugin)
-#     critic_booster = Booster(plugin=custom_plugin)
-
-#     default_dtype = torch.float16 if mixed_precision == "fp16" else torch.bfloat16
-#     torch.set_default_dtype(default_dtype)
-#     actor, _, _, _, _ = actor_booster.boost(
-#         model=actor
-#     )
-
-#     critic, _, _, _, _ = critic_booster.boost(
-#         model=critic
-#     )
-#     reward_model, _, _, _, _ = rm_booster.boost(model=reward_model)
-#     ref_model, _, _, _, _ = ref_booster.boost(model=ref_model)
-
-#     torch.set_default_dtype(torch.float)
-
-#     tokenizer = AutoTokenizer.from_pretrained("gpt2")
-#     tokenizer.padding_side = "left"
-#     tokenizer.pad_token = tokenizer.eos_token
-#     experience_maker = NaiveExperienceMaker(actor, critic, reward_model, ref_model, tokenizer)
-#     data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
-
-#     # experience of all ranks should be the same
-#     for _ in range(2):
-#         data = get_data(EXPERIENCE_BATCH_SIZE)
-#         assert gather_and_equal(data["input_ids"])
-#         assert gather_and_equal(data["attention_mask"])
-#         experience = experience_maker.make_experience(**data, do_sample=True, max_length=16)
-#         assert gather_and_equal(experience.sequences)
-#         assert gather_and_equal(experience.action_log_probs)
-#         assert gather_and_equal(experience.values)
-#         assert gather_and_equal(experience.reward)
-#         assert gather_and_equal(experience.advantages)
-#         assert gather_and_equal(experience.action_mask)
-#         assert gather_and_equal(experience.attention_mask)
-#         data_buffer.append(experience)
-
-#     # data buffer's data should be the same for tp but different for other methods
-#     # buffer_size = torch.tensor([len(data_buffer)], device="cuda")
-#     # assert gather_and_equal(buffer_size)
-#     # for item in data_buffer.items:
-#     #     assert gather_and_equal(item.sequences)
-#     #     assert gather_and_equal(item.action_log_probs)
-#     #     assert gather_and_equal(item.values)
-#     #     assert gather_and_equal(item.reward)
-#     #     assert gather_and_equal(item.advantages)
-#     #     assert gather_and_equal(item.action_mask)
-#     #     assert gather_and_equal(item.attention_mask)
-
-#     # # dataloader of each rank should have the same size and different batch
-#     # dataloader = strategy.setup_dataloader(data_buffer)
-#     # dataloader_size = torch.tensor([len(dataloader)], device="cuda")
-#     # assert gather_and_equal(dataloader_size)
-#     # for experience in dataloader:
-#     #     assert not gather_and_equal(experience.sequences)
-#     #     assert not gather_and_equal(experience.action_log_probs)
-#     #     assert not gather_and_equal(experience.values)
-#     #     assert not gather_and_equal(experience.reward)
-#     #     assert not gather_and_equal(experience.advantages)
-#     #     # action mask and attention mask may be same
-
-
-# def run_dist(rank, world_size, port, plugin=None, mixed_precision=None, tp=None):
-#     os.environ["RANK"] = str(rank)
-#     os.environ["LOCAL_RANK"] = str(rank)
-#     os.environ["WORLD_SIZE"] = str(world_size)
-#     os.environ["MASTER_ADDR"] = "localhost"
-#     os.environ["MASTER_PORT"] = str(port)
-#     make_and_consume_experience(plugin, mixed_precision, tp)
-
-
-# @pytest.mark.dist
-# @pytest.mark.parametrize("world_size", [2])
-# @pytest.mark.parametrize("plugin", ["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"])
-# @pytest.mark.parametrize("mixed_precision", ["fp16", "bf16"])
-# @pytest.mark.parametrize("tp", [2]) #only for 3d plugin
-# @rerun_if_address_is_in_use()
-# def test_experience(world_size, plugin, mixed_precision, tp):
-#     spawn(run_dist, world_size, plugin=plugin, mixed_precision=mixed_precision, tp=tp)
-
-
-# if __name__ == "__main__":
-#     test_experience(2, "colossalai-zero2")
diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh
deleted file mode 100755
index d818f7f7b28e..000000000000
--- a/applications/Chat/tests/test_inference.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-set -xue
-
-BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
-EXAMPLES_DIR=$BASE_DIR/examples
-
-echo "[Test]: testing inference ..."
-
-# HACK: skip llama due to oom
-for model in 'gpt2' 'bigscience/bloom-560m' 'facebook/opt-350m'; do
-    python $EXAMPLES_DIR/inference.py --model_path $model --io dummy --max_new_tokens 20
-done
diff --git a/applications/Chat/tests/test_models.py b/applications/Chat/tests/test_models.py
deleted file mode 100755
index f267da0331b0..000000000000
--- a/applications/Chat/tests/test_models.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# import copy
-# from typing import Any, Callable, Dict, Tuple
-
-# import pytest
-# import torch
-# import torch.nn as nn
-# from coati.models.base import Actor, Critic, RewardModel, get_base_model
-# from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-# from coati.models.chatglm import ChatGLMActor
-# from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-# from coati.models.generation import generate
-# from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-# from coati.models.llama import LlamaActor
-# from coati.models.lora import LoraLinear, convert_to_lora_module
-# from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
-# from coati.models.opt import OPTRM, OPTActor, OPTCritic
-# from coati.models.utils import calc_action_log_probs, masked_mean
-
-
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("seq_len", [32])
-# @pytest.mark.parametrize(
-#     "actor_maker",
-#     [
-#         lambda: BLOOMActor(),
-#         lambda: GPTActor(),
-#         # HACK: skip llama due to long execution time
-#         # lambda: LlamaActor(),
-#         lambda: OPTActor(),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     "generate_kwargs",
-#     [
-#         {
-#             "max_length": 64,
-#             "use_cache": True,
-#             "do_sample": True,
-#             "temperature": 1.0,
-#             "top_k": 50,
-#         }
-#     ],
-# )
-# def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
-#     class MockTokenizer:
-#         def __init__(self):
-#             self.padding_side = "left"
-#             self.eos_token_id = 0
-#             self.pad_token_id = 0
-
-#     actor = actor_maker()
-#     input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
-#     tokenizer = MockTokenizer()
-#     sequences = generate(actor.cuda(), input_ids, tokenizer, **generate_kwargs)
-#     assert sequences.shape == (batch_size, generate_kwargs["max_length"])
-
-
-# def test_utils():
-#     fn_input = {"tensor": torch.ones((10,)), "mask": torch.randint(0, 2, (10,))}
-#     fn_output = masked_mean(dim=0, **fn_input)
-#     assert fn_output.dim() == 0
-#     assert torch.allclose(fn_output, torch.tensor(1.0))
-
-#     batch_size = 4
-#     seq_len = 32
-#     num_labels = 10
-#     num_actions = 2
-#     fn_input = {
-#         "logits": torch.randn((batch_size, seq_len, num_labels)),
-#         "sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
-#         "num_actions": num_actions,
-#     }
-#     fn_output = calc_action_log_probs(**fn_input)
-#     assert fn_output.shape == (batch_size, num_actions)
-
-
-# @pytest.mark.parametrize("lora_rank", [4])
-# @pytest.mark.parametrize("num_dim", [32])
-# @pytest.mark.parametrize("num_layers", [4])
-# def test_lora(lora_rank: int, num_dim: int, num_layers: int):
-#     model = nn.ModuleList([nn.Linear(num_dim, num_dim) for _ in range(num_layers)])
-#     lora_model = convert_to_lora_module(model, lora_rank)
-#     assert isinstance(lora_model, nn.ModuleList)
-#     for i in range(num_layers):
-#         assert isinstance(lora_model[i], LoraLinear)
-#         assert lora_model[i].lora_A.shape == (lora_rank, num_dim)
-#         assert lora_model[i].lora_B.shape == (num_dim, lora_rank)
-
-#     old_model = copy.deepcopy(lora_model)
-#     for i in range(num_layers):
-#         assert isinstance(lora_model[i], LoraLinear)
-#         assert torch.allclose(old_model[i].weight, lora_model[i].weight)
-#         assert torch.allclose(old_model[i].bias, lora_model[i].bias)
-#         assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A)
-#     optimizer = torch.optim.Adam(lora_model.parameters())
-#     x = torch.randn(8, num_dim)
-#     for i in range(num_layers):
-#         x = lora_model[i](x)
-#     loss = x.sum()
-#     loss.backward()
-#     optimizer.step()
-#     for i in range(num_layers):
-#         assert isinstance(lora_model[i], LoraLinear)
-#         assert torch.allclose(old_model[i].weight, lora_model[i].weight)
-#         assert torch.allclose(old_model[i].bias, lora_model[i].bias)
-#         assert not torch.allclose(
-#             old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A
-#         )
-
-
-# @pytest.mark.parametrize("batch_size", [8])
-# @pytest.mark.parametrize("seq_len", [128])
-# @pytest.mark.parametrize(
-#     "models_maker",
-#     [
-#         lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
-#         lambda: (GPTActor(), GPTCritic(), GPTRM()),
-#         # HACK: skip llama due to long execution time
-#         # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
-#         lambda: (OPTActor(), OPTCritic(), OPTRM()),
-#         # lambda: (ChatGLMActor(), None, None), #temporally remove tests for chatglm
-#     ],
-# )
-# @torch.no_grad()
-# def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
-#     actor_input = {
-#         "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
-#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-#     }
-#     critic_input = {
-#         "sequences": torch.randint(0, 100, (batch_size, seq_len)),
-#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-#     }
-#     rm_input = {
-#         "sequences": torch.randint(0, 100, (batch_size, seq_len)),
-#         "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
-#     }
-
-#     actor, critic, rm = models_maker()
-#     if isinstance(actor, ChatGLMActor):
-#         actor = actor.float()
-#         tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-#         chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
-#         actor_input = {
-#             "input_ids": torch.cat(
-#                 (
-#                     torch.randint(0, 100, (batch_size, seq_len // 2)),
-#                     chatglm_special_token,
-#                     torch.randint(0, 100, (batch_size, seq_len // 2 - 2)),
-#                 ),
-#                 dim=1,
-#             ),
-#             "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len)),
-#         }
-#     assert isinstance(actor, Actor)
-#     get_base_model(actor)
-#     actor_output = actor(**actor_input)
-#     assert actor_output.logits.shape[:2] == (batch_size, seq_len)
-
-#     if critic:
-#         assert isinstance(critic, Critic)
-#         get_base_model(critic)
-#         critic_output = critic(**critic_input)
-#         assert critic_output.shape == (batch_size, seq_len)
-
-#     if rm:
-#         assert isinstance(rm, RewardModel)
-#         get_base_model(rm)
-#         rm_output = rm(**rm_input)
-#         assert rm_output.shape == (batch_size,)
-
-
-# @pytest.mark.parametrize("batch_size", [16])
-# @pytest.mark.parametrize("seq_len", [128])
-# @pytest.mark.parametrize("num_labels", [100])
-# def test_loss(batch_size: int, seq_len: int, num_labels: int):
-#     loss = GPTLMLoss()
-#     loss_input = {
-#         "logits": torch.randn(batch_size, seq_len, num_labels),
-#         "labels": torch.randint(0, num_labels, (batch_size, seq_len)),
-#     }
-#     loss(**loss_input)
-
-#     loss = PolicyLoss()
-#     loss_input = {
-#         "log_probs": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "old_log_probs": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "advantages": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "action_mask": torch.randn(batch_size, seq_len),
-#     }
-#     loss(**loss_input)
-
-#     loss = ValueLoss()
-#     loss_input = {
-#         "values": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "old_values": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "advantage": torch.randn(
-#             batch_size,
-#             seq_len,
-#         ),
-#         "action_mask": torch.randn(batch_size, seq_len),
-#     }
-#     loss(**loss_input)
-
-#     loss = LogSigLoss()
-#     loss_input = {
-#         "chosen_reward": torch.randn(
-#             batch_size,
-#         ),
-#         "reject_reward": torch.randn(
-#             batch_size,
-#         ),
-#     }
-#     loss(**loss_input)
-
-#     loss = LogExpLoss()
-#     loss_input = {
-#         "chosen_reward": torch.randn(
-#             batch_size,
-#         ),
-#         "reject_reward": torch.randn(
-#             batch_size,
-#         ),
-#     }
-#     loss(**loss_input)
-
-
-# if __name__ == "__main__":
-#     generate_kwargs = dict(max_length=40, use_cache=True, do_sample=True, temperature=1.0, top_k=50)
-#     test_generation(lambda: LlamaActor(), batch_size=4, seq_len=32, generate_kwargs=generate_kwargs)
-
-#     test_utils()
-
-#     test_lora(lora_rank=2, num_dim=8, num_layers=2)
-
-#     test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
-
-#     test_loss(batch_size=8, seq_len=128, num_labels=100)

From 63c64fcc524d9625f4ee6d3b9facd6064d45f170 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 6 Dec 2023 10:58:20 +0800
Subject: [PATCH 08/23] fix readme style

---
 applications/Chat/README.md          | 10 +++++-----
 applications/Chat/examples/README.md | 13 +++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index a3d6c4a5b6a6..9880cceac192 100755
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -124,7 +124,7 @@ pip install .
 
 ### RLHF Training Stage1 - Supervised Instructs Tuning
 
-Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of the RLHF training process, as it involves training a machine learning model using human-provided instructions to learn the initial behavior for the task at hand. Here's a detailed guide on how to SFT your LLM with ColossalChat:
+Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of the RLHF training process, as it involves training a machine learning model using human-provided instructions to learn the initial behavior for the task at hand. Here's a detailed guide on how to SFT your LLM with ColossalChat. More detais can be found in [./example/README.md](./examples/README.md)
 
 #### Step 1: Data Collection
 The first step in Stage 1 is to collect a dataset of human demonstrations of the following format.
@@ -160,7 +160,7 @@ Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you
 The tokenization step tokenize the formatted conversation, calculate input_ids, labels, attention_masks and buffer those into dataset files. We provide scripts for data formatting and tokenization for SFT. Simply run the [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh).
 
 #### Step 3: Training
-Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning.
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. More detais can be found in [./example/README.md](./examples/README.md).
 
 ### RLHF Training Stage2 - Training Reward Model
 
@@ -200,7 +200,7 @@ Below shows the preference dataset format used in training the reward model.
 Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
 #### Step 3: Training
-You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training.
+You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. More detais can be found in [./example/README.md](./examples/README.md).
 
 ### RLHF Training Stage3 - Proximal Policy Optimization
 
@@ -244,7 +244,7 @@ To prepare the prompt dataset for PPO training, simply run [prepare_prompt_datas
 To prepare the pretrained dataset for PPO training, simply run [prepare_ptx_dataset.sh](./examples/data_preparation_scripts/prepare_ptx_dataset.sh)
 
 #### Step 3: Training
-You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration.
+You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. More detais can be found in [./example/README.md](./examples/README.md).
 
 ```bash
 --pretrain $PRETRAINED_MODEL_PATH \
@@ -293,7 +293,7 @@ Please refer the [sft section](#dpo-training-stage1---supervised-instructs-tunin
 For DPO training, you only need the preference dataset. Please follow the instruction in the [preference dataset preparation section](#rlhf-training-stage2---training-reward-model) to prepare the preference data for DPO training.
 
 #### Step 2: Training
-You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training.
+You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. More detais can be found in [./example/README.md](./examples/README.md).
 
 ### Inference Quantization and Serving - After Training
 
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
index b9f6b7302b80..a0cfadfbf7b3 100755
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@@ -5,6 +5,7 @@
 - [Examples](#examples)
   - [Table of Contents](#table-of-contents)
   - [Install Requirements](#install-requirements)
+  - [Get Start with ColossalRun](#get-start-with-colossalrun)
   - [Training Configuration](#training-configuration)
   - [RLHF Stage 1: Supervised Instruction Tuning](#rlhf-training-stage1---supervised-instructs-tuning)
     - [Step 1: Data Collection](#step-1-data-collection)
@@ -288,14 +289,14 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
 - config_file: path to store the training config file.
 - save_dir: path to store the model checkpoints.
 - max_length: input will be padded/truncate to max_length before feeding to the model.
-- max_epochs: number of epoch to train
-- batch_size: training batch size
+- max_epochs: number of epoch to train.
+- batch_size: training batch size.
 - mixed_precision: precision to use in training. Support 'fp16' and 'bf16'. Note that some device may not support the 'bf16' option, please refer to [Nvidia](https://developer.nvidia.com/) to check compatability.
-- save_interval: save the model weights as well as optimizer/schedualer states every save_interval steps/episodes
+- save_interval: save the model weights as well as optimizer/schedualer states every save_interval steps/episodes.
 - merge_lora_weights: whether to merge lora weights before saving the model
-- lr: the learning rate used in training
-- accumulation_steps: accumulate gradient every accumulation_steps
-- log_dir: path to store the log
+- lr: the learning rate used in training.
+- accumulation_steps: accumulate gradient every accumulation_steps.
+- log_dir: path to store the log.
 - use_wandb: if this flag is up, you can view logs on wandb.
 
 </details>

From c29a6560476db5b8ea4666ef43dc87fbfaf18abb Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 6 Dec 2023 17:45:09 +0800
Subject: [PATCH 09/23] fix style, fix benchmark

---
 applications/Chat/benchmarks/README.md        |  31 +-
 applications/Chat/benchmarks/benchmark_ppo.py | 493 ++++++++++++++++++
 applications/Chat/benchmarks/benchmark_ppo.sh | 119 +++++
 .../Chat/benchmarks/train_ppo_benchmark.sh    |  87 ++++
 .../Chat/coati/trainer/callbacks/__init__.py  |   4 +
 .../Chat/coati/trainer/callbacks/base.py      |  39 ++
 .../callbacks/performance_evaluator.py        | 191 +++++++
 applications/Chat/coati/trainer/ppo.py        |   8 +-
 .../examples/training_scripts/train_dpo.py    |   9 +-
 .../examples/training_scripts/train_ppo.py    |   6 +
 .../examples/training_scripts/train_ppo.sh    |   2 +-
 .../examples/training_scripts/train_rm.py     |   7 +-
 .../examples/training_scripts/train_sft.py    |   7 +-
 .../Chat/tests/test_data_preparation.sh       |   9 -
 14 files changed, 981 insertions(+), 31 deletions(-)
 create mode 100644 applications/Chat/benchmarks/benchmark_ppo.py
 create mode 100755 applications/Chat/benchmarks/benchmark_ppo.sh
 create mode 100755 applications/Chat/benchmarks/train_ppo_benchmark.sh
 create mode 100644 applications/Chat/coati/trainer/callbacks/__init__.py
 create mode 100644 applications/Chat/coati/trainer/callbacks/base.py
 create mode 100644 applications/Chat/coati/trainer/callbacks/performance_evaluator.py

diff --git a/applications/Chat/benchmarks/README.md b/applications/Chat/benchmarks/README.md
index c13f3485863b..08c5e0e6c685 100755
--- a/applications/Chat/benchmarks/README.md
+++ b/applications/Chat/benchmarks/README.md
@@ -17,22 +17,21 @@ We provide various OPT models (string in parentheses is the corresponding model
 
 We also provide various training strategies:
 
-- ddp: torch DDP
-- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
-- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
-- colossalai_zero2: ColossalAI zero2
-- colossalai_zero2_cpu: ColossalAI zero2-offload
-- colossalai_zero1: ColossalAI zero1
-- colossalai_zero1_cpu: ColossalAI zero1-offload
-
-We only support `torchrun` to launch now. E.g.
+- gemini: ColossalAI GeminiPlugin with `placement_policy="cuda"`, like zero3
+- gemini_auto: ColossalAI GeminiPlugin with `placement_policy="cpu"`, like zero3-offload
+- zero2: ColossalAI zero2
+- zero2_cpu: ColossalAI zero2-offload
+- 3d: ColossalAI HybridParallelPlugin with TP, DP support
 
+## How to Run
 ```bash
-# run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py \
-    --model 125m --critic_model 125m --strategy ddp \
-    --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
-# run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py \
-    --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
+cd ../tests
+# Prepare data for benchmark
+SFT_DATASET=/path/to/sft/data/ \
+PROMPT_DATASET=/path/to/prompt/data/ \
+PRETRAIN_DATASET=/path/to/ptx/data/ \
+PREFERENCE_DATASET=/path/to/preference/data \
+./test_data_preparation.sh
+# Start benchmark
+./benchmark_ppo.sh
 ```
diff --git a/applications/Chat/benchmarks/benchmark_ppo.py b/applications/Chat/benchmarks/benchmark_ppo.py
new file mode 100644
index 000000000000..c7760f9f16a6
--- /dev/null
+++ b/applications/Chat/benchmarks/benchmark_ppo.py
@@ -0,0 +1,493 @@
+"""
+For becnhmarking ppo. Mudified from examples/training_scripts/train_ppo.py
+"""
+
+import argparse
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from coati.dataset import (
+    DataCollatorForPromptDataset,
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_conversation_template,
+    setup_distributed_dataloader,
+)
+from coati.models import Critic, RewardModel, convert_to_lora_module, disable_dropout
+from coati.trainer import PPOTrainer
+from coati.trainer.callbacks import PerformanceEvaluator
+from coati.trainer.utils import is_rank_0
+from coati.utils import load_checkpoint, replace_with_flash_attention
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.opt.configuration_opt import OPTConfig
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+
+def get_model_numel(model: torch.nn.Module, plugin: str, tp: int) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    if plugin == "3d" and tp > 1:
+        numel *= dist.get_world_size()
+    return numel
+
+
+def get_gpt_config(model_name: str) -> OPTConfig:
+    model_map = {
+        "125m": OPTConfig.from_pretrained("facebook/opt-125m"),
+        "350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
+        "700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
+        "1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
+        "2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
+        "3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
+        "5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
+        "6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
+        "10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
+        "13b": OPTConfig.from_pretrained("facebook/opt-13b"),
+    }
+    try:
+        return model_map[model_name]
+    except KeyError:
+        raise ValueError(f'Unknown model "{model_name}"')
+
+
+def benchmark_train(args):
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    init_ctx = LazyInitContext(default_device=get_current_device()) if "gemini" in args.plugin else nullcontext()
+
+    booster_policy = None
+    with init_ctx:
+        actor = AutoModelForCausalLM.from_config(get_gpt_config(args.pretrain), trust_remote_code=True)
+        # Disable dropout
+        disable_dropout(actor)
+        ref_model = AutoModelForCausalLM.from_config(get_gpt_config(args.pretrain), trust_remote_code=True)
+        reward_model = RewardModel(config=get_gpt_config("350m"))
+        critic = Critic(config=get_gpt_config("350m"))
+        disable_dropout(critic)
+
+        actor_numel = get_model_numel(actor, args.plugin, args.tp)
+        critic_numel = get_model_numel(critic, args.plugin, args.tp)
+        initial_model_numel = get_model_numel(ref_model, args.plugin, args.tp)
+        reward_model_numel = get_model_numel(reward_model, args.plugin, args.tp)
+
+        performance_evaluator = PerformanceEvaluator(
+            actor_numel,
+            critic_numel,
+            initial_model_numel,
+            reward_model_numel,
+            enable_grad_checkpoint=False,
+            ignore_episodes=1,
+            train_config={"model": "facebook/opt-" + args.pretrain, "lora_rank": args.lora_rank, "plugin": args.plugin},
+            save_path="./benchmark_performance_summarization.txt",
+        )
+
+        if args.tp > 1:
+            if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
+                raise ValueError("Reward model and critic model must have the same architecture")
+            if reward_model.model.config.architectures[0] == "BloomForCausalLM":
+                from colossalai.shardformer.policies.bloom import BloomPolicy
+
+                booster_policy = BloomPolicy()
+            elif reward_model.model.config.architectures[0] == "LlamaForCausalLM":
+                from colossalai.shardformer.policies.llama import LlamaPolicy
+
+                booster_policy = LlamaPolicy()
+            elif reward_model.model.config.architectures[0] == "GPT2LMHeadModel":
+                from colossalai.shardformer.policies.gpt2 import GPT2Policy
+
+                booster_policy = GPT2Policy()
+            elif reward_model.model.config.architectures[0] == "ChatGLMModel":
+                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
+
+                booster_policy = ChatGLMPolicy()
+            elif reward_model.model.config.architectures[0] == "OPTForCausalLM":
+                from colossalai.shardformer.policies.opt import OPTPolicy
+
+                booster_policy = OPTPolicy()
+            else:
+                raise ValueError("Unknown model architecture for policy")
+
+        if args.lora_rank > 0:
+            actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
+            critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if args.grad_checkpoint and args.lora_rank == 0:
+        actor.gradient_checkpointing_enable()
+        critic.model.gradient_checkpointing_enable()
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+    elif args.lora_rank > 0:
+        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
+
+    if args.use_flash_attn:
+        replace_with_flash_attention(model=actor)
+        replace_with_flash_attention(model=critic)
+        coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    _ = setup_conversation_template(tokenizer)
+    tokenizer.padding_side = "left"  # left padding for generation (online learning)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # configure generation config
+    actor.generation_config.update(
+        pad_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id
+    )
+
+    # configure optimizer
+    coordinator.print_on_master(f"setting up optimizer for actor: lr={args.lr}, weight_decay={args.weight_decay}")
+    actor_optim = HybridAdam(
+        model_params=actor.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    coordinator.print_on_master(f"setting up optimizer for critic: lr={args.lr}, weight_decay={args.weight_decay}")
+    critic_optim = HybridAdam(
+        model_params=critic.parameters(),
+        lr=args.critic_lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
+    train_prompt_dataloader = setup_distributed_dataloader(
+        dataset=train_prompt_dataset,
+        batch_size=args.experience_batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        use_tp=args.tp > 1,
+    )
+
+    if len(args.pretrain_dataset) > 0:
+        train_pretrain_dataset = load_tokenized_dataset(
+            dataset_paths=args.pretrain_dataset, mode="train", mode_map=mode_map
+        )
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
+        train_pretrain_dataloader = setup_distributed_dataloader(
+            dataset=train_pretrain_dataset,
+            batch_size=args.ptx_batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=data_collator,
+            use_tp=args.tp > 1,
+        )
+    else:
+        train_pretrain_dataloader = None
+
+    if args.warmup_steps is None:
+        args.warmup_steps = int(0.025 * args.num_episodes)
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    actor_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=actor_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    critic_lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=critic_optim,
+        total_steps=args.num_episodes,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+        )
+        custom_plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=1,
+            zero_stage=0,
+            precision=args.mixed_precision,
+            custom_policy=booster_policy,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    if args.plugin != "3d":
+        custom_plugin = plugin
+
+    actor_booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+    rm_booster = Booster(plugin=custom_plugin)
+    critic_booster = Booster(plugin=custom_plugin)
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    actor, actor_optim, _, train_prompt_dataloader, actor_lr_scheduler = actor_booster.boost(
+        model=actor,
+        optimizer=actor_optim,
+        lr_scheduler=actor_lr_scheduler,
+        dataloader=train_prompt_dataloader,
+    )
+
+    critic, critic_optim, _, _, critic_lr_scheduler = critic_booster.boost(
+        model=critic,
+        optimizer=critic_optim,
+        lr_scheduler=critic_lr_scheduler,
+        dataloader=train_prompt_dataloader,
+    )
+    reward_model, _, _, _, _ = rm_booster.boost(model=reward_model, dataloader=train_prompt_dataloader)
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_prompt_dataloader)
+
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    sampler_start_idx = 0
+    start_step = 0
+
+    if args.rm_checkpoint_path is not None:
+        if "modeling" in args.rm_checkpoint_path:
+            rm_booster.load_model(reward_model, args.rm_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.rm_checkpoint_path,
+                booster=rm_booster,
+                model=reward_model,
+                optimizer=None,
+                lr_scheduler=None,
+            )
+        coordinator.print_on_master(f"Loaded reward model checkpoint {args.rm_checkpoint_path}")
+
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            actor_booster.load_model(actor, args.checkpoint_path)
+            ref_booster.load_model(ref_model, args.checkpoint_path)
+            coordinator.print_on_master(f"Loaded actor and reference model {args.checkpoint_path}")
+        else:
+            _, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=actor_booster,
+                model=actor,
+                optimizer=actor_optim,
+                lr_scheduler=actor_lr_scheduler,
+            )
+            _, _, _ = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=ref_booster,
+                model=ref_model,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+            assert isinstance(train_prompt_dataloader.sampler, StatefulDistributedSampler)
+            train_prompt_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded actor and reference model checkpoint {args.checkpoint_path} at spisode {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    if args.critic_checkpoint_path is not None:
+        if "modeling" in args.critic_checkpoint_path:
+            critic_booster.load_model(critic, args.critic_checkpoint_path)
+        else:
+            _, _, _ = load_checkpoint(
+                load_dir=args.critic_checkpoint_path,
+                booster=critic_booster,
+                model=critic,
+                optimizer=critic_optim,
+                lr_scheduler=critic_lr_scheduler,
+            )
+        coordinator.print_on_master(f"Loaded critic checkpoint {args.critic_checkpoint_path}")
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    # configure trainer
+    trainer = PPOTrainer(
+        actor_booster,
+        critic_booster,
+        actor,
+        critic,
+        reward_model,
+        ref_model,
+        actor_optim,
+        critic_optim,
+        actor_lr_scheduler,
+        critic_lr_scheduler,
+        tokenizer=tokenizer,
+        kl_coef=args.kl_coef,
+        ptx_coef=args.ptx_coef,
+        train_batch_size=args.train_batch_size,
+        buffer_limit=args.num_collect_steps * args.experience_batch_size,
+        max_length=args.max_length,
+        max_new_tokens=args.max_seq_len,
+        use_cache=True,
+        do_sample=True,
+        temperature=0.7,
+        accumulation_steps=args.accumulation_steps,
+        save_dir=args.save_path,
+        save_interval=args.save_interval,
+        top_k=50,
+        use_tp=args.tp > 1,
+        offload_inference_models="gemini" not in args.plugin,
+        callbacks=[performance_evaluator],
+        coordinator=coordinator,
+    )
+
+    trainer.fit(
+        num_episodes=args.num_episodes,
+        num_collect_steps=args.num_collect_steps,
+        num_update_steps=args.num_update_steps,
+        prompt_dataloader=train_prompt_dataloader,
+        pretrain_dataloader=train_pretrain_dataloader,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        actor.eval()
+        critic.eval()
+    # save model checkpoint after fitting on only rank0
+    coordinator.print_on_master("Start saving final actor model checkpoint")
+    actor_booster.save_model(actor, os.path.join(trainer.actor_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final actor model checkpoint at episodes {args.num_episodes} at folder {args.save_path}"
+    )
+    coordinator.print_on_master("Start saving final critic model checkpoint")
+    critic_booster.save_model(critic, os.path.join(trainer.critic_save_dir, "modeling"), shard=True)
+    coordinator.print_on_master(
+        f"Saved final critic model checkpoint at episodes {args.num_episodes} at folder {args.save_path}"
+    )
+    memory_consumption = torch.cuda.max_memory_allocated() / 1024**2
+    if is_rank_0():
+        with open("./benchmark_memory_consumption.txt", "a+") as f:
+            f.write(
+                f"Model=Opt-{args.pretrain}; lora_rank={args.lora_rank}; plugin={args.plugin}\nMax CUDA memory usage: {memory_consumption:.2f} MB\n"
+            )
+    coordinator.print_on_master(f"Max CUDA memory usage: {memory_consumption:.2f} MB")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt_dataset", nargs="+", default=[])
+    parser.add_argument("--pretrain_dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--checkpoint_path", type=str, default=None)
+    parser.add_argument("--critic_checkpoint_path", type=str, default=None)
+    parser.add_argument("--rm_checkpoint_path", type=str, help="Reward model checkpoint path")
+    parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
+    parser.add_argument("--num_episodes", type=int, default=1)
+    parser.add_argument("--num_collect_steps", type=int, default=2)
+    parser.add_argument("--num_update_steps", type=int, default=5)
+    parser.add_argument("--save_interval", type=int, default=1000)
+    parser.add_argument("--train_batch_size", type=int, default=16)
+    parser.add_argument("--experience_batch_size", type=int, default=16)
+    parser.add_argument("--ptx_batch_size", type=int, default=1)
+    parser.add_argument("--lora_train_bias", type=str, default="none")
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument("--merge_lora_weights", type=bool, default=True)
+    parser.add_argument("--lr", type=float, default=9e-6)
+    parser.add_argument("--critic_lr", type=float, default=9e-6)
+    parser.add_argument("--kl_coef", type=float, default=0.1)
+    parser.add_argument("--ptx_coef", type=float, default=0.0)
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--max_seq_len", type=int, default=256)
+    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    benchmark_train(args)
diff --git a/applications/Chat/benchmarks/benchmark_ppo.sh b/applications/Chat/benchmarks/benchmark_ppo.sh
new file mode 100755
index 000000000000..b18be67bd42f
--- /dev/null
+++ b/applications/Chat/benchmarks/benchmark_ppo.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
+
+set -xu
+
+NUM_RETRY=3
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+EXAMPLES_DIR=$BASE_DIR/examples
+TEMP_DIR=$BASE_DIR/temp
+MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
+MODELS_DIR=$TEMP_DIR/models_config
+MODELS=('125m' '350m' '700m' '1.3b' '2.7b' '3.5b' '5.5b' '6.7b' '10b' '13b')
+PLUGINS=('zero2', 'zero2_cpu', '3d')
+LORA_RANK=('0', '20')
+
+export OMP_NUM_THREADS=8
+
+rm ./benchmark_memory_consumption.txt
+rm ./benchmark_performance_summarization.txt
+
+# install requirements
+pip install -r $EXAMPLES_DIR/requirements.txt
+
+random_choice() {
+    local arr=("$@")
+    local len=${#arr[@]}
+    local idx=$((RANDOM % len))
+    echo ${arr[$idx]}
+}
+
+echo "[Test]: testing ppo ..."
+
+SKIPPED_TESTS=(
+)
+
+GRAD_CKPTS=('--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
+    for model in ${MODELS[@]}; do
+        plugins=($(shuf -e "${PLUGINS[@]}"))
+        for plugin in ${plugins[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            pretrain=$model
+            tokenizer_dir="facebook/opt-125m"
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+            fi
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a prompt_dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    prompt_dataset+=("$TEMP_DIR/rlhf_data/tokenized_opt_prompt/arrow/part-$split")
+                done
+                declare -a ptx_dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_opt_ptx/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 8 --master_port 28547 $BASE_DIR/benchmarks/benchmark.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --prompt_dataset ${prompt_dataset[@]} \
+                    --pretrain_dataset ${ptx_dataset[@]} \
+                    --ptx_batch_size 1 \
+                    --ptx_coef 0.2 \
+                    --save_path $MODEL_SAVE_PATH \
+                    --lora_rank $lora_rank \
+                    --plugin $plugin \
+                    --num_episodes 5 \
+                    --num_collect_steps 1 \
+                    --num_update_steps 4 \
+                    --max_seq_len 1024 \
+                    --max_length 2048 \
+                    --experience_batch_size 4 \
+                    --train_batch_size 1 \
+                    --accumulation_steps 32 \
+                    --lr 9e-6 \
+                    --mixed_precision "bf16" \
+                    --grad_clip 1.0 \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    --use_flash_attn \
+                    $grad_ckpt
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    rm -rf $MODEL_SAVE_PATH/*
+                    rm -rf $MODELS_DIR/*
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
diff --git a/applications/Chat/benchmarks/train_ppo_benchmark.sh b/applications/Chat/benchmarks/train_ppo_benchmark.sh
new file mode 100755
index 000000000000..a5d418ba7063
--- /dev/null
+++ b/applications/Chat/benchmarks/train_ppo_benchmark.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+
+PROJECT_NAME="opt_ppo_benchmark"
+PARENT_SAVE_DIR="save_dir/ckpt"
+PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
+PARENT_CONFIG_FILE="save_dir/train_config"
+PRETRAINED_MODEL_PATH="sft_model_save_dir/modeling"
+REWARD_MODEL_PATH="reward_model_save_dir/modeling"
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
+declare -a prompt_dataset=(
+    path/to/prompt/data/arrow/part-00000
+    path/to/prompt/data/arrow/part-00001
+    path/to/prompt/data/arrow/part-00002
+    path/to/prompt/data/arrow/part-00003
+    path/to/prompt/data/arrow/part-00004
+    path/to/prompt/data/arrow/part-00005
+    path/to/prompt/data/arrow/part-00006
+    path/to/prompt/data/arrow/part-00007
+    path/to/prompt/data/arrow/part-00008
+    path/to/prompt/data/arrow/part-00009
+)
+
+declare -a ptx_dataset=(
+    path/to/ptx/data/arrow/part-00000
+    path/to/ptx/data/arrow/part-00001
+    path/to/ptx/data/arrow/part-00002
+    path/to/ptx/data/arrow/part-00003
+    path/to/ptx/data/arrow/part-00004
+    path/to/ptx/data/arrow/part-00005
+    path/to/ptx/data/arrow/part-00006
+    path/to/ptx/data/arrow/part-00007
+    path/to/ptx/data/arrow/part-00008
+    path/to/ptx/data/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 4 --master_port 30039 train_ppo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --rm_checkpoint_path $REWARD_MODEL_PATH \
+    --prompt_dataset ${prompt_dataset[@]} \
+    --pretrain_dataset ${ptx_dataset[@]} \
+    --ptx_batch_size 1 \
+    --ptx_coef 0.5 \
+    --plugin "zero2" \
+    --save_interval 200 \
+    --save_path $SAVE_DIR \
+    --num_episodes 2000 \
+    --num_collect_steps 1 \
+    --num_update_steps 1 \
+    --experience_batch_size 8 \
+    --train_batch_size 4 \
+    --accumulation_steps 2 \
+    --lr 9e-6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --grad_checkpoint \
diff --git a/applications/Chat/coati/trainer/callbacks/__init__.py b/applications/Chat/coati/trainer/callbacks/__init__.py
new file mode 100644
index 000000000000..a765485072c1
--- /dev/null
+++ b/applications/Chat/coati/trainer/callbacks/__init__.py
@@ -0,0 +1,4 @@
+from .base import Callback
+from .performance_evaluator import PerformanceEvaluator
+
+__all__ = ["Callback", "PerformanceEvaluator"]
diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/Chat/coati/trainer/callbacks/base.py
new file mode 100644
index 000000000000..c6e30f04885c
--- /dev/null
+++ b/applications/Chat/coati/trainer/callbacks/base.py
@@ -0,0 +1,39 @@
+from abc import ABC
+
+from coati.experience_maker import Experience
+
+
+class Callback(ABC):
+    """
+    Base callback class. It defines the interface for callbacks.
+    """
+
+    def on_fit_start(self) -> None:
+        pass
+
+    def on_fit_end(self) -> None:
+        pass
+
+    def on_episode_start(self, episode: int) -> None:
+        pass
+
+    def on_episode_end(self, episode: int) -> None:
+        pass
+
+    def on_make_experience_start(self) -> None:
+        pass
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        pass
+
+    def on_learn_epoch_start(self, epoch: int) -> None:
+        pass
+
+    def on_learn_epoch_end(self, epoch: int) -> None:
+        pass
+
+    def on_learn_batch_start(self) -> None:
+        pass
+
+    def on_learn_batch_end(self, experience: Experience) -> None:
+        pass
diff --git a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
new file mode 100644
index 000000000000..86384e5e39fb
--- /dev/null
+++ b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
@@ -0,0 +1,191 @@
+from time import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from coati.experience_maker import Experience
+
+from .base import Callback
+
+
+def get_world_size() -> int:
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def save_eval_result_rank_0(s: str, save_path: str, **kwargs) -> None:
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        with open(save_path, "a+") as f:
+            train_config = "; ".join([str(kwargs[key]) for key in kwargs])
+            f.write(train_config + "\n" + s + "\n")
+
+
+def divide(x: float, y: float) -> float:
+    if y == 0:
+        return float("inf")
+    elif y == float("inf"):
+        return float("nan")
+    return x / y
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class Timer:
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.0
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        assert self.start_time is not None
+        self.duration += time() - self.start_time
+        self.start_time = None
+
+    def reset(self) -> None:
+        self.duration = 0.0
+
+
+class PerformanceEvaluator(Callback):
+    """
+        Callback for valuate the performance of the model.
+    Args:
+        actor_num_params: The number of parameters of the actor model.
+        critic_num_params: The number of parameters of the critic model.
+        initial_model_num_params: The number of parameters of the initial model.
+        reward_model_num_params: The number of parameters of the reward model.
+        enable_grad_checkpoint: Whether to enable gradient checkpointing.
+        ignore_episodes: The number of episodes to ignore when calculating the performance.
+    """
+
+    def __init__(
+        self,
+        actor_num_params: int,
+        critic_num_params: int,
+        initial_model_num_params: int,
+        reward_model_num_params: int,
+        enable_grad_checkpoint: bool = False,
+        ignore_episodes: int = 0,
+        train_config: Optional[dict] = None,
+        save_path: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.initial_model_num_params = initial_model_num_params
+        self.reward_model_num_params = reward_model_num_params
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_episodes = ignore_episodes
+        self.disable: bool = False
+
+        self.overall_timer = Timer()
+        self.make_experience_timer = Timer()
+        self.learn_timer = Timer()
+        self.make_experience_num_samples: int = 0
+        self.make_experience_flop: int = 0
+        self.learn_num_samples: int = 0
+        self.learn_flop: int = 0
+        self.train_config = train_config
+        self.save_path = save_path
+
+    def on_episode_start(self, episode: int) -> None:
+        self.disable = self.ignore_episodes > 0 and episode < self.ignore_episodes
+        if self.disable:
+            return
+        self.overall_timer.start()
+
+    def on_episode_end(self, episode: int) -> None:
+        if self.disable:
+            return
+        self.overall_timer.end()
+
+    def on_make_experience_start(self) -> None:
+        if self.disable:
+            return
+        self.make_experience_timer.start()
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        if self.disable:
+            return
+        self.make_experience_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.make_experience_num_samples += batch_size
+
+        # actor generate
+        num_actions = experience.action_mask.size(1)
+        input_len = seq_len - num_actions
+        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
+        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
+        # actor forward
+        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
+        # critic forward
+        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
+        # initial model forward
+        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
+        # reward model forward
+        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
+
+    def on_learn_batch_start(self) -> None:
+        if self.disable:
+            return
+        self.learn_timer.start()
+
+    def on_learn_batch_end(self, experience: Experience) -> None:
+        if self.disable:
+            return
+        self.learn_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.learn_num_samples += batch_size
+
+        # actor forward-backward, 3 means forward(1) + backward(2)
+        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+        # critic forward-backward
+        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+
+    def on_fit_end(self) -> None:
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
+        avg_learn_duration = all_reduce_mean(self.learn_timer.duration, self.world_size)
+        avg_overall_duration = all_reduce_mean(self.overall_timer.duration, self.world_size)
+
+        avg_make_experience_throughput = (
+            self.make_experience_num_samples * self.world_size / (avg_make_experience_duration + 1e-12)
+        )
+        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
+
+        avg_learn_throughput = self.learn_num_samples * self.world_size / (avg_learn_duration + 1e-12)
+        avg_learn_tflops = self.learn_flop / 1e12 / (avg_learn_duration + 1e-12)
+
+        num_effective_samples = min(self.learn_num_samples, self.make_experience_num_samples) * self.world_size
+
+        avg_overall_throughput = num_effective_samples / (avg_overall_duration + 1e-12)
+
+        overall_time_per_sample = divide(1, avg_overall_throughput)
+        make_experience_time_per_sample = divide(avg_make_experience_duration, num_effective_samples)
+        learn_time_per_sample = divide(avg_learn_duration, num_effective_samples)
+
+        save_eval_result_rank_0(
+            f"Performance summary:\n"
+            + f"Generate {self.make_experience_num_samples * self.world_size} samples, throughput: {avg_make_experience_throughput:.2f} samples/s, TFLOPS per GPU: {avg_make_experience_tflops:.2f}\n"
+            + f"Train {self.learn_num_samples * self.world_size} samples, throughput: {avg_learn_throughput:.2f} samples/s, TFLOPS per GPU: {avg_learn_tflops:.2f}\n"
+            + f"Overall throughput: {avg_overall_throughput:.2f} samples/s\n"
+            + f"Overall time per sample: {overall_time_per_sample:.2f} s\n"
+            + f"Make experience time per sample: {make_experience_time_per_sample:.2f} s, {make_experience_time_per_sample/overall_time_per_sample*100:.2f}%\n"
+            + f"Learn time per sample: {learn_time_per_sample:.2f} s, {learn_time_per_sample/overall_time_per_sample*100:.2f}%",
+            self.save_path,
+            **self.train_config,
+        )
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index 704a8553f5a2..fc52e8e237fd 100755
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -3,7 +3,7 @@
 """
 
 import os
-from typing import Optional
+from typing import List, Optional
 
 import torch
 import wandb
@@ -12,6 +12,7 @@
 from coati.models import Critic, RewardModel
 from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
 from coati.models.utils import calc_action_log_probs
+from coati.trainer.callbacks import Callback
 from coati.trainer.utils import all_reduce_mean
 from coati.utils import AccumulativeMeanMeter, save_checkpoint
 from torch.optim import Optimizer
@@ -85,13 +86,16 @@ def __init__(
         save_dir: str = None,
         use_tp: bool = False,
         coordinator: DistCoordinator = None,
+        callbacks: List[Callback] = [],
         **generate_kwargs,
     ) -> None:
         if isinstance(actor_booster, GeminiPlugin):
             assert not offload_inference_models, "GeminiPlugin is not compatible with manual model.to('cpu')"
 
         data_buffer = NaiveExperienceBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
-        super().__init__(actor_booster, critic_booster, data_buffer, sample_buffer, dataloader_pin_memory)
+        super().__init__(
+            actor_booster, critic_booster, data_buffer, sample_buffer, dataloader_pin_memory, callbacks=callbacks
+        )
         self.generate_kwargs = generate_kwargs
 
         self.actor = actor
diff --git a/applications/Chat/examples/training_scripts/train_dpo.py b/applications/Chat/examples/training_scripts/train_dpo.py
index 6b9b12b9354a..3a215872bde9 100755
--- a/applications/Chat/examples/training_scripts/train_dpo.py
+++ b/applications/Chat/examples/training_scripts/train_dpo.py
@@ -28,6 +28,13 @@
 
 
 def train(args):
+    # check lora compatibility
+    if "gemini" in args.plugin:
+        if args.lora_rank > 0:
+            raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+        if args.accumulation_steps > 1:
+            raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
+
     # ==============================
     # Initialize Distributed Training
     # ==============================
@@ -37,6 +44,7 @@ def train(args):
     # ==============================
     # Initialize Booster
     # ==============================
+
     if args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,
@@ -248,7 +256,6 @@ def train(args):
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
diff --git a/applications/Chat/examples/training_scripts/train_ppo.py b/applications/Chat/examples/training_scripts/train_ppo.py
index 2a9c7d9aa572..20398c07fbaa 100755
--- a/applications/Chat/examples/training_scripts/train_ppo.py
+++ b/applications/Chat/examples/training_scripts/train_ppo.py
@@ -28,6 +28,12 @@
 
 
 def train(args):
+    # check lora compatibility
+    if "gemini" in args.plugin:
+        if args.lora_rank > 0:
+            raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+        if args.accumulation_steps > 1:
+            raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
     # ==============================
     # Initialize Distributed Training
     # ==============================
diff --git a/applications/Chat/examples/training_scripts/train_ppo.sh b/applications/Chat/examples/training_scripts/train_ppo.sh
index c3c94c2ce9ea..2f9cbfce62c2 100755
--- a/applications/Chat/examples/training_scripts/train_ppo.sh
+++ b/applications/Chat/examples/training_scripts/train_ppo.sh
@@ -30,7 +30,7 @@ PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
 PARENT_CONFIG_FILE="save_dir/train_config"
 PRETRAINED_MODEL_PATH="sft_model_save_dir/modeling"
 REWARD_MODEL_PATH="reward_model_save_dir/modeling"
-PRETRAINED_TOKENIZER_PATH="pretrained/model/path"  # "/mnt/vepfs/lcxyc/leaderboard_models/Colossal-LLaMA-2-7b-base/"  # "/home/lcyab/data/models/bloom-560m" #
+PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
 declare -a prompt_dataset=(
     path/to/prompt/data/arrow/part-00000
     path/to/prompt/data/arrow/part-00001
diff --git a/applications/Chat/examples/training_scripts/train_rm.py b/applications/Chat/examples/training_scripts/train_rm.py
index d03de14e8aab..9147f75406c8 100755
--- a/applications/Chat/examples/training_scripts/train_rm.py
+++ b/applications/Chat/examples/training_scripts/train_rm.py
@@ -29,6 +29,12 @@
 
 
 def train(args):
+    # check lora compatibility
+    if "gemini" in args.plugin:
+        if args.lora_rank > 0:
+            raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+        if args.accumulation_steps > 1:
+            raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
     # ==============================
     # Initialize Distributed Training
     # ==============================
@@ -278,7 +284,6 @@ def train(args):
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
diff --git a/applications/Chat/examples/training_scripts/train_sft.py b/applications/Chat/examples/training_scripts/train_sft.py
index a6700e004bc8..233348630c2b 100755
--- a/applications/Chat/examples/training_scripts/train_sft.py
+++ b/applications/Chat/examples/training_scripts/train_sft.py
@@ -28,6 +28,12 @@
 
 
 def train(args):
+    # check lora compatibility
+    if "gemini" in args.plugin:
+        if args.lora_rank > 0:
+            raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+        if args.accumulation_steps > 1:
+            raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
     # ==============================
     # Initialize Distributed Training
     # ==============================
@@ -254,7 +260,6 @@ def train(args):
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=1)
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
diff --git a/applications/Chat/tests/test_data_preparation.sh b/applications/Chat/tests/test_data_preparation.sh
index 27c7a58efcaa..496a71fcce78 100755
--- a/applications/Chat/tests/test_data_preparation.sh
+++ b/applications/Chat/tests/test_data_preparation.sh
@@ -1,13 +1,4 @@
 #!/usr/bin/env bash
-'''
-Usage
-
-SFT_DATASET=/mnt/tos/lcxyc/experiments/experiments5/tokenized_sft_data/jsonl \
-PROMPT_DATASET=/home/lcyab/data/data_rlhf/preprocessed_prompt \
-PRETRAIN_DATASET=/home/lcyab/data/pretrain_data/cleaned_wiki_en/preprocessed \
-PREFERENCE_DATASET=/home/lcyab/data/data_rlhf/preprcessed \
-./test_data_preparation.sh
-'''
 set_n_least_used_CUDA_VISIBLE_DEVICES() {
     local n=${1:-"9999"}
     echo "GPU Memory Usage:"

From 143bb946aefe73621f610d52c03993ffe48c0529 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 7 Dec 2023 11:15:30 +0800
Subject: [PATCH 10/23] reproduce benchmark result, remove useless files

---
 applications/Chat/README.md                   | 119 ++++------
 .../benchmark_memory_consumption.txt          |   2 +
 .../benchmarks/benchmark_opt_lora_dummy.py    | 214 ------------------
 .../benchmark_performance_summarization.txt   |   8 +
 applications/Chat/benchmarks/benchmark_ppo.py |   2 +-
 applications/Chat/benchmarks/benchmark_ppo.sh |  31 +--
 .../Chat/benchmarks/train_ppo_benchmark.sh    |  87 -------
 applications/Chat/coati/kernels/__init__.py   |   6 -
 applications/Chat/coati/kernels/opt_attn.py   |  90 --------
 applications/Chat/coati/kernels/wrapper.py    |  18 --
 .../Chat/tests/test_data_preparation.sh       |  11 +-
 11 files changed, 80 insertions(+), 508 deletions(-)
 create mode 100644 applications/Chat/benchmarks/benchmark_memory_consumption.txt
 delete mode 100755 applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
 create mode 100644 applications/Chat/benchmarks/benchmark_performance_summarization.txt
 delete mode 100755 applications/Chat/benchmarks/train_ppo_benchmark.sh
 delete mode 100755 applications/Chat/coati/kernels/__init__.py
 delete mode 100755 applications/Chat/coati/kernels/opt_attn.py
 delete mode 100755 applications/Chat/coati/kernels/wrapper.py

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 9880cceac192..f6db4016326a 100755
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -403,91 +403,60 @@ You can find more examples in this [repo](https://github.com/XueFuzhao/Instructi
 
 We have integrated the Transformers save and load pipeline, allowing users to freely call Hugging Face's language models and save them in the HF format.
 
+- Option 1: Save the model weights, model config and generation config (Note: tokenizer will not be saved) which can be loaded using HF's from_pretrained method.
 ```python
-from coati.models.llama import LlamaLM
-from coati.trainer import SFTTrainer
-
-model = LlamaLM(pretrained=args.pretrain)
-tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
-
-(model, optim) = strategy.prepare((model, optim))
-trainer = SFTTrainer(model=model,
-                     strategy=strategy,
-                     optim=optim,
-                     train_dataloader=train_dataloader,
-                     eval_dataloader=eval_dataloader,
-                     batch_size=args.batch_size,
-                     max_epochs=args.max_epochs,
-                     accumulation_steps=args.accumulation_steps
-                     )
-
-trainer.fit()
-# this saves in pytorch format
-strategy.save_model(model, args.save_path, only_rank0=True)
-
-# this saves in HF format
-strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=tokenizer)
+# if use lora, you can choose to merge lora weights before saving
+if args.lora_rank > 0 and args.merge_lora_weights:
+        from coati.models.lora import LORA_MANAGER
+
+        # NOTE: set model to eval to merge LoRA weights
+        LORA_MANAGER.merge_weights = True
+        model.eval()
+# save model checkpoint after fitting on only rank0
+booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+
 ```
 
+- Option 2: Save the model weights, model config, generation config, as well as the optimizer, learning rate schedualer, running states (Note: tokenizer will not be saved) which are needed for resuming training.
+```python
+from coati.utils import save_checkpoint
+# save model checkpoint after fitting on only rank0
+save_checkpoint(
+        save_dir=actor_save_dir,
+        booster=actor_booster,
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        epoch=0,
+        step=step,
+        batch_size=train_batch_size,
+        coordinator=coordinator,
+    )
+```
+To load the saved checkpoint
+```python
+from coati.utils import load_checkpoint
+start_epoch, start_step, sampler_start_idx = load_checkpoint(
+        load_dir=checkpoint_path,
+        booster=booster,
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+    )
+```
 </details>
 
 <details><summary><b>How to train with limited resources</b></summary>
 
-Here are some examples that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
+Here are some suggestions that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
 
-If you only have a single 24G GPU, you can use the following script. `batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model.
+`batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model. To maintain a descent batch size for gradient calculation, consider increase the accumulation_step and reduce the batch_size on each rank.
 
-```bash
-// [INFO]: MAX GPU MEMORY ALLOCATED:  19148.9345703125 MB
-torchrun --standalone --nproc_per_node=1 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy ddp \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --lora_rank 16 \
-    --grad_checkpoint
-```
+If you only have a single 24G GPU. Generally, using lora and "zero2-cpu" will be sufficient.
 
-`colossalai_gemini` strategy can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. You can use the following script.
-
-```bash
-torchrun --standalone --nproc_per_node=1 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_gemini \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --grad_checkpoint
-```
-
-If you have 4x32 GB GPUs, you can even train the whole 7B model using our `colossalai_zero2_cpu` strategy! The script is given as follows.
-
-```bash
-torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2_cpu \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --grad_checkpoint
-```
+`gemini` and `gemini-auto` can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. But that strategy doesn't support gradient accumulation.
 
+If you have multiple GPUs each has very limited VRAM, say 8GB. You can try the `3d` for the plugin option, which supports tensor parellelism, set `--tp` to the number of GPUs that you have.
 </details>
 
 ## The Plan
@@ -498,6 +467,8 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 - [x] support inference
 - [x] support llama from [facebook](https://github.com/facebookresearch/llama)
 - [x] implement PPO-ptx fine-tuning
+- [x] support flash-attention
+- [x] implement DPO fine-tuning
 - [ ] integrate with Ray
 - [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL),
 - [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)
diff --git a/applications/Chat/benchmarks/benchmark_memory_consumption.txt b/applications/Chat/benchmarks/benchmark_memory_consumption.txt
new file mode 100644
index 000000000000..5990a527513d
--- /dev/null
+++ b/applications/Chat/benchmarks/benchmark_memory_consumption.txt
@@ -0,0 +1,2 @@
+Model=Opt-1.3b; lora_rank=0; plugin=zero2
+Max CUDA memory usage: 56538.62 MB
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
deleted file mode 100755
index 8338abf54ced..000000000000
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.models.base import RewardModel
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer import PPOTrainer
-from coati.trainer.callbacks import PerformanceEvaluator
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
-from transformers.models.opt.configuration_opt import OPTConfig
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
-    numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, GeminiStrategy) and strategy.shard_init:
-        numel *= dist.get_world_size()
-    return numel
-
-
-def preprocess_batch(samples) -> dict:
-    input_ids = torch.stack(samples)
-    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-    return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-def print_rank_0(*args, **kwargs) -> None:
-    if dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def print_model_numel(model_dict: dict) -> None:
-    B = 1024**3
-    M = 1024**2
-    K = 1024
-    outputs = ""
-    for name, numel in model_dict.items():
-        outputs += f"{name}: "
-        if numel >= B:
-            outputs += f"{numel / B:.2f} B\n"
-        elif numel >= M:
-            outputs += f"{numel / M:.2f} M\n"
-        elif numel >= K:
-            outputs += f"{numel / K:.2f} K\n"
-        else:
-            outputs += f"{numel}\n"
-    print_rank_0(outputs)
-
-
-def get_gpt_config(model_name: str) -> OPTConfig:
-    model_map = {
-        "125m": OPTConfig.from_pretrained("facebook/opt-125m"),
-        "350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
-        "700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
-        "1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
-        "2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
-        "3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
-        "5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
-        "6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
-        "10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
-        "13b": OPTConfig.from_pretrained("facebook/opt-13b"),
-    }
-    try:
-        return model_map[model_name]
-    except KeyError:
-        raise ValueError(f'Unknown model "{model_name}"')
-
-
-def main(args):
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
-    elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(
-            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
-        )
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero2_cpu":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
-    elif args.strategy == "colossalai_zero1":
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero1_cpu":
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    torch.cuda.set_per_process_memory_fraction(args.cuda_mem_frac)
-
-    model_config = get_gpt_config(args.model)
-    critic_config = get_gpt_config(args.critic_model)
-    with strategy.model_init_context():
-        actor = OPTActor(config=model_config, lora_rank=args.lora_rank).cuda()
-        critic = OPTCritic(config=critic_config, lora_rank=args.lora_rank).cuda()
-
-        initial_model = deepcopy(actor).cuda().half()
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()
-
-    if args.use_kernels:
-        from coati.kernels import convert_to_xformer_model
-
-        actor, critic, initial_model, reward_model = map(
-            convert_to_xformer_model, (actor, critic, initial_model, reward_model)
-        )
-
-    actor_numel = get_model_numel(actor, strategy)
-    critic_numel = get_model_numel(critic, strategy)
-    initial_model_numel = get_model_numel(initial_model, strategy)
-    reward_model_numel = get_model_numel(reward_model, strategy)
-    print_model_numel(
-        {
-            "Actor": actor_numel,
-            "Critic": critic_numel,
-            "Initial model": initial_model_numel,
-            "Reward model": reward_model_numel,
-        }
-    )
-    performance_evaluator = PerformanceEvaluator(
-        actor_numel,
-        critic_numel,
-        initial_model_numel,
-        reward_model_numel,
-        enable_grad_checkpoint=False,
-        ignore_episodes=1,
-    )
-
-    if args.strategy.startswith("colossalai"):
-        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
-        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=5e-6)
-        critic_optim = Adam(critic.parameters(), lr=5e-6)
-
-    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"
-
-    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
-
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
-    dataloader = DataLoader(
-        random_prompts, batch_size=args.experience_batch_size, shuffle=True, collate_fn=preprocess_batch
-    )
-
-    trainer = PPOTrainer(
-        strategy,
-        actor,
-        critic,
-        reward_model,
-        initial_model,
-        actor_optim,
-        critic_optim,
-        None,
-        tokenizer=tokenizer,
-        rm_model_tokenizer=tokenizer,
-        ptx_coef=0,
-        train_batch_size=args.train_batch_size,
-        offload_inference_models=args.offload_inference_models,
-        max_length=512,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        use_cache=True,
-        callbacks=[performance_evaluator],
-    )
-
-    trainer.fit(
-        prompt_dataloader=dataloader,
-        pretrain_dataloader=None,
-        num_episodes=args.num_episodes,
-        num_update_steps=args.num_update_steps,
-        num_collect_steps=args.num_collect_steps,
-        save_per_num_episodes=args.save_per_num_episodes,
-    )
-
-    print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="125m")
-    parser.add_argument("--critic_model", default="125m")
-    parser.add_argument(
-        "--strategy",
-        choices=[
-            "ddp",
-            "colossalai_gemini",
-            "colossalai_gemini_cpu",
-            "colossalai_zero2",
-            "colossalai_zero2_cpu",
-            "colossalai_zero1",
-            "colossalai_zero1_cpu",
-        ],
-        default="ddp",
-    )
-    parser.add_argument("--num_episodes", type=int, default=3)
-    parser.add_argument("--num_collect_steps", type=int, default=8)
-    parser.add_argument("--num_update_steps", type=int, default=1)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--save_per_num_episodes", type=int, default=1000)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0)
-    parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
-    parser.add_argument("--offload_inference_models", action="store_true", default=False)
-    parser.add_argument("--use_kernels", action="store_true", default=False)
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/benchmarks/benchmark_performance_summarization.txt b/applications/Chat/benchmarks/benchmark_performance_summarization.txt
new file mode 100644
index 000000000000..d3d41bf22926
--- /dev/null
+++ b/applications/Chat/benchmarks/benchmark_performance_summarization.txt
@@ -0,0 +1,8 @@
+facebook/opt-1.3b; 0; zero2
+Performance summary:
+Generate 768 samples, throughput: 60.95 samples/s, TFLOPS per GPU: 1174.51
+Train 768 samples, throughput: 109.53 samples/s, TFLOPS per GPU: 70.31
+Overall throughput: 35.63 samples/s
+Overall time per sample: 0.03 s
+Make experience time per sample: 0.02 s, 58.45%
+Learn time per sample: 0.01 s, 32.53%
diff --git a/applications/Chat/benchmarks/benchmark_ppo.py b/applications/Chat/benchmarks/benchmark_ppo.py
index c7760f9f16a6..4606e3174e74 100644
--- a/applications/Chat/benchmarks/benchmark_ppo.py
+++ b/applications/Chat/benchmarks/benchmark_ppo.py
@@ -94,7 +94,7 @@ def benchmark_train(args):
             initial_model_numel,
             reward_model_numel,
             enable_grad_checkpoint=False,
-            ignore_episodes=1,
+            ignore_episodes=2,
             train_config={"model": "facebook/opt-" + args.pretrain, "lora_rank": args.lora_rank, "plugin": args.plugin},
             save_path="./benchmark_performance_summarization.txt",
         )
diff --git a/applications/Chat/benchmarks/benchmark_ppo.sh b/applications/Chat/benchmarks/benchmark_ppo.sh
index b18be67bd42f..6c2a368d6697 100755
--- a/applications/Chat/benchmarks/benchmark_ppo.sh
+++ b/applications/Chat/benchmarks/benchmark_ppo.sh
@@ -25,9 +25,13 @@ EXAMPLES_DIR=$BASE_DIR/examples
 TEMP_DIR=$BASE_DIR/temp
 MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
-MODELS=('125m' '350m' '700m' '1.3b' '2.7b' '3.5b' '5.5b' '6.7b' '10b' '13b')
-PLUGINS=('zero2', 'zero2_cpu', '3d')
-LORA_RANK=('0', '20')
+# To benchmark different models, change the following line
+# MODELS=('125m' '350m' '700m' '1.3b' '2.7b' '3.5b' '5.5b' '6.7b' '10b' '13b')
+MODELS=('1.3b')
+# To benchmark different strategies, change the following line
+# PLUGINS=('zero2', 'zero2_cpu', '3d')
+PLUGINS=('zero2')
+LORA_RANK=('0')
 
 export OMP_NUM_THREADS=8
 
@@ -49,7 +53,8 @@ echo "[Test]: testing ppo ..."
 SKIPPED_TESTS=(
 )
 
-GRAD_CKPTS=('--grad_checkpoint')
+GRAD_CKPTS=('' '--grad_checkpoint')
+GRAD_CKPTS=('')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
         plugins=($(shuf -e "${PLUGINS[@]}"))
@@ -78,30 +83,30 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_opt_ptx/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 8 --master_port 28547 $BASE_DIR/benchmarks/benchmark.py \
+                colossalai run --nproc_per_node 8 --master_port 28547 $BASE_DIR/benchmarks/benchmark_ppo.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --prompt_dataset ${prompt_dataset[@]} \
                     --pretrain_dataset ${ptx_dataset[@]} \
                     --ptx_batch_size 1 \
-                    --ptx_coef 0.2 \
+                    --ptx_coef 0 \
                     --save_path $MODEL_SAVE_PATH \
                     --lora_rank $lora_rank \
                     --plugin $plugin \
                     --num_episodes 5 \
                     --num_collect_steps 1 \
-                    --num_update_steps 4 \
-                    --max_seq_len 1024 \
-                    --max_length 2048 \
-                    --experience_batch_size 4 \
-                    --train_batch_size 1 \
-                    --accumulation_steps 32 \
+                    --num_update_steps 1 \
+                    --max_seq_len 128 \
+                    --max_length 512 \
+                    --experience_batch_size 32 \
+                    --train_batch_size 32 \
+                    --accumulation_steps 1 \
                     --lr 9e-6 \
                     --mixed_precision "bf16" \
                     --grad_clip 1.0 \
+                    --use_flash_attn \
                     --tp $tp \
                     --lr 2e-5 \
-                    --use_flash_attn \
                     $grad_ckpt
                 passed=$?
                 if [ $passed -eq 0 ]; then
diff --git a/applications/Chat/benchmarks/train_ppo_benchmark.sh b/applications/Chat/benchmarks/train_ppo_benchmark.sh
deleted file mode 100755
index a5d418ba7063..000000000000
--- a/applications/Chat/benchmarks/train_ppo_benchmark.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
-# NCCL IB environment variables
-export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME=eth0
-export NCCL_IB_GID_INDEX=3
-export NCCL_IB_TIMEOUT=23
-export NCCL_IB_RETRY_CNT=7
-export OMP_NUM_THREADS=8
-
-
-PROJECT_NAME="opt_ppo_benchmark"
-PARENT_SAVE_DIR="save_dir/ckpt"
-PARENT_TENSORBOARD_DIR="save_dir/tensorboard"
-PARENT_CONFIG_FILE="save_dir/train_config"
-PRETRAINED_MODEL_PATH="sft_model_save_dir/modeling"
-REWARD_MODEL_PATH="reward_model_save_dir/modeling"
-PRETRAINED_TOKENIZER_PATH="pretrained/model/path"
-declare -a prompt_dataset=(
-    path/to/prompt/data/arrow/part-00000
-    path/to/prompt/data/arrow/part-00001
-    path/to/prompt/data/arrow/part-00002
-    path/to/prompt/data/arrow/part-00003
-    path/to/prompt/data/arrow/part-00004
-    path/to/prompt/data/arrow/part-00005
-    path/to/prompt/data/arrow/part-00006
-    path/to/prompt/data/arrow/part-00007
-    path/to/prompt/data/arrow/part-00008
-    path/to/prompt/data/arrow/part-00009
-)
-
-declare -a ptx_dataset=(
-    path/to/ptx/data/arrow/part-00000
-    path/to/ptx/data/arrow/part-00001
-    path/to/ptx/data/arrow/part-00002
-    path/to/ptx/data/arrow/part-00003
-    path/to/ptx/data/arrow/part-00004
-    path/to/ptx/data/arrow/part-00005
-    path/to/ptx/data/arrow/part-00006
-    path/to/ptx/data/arrow/part-00007
-    path/to/ptx/data/arrow/part-00008
-    path/to/ptx/data/arrow/part-00009
-)
-
-TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
-FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
-SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
-
-colossalai run --nproc_per_node 4 --master_port 30039 train_ppo.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --rm_checkpoint_path $REWARD_MODEL_PATH \
-    --prompt_dataset ${prompt_dataset[@]} \
-    --pretrain_dataset ${ptx_dataset[@]} \
-    --ptx_batch_size 1 \
-    --ptx_coef 0.5 \
-    --plugin "zero2" \
-    --save_interval 200 \
-    --save_path $SAVE_DIR \
-    --num_episodes 2000 \
-    --num_collect_steps 1 \
-    --num_update_steps 1 \
-    --experience_batch_size 8 \
-    --train_batch_size 4 \
-    --accumulation_steps 2 \
-    --lr 9e-6 \
-    --mixed_precision "bf16" \
-    --grad_clip 1.0 \
-    --weight_decay 0.01 \
-    --warmup_steps 100 \
-    --grad_checkpoint \
diff --git a/applications/Chat/coati/kernels/__init__.py b/applications/Chat/coati/kernels/__init__.py
deleted file mode 100755
index 96d40c7c4709..000000000000
--- a/applications/Chat/coati/kernels/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .wrapper import convert_to_xformer_model, recover_from_xformer_model
-
-__all__ = [
-    "convert_to_xformer_model",
-    "recover_from_xformer_model",
-]
diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
deleted file mode 100755
index d1eb139187f3..000000000000
--- a/applications/Chat/coati/kernels/opt_attn.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-import xformers.ops as xops
-from torch import Tensor
-from transformers.models.opt.modeling_opt import OPTAttention
-
-
-# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-class XOPTAttention(OPTAttention):
-    # def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
-    #     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        key_value_states: Optional[Tensor] = None,
-        past_key_value: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        layer_head_mask: Optional[Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
-        if not self.training:
-            return super().forward(
-                hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions
-            )
-        """Input shape: Batch x Time x Channel"""
-        assert layer_head_mask is None, "Xformers attention does not support layer_head_mask"
-        assert not output_attentions, "Xformers attention does not support output_attentions"
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = xops.memory_efficient_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_bias=xops.LowerTriangularMask(),
-            p=self.dropout if self.training else 0.0,
-            scale=self.scaling,
-        )
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        attn_weights_reshaped = None
-
-        return attn_output, attn_weights_reshaped, past_key_value
diff --git a/applications/Chat/coati/kernels/wrapper.py b/applications/Chat/coati/kernels/wrapper.py
deleted file mode 100755
index c55bda600230..000000000000
--- a/applications/Chat/coati/kernels/wrapper.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch.nn as nn
-from transformers.models.opt.modeling_opt import OPTAttention
-
-from .opt_attn import XOPTAttention
-
-
-def convert_to_xformer_model(model: nn.Module) -> nn.Module:
-    for module in model.modules():
-        if isinstance(module, OPTAttention):
-            module.__class__ = XOPTAttention
-    return model
-
-
-def recover_from_xformer_model(model: nn.Module) -> nn.Module:
-    for module in model.modules():
-        if isinstance(module, XOPTAttention):
-            module.__class__ = OPTAttention
-    return model
diff --git a/applications/Chat/tests/test_data_preparation.sh b/applications/Chat/tests/test_data_preparation.sh
index 496a71fcce78..62ae1e500ec9 100755
--- a/applications/Chat/tests/test_data_preparation.sh
+++ b/applications/Chat/tests/test_data_preparation.sh
@@ -43,7 +43,8 @@ BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 BASE_TEMP_DIR=$BASE_DIR/temp
 EXAMPLES_DIR=$BASE_DIR/examples
 DATA_SAVE_PATH=$BASE_TEMP_DIR/rlhf_data
-MODELS=('gpt2' 'bloom' 'opt' 'llama')
+# MODELS=('gpt2' 'bloom' 'opt' 'llama')
+MODELS=('opt')
 
 if [ ! -d "$BASE_TEMP_DIR" ]; then
   mkdir "$BASE_TEMP_DIR"
@@ -134,7 +135,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 100 \
+            --num_samples_per_datafile 500 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -177,7 +178,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 100 \
+            --num_samples_per_datafile 500 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -220,7 +221,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 100 \
+            --num_samples_per_datafile 500 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -263,7 +264,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 100 \
+            --num_samples_per_datafile 500 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then

From 427f161e4348538f436099ac2d67b0e9e0ee3f75 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 7 Dec 2023 11:24:05 +0800
Subject: [PATCH 11/23] rename to ColossalChat

---
 applications/{Chat => ColossalChat}/.gitignore                    | 0
 applications/{Chat => ColossalChat}/LICENSE                       | 0
 applications/{Chat => ColossalChat}/README.md                     | 0
 applications/{Chat => ColossalChat}/benchmarks/README.md          | 0
 .../benchmarks/benchmark_memory_consumption.txt                   | 0
 .../benchmarks/benchmark_performance_summarization.txt            | 0
 applications/{Chat => ColossalChat}/benchmarks/benchmark_ppo.py   | 0
 applications/{Chat => ColossalChat}/benchmarks/benchmark_ppo.sh   | 0
 applications/{Chat => ColossalChat}/benchmarks/ray/1mmt_dummy.py  | 0
 applications/{Chat => ColossalChat}/benchmarks/ray/mmmt_dummy.py  | 0
 applications/{Chat => ColossalChat}/coati/__init__.py             | 0
 applications/{Chat => ColossalChat}/coati/dataset/__init__.py     | 0
 applications/{Chat => ColossalChat}/coati/dataset/conversation.py | 0
 applications/{Chat => ColossalChat}/coati/dataset/loader.py       | 0
 .../{Chat => ColossalChat}/coati/dataset/tokenization_utils.py    | 0
 applications/{Chat => ColossalChat}/coati/dataset/utils.py        | 0
 .../{Chat => ColossalChat}/coati/experience_buffer/__init__.py    | 0
 .../{Chat => ColossalChat}/coati/experience_buffer/base.py        | 0
 .../{Chat => ColossalChat}/coati/experience_buffer/naive.py       | 0
 .../{Chat => ColossalChat}/coati/experience_buffer/utils.py       | 0
 .../{Chat => ColossalChat}/coati/experience_maker/__init__.py     | 0
 .../{Chat => ColossalChat}/coati/experience_maker/base.py         | 0
 .../{Chat => ColossalChat}/coati/experience_maker/naive.py        | 0
 applications/{Chat => ColossalChat}/coati/models/__init__.py      | 0
 applications/{Chat => ColossalChat}/coati/models/base.py          | 0
 applications/{Chat => ColossalChat}/coati/models/critic.py        | 0
 applications/{Chat => ColossalChat}/coati/models/generation.py    | 0
 applications/{Chat => ColossalChat}/coati/models/lora.py          | 0
 applications/{Chat => ColossalChat}/coati/models/loss.py          | 0
 applications/{Chat => ColossalChat}/coati/models/reward_model.py  | 0
 applications/{Chat => ColossalChat}/coati/models/utils.py         | 0
 applications/{Chat => ColossalChat}/coati/quant/__init__.py       | 0
 .../{Chat => ColossalChat}/coati/quant/llama_gptq/__init__.py     | 0
 .../{Chat => ColossalChat}/coati/quant/llama_gptq/loader.py       | 0
 .../{Chat => ColossalChat}/coati/quant/llama_gptq/model_utils.py  | 0
 .../{Chat => ColossalChat}/coati/quant/llama_gptq/quant.py        | 0
 applications/{Chat => ColossalChat}/coati/quant/utils.py          | 0
 applications/{Chat => ColossalChat}/coati/ray/README.md           | 0
 applications/{Chat => ColossalChat}/coati/ray/__init__.py         | 0
 .../{Chat => ColossalChat}/coati/ray/callbacks/__init__.py        | 0
 applications/{Chat => ColossalChat}/coati/ray/callbacks/base.py   | 0
 .../coati/ray/callbacks/performance_evaluator.py                  | 0
 .../{Chat => ColossalChat}/coati/ray/detached_replay_buffer.py    | 0
 .../{Chat => ColossalChat}/coati/ray/detached_trainer_base.py     | 0
 .../{Chat => ColossalChat}/coati/ray/detached_trainer_ppo.py      | 0
 .../{Chat => ColossalChat}/coati/ray/experience_maker_holder.py   | 0
 applications/{Chat => ColossalChat}/coati/ray/lora_constructor.py | 0
 applications/{Chat => ColossalChat}/coati/ray/utils.py            | 0
 applications/{Chat => ColossalChat}/coati/trainer/__init__.py     | 0
 applications/{Chat => ColossalChat}/coati/trainer/base.py         | 0
 .../{Chat => ColossalChat}/coati/trainer/callbacks/__init__.py    | 0
 .../{Chat => ColossalChat}/coati/trainer/callbacks/base.py        | 0
 .../coati/trainer/callbacks/performance_evaluator.py              | 0
 applications/{Chat => ColossalChat}/coati/trainer/dpo.py          | 0
 applications/{Chat => ColossalChat}/coati/trainer/ppo.py          | 0
 applications/{Chat => ColossalChat}/coati/trainer/rm.py           | 0
 applications/{Chat => ColossalChat}/coati/trainer/sft.py          | 0
 applications/{Chat => ColossalChat}/coati/trainer/utils.py        | 0
 applications/{Chat => ColossalChat}/coati/utils/__init__.py       | 0
 .../{Chat => ColossalChat}/coati/utils/accumulative_meter.py      | 0
 applications/{Chat => ColossalChat}/coati/utils/ckpt_io.py        | 0
 .../{Chat => ColossalChat}/coati/utils/flash_attention_patch.py   | 0
 applications/{Chat => ColossalChat}/examples/README.md            | 0
 applications/{Chat => ColossalChat}/examples/community/README.md  | 0
 .../{Chat => ColossalChat}/examples/community/peft/README.md      | 0
 .../examples/community/peft/easy_dataset.py                       | 0
 .../{Chat => ColossalChat}/examples/community/peft/easy_models.py | 0
 .../examples/community/peft/train_peft_prompts.py                 | 0
 .../examples/community/peft/train_peft_sft.py                     | 0
 .../{Chat => ColossalChat}/examples/community/ray/README.md       | 0
 .../examples/community/ray/ray_job_script.py                      | 0
 .../examples/community/ray/train_prompts_on_ray.py                | 0
 .../data_preparation_scripts/prepare_preference_dataset.py        | 0
 .../data_preparation_scripts/prepare_preference_dataset.sh        | 0
 .../examples/data_preparation_scripts/prepare_prompt_dataset.py   | 0
 .../examples/data_preparation_scripts/prepare_prompt_dataset.sh   | 0
 .../examples/data_preparation_scripts/prepare_ptx_dataset.py      | 0
 .../examples/data_preparation_scripts/prepare_ptx_dataset.sh      | 0
 .../examples/data_preparation_scripts/prepare_sft_dataset.py      | 0
 .../examples/data_preparation_scripts/prepare_sft_dataset.sh      | 0
 applications/{Chat => ColossalChat}/examples/inference/chatio.py  | 0
 .../{Chat => ColossalChat}/examples/inference/inference.py        | 0
 .../examples/inference/web_chatbot/README.md                      | 0
 .../examples/inference/web_chatbot/locustfile.py                  | 0
 .../examples/inference/web_chatbot/requirements.txt               | 0
 .../examples/inference/web_chatbot/server.py                      | 0
 .../examples/inference/web_chatbot/utils.py                       | 0
 applications/{Chat => ColossalChat}/examples/ray/1mmt_prompt.py   | 0
 applications/{Chat => ColossalChat}/examples/ray/mmmt_prompt.py   | 0
 applications/{Chat => ColossalChat}/examples/ray/requirements.txt | 0
 applications/{Chat => ColossalChat}/examples/ray/test_ci.sh       | 0
 applications/{Chat => ColossalChat}/examples/requirements.txt     | 0
 .../{Chat => ColossalChat}/examples/training_scripts/hostfile     | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_dpo.py | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_dpo.sh | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_ppo.py | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_ppo.sh | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_rm.py  | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_rm.sh  | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_sft.py | 0
 .../{Chat => ColossalChat}/examples/training_scripts/train_sft.sh | 0
 applications/{Chat => ColossalChat}/pytest.ini                    | 0
 applications/{Chat => ColossalChat}/requirements.txt              | 0
 applications/{Chat => ColossalChat}/setup.py                      | 0
 applications/{Chat => ColossalChat}/tests/__init__.py             | 0
 .../{Chat => ColossalChat}/tests/test_data_preparation.sh         | 0
 applications/{Chat => ColossalChat}/tests/test_lora.py            | 0
 applications/{Chat => ColossalChat}/tests/test_train.sh           | 0
 applications/{Chat => ColossalChat}/version.txt                   | 0
 109 files changed, 0 insertions(+), 0 deletions(-)
 rename applications/{Chat => ColossalChat}/.gitignore (100%)
 rename applications/{Chat => ColossalChat}/LICENSE (100%)
 rename applications/{Chat => ColossalChat}/README.md (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/README.md (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/benchmark_memory_consumption.txt (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/benchmark_performance_summarization.txt (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/benchmark_ppo.py (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/benchmark_ppo.sh (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/ray/1mmt_dummy.py (100%)
 rename applications/{Chat => ColossalChat}/benchmarks/ray/mmmt_dummy.py (100%)
 rename applications/{Chat => ColossalChat}/coati/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/dataset/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/dataset/conversation.py (100%)
 rename applications/{Chat => ColossalChat}/coati/dataset/loader.py (100%)
 rename applications/{Chat => ColossalChat}/coati/dataset/tokenization_utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/dataset/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_buffer/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_buffer/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_buffer/naive.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_buffer/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_maker/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_maker/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/experience_maker/naive.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/critic.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/generation.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/lora.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/loss.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/reward_model.py (100%)
 rename applications/{Chat => ColossalChat}/coati/models/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/llama_gptq/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/llama_gptq/loader.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/llama_gptq/model_utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/llama_gptq/quant.py (100%)
 rename applications/{Chat => ColossalChat}/coati/quant/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/README.md (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/callbacks/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/callbacks/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/callbacks/performance_evaluator.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/detached_replay_buffer.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/detached_trainer_base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/detached_trainer_ppo.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/experience_maker_holder.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/lora_constructor.py (100%)
 rename applications/{Chat => ColossalChat}/coati/ray/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/callbacks/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/callbacks/base.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/callbacks/performance_evaluator.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/dpo.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/ppo.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/rm.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/sft.py (100%)
 rename applications/{Chat => ColossalChat}/coati/trainer/utils.py (100%)
 rename applications/{Chat => ColossalChat}/coati/utils/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/coati/utils/accumulative_meter.py (100%)
 rename applications/{Chat => ColossalChat}/coati/utils/ckpt_io.py (100%)
 rename applications/{Chat => ColossalChat}/coati/utils/flash_attention_patch.py (100%)
 rename applications/{Chat => ColossalChat}/examples/README.md (100%)
 rename applications/{Chat => ColossalChat}/examples/community/README.md (100%)
 rename applications/{Chat => ColossalChat}/examples/community/peft/README.md (100%)
 rename applications/{Chat => ColossalChat}/examples/community/peft/easy_dataset.py (100%)
 rename applications/{Chat => ColossalChat}/examples/community/peft/easy_models.py (100%)
 rename applications/{Chat => ColossalChat}/examples/community/peft/train_peft_prompts.py (100%)
 rename applications/{Chat => ColossalChat}/examples/community/peft/train_peft_sft.py (100%)
 rename applications/{Chat => ColossalChat}/examples/community/ray/README.md (100%)
 rename applications/{Chat => ColossalChat}/examples/community/ray/ray_job_script.py (100%)
 rename applications/{Chat => ColossalChat}/examples/community/ray/train_prompts_on_ray.py (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_preference_dataset.py (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_preference_dataset.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_prompt_dataset.py (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_prompt_dataset.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_ptx_dataset.py (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_ptx_dataset.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_sft_dataset.py (100%)
 rename applications/{Chat => ColossalChat}/examples/data_preparation_scripts/prepare_sft_dataset.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/chatio.py (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/inference.py (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/web_chatbot/README.md (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/web_chatbot/locustfile.py (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/web_chatbot/requirements.txt (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/web_chatbot/server.py (100%)
 rename applications/{Chat => ColossalChat}/examples/inference/web_chatbot/utils.py (100%)
 rename applications/{Chat => ColossalChat}/examples/ray/1mmt_prompt.py (100%)
 rename applications/{Chat => ColossalChat}/examples/ray/mmmt_prompt.py (100%)
 rename applications/{Chat => ColossalChat}/examples/ray/requirements.txt (100%)
 rename applications/{Chat => ColossalChat}/examples/ray/test_ci.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/requirements.txt (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/hostfile (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_dpo.py (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_dpo.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_ppo.py (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_ppo.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_rm.py (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_rm.sh (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_sft.py (100%)
 rename applications/{Chat => ColossalChat}/examples/training_scripts/train_sft.sh (100%)
 rename applications/{Chat => ColossalChat}/pytest.ini (100%)
 rename applications/{Chat => ColossalChat}/requirements.txt (100%)
 rename applications/{Chat => ColossalChat}/setup.py (100%)
 rename applications/{Chat => ColossalChat}/tests/__init__.py (100%)
 rename applications/{Chat => ColossalChat}/tests/test_data_preparation.sh (100%)
 rename applications/{Chat => ColossalChat}/tests/test_lora.py (100%)
 rename applications/{Chat => ColossalChat}/tests/test_train.sh (100%)
 rename applications/{Chat => ColossalChat}/version.txt (100%)

diff --git a/applications/Chat/.gitignore b/applications/ColossalChat/.gitignore
similarity index 100%
rename from applications/Chat/.gitignore
rename to applications/ColossalChat/.gitignore
diff --git a/applications/Chat/LICENSE b/applications/ColossalChat/LICENSE
similarity index 100%
rename from applications/Chat/LICENSE
rename to applications/ColossalChat/LICENSE
diff --git a/applications/Chat/README.md b/applications/ColossalChat/README.md
similarity index 100%
rename from applications/Chat/README.md
rename to applications/ColossalChat/README.md
diff --git a/applications/Chat/benchmarks/README.md b/applications/ColossalChat/benchmarks/README.md
similarity index 100%
rename from applications/Chat/benchmarks/README.md
rename to applications/ColossalChat/benchmarks/README.md
diff --git a/applications/Chat/benchmarks/benchmark_memory_consumption.txt b/applications/ColossalChat/benchmarks/benchmark_memory_consumption.txt
similarity index 100%
rename from applications/Chat/benchmarks/benchmark_memory_consumption.txt
rename to applications/ColossalChat/benchmarks/benchmark_memory_consumption.txt
diff --git a/applications/Chat/benchmarks/benchmark_performance_summarization.txt b/applications/ColossalChat/benchmarks/benchmark_performance_summarization.txt
similarity index 100%
rename from applications/Chat/benchmarks/benchmark_performance_summarization.txt
rename to applications/ColossalChat/benchmarks/benchmark_performance_summarization.txt
diff --git a/applications/Chat/benchmarks/benchmark_ppo.py b/applications/ColossalChat/benchmarks/benchmark_ppo.py
similarity index 100%
rename from applications/Chat/benchmarks/benchmark_ppo.py
rename to applications/ColossalChat/benchmarks/benchmark_ppo.py
diff --git a/applications/Chat/benchmarks/benchmark_ppo.sh b/applications/ColossalChat/benchmarks/benchmark_ppo.sh
similarity index 100%
rename from applications/Chat/benchmarks/benchmark_ppo.sh
rename to applications/ColossalChat/benchmarks/benchmark_ppo.sh
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/ColossalChat/benchmarks/ray/1mmt_dummy.py
similarity index 100%
rename from applications/Chat/benchmarks/ray/1mmt_dummy.py
rename to applications/ColossalChat/benchmarks/ray/1mmt_dummy.py
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/ColossalChat/benchmarks/ray/mmmt_dummy.py
similarity index 100%
rename from applications/Chat/benchmarks/ray/mmmt_dummy.py
rename to applications/ColossalChat/benchmarks/ray/mmmt_dummy.py
diff --git a/applications/Chat/coati/__init__.py b/applications/ColossalChat/coati/__init__.py
similarity index 100%
rename from applications/Chat/coati/__init__.py
rename to applications/ColossalChat/coati/__init__.py
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/ColossalChat/coati/dataset/__init__.py
similarity index 100%
rename from applications/Chat/coati/dataset/__init__.py
rename to applications/ColossalChat/coati/dataset/__init__.py
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/ColossalChat/coati/dataset/conversation.py
similarity index 100%
rename from applications/Chat/coati/dataset/conversation.py
rename to applications/ColossalChat/coati/dataset/conversation.py
diff --git a/applications/Chat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
similarity index 100%
rename from applications/Chat/coati/dataset/loader.py
rename to applications/ColossalChat/coati/dataset/loader.py
diff --git a/applications/Chat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
similarity index 100%
rename from applications/Chat/coati/dataset/tokenization_utils.py
rename to applications/ColossalChat/coati/dataset/tokenization_utils.py
diff --git a/applications/Chat/coati/dataset/utils.py b/applications/ColossalChat/coati/dataset/utils.py
similarity index 100%
rename from applications/Chat/coati/dataset/utils.py
rename to applications/ColossalChat/coati/dataset/utils.py
diff --git a/applications/Chat/coati/experience_buffer/__init__.py b/applications/ColossalChat/coati/experience_buffer/__init__.py
similarity index 100%
rename from applications/Chat/coati/experience_buffer/__init__.py
rename to applications/ColossalChat/coati/experience_buffer/__init__.py
diff --git a/applications/Chat/coati/experience_buffer/base.py b/applications/ColossalChat/coati/experience_buffer/base.py
similarity index 100%
rename from applications/Chat/coati/experience_buffer/base.py
rename to applications/ColossalChat/coati/experience_buffer/base.py
diff --git a/applications/Chat/coati/experience_buffer/naive.py b/applications/ColossalChat/coati/experience_buffer/naive.py
similarity index 100%
rename from applications/Chat/coati/experience_buffer/naive.py
rename to applications/ColossalChat/coati/experience_buffer/naive.py
diff --git a/applications/Chat/coati/experience_buffer/utils.py b/applications/ColossalChat/coati/experience_buffer/utils.py
similarity index 100%
rename from applications/Chat/coati/experience_buffer/utils.py
rename to applications/ColossalChat/coati/experience_buffer/utils.py
diff --git a/applications/Chat/coati/experience_maker/__init__.py b/applications/ColossalChat/coati/experience_maker/__init__.py
similarity index 100%
rename from applications/Chat/coati/experience_maker/__init__.py
rename to applications/ColossalChat/coati/experience_maker/__init__.py
diff --git a/applications/Chat/coati/experience_maker/base.py b/applications/ColossalChat/coati/experience_maker/base.py
similarity index 100%
rename from applications/Chat/coati/experience_maker/base.py
rename to applications/ColossalChat/coati/experience_maker/base.py
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/ColossalChat/coati/experience_maker/naive.py
similarity index 100%
rename from applications/Chat/coati/experience_maker/naive.py
rename to applications/ColossalChat/coati/experience_maker/naive.py
diff --git a/applications/Chat/coati/models/__init__.py b/applications/ColossalChat/coati/models/__init__.py
similarity index 100%
rename from applications/Chat/coati/models/__init__.py
rename to applications/ColossalChat/coati/models/__init__.py
diff --git a/applications/Chat/coati/models/base.py b/applications/ColossalChat/coati/models/base.py
similarity index 100%
rename from applications/Chat/coati/models/base.py
rename to applications/ColossalChat/coati/models/base.py
diff --git a/applications/Chat/coati/models/critic.py b/applications/ColossalChat/coati/models/critic.py
similarity index 100%
rename from applications/Chat/coati/models/critic.py
rename to applications/ColossalChat/coati/models/critic.py
diff --git a/applications/Chat/coati/models/generation.py b/applications/ColossalChat/coati/models/generation.py
similarity index 100%
rename from applications/Chat/coati/models/generation.py
rename to applications/ColossalChat/coati/models/generation.py
diff --git a/applications/Chat/coati/models/lora.py b/applications/ColossalChat/coati/models/lora.py
similarity index 100%
rename from applications/Chat/coati/models/lora.py
rename to applications/ColossalChat/coati/models/lora.py
diff --git a/applications/Chat/coati/models/loss.py b/applications/ColossalChat/coati/models/loss.py
similarity index 100%
rename from applications/Chat/coati/models/loss.py
rename to applications/ColossalChat/coati/models/loss.py
diff --git a/applications/Chat/coati/models/reward_model.py b/applications/ColossalChat/coati/models/reward_model.py
similarity index 100%
rename from applications/Chat/coati/models/reward_model.py
rename to applications/ColossalChat/coati/models/reward_model.py
diff --git a/applications/Chat/coati/models/utils.py b/applications/ColossalChat/coati/models/utils.py
similarity index 100%
rename from applications/Chat/coati/models/utils.py
rename to applications/ColossalChat/coati/models/utils.py
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/ColossalChat/coati/quant/__init__.py
similarity index 100%
rename from applications/Chat/coati/quant/__init__.py
rename to applications/ColossalChat/coati/quant/__init__.py
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/ColossalChat/coati/quant/llama_gptq/__init__.py
similarity index 100%
rename from applications/Chat/coati/quant/llama_gptq/__init__.py
rename to applications/ColossalChat/coati/quant/llama_gptq/__init__.py
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/ColossalChat/coati/quant/llama_gptq/loader.py
similarity index 100%
rename from applications/Chat/coati/quant/llama_gptq/loader.py
rename to applications/ColossalChat/coati/quant/llama_gptq/loader.py
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/ColossalChat/coati/quant/llama_gptq/model_utils.py
similarity index 100%
rename from applications/Chat/coati/quant/llama_gptq/model_utils.py
rename to applications/ColossalChat/coati/quant/llama_gptq/model_utils.py
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/ColossalChat/coati/quant/llama_gptq/quant.py
similarity index 100%
rename from applications/Chat/coati/quant/llama_gptq/quant.py
rename to applications/ColossalChat/coati/quant/llama_gptq/quant.py
diff --git a/applications/Chat/coati/quant/utils.py b/applications/ColossalChat/coati/quant/utils.py
similarity index 100%
rename from applications/Chat/coati/quant/utils.py
rename to applications/ColossalChat/coati/quant/utils.py
diff --git a/applications/Chat/coati/ray/README.md b/applications/ColossalChat/coati/ray/README.md
similarity index 100%
rename from applications/Chat/coati/ray/README.md
rename to applications/ColossalChat/coati/ray/README.md
diff --git a/applications/Chat/coati/ray/__init__.py b/applications/ColossalChat/coati/ray/__init__.py
similarity index 100%
rename from applications/Chat/coati/ray/__init__.py
rename to applications/ColossalChat/coati/ray/__init__.py
diff --git a/applications/Chat/coati/ray/callbacks/__init__.py b/applications/ColossalChat/coati/ray/callbacks/__init__.py
similarity index 100%
rename from applications/Chat/coati/ray/callbacks/__init__.py
rename to applications/ColossalChat/coati/ray/callbacks/__init__.py
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/ColossalChat/coati/ray/callbacks/base.py
similarity index 100%
rename from applications/Chat/coati/ray/callbacks/base.py
rename to applications/ColossalChat/coati/ray/callbacks/base.py
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/ColossalChat/coati/ray/callbacks/performance_evaluator.py
similarity index 100%
rename from applications/Chat/coati/ray/callbacks/performance_evaluator.py
rename to applications/ColossalChat/coati/ray/callbacks/performance_evaluator.py
diff --git a/applications/Chat/coati/ray/detached_replay_buffer.py b/applications/ColossalChat/coati/ray/detached_replay_buffer.py
similarity index 100%
rename from applications/Chat/coati/ray/detached_replay_buffer.py
rename to applications/ColossalChat/coati/ray/detached_replay_buffer.py
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/ColossalChat/coati/ray/detached_trainer_base.py
similarity index 100%
rename from applications/Chat/coati/ray/detached_trainer_base.py
rename to applications/ColossalChat/coati/ray/detached_trainer_base.py
diff --git a/applications/Chat/coati/ray/detached_trainer_ppo.py b/applications/ColossalChat/coati/ray/detached_trainer_ppo.py
similarity index 100%
rename from applications/Chat/coati/ray/detached_trainer_ppo.py
rename to applications/ColossalChat/coati/ray/detached_trainer_ppo.py
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/ColossalChat/coati/ray/experience_maker_holder.py
similarity index 100%
rename from applications/Chat/coati/ray/experience_maker_holder.py
rename to applications/ColossalChat/coati/ray/experience_maker_holder.py
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/ColossalChat/coati/ray/lora_constructor.py
similarity index 100%
rename from applications/Chat/coati/ray/lora_constructor.py
rename to applications/ColossalChat/coati/ray/lora_constructor.py
diff --git a/applications/Chat/coati/ray/utils.py b/applications/ColossalChat/coati/ray/utils.py
similarity index 100%
rename from applications/Chat/coati/ray/utils.py
rename to applications/ColossalChat/coati/ray/utils.py
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/ColossalChat/coati/trainer/__init__.py
similarity index 100%
rename from applications/Chat/coati/trainer/__init__.py
rename to applications/ColossalChat/coati/trainer/__init__.py
diff --git a/applications/Chat/coati/trainer/base.py b/applications/ColossalChat/coati/trainer/base.py
similarity index 100%
rename from applications/Chat/coati/trainer/base.py
rename to applications/ColossalChat/coati/trainer/base.py
diff --git a/applications/Chat/coati/trainer/callbacks/__init__.py b/applications/ColossalChat/coati/trainer/callbacks/__init__.py
similarity index 100%
rename from applications/Chat/coati/trainer/callbacks/__init__.py
rename to applications/ColossalChat/coati/trainer/callbacks/__init__.py
diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/ColossalChat/coati/trainer/callbacks/base.py
similarity index 100%
rename from applications/Chat/coati/trainer/callbacks/base.py
rename to applications/ColossalChat/coati/trainer/callbacks/base.py
diff --git a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py b/applications/ColossalChat/coati/trainer/callbacks/performance_evaluator.py
similarity index 100%
rename from applications/Chat/coati/trainer/callbacks/performance_evaluator.py
rename to applications/ColossalChat/coati/trainer/callbacks/performance_evaluator.py
diff --git a/applications/Chat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
similarity index 100%
rename from applications/Chat/coati/trainer/dpo.py
rename to applications/ColossalChat/coati/trainer/dpo.py
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/ColossalChat/coati/trainer/ppo.py
similarity index 100%
rename from applications/Chat/coati/trainer/ppo.py
rename to applications/ColossalChat/coati/trainer/ppo.py
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py
similarity index 100%
rename from applications/Chat/coati/trainer/rm.py
rename to applications/ColossalChat/coati/trainer/rm.py
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py
similarity index 100%
rename from applications/Chat/coati/trainer/sft.py
rename to applications/ColossalChat/coati/trainer/sft.py
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py
similarity index 100%
rename from applications/Chat/coati/trainer/utils.py
rename to applications/ColossalChat/coati/trainer/utils.py
diff --git a/applications/Chat/coati/utils/__init__.py b/applications/ColossalChat/coati/utils/__init__.py
similarity index 100%
rename from applications/Chat/coati/utils/__init__.py
rename to applications/ColossalChat/coati/utils/__init__.py
diff --git a/applications/Chat/coati/utils/accumulative_meter.py b/applications/ColossalChat/coati/utils/accumulative_meter.py
similarity index 100%
rename from applications/Chat/coati/utils/accumulative_meter.py
rename to applications/ColossalChat/coati/utils/accumulative_meter.py
diff --git a/applications/Chat/coati/utils/ckpt_io.py b/applications/ColossalChat/coati/utils/ckpt_io.py
similarity index 100%
rename from applications/Chat/coati/utils/ckpt_io.py
rename to applications/ColossalChat/coati/utils/ckpt_io.py
diff --git a/applications/Chat/coati/utils/flash_attention_patch.py b/applications/ColossalChat/coati/utils/flash_attention_patch.py
similarity index 100%
rename from applications/Chat/coati/utils/flash_attention_patch.py
rename to applications/ColossalChat/coati/utils/flash_attention_patch.py
diff --git a/applications/Chat/examples/README.md b/applications/ColossalChat/examples/README.md
similarity index 100%
rename from applications/Chat/examples/README.md
rename to applications/ColossalChat/examples/README.md
diff --git a/applications/Chat/examples/community/README.md b/applications/ColossalChat/examples/community/README.md
similarity index 100%
rename from applications/Chat/examples/community/README.md
rename to applications/ColossalChat/examples/community/README.md
diff --git a/applications/Chat/examples/community/peft/README.md b/applications/ColossalChat/examples/community/peft/README.md
similarity index 100%
rename from applications/Chat/examples/community/peft/README.md
rename to applications/ColossalChat/examples/community/peft/README.md
diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/ColossalChat/examples/community/peft/easy_dataset.py
similarity index 100%
rename from applications/Chat/examples/community/peft/easy_dataset.py
rename to applications/ColossalChat/examples/community/peft/easy_dataset.py
diff --git a/applications/Chat/examples/community/peft/easy_models.py b/applications/ColossalChat/examples/community/peft/easy_models.py
similarity index 100%
rename from applications/Chat/examples/community/peft/easy_models.py
rename to applications/ColossalChat/examples/community/peft/easy_models.py
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/ColossalChat/examples/community/peft/train_peft_prompts.py
similarity index 100%
rename from applications/Chat/examples/community/peft/train_peft_prompts.py
rename to applications/ColossalChat/examples/community/peft/train_peft_prompts.py
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/ColossalChat/examples/community/peft/train_peft_sft.py
similarity index 100%
rename from applications/Chat/examples/community/peft/train_peft_sft.py
rename to applications/ColossalChat/examples/community/peft/train_peft_sft.py
diff --git a/applications/Chat/examples/community/ray/README.md b/applications/ColossalChat/examples/community/ray/README.md
similarity index 100%
rename from applications/Chat/examples/community/ray/README.md
rename to applications/ColossalChat/examples/community/ray/README.md
diff --git a/applications/Chat/examples/community/ray/ray_job_script.py b/applications/ColossalChat/examples/community/ray/ray_job_script.py
similarity index 100%
rename from applications/Chat/examples/community/ray/ray_job_script.py
rename to applications/ColossalChat/examples/community/ray/ray_job_script.py
diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py
similarity index 100%
rename from applications/Chat/examples/community/ray/train_prompts_on_ray.py
rename to applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.py
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.py
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.py
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_preference_dataset.sh
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.py
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.py
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.py
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_ptx_dataset.py
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.py
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_ptx_dataset.py
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_ptx_dataset.sh
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.py
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.py
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.py
diff --git a/applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
similarity index 100%
rename from applications/Chat/examples/data_preparation_scripts/prepare_sft_dataset.sh
rename to applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
diff --git a/applications/Chat/examples/inference/chatio.py b/applications/ColossalChat/examples/inference/chatio.py
similarity index 100%
rename from applications/Chat/examples/inference/chatio.py
rename to applications/ColossalChat/examples/inference/chatio.py
diff --git a/applications/Chat/examples/inference/inference.py b/applications/ColossalChat/examples/inference/inference.py
similarity index 100%
rename from applications/Chat/examples/inference/inference.py
rename to applications/ColossalChat/examples/inference/inference.py
diff --git a/applications/Chat/examples/inference/web_chatbot/README.md b/applications/ColossalChat/examples/inference/web_chatbot/README.md
similarity index 100%
rename from applications/Chat/examples/inference/web_chatbot/README.md
rename to applications/ColossalChat/examples/inference/web_chatbot/README.md
diff --git a/applications/Chat/examples/inference/web_chatbot/locustfile.py b/applications/ColossalChat/examples/inference/web_chatbot/locustfile.py
similarity index 100%
rename from applications/Chat/examples/inference/web_chatbot/locustfile.py
rename to applications/ColossalChat/examples/inference/web_chatbot/locustfile.py
diff --git a/applications/Chat/examples/inference/web_chatbot/requirements.txt b/applications/ColossalChat/examples/inference/web_chatbot/requirements.txt
similarity index 100%
rename from applications/Chat/examples/inference/web_chatbot/requirements.txt
rename to applications/ColossalChat/examples/inference/web_chatbot/requirements.txt
diff --git a/applications/Chat/examples/inference/web_chatbot/server.py b/applications/ColossalChat/examples/inference/web_chatbot/server.py
similarity index 100%
rename from applications/Chat/examples/inference/web_chatbot/server.py
rename to applications/ColossalChat/examples/inference/web_chatbot/server.py
diff --git a/applications/Chat/examples/inference/web_chatbot/utils.py b/applications/ColossalChat/examples/inference/web_chatbot/utils.py
similarity index 100%
rename from applications/Chat/examples/inference/web_chatbot/utils.py
rename to applications/ColossalChat/examples/inference/web_chatbot/utils.py
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/ColossalChat/examples/ray/1mmt_prompt.py
similarity index 100%
rename from applications/Chat/examples/ray/1mmt_prompt.py
rename to applications/ColossalChat/examples/ray/1mmt_prompt.py
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/ColossalChat/examples/ray/mmmt_prompt.py
similarity index 100%
rename from applications/Chat/examples/ray/mmmt_prompt.py
rename to applications/ColossalChat/examples/ray/mmmt_prompt.py
diff --git a/applications/Chat/examples/ray/requirements.txt b/applications/ColossalChat/examples/ray/requirements.txt
similarity index 100%
rename from applications/Chat/examples/ray/requirements.txt
rename to applications/ColossalChat/examples/ray/requirements.txt
diff --git a/applications/Chat/examples/ray/test_ci.sh b/applications/ColossalChat/examples/ray/test_ci.sh
similarity index 100%
rename from applications/Chat/examples/ray/test_ci.sh
rename to applications/ColossalChat/examples/ray/test_ci.sh
diff --git a/applications/Chat/examples/requirements.txt b/applications/ColossalChat/examples/requirements.txt
similarity index 100%
rename from applications/Chat/examples/requirements.txt
rename to applications/ColossalChat/examples/requirements.txt
diff --git a/applications/Chat/examples/training_scripts/hostfile b/applications/ColossalChat/examples/training_scripts/hostfile
similarity index 100%
rename from applications/Chat/examples/training_scripts/hostfile
rename to applications/ColossalChat/examples/training_scripts/hostfile
diff --git a/applications/Chat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_dpo.py
rename to applications/ColossalChat/examples/training_scripts/train_dpo.py
diff --git a/applications/Chat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_dpo.sh
rename to applications/ColossalChat/examples/training_scripts/train_dpo.sh
diff --git a/applications/Chat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_ppo.py
rename to applications/ColossalChat/examples/training_scripts/train_ppo.py
diff --git a/applications/Chat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_ppo.sh
rename to applications/ColossalChat/examples/training_scripts/train_ppo.sh
diff --git a/applications/Chat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_rm.py
rename to applications/ColossalChat/examples/training_scripts/train_rm.py
diff --git a/applications/Chat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_rm.sh
rename to applications/ColossalChat/examples/training_scripts/train_rm.sh
diff --git a/applications/Chat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_sft.py
rename to applications/ColossalChat/examples/training_scripts/train_sft.py
diff --git a/applications/Chat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
similarity index 100%
rename from applications/Chat/examples/training_scripts/train_sft.sh
rename to applications/ColossalChat/examples/training_scripts/train_sft.sh
diff --git a/applications/Chat/pytest.ini b/applications/ColossalChat/pytest.ini
similarity index 100%
rename from applications/Chat/pytest.ini
rename to applications/ColossalChat/pytest.ini
diff --git a/applications/Chat/requirements.txt b/applications/ColossalChat/requirements.txt
similarity index 100%
rename from applications/Chat/requirements.txt
rename to applications/ColossalChat/requirements.txt
diff --git a/applications/Chat/setup.py b/applications/ColossalChat/setup.py
similarity index 100%
rename from applications/Chat/setup.py
rename to applications/ColossalChat/setup.py
diff --git a/applications/Chat/tests/__init__.py b/applications/ColossalChat/tests/__init__.py
similarity index 100%
rename from applications/Chat/tests/__init__.py
rename to applications/ColossalChat/tests/__init__.py
diff --git a/applications/Chat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
similarity index 100%
rename from applications/Chat/tests/test_data_preparation.sh
rename to applications/ColossalChat/tests/test_data_preparation.sh
diff --git a/applications/Chat/tests/test_lora.py b/applications/ColossalChat/tests/test_lora.py
similarity index 100%
rename from applications/Chat/tests/test_lora.py
rename to applications/ColossalChat/tests/test_lora.py
diff --git a/applications/Chat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
similarity index 100%
rename from applications/Chat/tests/test_train.sh
rename to applications/ColossalChat/tests/test_train.sh
diff --git a/applications/Chat/version.txt b/applications/ColossalChat/version.txt
similarity index 100%
rename from applications/Chat/version.txt
rename to applications/ColossalChat/version.txt

From b0ac968ec3d9805c2d10e80bc135ee231067a710 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 8 Dec 2023 15:18:27 +0800
Subject: [PATCH 12/23] use new image

---
 .github/workflows/build_on_pr.yml               | 4 ++--
 .github/workflows/build_on_schedule.yml         | 2 +-
 .github/workflows/doc_test_on_pr.yml            | 2 +-
 .github/workflows/doc_test_on_schedule.yml      | 2 +-
 .github/workflows/example_check_on_dispatch.yml | 2 +-
 .github/workflows/example_check_on_pr.yml       | 2 +-
 .github/workflows/example_check_on_schedule.yml | 2 +-
 .github/workflows/run_chatgpt_examples.yml      | 2 +-
 .github/workflows/run_chatgpt_unit_tests.yml    | 2 +-
 .github/workflows/run_colossalqa_unit_tests.yml | 4 ++--
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index e2114d43bcd0..dbbcd8505608 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -30,7 +30,7 @@ jobs:
       github.event.repository.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
@@ -54,7 +54,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 6c77377be34f..bfe88091f090 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, 8-gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 40
     steps:
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index f1e7a2d0cab0..0cacad8f596d 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -56,7 +56,7 @@ jobs:
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm
     timeout-minutes: 20
     defaults:
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index 027fbfd0aaeb..99cd880669f4 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index 9d3bd9a48235..4f4be842d882 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -45,7 +45,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
     steps:
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index 859b6e4fb556..96c4a292b8fa 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -77,7 +77,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 20
     concurrency:
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index 5ed128c3ebc5..8fbfbe7aad7d 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
     timeout-minutes: 10
     steps:
       - name: 📚 Checkout
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 4fff245f5e60..8b42bb6622a6 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -18,7 +18,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm -v /data/scratch/colossal-llama2/colossal_chat_test_data:/data/scratch/colossal-llama2/colossal_chat_test_data --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index a47e03987cf7..1a2995d8d573 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -20,7 +20,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
     timeout-minutes: 30
     defaults:
diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml
index 763db277289f..4407534cae29 100644
--- a/.github/workflows/run_colossalqa_unit_tests.yml
+++ b/.github/workflows/run_colossalqa_unit_tests.yml
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
       volumes:
         - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
         - /data/scratch/llama-tiny:/data/scratch/llama-tiny
@@ -51,4 +51,4 @@ jobs:
           TEST_DATA_PATH_EN: /data/scratch/test_data_colossalqa/companies.txt
           TEST_DATA_PATH_ZH: /data/scratch/test_data_colossalqa/companies_zh.txt
           TEST_DOCUMENT_LOADER_DATA_PATH: /data/scratch/test_data_colossalqa/tests/*
-          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
\ No newline at end of file
+          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path

From 853642cc3df233e2ca02e876bd03454dbe6dd2af Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 11 Dec 2023 09:18:37 +0800
Subject: [PATCH 13/23] fix ci workflow

---
 .github/workflows/run_chatgpt_examples.yml   | 4 ++--
 .github/workflows/run_chatgpt_unit_tests.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 8b42bb6622a6..2eef82810a9b 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -30,7 +30,7 @@ jobs:
 
       - name: Install ChatGPT
         run: |
-          cd applications/Chat
+          cd applications/ColossalChat
           pip install -v .
           pip install -r examples/requirements.txt
 
@@ -40,7 +40,7 @@ jobs:
 
       - name: Execute Examples
         run: |
-          cd applications/Chat
+          cd applications/ColossalChat
           rm -rf ~/.cache/colossalai
           ./tests/test_data_preparation.sh
           ./tests/test_train.sh
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index 1a2995d8d573..c54940ce859a 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -32,13 +32,13 @@ jobs:
 
       - name: Install ChatGPT
         run: |
-          cd applications/Chat
+          cd applications/ChColossalChatat
           pip install -v .
           pip install -r requirements-test.txt
 
       - name: Execute Unit Testing
         run: |
-          cd applications/Chat
+          cd applications/ColossalChat
           rm -rf ~/.cache/colossalai
           # pytest tests/
           # Disabled temporally because some unit tests are not implemented

From 448960208c5c91c7139c153004bfcf85835534d7 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 11 Dec 2023 10:55:26 +0800
Subject: [PATCH 14/23] fix ci

---
 .github/workflows/run_chatgpt_unit_tests.yml          |  2 +-
 .../ColossalChat/tests/test_data_preparation.sh       | 11 +++++------
 applications/ColossalChat/tests/test_train.sh         |  1 -
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index c54940ce859a..816f627369ea 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install ChatGPT
         run: |
-          cd applications/ChColossalChatat
+          cd applications/ColossalChat
           pip install -v .
           pip install -r requirements-test.txt
 
diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index 62ae1e500ec9..496a71fcce78 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -43,8 +43,7 @@ BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 BASE_TEMP_DIR=$BASE_DIR/temp
 EXAMPLES_DIR=$BASE_DIR/examples
 DATA_SAVE_PATH=$BASE_TEMP_DIR/rlhf_data
-# MODELS=('gpt2' 'bloom' 'opt' 'llama')
-MODELS=('opt')
+MODELS=('gpt2' 'bloom' 'opt' 'llama')
 
 if [ ! -d "$BASE_TEMP_DIR" ]; then
   mkdir "$BASE_TEMP_DIR"
@@ -135,7 +134,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 500 \
+            --num_samples_per_datafile 100 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -178,7 +177,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 500 \
+            --num_samples_per_datafile 100 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -221,7 +220,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 500 \
+            --num_samples_per_datafile 100 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
@@ -264,7 +263,7 @@ for model in ${MODELS[@]}; do
             --data_jsonl_output_dir $jsonl_dir \
             --data_arrow_output_dir $arrow_dir \
             --max_length 400 \
-            --num_samples_per_datafile 500 \
+            --num_samples_per_datafile 100 \
             --num_spliced_dataset_bins 1
         passed=$?
         if [ $passed -eq 0 ]; then
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index d2dded2ed78f..7e96a7f8da5d 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -26,7 +26,6 @@ EXAMPLES_DIR=$BASE_DIR/examples
 TEMP_DIR=$BASE_DIR/temp
 MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
-# MODELS=('gpt2' 'bloom' 'opt' 'llama')
 MODELS=('gpt2' 'bloom' 'opt' 'llama')
 # PLUGINS=('gemini' 'gemini_auto' 'zero2' 'zero2_cpu' '3d')
 PLUGINS=('zero2' 'zero2_cpu' '3d')

From d88e2d93899f3545d754bb4fc456fa7637f10e84 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 12 Dec 2023 12:24:16 +0800
Subject: [PATCH 15/23] use local model/tokenizer for ci tests

---
 .../ColossalChat/tests/test_data_preparation.sh    |  8 ++++----
 applications/ColossalChat/tests/test_train.sh      | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index 496a71fcce78..d4df91d9fce7 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -83,13 +83,13 @@ get_data_input_dirs() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "gpt2"
+        echo "/data/scratch/models/gpt2"
     elif [[ $model == "bloom" ]]; then
-        echo "bigscience/bloom-560m"
+        echo "/data/scratch/models/bloom-560m"
     elif [[ $model == "opt" ]]; then
-        echo "facebook/opt-350m"
+        echo "/data/scratch/models/opt-350m"
     elif [[ $model == "llama" ]]; then
-        echo "hf-internal-testing/llama-tokenizer"
+        echo "/data/scratch/models/llama-tokenizer"
     else
         echo "Unknown model $model"
         exit 1
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 7e96a7f8da5d..f3a4ee4daa30 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -39,11 +39,11 @@ pip install -r $EXAMPLES_DIR/requirements.txt
 get_pretrain() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "gpt2"
+        echo "/data/scratch/models/gpt2"
     elif [[ $model == "bloom" ]]; then
-        echo "bigscience/bloom-560m"
+        echo "/data/scratch/models/bloom-560m"
     elif [[ $model == "opt" ]]; then
-        echo "facebook/opt-350m"
+        echo "/data/scratch/models/opt-350m"
     elif [[ $model == "llama" ]]; then
         echo "/data/scratch/llama-tiny"
     else
@@ -55,13 +55,13 @@ get_pretrain() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "gpt2"
+        echo "/data/scratch/models/gpt2"
     elif [[ $model == "bloom" ]]; then
-        echo "bigscience/bloom-560m"
+        echo "/data/scratch/models/bloom-560m"
     elif [[ $model == "opt" ]]; then
-        echo "facebook/opt-350m"
+        echo "/data/scratch/models/opt-350m"
     elif [[ $model == "llama" ]]; then
-        echo "hf-internal-testing/llama-tokenizer"
+        echo "/data/scratch/models/llama-tokenizer"
     else
         echo "Unknown model $model"
         exit 1

From d48c4ac13ee8fb7bd1c62dc812e673b9d7d376bd Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 12 Dec 2023 14:21:52 +0800
Subject: [PATCH 16/23] fix ci

---
 .../ColossalChat/tests/test_data_preparation.sh  |  8 ++++----
 applications/ColossalChat/tests/test_train.sh    | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index d4df91d9fce7..713395d1d9eb 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -83,13 +83,13 @@ get_data_input_dirs() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2"
+        echo "/data/scratch/models/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m"
+        echo "/data/scratch/models/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m"
+        echo "/data/scratch/models/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/models/llama-tokenizer"
+        echo "/data/scratch/models/llama-tokenizer/"
     else
         echo "Unknown model $model"
         exit 1
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index f3a4ee4daa30..0311c5788317 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -39,13 +39,13 @@ pip install -r $EXAMPLES_DIR/requirements.txt
 get_pretrain() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2"
+        echo "/data/scratch/models/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m"
+        echo "/data/scratch/models/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m"
+        echo "/data/scratch/models/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/llama-tiny"
+        echo "/data/scratch/llama-tiny/"
     else
         echo "Unknown model $model"
         exit 1
@@ -55,13 +55,13 @@ get_pretrain() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2"
+        echo "/data/scratch/models/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m"
+        echo "/data/scratch/models/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m"
+        echo "/data/scratch/models/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/models/llama-tokenizer"
+        echo "/data/scratch/models/llama-tokenizer/"
     else
         echo "Unknown model $model"
         exit 1

From b6f4d85a96cc604ff70f3c2a6ac4639e224561e8 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 12 Dec 2023 16:19:56 +0800
Subject: [PATCH 17/23] fix ci

---
 .github/workflows/run_chatgpt_examples.yml         |  3 ++-
 .../ColossalChat/tests/test_data_preparation.sh    |  9 +++++----
 applications/ColossalChat/tests/test_train.sh      | 14 +++++++-------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 2eef82810a9b..3f2330e20204 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.13.0-11.7.0
-      options: --gpus all --rm -v /data/scratch/colossal-llama2/colossal_chat_test_data:/data/scratch/colossal-llama2/colossal_chat_test_data --shm-size=10.24gb
+      options: --gpus all --rm -v /data/scratch/colossal-llama2:/data/scratch/colossal-llama2 --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
       run:
@@ -47,6 +47,7 @@ jobs:
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8
+          PRETRAINED_MODEL_PATH: /data/scratch/colossal-llama2/models
           SFT_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/sft
           PROMPT_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/prompt
           PRETRAIN_DATASET: /data/scratch/colossal-llama2/colossal_chat_test_data/ptx
diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index 713395d1d9eb..662d9f0d3da2 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -59,6 +59,7 @@ else
   echo "Directory already exists"
 fi
 
+
 export OMP_NUM_THREADS=8
 
 # install requirements
@@ -83,13 +84,13 @@ get_data_input_dirs() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2/"
+        echo "$PRETRAINED_MODEL_PATH/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m/"
+        echo "$PRETRAINED_MODEL_PATH/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m/"
+        echo "$PRETRAINED_MODEL_PATH/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/models/llama-tokenizer/"
+        echo "$PRETRAINED_MODEL_PATH/llama-tokenizer/"
     else
         echo "Unknown model $model"
         exit 1
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 0311c5788317..75d8933d68d6 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -41,11 +41,11 @@ get_pretrain() {
     if [[ $model == "gpt2" ]]; then
         echo "/data/scratch/models/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m/"
+        echo "$PRETRAINED_MODEL_PATH/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m/"
+        echo "$PRETRAINED_MODEL_PATH/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/llama-tiny/"
+        echo "$PRETRAINED_MODEL_PATH/llama-tiny/"
     else
         echo "Unknown model $model"
         exit 1
@@ -55,13 +55,13 @@ get_pretrain() {
 get_tokenizer_dirs() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2/"
+        echo "$PRETRAINED_MODEL_PATH/gpt2/"
     elif [[ $model == "bloom" ]]; then
-        echo "/data/scratch/models/bloom-560m/"
+        echo "$PRETRAINED_MODEL_PATH/bloom-560m/"
     elif [[ $model == "opt" ]]; then
-        echo "/data/scratch/models/opt-350m/"
+        echo "$PRETRAINED_MODEL_PATH/opt-350m/"
     elif [[ $model == "llama" ]]; then
-        echo "/data/scratch/models/llama-tokenizer/"
+        echo "$PRETRAINED_MODEL_PATH/llama-tokenizer/"
     else
         echo "Unknown model $model"
         exit 1

From 7f5aae5a86d98a765be28c44b4739414ff9d5347 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 13 Dec 2023 09:25:11 +0800
Subject: [PATCH 18/23] fix ci

---
 applications/ColossalChat/tests/test_train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 75d8933d68d6..0934bf80d860 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -39,7 +39,7 @@ pip install -r $EXAMPLES_DIR/requirements.txt
 get_pretrain() {
     local model=$1
     if [[ $model == "gpt2" ]]; then
-        echo "/data/scratch/models/gpt2/"
+        echo "$PRETRAINED_MODEL_PATH/gpt2/"
     elif [[ $model == "bloom" ]]; then
         echo "$PRETRAINED_MODEL_PATH/bloom-560m/"
     elif [[ $model == "opt" ]]; then

From 185b3b01527f35e24160069f9bf71660aab30fda Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 13 Dec 2023 16:40:44 +0800
Subject: [PATCH 19/23] fix ci timeout

---
 .../ColossalChat/tests/test_data_preparation.sh  |  4 +++-
 applications/ColossalChat/tests/test_train.sh    | 16 +++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index 662d9f0d3da2..58be9258b314 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -43,7 +43,9 @@ BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 BASE_TEMP_DIR=$BASE_DIR/temp
 EXAMPLES_DIR=$BASE_DIR/examples
 DATA_SAVE_PATH=$BASE_TEMP_DIR/rlhf_data
-MODELS=('gpt2' 'bloom' 'opt' 'llama')
+# Skip those tests due to CI tests timeout
+# MODELS=('gpt2' 'bloom' 'opt' 'llama')
+MODELS=('llama')
 
 if [ ! -d "$BASE_TEMP_DIR" ]; then
   mkdir "$BASE_TEMP_DIR"
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 0934bf80d860..930879067608 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -26,10 +26,12 @@ EXAMPLES_DIR=$BASE_DIR/examples
 TEMP_DIR=$BASE_DIR/temp
 MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
-MODELS=('gpt2' 'bloom' 'opt' 'llama')
+# Skip those tests due to CI tests timeout
+MODELS=('llama')
 # PLUGINS=('gemini' 'gemini_auto' 'zero2' 'zero2_cpu' '3d')
-PLUGINS=('zero2' 'zero2_cpu' '3d')
-LORA_RANK=('0' '20')
+PLUGINS=('zero2' '3d')
+# LORA_RANK=('0' '20')
+LORA_RANK=('0')
 
 export OMP_NUM_THREADS=8
 
@@ -83,7 +85,7 @@ SKIPPED_TESTS=(
     llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
 )
 
-GRAD_CKPTS=('' '--grad_checkpoint')
+GRAD_CKPTS=('--grad_checkpoint')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
         plugins=($(shuf -e "${PLUGINS[@]}"))
@@ -145,7 +147,7 @@ SKIPPED_TESTS=(
     llama-3d-20 # This test cannot pass, it is probably a bug for the 3d plugin
 )
 
-GRAD_CKPTS=('' '--grad_checkpoint')
+GRAD_CKPTS=('--grad_checkpoint')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
         plugins=($(shuf -e "${PLUGINS[@]}"))
@@ -212,7 +214,7 @@ SKIPPED_TESTS=(
     bloom-zero2_cpu # This test can pass locally. Removed due to OOM
 )
 
-GRAD_CKPTS=('' '--grad_checkpoint')
+GRAD_CKPTS=('--grad_checkpoint')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
         plugins=($(shuf -e "${PLUGINS[@]}"))
@@ -289,7 +291,7 @@ SKIPPED_TESTS=(
     bloom-zero2_cpu # This test can pass locally. Removed due to OOM
 )
 
-GRAD_CKPTS=('' '--grad_checkpoint')
+GRAD_CKPTS=('--grad_checkpoint')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
         plugins=($(shuf -e "${PLUGINS[@]}"))

From cfb6f67441d3a1152585279f63d30961eaa53e64 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 13 Dec 2023 17:49:16 +0800
Subject: [PATCH 20/23] fix rm progress bar. fix ci timeout

---
 applications/ColossalChat/coati/trainer/rm.py |  4 +++-
 applications/ColossalChat/tests/test_train.sh | 24 +++++++++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py
index 0a5dabba5c1e..c22e1a343797 100755
--- a/applications/ColossalChat/coati/trainer/rm.py
+++ b/applications/ColossalChat/coati/trainer/rm.py
@@ -104,7 +104,9 @@ def _before_fit(
     def _train(self, epoch):
         self.model.train()
         step_bar = tqdm.trange(
-            len(self.train_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 930879067608..7d875ac4fd64 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -101,8 +101,10 @@ for lora_rank in ${LORA_RANK[@]}; do
             tokenizer_dir=$(get_tokenizer_dirs $model)
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
+            bs='2'
             if [[ $plugin == "3d" ]]; then
                 tp='4'
+                bs='8'
             fi
             for i in $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
@@ -118,7 +120,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --config_file $MODELS_DIR/config.jsonl \
                     --lora_rank $lora_rank \
                     --plugin $plugin \
-                    --batch_size 2 \
+                    --batch_size $bs \
                     --max_epochs 1 \
                     --accumulation_steps 2 \
                     --tp $tp \
@@ -163,10 +165,11 @@ for lora_rank in ${LORA_RANK[@]}; do
             tokenizer_dir=$(get_tokenizer_dirs $model)
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
+            bs='2'
             if [[ $plugin == "3d" ]]; then
                 tp='4'
-            fi
-            for i in $(seq $NUM_RETRY); do
+                bs='8'
+            fiin $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
                 declare -a dataset=()
                 for split in $(seq -f "%05g" 0 0); do
@@ -180,7 +183,8 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --config_file $MODELS_DIR/config.jsonl \
                     --lora_rank $lora_rank \
                     --plugin $plugin \
-                    --batch_size 2 \
+
+            for i  --batch_size $bs \
                     --max_epochs 1 \
                     --accumulation_steps 2 \
                     --tp $tp \
@@ -230,8 +234,12 @@ for lora_rank in ${LORA_RANK[@]}; do
             tokenizer_dir=$(get_tokenizer_dirs $model)
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
+            bs='4'
+            ebs='8'
             if [[ $plugin == "3d" ]]; then
                 tp='4'
+                bs='16'
+                ebs='32'
             fi
             for i in $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
@@ -257,8 +265,8 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --num_episodes 5 \
                     --num_collect_steps 1 \
                     --num_update_steps 1 \
-                    --experience_batch_size 8 \
-                    --train_batch_size 4 \
+                    --experience_batch_size $ebs \
+                    --train_batch_size $bs \
                     --accumulation_steps 2 \
                     --lr 9e-6 \
                     --mixed_precision "bf16" \
@@ -307,8 +315,10 @@ for lora_rank in ${LORA_RANK[@]}; do
             tokenizer_dir=$(get_tokenizer_dirs $model)
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
+            bs='2'
             if [[ $plugin == "3d" ]]; then
                 tp='4'
+                bs='8'
             fi
             for i in $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
@@ -324,7 +334,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --config_file $MODELS_DIR/config.jsonl \
                     --lora_rank $lora_rank \
                     --plugin $plugin \
-                    --batch_size 2 \
+                    --batch_size $bs \
                     --max_epochs 1 \
                     --accumulation_steps 2 \
                     --tp $tp \

From f9e151e1f21c42ca044b356332be8f287625c733 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 14 Dec 2023 09:09:17 +0800
Subject: [PATCH 21/23] fix ci

---
 applications/ColossalChat/tests/test_train.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 7d875ac4fd64..f211a5973a72 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -169,7 +169,8 @@ for lora_rank in ${LORA_RANK[@]}; do
             if [[ $plugin == "3d" ]]; then
                 tp='4'
                 bs='8'
-            fiin $(seq $NUM_RETRY); do
+            fi
+            for i in $(seq $NUM_RETRY); do
                 echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
                 declare -a dataset=()
                 for split in $(seq -f "%05g" 0 0); do

From 9b62c0e64374a91447c3130c7bb543e4a5abc8e5 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 14 Dec 2023 09:25:24 +0800
Subject: [PATCH 22/23] fix ci typo

---
 applications/ColossalChat/tests/test_train.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index f211a5973a72..e0d8fd0c96c4 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -184,8 +184,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --config_file $MODELS_DIR/config.jsonl \
                     --lora_rank $lora_rank \
                     --plugin $plugin \
-
-            for i  --batch_size $bs \
+                    --batch_size $bs \
                     --max_epochs 1 \
                     --accumulation_steps 2 \
                     --tp $tp \

From 9bc0d425757463f8a7f112efe4091aa5acaee9a6 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 14 Dec 2023 09:47:04 +0800
Subject: [PATCH 23/23] remove 3d plugin from ci temporary

---
 applications/ColossalChat/tests/test_train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index e0d8fd0c96c4..c012245849a7 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -29,7 +29,7 @@ MODELS_DIR=$TEMP_DIR/models_config
 # Skip those tests due to CI tests timeout
 MODELS=('llama')
 # PLUGINS=('gemini' 'gemini_auto' 'zero2' 'zero2_cpu' '3d')
-PLUGINS=('zero2' '3d')
+PLUGINS=('zero2')
 # LORA_RANK=('0' '20')
 LORA_RANK=('0')