From da470b723261c9831606d9fb0ce31926cca8cebe Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:14:06 +0000
Subject: [PATCH 01/12] add ignore and tiny llama

---
 applications/ColossalChat/.gitignore                      | 3 +++
 .../config/conversation_template/tiny-llama.json          | 8 ++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 applications/ColossalChat/config/conversation_template/tiny-llama.json
diff --git a/applications/ColossalChat/.gitignore b/applications/ColossalChat/.gitignore
index 33950adc0bb5..757cbb5da051 100755
--- a/applications/ColossalChat/.gitignore
+++ b/applications/ColossalChat/.gitignore
@@ -146,6 +146,9 @@ docs/.build
 examples/wandb/
 examples/logs/
 examples/output/
+examples/training_scripts/logs
+examples/training_scripts/wandb
+examples/training_scripts/output
 
 examples/awesome-chatgpt-prompts/
 temp/
diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/config/conversation_template/tiny-llama.json
new file mode 100644
index 000000000000..fe927d1575e7
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/tiny-llama.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        2
+    ],
+    "end_of_assistant": "</s>"
+}
\ No newline at end of file

From bdc3df945e01cd1d0874647a066f8fa740951d82 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:18:03 +0000
Subject: [PATCH 02/12] fix path issue

---
 applications/ColossalChat/examples/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index bdf4d23f1ad3..d6114c8d557b 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -490,7 +490,7 @@ In this code we provide a flexible way for users to set the conversation templat
   On your first run of the data preparation script, you only need to define the "chat_template" (if you want to use custom chat template) and the "system message" (if you want to use a custom system message),
 
 
-- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
+- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
 
 
 - Step 3: (Optional) Check the correctness of the processed data. We provided an easy way for you to do a manual checking on the processed data by checking the "$SAVE_DIR/jsonl/part-XXXX.jsonl" files.
@@ -510,7 +510,7 @@ Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you
 
 
 #### Step 3: Training
-Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ### RLHF Training Stage2 - Training Reward Model
@@ -552,11 +552,11 @@ Below shows the preference dataset format used in training the reward model.
 
 
 #### Step 2: Preprocessing
-Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
+Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
 
 #### Step 3: Training
-You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run [train_rm.sh](./training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 #### Features and Tricks in RM Training
@@ -627,14 +627,14 @@ The second dataset--- pretrained dataset is optional, provide it if you want to
   ]
   ```
 #### Step 2: Preprocessing
-To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
+To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./data_preparation_scripts/prepare_prompt_dataset.sh)
 
 
 You can use the SFT dataset you prepared in the SFT stage or prepare a new one from different source for the ptx dataset. The ptx data is used to calculate ptx loss, which stabilizes the training according to the [InstructGPT paper](https://arxiv.org/pdf/2203.02155.pdf).
 
 
 #### Step 3: Training
-You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run the [train_ppo.sh](./training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ```bash
@@ -718,7 +718,7 @@ For DPO training, you only need the preference dataset. Please follow the instru
 
 
 #### Step 2: Training
-You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options. Following the trend of recent research on DPO-like alignment methods, we added option for the user to choose from, including whether to do length normalization , reward shaping and whether to use a reference model in calculating implicit reward. Here are those options,
+You can run the [train_dpo.sh](./training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options. Following the trend of recent research on DPO-like alignment methods, we added option for the user to choose from, including whether to do length normalization , reward shaping and whether to use a reference model in calculating implicit reward. Here are those options,
 
 ```
 --beta 0.1 \     # the temperature in DPO loss, Default to 0.1
@@ -735,7 +735,7 @@ You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to star
 ### Alternative Option For RLHF: Simple Preference Optimization
 
 We support the method introduced in the paper [SimPO: Simple Preference Optimization
-with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which is a reference model free aligment method that add length normalization and reward shaping to the DPO loss to enhance training stability and efficiency. As the method doesn't deviate too much from DPO, we add support for length normalization and SimPO reward shaping in our DPO implementation. To use SimPO in alignment, use the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) script, set the `loss_type` to `simpo_loss`, you can also set the value for temperature (`beta`) and reward target margin (`gamma`) but it is optional.
+with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which is a reference model free aligment method that add length normalization and reward shaping to the DPO loss to enhance training stability and efficiency. As the method doesn't deviate too much from DPO, we add support for length normalization and SimPO reward shaping in our DPO implementation. To use SimPO in alignment, use the [train_dpo.sh](./training_scripts/train_dpo.sh) script, set the `loss_type` to `simpo_loss`, you can also set the value for temperature (`beta`) and reward target margin (`gamma`) but it is optional.
 
 #### SimPO Result
 <p align="center">
@@ -744,7 +744,7 @@ with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which i
 
 
 ### Alternative Option For RLHF: Odds Ratio Preference Optimization
-We support the method introduced in the paper [ORPO: Monolithic Preference Optimization without Reference Model](https://arxiv.org/abs/2403.07691) (ORPO). Which is a reference model free aligment method that mixes the SFT loss with a reinforcement learning loss that uses odds ratio as the implicit reward to enhance training stability and efficiency. Simply set the flag to disable the use of the reference model, set the reward target margin and enable length normalization in the DPO training script. To use ORPO in alignment, use the [train_orpo.sh](./examples/training_scripts/train_orpo.sh) script, You can set the value for `lambda` (which determine how strongly the reinforcement learning loss affect the training) but it is optional.
+We support the method introduced in the paper [ORPO: Monolithic Preference Optimization without Reference Model](https://arxiv.org/abs/2403.07691) (ORPO). Which is a reference model free aligment method that mixes the SFT loss with a reinforcement learning loss that uses odds ratio as the implicit reward to enhance training stability and efficiency. Simply set the flag to disable the use of the reference model, set the reward target margin and enable length normalization in the DPO training script. To use ORPO in alignment, use the [train_orpo.sh](./training_scripts/train_orpo.sh) script, You can set the value for `lambda` (which determine how strongly the reinforcement learning loss affect the training) but it is optional.
 
 #### ORPO Result
 <p align="center">

From 14bd4d2a71d4f7d2718eeb16e398cd25c6c211c1 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:21:19 +0000
Subject: [PATCH 03/12] run style

---
 .../ColossalChat/config/conversation_template/tiny-llama.json   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/config/conversation_template/tiny-llama.json
index fe927d1575e7..59196159f930 100644
--- a/applications/ColossalChat/config/conversation_template/tiny-llama.json
+++ b/applications/ColossalChat/config/conversation_template/tiny-llama.json
@@ -5,4 +5,4 @@
         2
     ],
     "end_of_assistant": "</s>"
-}
\ No newline at end of file
+}

From bd861a14ad1c69c53348b87827d3ae32c8a36e5e Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:30:26 +0000
Subject: [PATCH 04/12] fix issue

---
 .../ColossalChat/examples/training_scripts/train_dpo.sh       | 2 +-
 .../ColossalChat/examples/training_scripts/train_orpo.sh      | 4 ++--
 .../ColossalChat/examples/training_scripts/train_ppo.sh       | 2 +-
 .../ColossalChat/examples/training_scripts/train_rm.sh        | 2 +-
 .../ColossalChat/examples/training_scripts/train_sft.sh       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
index f7bb456584c2..48c5f7181288 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
index ca80a14c1f7d..1253fa71988c 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
@@ -13,7 +13,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
 PROJECT_NAME="dpo"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
index 91633978e6ff..559f630a8835 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
@@ -54,7 +54,7 @@ declare -a ptx_dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_ppo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
index e06d9092fe4c..571c650a489c 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index 18df0929327e..678bfc809819 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -36,7 +36,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 echo $(which colossalai)
 echo $(which python)

From 94ddbf33b55eff85b44d297600f31360d2a8a422 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 07:09:20 +0000
Subject: [PATCH 05/12] update bash

---
 .../ColossalChat/examples/training_scripts/train_dpo.sh       | 4 +---
 .../ColossalChat/examples/training_scripts/train_orpo.sh      | 4 +---
 .../ColossalChat/examples/training_scripts/train_ppo.sh       | 3 +--
 .../ColossalChat/examples/training_scripts/train_rm.sh        | 4 +---
 .../ColossalChat/examples/training_scripts/train_sft.sh       | 3 +--
 5 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
index 48c5f7181288..082d54ff0d89 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
-PROJECT_NAME="dpo"
+PROJECT_NAME="DPO"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
index 1253fa71988c..482956b21c7e 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-PROJECT_NAME="dpo"
+PROJECT_NAME="ORPO"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
index 559f630a8835..277e75e6de56 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
@@ -15,10 +15,9 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="ppo"
+PROJECT_NAME="PPO"
 
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # local pretrained model path (from RLHF step 1: SFT)
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
index 571c650a489c..cd42afcc8957 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="rm"
+PROJECT_NAME="RM"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path /home/yeanbang/data/experiments/rm/hhh_aligh/ckptllama2-rm-2024-01-17-14-43-24/epoch-1_step-1317/modeling \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index 678bfc809819..c7d38c1d8c9b 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -14,9 +14,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
-PROJECT_NAME="sft"
+PROJECT_NAME="SFT"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path

From 972ba558c6ba4ddf2c435bedb0c6569f4f6f1590 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:14:06 +0000
Subject: [PATCH 06/12] add ignore and tiny llama

---
 applications/ColossalChat/.gitignore                      | 3 +++
 .../config/conversation_template/tiny-llama.json          | 8 ++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 applications/ColossalChat/config/conversation_template/tiny-llama.json

diff --git a/applications/ColossalChat/.gitignore b/applications/ColossalChat/.gitignore
index 33950adc0bb5..757cbb5da051 100755
--- a/applications/ColossalChat/.gitignore
+++ b/applications/ColossalChat/.gitignore
@@ -146,6 +146,9 @@ docs/.build
 examples/wandb/
 examples/logs/
 examples/output/
+examples/training_scripts/logs
+examples/training_scripts/wandb
+examples/training_scripts/output
 
 examples/awesome-chatgpt-prompts/
 temp/
diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/config/conversation_template/tiny-llama.json
new file mode 100644
index 000000000000..fe927d1575e7
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/tiny-llama.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        2
+    ],
+    "end_of_assistant": "</s>"
+}
\ No newline at end of file

From a09ecfa38b49292173348e68348392d523abb844 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:18:03 +0000
Subject: [PATCH 07/12] fix path issue

---
 applications/ColossalChat/examples/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index bdf4d23f1ad3..d6114c8d557b 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -490,7 +490,7 @@ In this code we provide a flexible way for users to set the conversation templat
   On your first run of the data preparation script, you only need to define the "chat_template" (if you want to use custom chat template) and the "system message" (if you want to use a custom system message),
 
 
-- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
+- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
 
 
 - Step 3: (Optional) Check the correctness of the processed data. We provided an easy way for you to do a manual checking on the processed data by checking the "$SAVE_DIR/jsonl/part-XXXX.jsonl" files.
@@ -510,7 +510,7 @@ Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you
 
 
 #### Step 3: Training
-Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ### RLHF Training Stage2 - Training Reward Model
@@ -552,11 +552,11 @@ Below shows the preference dataset format used in training the reward model.
 
 
 #### Step 2: Preprocessing
-Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
+Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
 
 #### Step 3: Training
-You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run [train_rm.sh](./training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 #### Features and Tricks in RM Training
@@ -627,14 +627,14 @@ The second dataset--- pretrained dataset is optional, provide it if you want to
   ]
   ```
 #### Step 2: Preprocessing
-To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
+To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./data_preparation_scripts/prepare_prompt_dataset.sh)
 
 
 You can use the SFT dataset you prepared in the SFT stage or prepare a new one from different source for the ptx dataset. The ptx data is used to calculate ptx loss, which stabilizes the training according to the [InstructGPT paper](https://arxiv.org/pdf/2203.02155.pdf).
 
 
 #### Step 3: Training
-You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run the [train_ppo.sh](./training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ```bash
@@ -718,7 +718,7 @@ For DPO training, you only need the preference dataset. Please follow the instru
 
 
 #### Step 2: Training
-You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options. Following the trend of recent research on DPO-like alignment methods, we added option for the user to choose from, including whether to do length normalization , reward shaping and whether to use a reference model in calculating implicit reward. Here are those options,
+You can run the [train_dpo.sh](./training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options. Following the trend of recent research on DPO-like alignment methods, we added option for the user to choose from, including whether to do length normalization , reward shaping and whether to use a reference model in calculating implicit reward. Here are those options,
 
 ```
 --beta 0.1 \     # the temperature in DPO loss, Default to 0.1
@@ -735,7 +735,7 @@ You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to star
 ### Alternative Option For RLHF: Simple Preference Optimization
 
 We support the method introduced in the paper [SimPO: Simple Preference Optimization
-with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which is a reference model free aligment method that add length normalization and reward shaping to the DPO loss to enhance training stability and efficiency. As the method doesn't deviate too much from DPO, we add support for length normalization and SimPO reward shaping in our DPO implementation. To use SimPO in alignment, use the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) script, set the `loss_type` to `simpo_loss`, you can also set the value for temperature (`beta`) and reward target margin (`gamma`) but it is optional.
+with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which is a reference model free aligment method that add length normalization and reward shaping to the DPO loss to enhance training stability and efficiency. As the method doesn't deviate too much from DPO, we add support for length normalization and SimPO reward shaping in our DPO implementation. To use SimPO in alignment, use the [train_dpo.sh](./training_scripts/train_dpo.sh) script, set the `loss_type` to `simpo_loss`, you can also set the value for temperature (`beta`) and reward target margin (`gamma`) but it is optional.
 
 #### SimPO Result
 <p align="center">
@@ -744,7 +744,7 @@ with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which i
 
 
 ### Alternative Option For RLHF: Odds Ratio Preference Optimization
-We support the method introduced in the paper [ORPO: Monolithic Preference Optimization without Reference Model](https://arxiv.org/abs/2403.07691) (ORPO). Which is a reference model free aligment method that mixes the SFT loss with a reinforcement learning loss that uses odds ratio as the implicit reward to enhance training stability and efficiency. Simply set the flag to disable the use of the reference model, set the reward target margin and enable length normalization in the DPO training script. To use ORPO in alignment, use the [train_orpo.sh](./examples/training_scripts/train_orpo.sh) script, You can set the value for `lambda` (which determine how strongly the reinforcement learning loss affect the training) but it is optional.
+We support the method introduced in the paper [ORPO: Monolithic Preference Optimization without Reference Model](https://arxiv.org/abs/2403.07691) (ORPO). Which is a reference model free aligment method that mixes the SFT loss with a reinforcement learning loss that uses odds ratio as the implicit reward to enhance training stability and efficiency. Simply set the flag to disable the use of the reference model, set the reward target margin and enable length normalization in the DPO training script. To use ORPO in alignment, use the [train_orpo.sh](./training_scripts/train_orpo.sh) script, You can set the value for `lambda` (which determine how strongly the reinforcement learning loss affect the training) but it is optional.
 
 #### ORPO Result
 <p align="center">

From b387e6af7a58a7895a6a6629e6bbe26884431849 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:21:19 +0000
Subject: [PATCH 08/12] run style

---
 .../ColossalChat/config/conversation_template/tiny-llama.json   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/config/conversation_template/tiny-llama.json
index fe927d1575e7..59196159f930 100644
--- a/applications/ColossalChat/config/conversation_template/tiny-llama.json
+++ b/applications/ColossalChat/config/conversation_template/tiny-llama.json
@@ -5,4 +5,4 @@
         2
     ],
     "end_of_assistant": "</s>"
-}
\ No newline at end of file
+}

From 3919cd60d788e9ce3d571458375ef5dcb3d1b5d9 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 06:30:26 +0000
Subject: [PATCH 09/12] fix issue

---
 .../ColossalChat/examples/training_scripts/train_dpo.sh       | 2 +-
 .../ColossalChat/examples/training_scripts/train_orpo.sh      | 4 ++--
 .../ColossalChat/examples/training_scripts/train_ppo.sh       | 2 +-
 .../ColossalChat/examples/training_scripts/train_rm.sh        | 2 +-
 .../ColossalChat/examples/training_scripts/train_sft.sh       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
index f7bb456584c2..48c5f7181288 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
index ca80a14c1f7d..1253fa71988c 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
@@ -13,7 +13,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
 PROJECT_NAME="dpo"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
index 91633978e6ff..559f630a8835 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
@@ -54,7 +54,7 @@ declare -a ptx_dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_ppo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
index e06d9092fe4c..571c650a489c 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh
@@ -38,7 +38,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index 18df0929327e..678bfc809819 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -36,7 +36,7 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 echo $(which colossalai)
 echo $(which python)

From 7f4940ad808b2877dd0a0f4f189935f178860697 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 12 Jul 2024 07:09:20 +0000
Subject: [PATCH 10/12] update bash

---
 .../ColossalChat/examples/training_scripts/train_dpo.sh       | 4 +---
 .../ColossalChat/examples/training_scripts/train_orpo.sh      | 4 +---
 .../ColossalChat/examples/training_scripts/train_ppo.sh       | 3 +--
 .../ColossalChat/examples/training_scripts/train_rm.sh        | 4 +---
 .../ColossalChat/examples/training_scripts/train_sft.sh       | 3 +--
 5 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
index 48c5f7181288..082d54ff0d89 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
-PROJECT_NAME="dpo"
+PROJECT_NAME="DPO"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
index 1253fa71988c..482956b21c7e 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
-PROJECT_NAME="dpo"
+PROJECT_NAME="ORPO"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
index 559f630a8835..277e75e6de56 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
@@ -15,10 +15,9 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="ppo"
+PROJECT_NAME="PPO"
 
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # local pretrained model path (from RLHF step 1: SFT)
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
index 571c650a489c..cd42afcc8957 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh
@@ -15,9 +15,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="rm"
+PROJECT_NAME="RM"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -42,7 +41,6 @@ CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path /home/yeanbang/data/experiments/rm/hhh_aligh/ckptllama2-rm-2024-01-17-14-43-24/epoch-1_step-1317/modeling \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index 678bfc809819..c7d38c1d8c9b 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -14,9 +14,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 
 set_n_least_used_CUDA_VISIBLE_DEVICES 4
-PROJECT_NAME="sft"
+PROJECT_NAME="SFT"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path

From ab358c0f6e9ad3dd2796be191d2f94196311438f Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 16 Jul 2024 08:28:33 +0000
Subject: [PATCH 11/12] fix ddp issue

---
 applications/ColossalChat/coati/trainer/sft.py                  | 2 +-
 .../ColossalChat/examples/training_scripts/train_sft.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py
index 1484f5057a83..c09d61034984 100755
--- a/applications/ColossalChat/coati/trainer/sft.py
+++ b/applications/ColossalChat/coati/trainer/sft.py
@@ -102,7 +102,6 @@ def _train(self, epoch: int):
             batch_size = batch["input_ids"].size(0)
             outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
             loss = outputs.loss
-            step_bar.set_description(f"Epoch {epoch + 1}/{self.max_epochs} Loss: {loss.detach().cpu().item():.4f}")
 
             self.booster.backward(loss=loss, optimizer=self.optimizer)
 
@@ -115,6 +114,7 @@ def _train(self, epoch: int):
                 self.optimizer.zero_grad()
                 self.scheduler.step()
 
+                step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")})
                 if self.writer:
                     self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
                     self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
index fe15065594d4..b89cbeb917be 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.py
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -61,7 +61,7 @@ def train(args):
         Default torch ddp plugin without any acceleration, for
         debugging purpose acceleration, for debugging purpose
         """
-        plugin = TorchDDPPlugin(find_unused_parameters=True)
+        plugin = TorchDDPPlugin(find_unused_parameters=True if args.grad_checkpoint is False else False)
     elif args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,

From e4c92fa9ac2a5d0249c4d067766311871fb5c2c8 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 17 Jul 2024 06:02:33 +0000
Subject: [PATCH 12/12] add Qwen 1.5 32B

---
 .../conversation_template/Qwen_Qwen1.5-32B-Chat.json     | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json

diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
new file mode 100644
index 000000000000..58941a5918ff
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
@@ -0,0 +1,9 @@
+{
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        151645,
+        151643
+    ],
+    "end_of_assistant": "<|im_end|>"
+}