From 06a982d8eaa3e636f76e336a1b9306e9235b9a5e Mon Sep 17 00:00:00 2001
From: yynil <yueyu.lin@me.com>
Date: Tue, 11 Apr 2023 23:47:59 +0800
Subject: [PATCH 1/3] save changes

---
 applications/Chat/examples/community/peft/trainer.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 applications/Chat/examples/community/peft/trainer.py

diff --git a/applications/Chat/examples/community/peft/trainer.py b/applications/Chat/examples/community/peft/trainer.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 897a99d7c559a55e871cc0cc26e4d98c2ac7bb0b Mon Sep 17 00:00:00 2001
From: yynil <yueyu.lin@me.com>
Date: Fri, 14 Apr 2023 18:19:53 +0800
Subject: [PATCH 2/3] Add support for ChatGLM-6B and impplement vram friendly
 training.

---
 .../peft/convert_rewards_ds_to_sft.py         |  18 +
 .../examples/community/peft/easy_dataset.py   | 449 ++++++++++++++---
 .../examples/community/peft/easy_models.py    | 129 ++++-
 .../examples/community/peft/lora_inference.py |  45 ++
 .../examples/community/peft/save_vram_em.py   | 116 +++++
 .../examples/community/peft/toys_pretrain.txt |   2 +
 .../examples/community/peft/toys_prompts.txt  |   2 +
 .../community/peft/toys_rewards_test.jsonl    |   1 +
 .../community/peft/toys_rewards_train.jsonl   |   1 +
 .../Chat/examples/community/peft/toys_sft.txt |   2 +
 .../community/peft/train_peft_prompts.py      | 124 +++--
 .../examples/community/peft/train_peft_sft.py | 104 ++--
 .../examples/community/peft/train_peft_sft.sh |  13 +
 .../community/peft/train_peft_sft_prompts.sh  |  11 +
 .../examples/community/peft/train_prompts.sh  |  21 +
 .../community/peft/train_reward_model.py      | 197 ++++++++
 .../community/peft/train_reward_model.sh      |  18 +
 .../Chat/examples/community/peft/trainer.py   | 464 ++++++++++++++++++
 18 files changed, 1516 insertions(+), 201 deletions(-)
 create mode 100644 applications/Chat/examples/community/peft/convert_rewards_ds_to_sft.py
 create mode 100755 applications/Chat/examples/community/peft/lora_inference.py
 create mode 100644 applications/Chat/examples/community/peft/save_vram_em.py
 create mode 100644 applications/Chat/examples/community/peft/toys_pretrain.txt
 create mode 100644 applications/Chat/examples/community/peft/toys_prompts.txt
 create mode 100644 applications/Chat/examples/community/peft/toys_rewards_test.jsonl
 create mode 100644 applications/Chat/examples/community/peft/toys_rewards_train.jsonl
 create mode 100644 applications/Chat/examples/community/peft/toys_sft.txt
 create mode 100644 applications/Chat/examples/community/peft/train_peft_sft.sh
 create mode 100644 applications/Chat/examples/community/peft/train_peft_sft_prompts.sh
 create mode 100755 applications/Chat/examples/community/peft/train_prompts.sh
 create mode 100644 applications/Chat/examples/community/peft/train_reward_model.py
 create mode 100755 applications/Chat/examples/community/peft/train_reward_model.sh

diff --git a/applications/Chat/examples/community/peft/convert_rewards_ds_to_sft.py b/applications/Chat/examples/community/peft/convert_rewards_ds_to_sft.py
new file mode 100644
index 000000000000..089f4108c3bf
--- /dev/null
+++ b/applications/Chat/examples/community/peft/convert_rewards_ds_to_sft.py
@@ -0,0 +1,18 @@
+import json
+import os
+import sys 
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, default=None, help='input file')
+    parser.add_argument('--output_file', type=str, default=None, help='output file')
+    args = parser.parse_args()
+    with open(args.input_file, 'r',encoding='UTF-8') as f:
+        with open(args.output_file, 'w',encoding='UTF-8') as f2:
+            for line in f:
+                line = line.strip()
+                data = json.loads(line)
+                prompt = data['prompt']
+                response = data['chosen']
+                instructions = "请根据法律知识回答。提问："+prompt+"\t回答\t"+response+'\n'
+                f2.write(instructions)
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/Chat/examples/community/peft/easy_dataset.py
index 13dceef79145..9b6899781388 100644
--- a/applications/Chat/examples/community/peft/easy_dataset.py
+++ b/applications/Chat/examples/community/peft/easy_dataset.py
@@ -1,13 +1,14 @@
 import copy
 import json
-from typing import Dict, Sequence
+import time
+from typing import Dict, List, Sequence
 
 import torch
 from datasets import load_dataset
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import AutoTokenizer
-
+import gc
 IGNORE_INDEX = -100
 
 
@@ -55,14 +56,25 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 5
             all_lines = f.readlines()
         #split to source and target ,source the characters before "回答：" including "回答：", target the characters after "回答："
         sources, targets = [], []
+        random_print_legal=3
+        random_print_illegal=3
+        import random
         for line in all_lines:
-            if "回答：" in line:
-                sep_index = line.index("回答：")
-                sources.append(line[:sep_index + 3])
-                targets.append(line[sep_index + 3:] + tokenizer.eos_token)
+            if "\t回答\t" in line:
+                sep_index = line.index("\t回答\t")
+                sources.append(line[:sep_index + len("\t回答\t")])
+                targets.append(line[sep_index + len("\t回答\t"):] + tokenizer.eos_token)
+                if random_print_legal>0:
+                    if random.random()>0.8:
+                        print("legal line :",line)
+                        random_print_legal-=1
             else:
                 sources.append(line)
                 targets.append("" + tokenizer.eos_token)
+                if random_print_illegal>0:
+                    if random.random()>0.8:
+                        print("illegal line :",line)
+                        random_print_illegal-=1
         data_dict = preprocess(sources, targets, tokenizer, max_length)
 
         self.input_ids = data_dict["input_ids"]
@@ -88,7 +100,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 9
         super(EasyPromptsDataset, self).__init__()
         with open(data_file, "r", encoding="UTF-8") as f:
             all_lines = f.readlines()
-            all_lines = [line if "回答：" not in line else line[:line.index("回答：") + 3] for line in all_lines]
+            all_lines = [line if "\t回答\t" not in line else line[:line.index("\t回答\t") + len("\t回答\t")] for line in all_lines]
         self.prompts = [
             tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
                       truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
@@ -108,54 +120,135 @@ def __repr__(self):
     def __str__(self):
         return f"LawPromptsDataset(data_file={self.data_file}, prompts_len={len(self.prompts)})"
 
-
+from multiprocessing import Pool, Process,Queue
+from contextlib import closing
+import os
 class EasyRewardDataset(Dataset):
 
-    def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
+    def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512,concurrency = 20) -> None:
         super(EasyRewardDataset, self).__init__()
+        self.current_index = 0   
         self.chosen = []
         self.reject = []
         if special_token is None:
             self.end_token = tokenizer.eos_token
         else:
             self.end_token = special_token
-        print(self.end_token)
         #read all lines in the train_file to a list
         with open(train_file, "r", encoding="UTF-8") as f:
             all_lines = f.readlines()
+        self.concurrency = concurrency
+        max_tasks_in_queue = 100
+        from pyarrow import parquet as pq
+        from pandas import DataFrame
+        import pandas as pd
+        def mp_function(task_queue:Queue,index:int):
+            #get a task from task_queue, catch the exception if the queue is closed
+            parquet_dir = 'parquet'
+            if not os.path.exists(parquet_dir):
+                os.makedirs(parquet_dir,exist_ok=True)
+            index = 0
+            df = DataFrame(columns=['chosen_input_ids','chosen_attention_mask','reject_input_ids','reject_attention_mask'])
+            while True:
+                try:
+                    line = task_queue.get()
+                    if line == 'Exit':
+                        break
+                    data = json.loads(line)
+                    prompt = "请根据法律回答。提问：" + data['prompt'] + "\t回答：\t"
+                    chosen = prompt + data['chosen'] + self.end_token
+                    chosen_token = tokenizer(chosen,
+                                                    max_length=max_length,
+                                                    padding="max_length",
+                                                    truncation=True,
+                                                    return_tensors="pt")
+                    reject = prompt + data['rejected'] + self.end_token
+                    reject_token = tokenizer(reject,
+                                                    max_length=max_length,
+                                                    padding="max_length",
+                                                    truncation=True,
+                                                    return_tensors="pt")
+                    data = {
+                        "chosen_input_ids": chosen_token['input_ids'].tolist(),
+                        "chosen_attention_mask": chosen_token['attention_mask'].tolist(),
+                        "reject_input_ids": reject_token['input_ids'].tolist(),
+                        "reject_attention_mask": reject_token['attention_mask'].tolist()
+                    }
+                    new_df = DataFrame.from_dict(data)
+                    df = pd.concat([df,new_df],ignore_index=True)
+                    if len(df) > 100:
+                        print(f"Write {len(df)} to parquet file {parquet_dir}/{os.getpid()}_{index}.parquet}}")
+                        df.to_parquet(f'{parquet_dir}/{os.getpid()}_{index}.parquet')
+                        index += 1
+                        df = DataFrame(columns=['chosen_input_ids','chosen_attention_mask','reject_input_ids','reject_attention_mask'])
+                    #print the process id and the task
+                    # print(f"Process {os.getpid()} has processed task")
+                except:
+                    print(f"Process {os.getpid()} has no more tasks")
+                    break
+
+            if len(df) > 0:
+                print(f"Write {len(df)} to parquet file {parquet_dir}/{os.getpid()}_{index}.parquet}}")
+                df.to_parquet(f'{parquet_dir}/{os.getpid()}_{index}.parquet')
+        processes = []
+        task_queues = []
+        for i in range(concurrency):
+            task_queue = Queue(max_tasks_in_queue)
+            p = Process(target=mp_function, args=(task_queue,i))
+            p.start()
+            processes.append(p)    
+            task_queues.append(task_queue)
+        index = 0
         for line in tqdm(all_lines):
-            data = json.loads(line)
-            prompt = "提问：" + data['prompt'] + " 回答："
-
-            chosen = prompt + data['chosen'] + self.end_token
-            chosen_token = tokenizer(chosen,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.chosen.append({
-                "input_ids": chosen_token['input_ids'],
-                "attention_mask": chosen_token['attention_mask']
-            })
-
-            reject = prompt + data['rejected'] + self.end_token
-            reject_token = tokenizer(reject,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.reject.append({
-                "input_ids": reject_token['input_ids'],
-                "attention_mask": reject_token['attention_mask']
-            })
+            task_queues[index % concurrency].put(line)
+            index += 1
+            
+        print('waiting for all tasks to be done')
+        for task_queue in task_queues:
+            task_queue.put("Exit")
+        for task_queue in task_queues:
+            while True:
+                if task_queue.empty():
+                    print('all tasks are done, closing the queue')
+                    task_queue.close()
+                    break
+                print(f'still have {task_queue.qsize()} tasks to be done,waiting for 1s')
+                time.sleep(1)
+        print('get all results from processes')
+        time.sleep(5)
+        print(f"waiting for response in main process {os.getpid()}")
+        #terminate all processes
+        for p in processes:
+            p.terminate()
+            p.join()
+        print('all tasks done')
 
     def __len__(self):
         length = len(self.chosen)
         return length
 
     def __getitem__(self, idx):
-        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
-            "input_ids"], self.reject[idx]["attention_mask"]
+        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx]["input_ids"], self.reject[idx]["attention_mask"]
+    
+
+
+    def gen(self):
+        #yeild from the dataset
+        while self.current_index < len(self.chosen):
+            chosen_ids, c_mask, reject_ids, r_mask =self.chosen[self.current_index]["input_ids"], self.chosen[self.current_index]["attention_mask"], self.reject[self.current_index]["input_ids"], self.reject[self.current_index]["attention_mask"]
+            yield dict(chosen_input_ids=chosen_ids, chosen_attention_mask=c_mask, reject_input_ids=reject_ids, reject_attention_mask=r_mask)
+            self.current_index += 1
+
+    @staticmethod
+    def collate_fn(batch):
+        size = len(batch)
+        chosen_input_ids = [d['chosen_input_ids'] for d in batch]
+        chosen_attention_mask = [d['chosen_attention_mask'] for d in batch]
+        reject_input_ids = [d['reject_input_ids'] for d in batch]
+        reject_attention_mask = [d['reject_attention_mask'] for d in batch]
+
+        return (torch.tensor(chosen_input_ids,dtype=torch.long), 
+                torch.tensor(chosen_attention_mask), torch.tensor(reject_input_ids,dtype=torch.long), torch.tensor(reject_attention_mask))
 
     #python representation of the object and the string representation of the object
     def __repr__(self):
@@ -169,60 +262,187 @@ def __str__(self):
 Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
 If individual lines are not related, just set is_group_texts to False.
 '''
-
-
+def split_texts(input_text :str, max_length) -> List[str]:
+    chinese_punctuations = "，。！？；："
+    english_punctuations = ",.!?;:"
+    punctuations = chinese_punctuations + english_punctuations
+    texts = []
+    #first split the text by punctuations
+    for punctuation in punctuations:
+        input_text = input_text.replace(punctuation, "\t" + punctuation + "\t")
+    #then split the text by \t
+    input_text = input_text.split("\t")
+    #remove empty strings
+    input_text = [text for text in input_text if text.strip() != ""]
+    #group texts together
+    current_text = ""
+    for text in input_text:
+        if len(current_text) + len(text) > max_length:
+            #if current_text is still too long, just split it by max_length
+            if len(current_text) > max_length:
+                for i in range(0, len(current_text), max_length):
+                    texts.append(current_text[i:i + max_length])
+            else:
+                texts.append(current_text)
+            current_text = text
+        else:
+            current_text += text
+    if current_text != "":
+        texts.append(current_text)
+    return texts 
 class EasySFTDataset(Dataset):
 
-    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
+    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True,concurrency = 20,mask_prompts =  False,prompts_sep :str = "\t回答\t") -> None:
         super().__init__()
         #read the data_file line by line
         with open(data_file, "r", encoding="UTF-8") as f:
-            #encode the text data line by line and put raw python list input_ids only to raw_input_ids list
-            raw_input_ids = []
-            for line in f:
-                encoded_ids = tokenizer.encode(line)
-                #if the encoded_ids is longer than max_length, then split it into several parts
-                if len(encoded_ids) > max_length:
-                    for i in range(0, len(encoded_ids), max_length):
-                        raw_input_ids.append(encoded_ids[i:i + max_length])
+            all_lines = f.readlines()
+        print(f"total lines: {len(all_lines)}")
+        if is_group_texts:
+            grouped_lines = []
+            current_line = ""
+            for line in all_lines:
+                line = line.strip()
+                current_line += line
+                if len(current_line) > max_length:
+                    splitted_lines = split_texts(current_line, max_length)
+                    grouped_lines.extend(splitted_lines)
+                    current_line = ""
+        else:
+            grouped_lines = []
+            for line in all_lines:
+                line = line.strip()
+                if not mask_prompts:
+                    splitted_lines = split_texts(line, max_length)
+                    grouped_lines.extend(splitted_lines)
                 else:
-                    raw_input_ids.append(encoded_ids)
-
-        grouped_inpup_ids = []
-        current_input_ids = []
-        attention_mask = []
+                    #if the line does not contain the prompts_sep, just skip it
+                    if prompts_sep not in line:
+                        continue
+                    #if the length before prompts_sep is too long, just skip it
+                    if len(line.split(prompts_sep)[0]) > max_length-len(prompts_sep):
+                        print(f'prompts is too long, skip this line: {line}')
+                        continue
+                    grouped_lines.append(line)
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
-        if is_group_texts:
-            for input_ids in raw_input_ids:
-                if len(current_input_ids) + len(input_ids) > max_length:
-                    #pad the current_input_ids to max_length with tokenizer.pad_token_id
-                    padded_length = max_length - len(current_input_ids)
-                    current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                    grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
-                    attention_mask.append(
-                        torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
-                    current_input_ids = []
-                else:
-                    current_input_ids.extend(input_ids)
-            if len(current_input_ids) > 0:
-                padded_length = max_length - len(current_input_ids)
-                current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
-                attention_mask.append(
-                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
-        else:
-            #just append the raw_input_ids to max_length
-            for input_ids in raw_input_ids:
-                padded_length = max_length - len(input_ids)
-                input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                attention_mask.append(
-                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
-                grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
-        self.input_ids = grouped_inpup_ids
-        self.labels = copy.deepcopy(self.input_ids)
+
+        print(f"total lines: {len(grouped_lines)}")
+        print('samples from grouped_lines')
+        print(grouped_lines[:10])
+        
+        from pyarrow import parquet as pq
+        from pandas import DataFrame
+        import pandas as pd
+        def mp_function(task_queue:Queue,index:int):
+            #get a task from task_queue, catch the exception if the queue is closed
+            parquet_dir = 'parquet'
+            if not os.path.exists(parquet_dir):
+                os.makedirs(parquet_dir,exist_ok=True)
+            index = 0
+            df = DataFrame(columns=['input_ids','labels','attention_mask'])
+            if mask_prompts:
+                prompts_sep_ids = tokenizer.encode(prompts_sep,add_special_tokens=False)[1:]
+            while True:
+                try:
+                    line = task_queue.get()
+                    if line == 'Exit':
+                        break
+                    token = tokenizer(line,
+                                                    max_length=max_length,
+                                                    padding="max_length",
+                                                    truncation=True,
+                                                    return_tensors="pt")
+                    labels = token['input_ids'].tolist()[0]
+                    labels = [-100 if label == tokenizer.pad_token_id else label for label in labels]
+                    if mask_prompts:
+                        def find_first_sub_list(full_list, sub_list):
+                            index = -1
+                            len_of_full = len(full_list)
+                            len_of_sub = len(sub_list)
+                            for i in range(len_of_full):
+                                if full_list[i:i+len_of_sub] == sub_list:
+                                    index = i
+                                    break
+                            return index
+                        prompts_sep_index = find_first_sub_list(token['input_ids'].tolist()[0],prompts_sep_ids)
+                        if prompts_sep_index != -1:
+                            #mask the ids before prompts_sep
+                            labels = [-100 if i < prompts_sep_index else label for i,label in enumerate(labels)]
+                        else:
+                            print(f'wrong prompts_sep {prompts_sep} in {line}, {labels}')
+
+                    labels = [labels]
+                    data = {
+                        "input_ids": token['input_ids'].tolist(),
+                        "labels": labels,
+                        "attention_mask": token['attention_mask'].tolist()
+                    }
+                    new_df = DataFrame.from_dict(data)
+                    df = pd.concat([df,new_df],ignore_index=True)
+                    if len(df) > 100:
+                        print(f"Write {len(df)} to parquet file {parquet_dir}/{os.getpid()}_{index}.parquet}}")
+                        df.to_parquet(f'{parquet_dir}/{os.getpid()}_{index}.parquet')
+                        index += 1
+                        df = DataFrame(columns=['input_ids','labels','attention_mask'])
+                    #print the process id and the task
+                    # print(f"Process {os.getpid()} has processed task")
+                except:
+                    print(f"Process {os.getpid()} has no more tasks")
+                    break
+
+            if len(df) > 0:
+                print(f"Write {len(df)} to parquet file {parquet_dir}/{os.getpid()}_{index}.parquet}}")
+                df.to_parquet(f'{parquet_dir}/{os.getpid()}_{index}.parquet')
+        from tqdm import tqdm
+        progress_bar = tqdm(grouped_lines, desc="Tokenizing texts")
+        processes = []
+        task_queues = []
+        self.concurrency = concurrency
+        max_tasks_in_queue = 100
+        for i in range(concurrency):
+            task_queue = Queue(max_tasks_in_queue)
+            p = Process(target=mp_function, args=(task_queue,i))
+            p.start()
+            processes.append(p)    
+            task_queues.append(task_queue)
+        index = 0
+        for line in progress_bar:
+            task_queues[index % concurrency].put(line)
+            index += 1
+        print('waiting for all tasks to be done')
+        for task_queue in task_queues:
+            task_queue.put("Exit")
+        for task_queue in task_queues:
+            while True:
+                if task_queue.empty():
+                    print('all tasks are done, closing the queue')
+                    task_queue.close()
+                    break
+                print(f'still have {task_queue.qsize()} tasks to be done,waiting for 1s')
+                time.sleep(1)
+        print('get all results from processes')
+        time.sleep(5)
+        print(f"waiting for response in main process {os.getpid()}")
+        #terminate all processes
+        for p in processes:
+            p.terminate()
+            p.join()
+        print('all tasks done')
+        self.input_ids = []
+        self.labels = []
         self.file_name = data_file
-        self.attention_mask = attention_mask
+        self.attention_mask = []
+    
+    @staticmethod
+    def collate_fn(batch):
+        size = len(batch)
+        input_ids = [d['input_ids'] for d in batch]
+        labels = [d['labels'] for d in batch]
+        attention_mask = [d['attention_mask'] for d in batch]
+
+        return dict(input_ids=torch.tensor(input_ids,dtype=torch.long), 
+                attention_mask=torch.tensor(attention_mask), labels=torch.tensor(labels,dtype=torch.long))
 
     def __len__(self):
         return len(self.input_ids)
@@ -238,3 +458,70 @@ def __repr__(self):
     #generate the dataset description to be printed by print in python
     def __str__(self):
         return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str)
+    parser.add_argument("--output_file", type=str)
+    parser.add_argument("--max_length", type=int, default=256)
+    parser.add_argument("--is_group_texts", action='store_true', default=False)
+    parser.add_argument("--tokenizer_name", type=str)
+    parser.add_argument("--need_trust_code", action='store_true',default=False)
+    parser.add_argument("--dataset_type", type=str, default="easy_sft")
+    parser.add_argument("--task_type", type=str, default="convert_2_binary")
+    parser.add_argument("--mask_prompts", action='store_true', default=False)
+
+    args = parser.parse_args()
+    print(args)
+    if args.task_type == "convert_2_binary":
+        from transformers import AutoTokenizer
+        if args.need_trust_code:
+            tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,trust_remote_code=True)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
+
+        if args.dataset_type == "easy_sft":
+            dataset = EasySFTDataset(args.input_file, tokenizer, max_length =args.max_length, is_group_texts= args.is_group_texts,mask_prompts=args.mask_prompts)
+        elif args.dataset_type == "reward":
+            dataset = EasyRewardDataset(args.input_file, tokenizer, max_length = args.max_length)
+        
+        #save the dataset
+        print(f'Finish loading and now saving...to {args.output_file}')
+        import datasets
+        dir = 'parquet'
+        datafiles = [os.path.join(dir, f) for f in os.listdir(dir) if f.endswith('.parquet')]
+        ds = datasets.load_dataset('parquet', data_files=datafiles)
+        ds.save_to_disk(args.output_file)
+        #delete all the temporary files
+        for file in datafiles:
+            os.remove(file)
+    elif args.task_type == "load_from_binary":
+        import datasets
+        ds_train = datasets.load_from_disk(args.output_file)
+        print(ds_train)
+
+        from torch.utils.data import DataLoader
+        if args.dataset_type == "reward":
+            dataloader = DataLoader(ds_train, batch_size=1, shuffle=True,collate_fn=EasyRewardDataset.collate_fn)
+            print(dataloader)
+            for chosen_ids, c_mask, reject_ids, r_mask in dataloader:
+                print(chosen_ids.tolist())
+                print(chosen_ids.shape)
+                print(c_mask.shape)
+                print(reject_ids.shape)
+                print(r_mask.shape)
+                break
+        elif args.dataset_type == "easy_sft":
+            dataloader = DataLoader(ds_train, batch_size=1, shuffle=True,collate_fn=EasySFTDataset.collate_fn)
+            print(dataloader)
+            for batch in dataloader:
+                input_ids = batch['input_ids']
+                labels = batch['labels']
+                attention_mask = batch['attention_mask']
+                print(input_ids.tolist())
+                print(input_ids.shape)
+                print(labels.shape)
+                print(attention_mask.shape)
+                print(labels.tolist())
+                break
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/easy_models.py b/applications/Chat/examples/community/peft/easy_models.py
index fe294868159d..46dddf25db3b 100644
--- a/applications/Chat/examples/community/peft/easy_models.py
+++ b/applications/Chat/examples/community/peft/easy_models.py
@@ -5,11 +5,109 @@
 import torch.nn.functional as F
 from coati.models.generation import generate
 from coati.models.utils import log_probs_from_logits, masked_mean
-from peft import PeftModel
+from peft import LoraConfig, PeftModel, TaskType, get_peft_model
 from torch.nn.modules import Module
-from transformers import BloomConfig, BloomForCausalLM
+from transformers import BloomConfig, BloomForCausalLM,AutoModel
+from coati.models.base.reward_model import RewardModel
+import os
 
+class ChatGLMRM(Module):
+    """
+    ChatGLMRM Reward model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (BloomConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+        lora_rank (int): LoRA rank.
+        lora_train_bias (str): LoRA bias training mode.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 lora_path :str = None,
+                 lora_rank :int = 0) -> None:
+        super().__init__()
+        if pretrained is not None:
+            model = AutoModel.from_pretrained(
+                pretrained,
+                trust_remote_code=True,
+            ).half() # load model to cpu and half 
+            if lora_path is not None and os.path.exists(lora_path+'/adapter_config.json') \
+            and os.path.exists(lora_path+'/adapter_model.bin'):
+                print('load lora from ',lora_path)
+                model = PeftModel.from_pretrained(model, lora_path).half().cpu()
+                self.model = model
+            else:
+                #we'll use peft lora library to do the lora
+                lora_rank = lora_rank if lora_rank > 0 else 32
+                #config lora with rank of lora_rank
+                lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
+                                        inference_mode=False,
+                                        r=lora_rank,
+                                        lora_alpha=32,
+                                        lora_dropout=0.1)
+                model = get_peft_model(model, lora_config)
+                self.model = model
+        else:
+            raise ValueError("No pretrained model provided!")
+        value_head = nn.Linear(model.config.hidden_size, 1)
+        if lora_path is not None and os.path.exists(os.path.join(lora_path,'value_head.bin')):
+            print('load value_head from ',os.path.exists(os.path.join(lora_path,'value_head.bin')))
+            value_head.load_state_dict(torch.load(os.path.join(lora_path,'value_head.bin')))
+        else:
+            value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
+        self.value_head = value_head
+
+
+    def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(sequences, attention_mask=None,return_dict=True, output_hidden_states=True)
+        last_hidden_states = outputs['hidden_states'][-1]
+        # print(last_hidden_states.shape)
+        values = self.value_head(last_hidden_states)[:-1, :]
+        # print(values.shape)
+        values = values.transpose(0,1)#change from (seq,B) to (B,seq)
+        value = values.mean(dim=1).squeeze(1)    # ensure shape is (B)
+        # print(value.shape)
+        return value
+    
+    def get_base_model(self):
+        return self.model
+    
+    def save_pretrained(self,save_directory):
+        self.model.save_pretrained(save_directory)
+        torch.save(self.value_head.state_dict(),os.path.join(save_directory,'value_head.bin'))
+
+
+class ChatGLMCritic(ChatGLMRM):
 
+    def __init__(self, pretrained: str = None, lora_path: str = None, lora_rank: int = 0,use_action_mask: bool = True,pad_token_id :int=3) -> None:
+        super().__init__(pretrained, lora_path, lora_rank)
+        self.use_action_mask = use_action_mask
+        self.pad_token_id = pad_token_id
+
+    def forward(self,
+                sequences: torch.LongTensor,
+                action_mask: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        outputs = self.model(sequences, attention_mask=None,return_dict=True, output_hidden_states=True)
+        last_hidden_states = outputs['hidden_states'][-1]
+
+        values = self.value_head(last_hidden_states).squeeze(-1)
+        values = values.transpose(0,1)#change from (seq,B) to (B,seq)
+
+        if action_mask is not None and self.use_action_mask:
+            num_actions = action_mask.size(1)
+            #create a prompt_mask with is the same size as the values
+            prompt_mask = attention_mask[:, :-num_actions]
+            values = values[:, :-num_actions]
+            value = masked_mean(values, prompt_mask, dim=1)
+            return value
+
+        values = values[:, :-1]
+        value = values.mean(dim=1)
+        return value
+    
 class Actor(Module):
     """
     Actor model base class.
@@ -57,11 +155,36 @@ def forward(self,
         output = self.model(sequences, attention_mask=attention_mask)
         logits = output['logits']
         log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
-        return log_probs[:, -num_actions:]
+        return log_probs[:, -num_actions:]#shape is (B,num_actions)
 
     def get_base_model(self):
         return self.model
+    
+    def save_pretrained(self,save_directory):
+        self.model.save_pretrained(save_directory)
 
+class ChatGlmActor(Actor):
+    def __init__(self,
+                 pretrained: str = None,
+                 lora_path: str = None) -> None:
+        if pretrained is not None:
+            model = AutoModel.from_pretrained(
+                pretrained,
+                trust_remote_code=True,
+            ).half().cpu() # load model to cpu and half precision
+        if lora_path is not None:
+            model = PeftModel.from_pretrained(model, lora_path).half().cpu()
+        super().__init__(model)
+    def forward(self,
+                sequences: torch.LongTensor,
+                num_actions: int,
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """ignor attention_mask
+        """
+        return super().forward(sequences, num_actions,attention_mask=None)
+
+    def print_trainable_parameters(self):
+        self.get_base_model().print_trainable_parameters()
 
 class BLOOMActor(Actor):
     """
diff --git a/applications/Chat/examples/community/peft/lora_inference.py b/applications/Chat/examples/community/peft/lora_inference.py
new file mode 100755
index 000000000000..e61b0e38eb53
--- /dev/null
+++ b/applications/Chat/examples/community/peft/lora_inference.py
@@ -0,0 +1,45 @@
+from peft import PeftModel
+from transformers import AutoTokenizer,AutoModelForCausalLM
+import sys
+import torch
+if __name__ == '__main__':
+    model_path = sys.argv[1]
+    peft_id = sys.argv[2]
+    device = sys.argv[3]
+    #init the AutoModelForCausalLM from model_path
+    #init the AutoTokenizer from model_path
+    #init the PeftModel from model and peft_id
+    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
+    if not peft_id == 'None':
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        model = PeftModel.from_pretrained(model,peft_id).to(device)
+    print(model)
+    generation_kwargs = {
+        "min_length": -1,
+        "top_k": 0.0,
+        "top_p": 1.0,
+        "do_sample": True,
+        "pad_token_id": tokenizer.eos_token_id,
+        "eos_token_id": -1,
+        "max_new_tokens":256,
+    }
+
+
+    questions = []
+    while True:
+        question = input("Enter a question: ")
+        if question == "launch":
+            inputs = tokenizer(questions, return_tensors='pt',padding="longest",truncation=True)
+            with torch.no_grad():
+                outputs = model.generate(input_ids=inputs["input_ids"].to(device),attention_mask=inputs["attention_mask"] , **generation_kwargs)
+            print(outputs)
+            responses = [output for output in outputs]
+            print(responses)
+            print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+            questions = []
+        else:
+            input_prompts = "提问: " + question + " 回答: "
+            questions.append(input_prompts)
+        
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/save_vram_em.py b/applications/Chat/examples/community/peft/save_vram_em.py
new file mode 100644
index 000000000000..71dc236a8a6a
--- /dev/null
+++ b/applications/Chat/examples/community/peft/save_vram_em.py
@@ -0,0 +1,116 @@
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from coati.models.utils import normalize,compute_reward
+from coati.experience_maker.base import Experience
+from easy_models import Actor
+from transformers import AutoTokenizer
+import torch.nn.functional as F
+
+
+class SaveVramExperienceMaker:
+    
+    def __init__(self,
+                 actor: Actor, 
+                 critic: nn.Module, 
+                 reward_model: nn.Module, 
+                 initial_model: Actor, 
+                 kl_coef: float = 0.1) -> None:
+        self.actor = actor
+        self.critic = critic
+        self.reward_model = reward_model
+        self.initial_model = initial_model
+        self.kl_coef = kl_coef
+
+    """
+    Naive experience maker.
+    """
+
+    @torch.no_grad()
+    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
+        #first we move the initial_model,reward_model and critic to cpu,only actor in gpu
+        self.initial_model.to("cpu")
+        self.reward_model.to("cpu")
+        self.critic.to("cpu")
+
+        self.actor.half().to(torch.cuda.current_device())
+        self.actor.eval()
+        print(f"cuda memeor usage actor generate: {torch.cuda.memory_allocated(0)/1024/1024} MB")
+        sequences, attention_mask, action_mask = self.actor.generate(input_ids.to(torch.cuda.current_device()),
+                                                                     return_action_mask=True,
+                                                                     **generate_kwargs)
+        num_actions = action_mask.size(1)
+        print(f"cuda memeor usage actor log probs: {torch.cuda.memory_allocated(0)/1024/1024} MB")
+
+        action_log_probs = self.actor(sequences, num_actions, attention_mask)
+        self.actor.to("cpu")
+        self.initial_model.half().to(torch.cuda.current_device())
+        self.initial_model.eval()
+        print(f"cuda memeor usage initial_model log_probs: {torch.cuda.memory_allocated(0)/1024/1024} MB")
+        base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask)
+        self.initial_model.to("cpu")
+
+        self.critic.to(torch.cuda.current_device())
+        self.critic.eval()
+        print(f"cuda memeor usage critic : {torch.cuda.memory_allocated(0)/1024/1024} MB")
+        value = self.critic(sequences, action_mask, attention_mask)
+        self.critic.to("cpu")
+
+        self.reward_model.to(torch.cuda.current_device())
+        self.reward_model.eval()
+        print(f"cuda memeor reward critic : {torch.cuda.memory_allocated(0)/1024/1024} MB")
+        r = self.reward_model(sequences, attention_mask)
+        self.reward_model.to("cpu")
+        print(f"cuda memeor reward after reward : {torch.cuda.memory_allocated(0)/1024/1024} MB")
+        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
+
+        advantage = reward - value
+        # TODO(ver217): maybe normalize adv
+        if advantage.ndim == 1:
+            advantage = advantage.unsqueeze(-1)
+
+        return Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)
+
+
+if __name__ == '__main__':
+    #tmp test !
+    model_path = "/home/yueyulin/pretrained_models/chatglm-6b"
+    lora_path = "/home/yueyulin/models/sft_law_chatglm6b_ask_law_prompts"
+    from easy_models import ChatGlmActor,ChatGLMCritic,ChatGLMRM
+    from transformers import AutoTokenizer
+    actor = ChatGlmActor(model_path, lora_path=lora_path).half().cpu()
+    print(f'cuda vram usage after loading actor: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(tokenizer)
+    initial_model = ChatGlmActor(model_path, lora_path=lora_path).half().cpu()
+    print(f'cuda vram usage after loading initial_model: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+
+    reward_model_lora = "/home/yueyulin/models/chatglmrm"
+    reward_model = ChatGLMRM(model_path,reward_model_lora).half().cpu()
+    print(f'cuda vram usage after loading reward_model: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+    critic = ChatGLMCritic(model_path,reward_model_lora).half().cpu()
+    print(f'cuda vram usage after loading critic: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+
+    input_prompts = ["提问：正在按揭还贷款的房子在不花钱的情况下如何改到未成年孩子的名下 回答：",
+                     "提问：长期不运动会怎么样？回答："]
+    input_ids = tokenizer.batch_encode_plus(input_prompts, return_tensors="pt", padding=True, truncation=True)
+    print(input_ids)
+    attention_mask = input_ids['attention_mask']
+    input_ids = input_ids["input_ids"]
+
+    generate_kwargs = {"max_length": 32,
+                       "do_sample": True,
+                       "temperature": 1.0,
+                       "top_k":50,
+                       "pad_token_id":tokenizer.pad_token_id,
+                       "eos_token_id":tokenizer.eos_token_id}
+    
+    exp_maker = SaveVramExperienceMaker(actor,critic,reward_model, initial_model,  kl_coef=0.1)
+    exp = exp_maker.make_experience(input_ids, **generate_kwargs)
+    print(exp)
+        
+        
+        
+        
+        
+        
diff --git a/applications/Chat/examples/community/peft/toys_pretrain.txt b/applications/Chat/examples/community/peft/toys_pretrain.txt
new file mode 100644
index 000000000000..932cd5a761cf
--- /dev/null
+++ b/applications/Chat/examples/community/peft/toys_pretrain.txt
@@ -0,0 +1,2 @@
+提问：居民区不远处，现在按装了一台大型破石头机器!他们白天不工作.一到晚上就开始了！楼上晃得厉害，睡不好  回答：  你好，可以及时向政府主管部门投诉处理。
+提问：睡眠不好怎么办？  回答：  继续睡。
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/toys_prompts.txt b/applications/Chat/examples/community/peft/toys_prompts.txt
new file mode 100644
index 000000000000..dd2a8169c906
--- /dev/null
+++ b/applications/Chat/examples/community/peft/toys_prompts.txt
@@ -0,0 +1,2 @@
+提问：正在按揭还贷款的房子在不花钱的情况下如何改到未成年孩子的名下  回答：  父母正在按揭中的房子肯定是不可以换成未成年子女的名字。贷款尚未还清的话，房屋尚处于抵押阶段。父母正在按揭中的房子肯定是不可以换成未成年子女的名字。因此不能转到孩子名下，但是如果孩子有偿还能力并取得银行同意的可以转让。父母需要将贷款结清，方可将房子过户给子女。房子过户可以通过赠与或者买卖的方式。
+提问：长期不运动会怎么样？  回答：  过的很开心。
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/toys_rewards_test.jsonl b/applications/Chat/examples/community/peft/toys_rewards_test.jsonl
new file mode 100644
index 000000000000..4736e5eb46a4
--- /dev/null
+++ b/applications/Chat/examples/community/peft/toys_rewards_test.jsonl
@@ -0,0 +1 @@
+{"prompt": "testquestiosfefen", "chosen": "testabababa chosen", "rejected": "test rejected"}
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/toys_rewards_train.jsonl b/applications/Chat/examples/community/peft/toys_rewards_train.jsonl
new file mode 100644
index 000000000000..c0d96f6d0f81
--- /dev/null
+++ b/applications/Chat/examples/community/peft/toys_rewards_train.jsonl
@@ -0,0 +1 @@
+{"prompt": "blablabla question", "chosen": "abababa chosen", "rejected": "dfefefe rejected"}
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/toys_sft.txt b/applications/Chat/examples/community/peft/toys_sft.txt
new file mode 100644
index 000000000000..d911712bdb0b
--- /dev/null
+++ b/applications/Chat/examples/community/peft/toys_sft.txt
@@ -0,0 +1,2 @@
+Blalabal sfe 2017-08-10: I'm not sure if this is what you're looking for, but I've been using this for a while now and it's been working great. It's a little pricey, but it's worth it.
+You might be shorter, might be less stronger, but you'll be able to do more reps and sets.
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 0e277021e917..c05c3be77c7a 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -8,16 +8,16 @@
 from coati.models.gpt import GPTRM, GPTActor, GPTCritic
 from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
-from coati.trainer import PPOTrainer
+from trainer import PPOTrainer
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
-from easy_models import BLOOMActor
+from easy_models import BLOOMActor,ChatGlmActor,ChatGLMRM,ChatGLMCritic
 from peft import PeftModel
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
+from transformers import AutoTokenizer, BloomTokenizerFast
 
 from colossalai.nn.optimizer import HybridAdam
 
@@ -35,73 +35,76 @@ def main(args):
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
-    if args.rm_path is not None:
-        state_dict = torch.load(args.rm_path, map_location='cpu')
 
     # configure model
     if args.model == 'bloom':
         # initial_model = BLOOMActor(pretrained=args.pretrain)
         print('Using peft lora to load Bloom model as inital_model')
         initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
+        #first offload the model to cpu and half precision
+        initial_model = initial_model.half().cpu()
         print('Using peft lora to load Bloom model as initial_model (Done)')
+    elif args.model == 'chatglm':
+        initial_model = ChatGlmActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
+        initial_model = initial_model.half().cpu()
     else:
         raise ValueError(f'Unsupported actor model "{args.model}"')
-
-    if args.rm_model == None:
-        rm_model_name = args.model
-    else:
-        rm_model_name = args.rm_model
-
-    if rm_model_name == 'gpt2':
-        reward_model = GPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'bloom':
-        print("load bloom reward model ", args.rm_pretrain)
-        reward_model = BLOOMRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'opt':
-        reward_model = OPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'llama':
-        reward_model = LlamaRM(pretrained=args.rm_pretrain)
+    
+    #print current cuda memory usage
+    print(f'Step 0. load initial model. Current cuda memory usage: {torch.cuda.memory_allocated()/1024/1024} MB')
+
+    #we must make rm_model the same as model because in PPO stage, the Actor's output ids must be the from the same tokenizer that reward model has
+    #TODO:: I'll change the bloom model to the same implementation as ChatGLMM later
+    rm_model_name = args.model
+    if rm_model_name == 'bloom':
+        print("load bloom reward model ", args.pretrain)
+        reward_model = BLOOMRM(pretrained=args.pretrain)
+        #first offload the model to cpu and half precision
+        reward_model = reward_model.half().cpu()
+        print("load bloom reward model (Done) ")
+    elif rm_model_name == 'chatglm':
+        print("load chatglm reward model ", args.pretrain," with lora path ",args.rm_lora_path)
+        reward_model = ChatGLMRM(pretrained=args.pretrain,lora_path=args.rm_lora_path)
+        #first offload the model to cpu and half precision
+        reward_model = reward_model.half().cpu()
     else:
         raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+    #print current cuda memory usage
+    print(f'Step 1. load reward model. Current cuda memory usage: {torch.cuda.memory_allocated()/1024/1024} MB')
 
-    if args.rm_path is not None:
-        print('Loading reward model from', args.rm_path)
-        reward_model.load_state_dict(state_dict)
-
-    if args.strategy != 'colossalai_gemini':
-        initial_model.to(torch.float16).to(torch.cuda.current_device())
-        reward_model.to(torch.float16).to(torch.cuda.current_device())
 
     with strategy.model_init_context():
         if args.model == 'bloom':
             # actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
             print('Using peft lora to load Bloom model as Actor')
             actor = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
+            #first offload the model to cpu and half precision
+            actor = actor.half().cpu()
             print('Using peft lora to load Bloom model as Actor (Done)')
+        elif args.model == 'chatglm':
+            print(f'load glm actor model from {args.pretrain} with loar path {args.sft_lora_path}')
+            actor = ChatGlmActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
+            actor = initial_model.half().cpu()
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
-
-        if rm_model_name == 'gpt2':
-            critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
-        elif rm_model_name == 'bloom':
+        #print current cuda memory usage
+        print(f'Step 2. actor model is loaded. Current cuda memory usage: {torch.cuda.memory_allocated()/1024/1024} MB')
+        if rm_model_name == 'bloom':
             print("load bloom critic ", args.rm_pretrain, " lora_rank ", args.lora_rank, " use_action_mask ", True)
-            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=False)
+            #first offload the model to cpu and half precision
+            critic = critic.half().cpu()
             print("load bloom critic (Done) ")
-        elif rm_model_name == 'opt':
-            critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
-        elif rm_model_name == 'llama':
-            critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        elif rm_model_name == 'chatglm':
+            print("load chatglm critic ", args.pretrain," with lora path ",args.rm_lora_path)
+            critic = ChatGLMCritic(pretrained=args.pretrain,lora_path=args.rm_lora_path)
+            #first offload the model to cpu and half precision
+            critic = critic.half().cpu()
         else:
             raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+        #print current cuda memory usage
+        print(f'Step 3. critic model is loaded. Current cuda memory usage: {torch.cuda.memory_allocated()/1024/1024} MB')
 
-        if args.rm_path is not None:
-            print('Loading reward model from', args.rm_path)
-            critic.load_state_dict(state_dict)
-            del state_dict
-
-    if args.strategy != 'colossalai_gemini':
-        critic.to(torch.float16).to(torch.cuda.current_device())
-        actor.to(torch.float16).to(torch.cuda.current_device())
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
@@ -112,22 +115,15 @@ def main(args):
         critic_optim = Adam(critic.parameters(), lr=1e-7)
 
     # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained(args.rm_pretrain)
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.rm_pretrain)
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained(args.rm_pretrain)
-    elif args.model == 'llama':
-        tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
-        tokenizer.eos_token = '<\s>'
+    if args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+    elif args.model == 'chatglm':
+        tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True)
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
+    
+
 
-    if args.model == 'llama':
-        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, actor)
-    else:
-        tokenizer.pad_token = tokenizer.eos_token
 
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
 
@@ -175,12 +171,12 @@ def tokenize_fn(texts):
         train_batch_size=args.train_batch_size,
         experience_batch_size=args.experience_batch_size,
         tokenizer=tokenize_fn,
-        max_length=512,
+        max_length=256,
         do_sample=True,
         temperature=1.0,
         top_k=50,
         pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id
     )
 
     trainer.fit(prompt_dataloader=prompt_dataloader,
@@ -206,21 +202,19 @@ def tokenize_fn(texts):
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='naive',
                         help='strategy to use')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--model', default='gpt2', choices=['bloom', 'chatglm'])
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--sft_lora_path', type=str, default=None)
-    parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama'])
-    parser.add_argument('--rm_path', type=str, default=None)
-    parser.add_argument('--rm_pretrain', type=str, default=None)
+    parser.add_argument('--rm_lora_path', type=str, default=None)
     parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
     parser.add_argument('--need_optim_ckpt', type=bool, default=False)
     parser.add_argument('--num_episodes', type=int, default=10)
     parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
+    parser.add_argument('--update_timesteps', type=int, default=2)
     parser.add_argument('--max_epochs', type=int, default=5)
     parser.add_argument('--train_batch_size', type=int, default=2)
     parser.add_argument('--ptx_batch_size', type=int, default=1)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=2)
     parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument('--kl_coef', type=float, default=0.1)
     parser.add_argument('--ptx_coef', type=float, default=0.9)
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index fcc65e24478a..0a3971ace06e 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -14,18 +14,20 @@
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
-from easy_dataset import EasyDataset
+from easy_dataset import EasySFTDataset
 from peft import LoraConfig, PeftModel, TaskType, get_peft_model
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast
+from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast,AutoModel
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor import ColoParameter
+import datasets
+import os
 
 
 def train(args):
@@ -37,64 +39,62 @@ def train(args):
     elif args.strategy == 'colossalai_gemini':
         strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
     # configure model
     with strategy.model_init_context():
-        print('Warning: currently only bloom is tested, gpt2,llama and opt are not tested')
-        model = AutoModelForCausalLM.from_pretrained(args.pretrain).to(torch.cuda.current_device())
-        #if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json
-        if os.path.exists(args.save_path) and os.path.exists(args.save_path+'/adapter_config.json') \
-            and os.path.exists(args.save_path+'/adapter_model.bin'):
-            print("loading from saved peft model ", args.save_path)
-            model = PeftModel.from_pretrained(model, args.save_path)
-        else:
-            #we'll use peft lora library to do the lora
-            lora_rank = args.lora_rank if args.lora_rank > 0 else 32
-            #config lora with rank of lora_rank
-            lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
-                                     inference_mode=False,
-                                     r=lora_rank,
-                                     lora_alpha=32,
-                                     lora_dropout=0.1)
-            model = get_peft_model(model, lora_config)
+        print('Warning: currently only bloom/chatglm is tested, gpt2,llama and opt are not tested')
+        if args.model == 'bloom':
+            model = AutoModelForCausalLM.from_pretrained(args.pretrain).to(torch.cuda.current_device())
+            #if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json
+            if args.model_path is not None and os.path.exists(args.model_path) and os.path.exists(args.model_path+'/adapter_config.json') \
+                and os.path.exists(args.model_path+'/adapter_model.bin'):
+                print("loading from saved peft model ", args.model_path)
+                model = PeftModel.from_pretrained(model, args.model_path)
+            else:
+                #we'll use peft lora library to do the lora
+                lora_rank = args.lora_rank if args.lora_rank > 0 else 32
+                #config lora with rank of lora_rank
+                lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
+                                        inference_mode=False,
+                                        r=lora_rank,
+                                        lora_alpha=32,
+                                        lora_dropout=0.1)
+                model = get_peft_model(model, lora_config)
+        elif args.model == 'chatglm':
+            model = AutoModel.from_pretrained(
+                args.pretrain,
+                trust_remote_code=True,
+            ).to(torch.cuda.current_device())
+            print("check if load from peft model_path ", args.model_path)
+            if args.model_path is not None and os.path.exists(args.model_path) and os.path.exists(args.model_path+'/adapter_config.json') \
+                and os.path.exists(args.model_path+'/adapter_model.bin'):
+                print("loading from saved peft model ", args.model_path)
+                model = PeftModel.from_pretrained(model, args.model_path)
+            else:
+                print("peft from scratch")
+                #we'll use peft lora library to do the lora
+                lora_rank = args.lora_rank if args.lora_rank > 0 else 32
+                #config lora with rank of lora_rank
+                lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
+                                        inference_mode=False,
+                                        r=lora_rank,
+                                        lora_alpha=32,
+                                        lora_dropout=0.1)
+                model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
 
     # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
+    if args.model == 'bloom':
         tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
         tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    elif args.model == 'llama':
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrain,
-            padding_side="right",
-            use_fast=False,
-        )
-        tokenizer.eos_token = '<\s>'
+    elif args.model == 'chatglm':
+        tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True)
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
     tokenizer.pad_token = tokenizer.eos_token
-    if args.model == 'llama':
-        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
-
-        if args.strategy == 'colossalai_gemini':
-            # this is a hack to deal with the resized embedding
-            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
-            for name, param in model.named_parameters():
-                if not isinstance(param, ColoParameter):
-                    sub_module_name = '.'.join(name.split('.')[:-1])
-                    weight_name = name.split('.')[-1]
-                    sub_module = model.get_submodule(sub_module_name)
-                    setattr(sub_module, weight_name, ColoParameter(param))
-    else:
-        tokenizer.pad_token = tokenizer.eos_token
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
@@ -106,13 +106,12 @@ def train(args):
     logger.set_level('WARNING')
 
     # configure dataset
-    law_dataset = EasyDataset(args.dataset, tokenizer=tokenizer, is_group_texts=not args.is_short_text)
-    train_dataset = law_dataset
+    train_dataset = datasets.load_from_disk(args.dataset)
     print(train_dataset)
     eval_dataset = None
     if args.eval_dataset is not None:
-        eval_dataset = EasyDataset(args.eval_dataset, tokenizer=tokenizer, is_group_texts=not args.is_short_text)
-    data_collator = default_collate
+        eval_dataset = datasets.load_from_disk(args.dataset)
+    data_collator = EasySFTDataset.collate_fn
     if dist.is_initialized() and dist.get_world_size() > 1:
         train_sampler = DistributedSampler(train_dataset,
                                            shuffle=True,
@@ -172,8 +171,9 @@ def train(args):
     parser.add_argument('--strategy',
                         choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
                         default='naive')
-    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama','chatglm'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--model_path', type=str, default=None)
     parser.add_argument('--dataset', type=str, default=None)
     parser.add_argument('--eval_dataset', type=str, default=None)
     parser.add_argument('--save_path', type=str, default='output')
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.sh b/applications/Chat/examples/community/peft/train_peft_sft.sh
new file mode 100644
index 000000000000..f6fd3183d74e
--- /dev/null
+++ b/applications/Chat/examples/community/peft/train_peft_sft.sh
@@ -0,0 +1,13 @@
+TRAIN_SET='toys_sft/train'
+SAVE_PATH='toys_sft_lora'
+PRETRAINED_MODEL='/home/yueyulin/pretrained_models/chatglm-6b'
+#change to THUDM/chatglm-6b if you don't have the pretrained model
+torchrun --standalone --nproc_per_node=1 \
+        train_peft_sft.py \
+        --dataset $TRAIN_SET \
+        --model chatglm \
+        --pretrain $PRETRAINED_MODEL \
+        --save_path $SAVE_PATH \
+        --strategy colossalai_zero2 \
+        --batch_size 2 \
+        --max_epochs 1 
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/train_peft_sft_prompts.sh b/applications/Chat/examples/community/peft/train_peft_sft_prompts.sh
new file mode 100644
index 000000000000..d5b9400386ad
--- /dev/null
+++ b/applications/Chat/examples/community/peft/train_peft_sft_prompts.sh
@@ -0,0 +1,11 @@
+
+torchrun --standalone --nproc_per_node=1 \
+        train_peft_sft.py \
+        --dataset /home/yueyulin/ask_law_data_dir/ask_law_sft_train/train/ \
+        --model chatglm \
+        --pretrain /home/yueyulin/pretrained_models/chatglm-6b \
+        --model_path /home/yueyulin/models/sft_law_chatglm6b_ask_law_prompts \
+        --strategy colossalai_zero2 \
+        --batch_size 2 \
+        --max_epochs 1 \
+        --save_path /home/yueyulin/models/sft_law_chatglm6b_ask_law_prompts 
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/train_prompts.sh b/applications/Chat/examples/community/peft/train_prompts.sh
new file mode 100755
index 000000000000..908821d629b0
--- /dev/null
+++ b/applications/Chat/examples/community/peft/train_prompts.sh
@@ -0,0 +1,21 @@
+PROMPT_PATH=toys_prompts.txt
+PRETRAIN_PATH=toys_pretrain.txt
+PRETRAINED_MODEL='/home/yueyulin/pretrained_models/chatglm-6b'
+SFT_LORA='toys_sft_lora'
+RM_LORA='chatglmrm'
+SAVE_PATH='lora_ppo'
+#change to THUDM/chatglm-6b if you don't have the pretrained model
+torchrun --standalone --nproc_per_nod=1 \
+        train_peft_prompts.py \
+        --prompt_path $PROMPT_PATH \
+        --pretrain_dataset $PRETRAIN_PATH \
+        --model chatglm \
+        --pretrain $PRETRAINED_MODEL \
+        --sft_lora_path $SFT_LORA \
+        --rm_lora_path $RM_LORA \
+        --save_path $SAVE_PATH \
+        --strategy colossalai_zero2 \
+        --num_episodes 1 \
+        --max_timesteps 4 \
+        --update_timesteps 2 \
+        --train_batch_size 1
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/train_reward_model.py b/applications/Chat/examples/community/peft/train_reward_model.py
new file mode 100644
index 000000000000..bf0cb250fcd5
--- /dev/null
+++ b/applications/Chat/examples/community/peft/train_reward_model.py
@@ -0,0 +1,197 @@
+import argparse
+from random import randint
+
+import loralib as lora
+import torch
+from coati.dataset import HhRlhfDataset, RmStaticDataset
+from coati.models import LogExpLoss, LogSigLoss
+from coati.models.base import RewardModel
+from coati.models.bloom import BLOOMRM
+from coati.models.deberta import DebertaRM
+from coati.models.gpt import GPTRM
+from coati.models.llama import LlamaRM
+from coati.models.opt import OPTRM
+from coati.models.roberta import RoBERTaRM
+from trainer import RewardModelTrainer
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from datasets import load_dataset
+from torch.optim import Adam
+from transformers import AutoTokenizer, BloomTokenizerFast, DebertaV2Tokenizer, LlamaTokenizer, RobertaTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+from easy_models import ChatGLMRM
+from easy_dataset import EasyRewardDataset
+import os
+import datasets
+
+def train(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda',precision='fp16')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'bloom':
+            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'opt':
+            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'gpt2':
+            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'deberta':
+            model = DebertaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'llama':
+            model = LlamaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'roberta':
+            model = RoBERTaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'chatglm':
+            model = ChatGLMRM(pretrained=args.pretrain, lora_rank=args.lora_rank,lora_path=args.model_path).to(torch.cuda.current_device())
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+        # if args.model_path is not None and os.path.exists(args.model_path):
+        #     print(model.state_dict().keys())
+        #     print(f'loading model from {args.model_path}')
+        #     state_dict = torch.load(args.model_path,map_location=torch.device('cpu'))
+        #     print("-----------")
+        #     print(state_dict.keys())
+        #     model.to("cpu")
+        #     model.load_state_dict(state_dict, strict=True)
+        #     del state_dict
+        #     model.to(torch.cuda.current_device())
+    print(model)
+    def print_trainable_parameters(model):
+        """
+        Prints the number of trainable parameters in the model.
+        """
+        trainable_params = 0
+        all_param = 0
+        for _, param in model.named_parameters():
+            num_params = param.numel()
+            # if using DS Zero 3 and the weights are initialized empty
+            if num_params == 0 and hasattr(param, "ds_numel"):
+                num_params = param.ds_numel
+
+            all_param += num_params
+            if param.requires_grad:
+                trainable_params += num_params
+        print(
+            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+        )
+    print_trainable_parameters(model)
+    print(f'current cuda memory is {torch.cuda.memory_allocated()/1024/1024} MB')
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'deberta':
+        tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
+    elif args.model == 'llama':
+        tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
+    elif args.model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    elif args.model == 'chatglm':
+        tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True)
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+    max_len = args.max_len
+
+    if args.model == 'llama':
+        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        optim = HybridAdam(model.parameters(), lr=5e-6)
+    else:
+        optim = Adam(model.parameters(), lr=5e-6)
+
+    # configure loss function
+    if args.loss_fn == 'log_sig':
+        loss_fn = LogSigLoss()
+    elif args.loss_fn == 'log_exp':
+        loss_fn = LogExpLoss()
+    else:
+        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
+
+    # configure dataset
+    if args.train_file is not None:
+        #if the train_file is a file , we use the EasyRewardDataset
+        if os.path.isfile(args.train_file):
+            train_dataset = EasyRewardDataset(args.train_file, tokenizer,max_length= max_len)
+        #if the train_file is a directory, we use datasets.load_from_disk
+        elif os.path.isdir(args.train_file):
+            train_dataset = datasets.load_from_disk(args.train_file)
+        print(f'train dataset {train_dataset}')
+    if args.valid_file is not None:
+        if os.path.isfile(args.valid_file):
+            valid_dataset = EasyRewardDataset(args.valid_file, tokenizer,max_length= max_len)
+        elif os.path.isdir(args.valid_file):
+            valid_dataset = datasets.load_from_disk(args.valid_file)
+        print(f'valid dataset {valid_dataset}')
+    else:
+        valid_dataset = None
+    if args.eval_file is not None:
+        if os.path.isfile(args.eval_file):
+            eval_dataset = EasyRewardDataset(args.eval_file, tokenizer,max_length= max_len)
+        elif os.path.isdir(args.eval_file): 
+            eval_dataset = datasets.load_from_disk(args.eval_file)
+        print(f'eval dataset {eval_dataset}')
+    else:
+        eval_dataset = None
+    
+    print(f'current cuda memory is {torch.cuda.memory_allocated()/1024/1024} MB')
+    trainer = RewardModelTrainer(model=model,
+                                 strategy=strategy,
+                                 optim=optim,
+                                 loss_fn=loss_fn,
+                                 train_dataset=train_dataset,
+                                 valid_dataset=valid_dataset,
+                                 eval_dataset=eval_dataset,
+                                 batch_size=args.batch_size,
+                                 max_epochs=args.max_epochs)
+
+    trainer.fit()
+    # save model checkpoint after fitting on only rank0
+    trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+    # save optimizer checkpoint on all ranks
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(trainer.optimizer,
+                                'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta', 'llama', 'roberta','chatglm'], default='chatglm')
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--model_path', type=str, default=None)
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--train_file',type=str, default=None)
+    parser.add_argument('--valid_file',type=str, default=None)
+    parser.add_argument('--eval_file',type=str, default=None)
+    parser.add_argument('--subset', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='rm_ckpt')
+    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--max_len', type=int, default=256)
+    parser.add_argument('--lora_rank', type=int, default=32, help="low-rank adaptation matrices rank")
+    parser.add_argument('--loss_fn', type=str, default='log_sig', choices=['log_sig', 'log_exp'])
+    args = parser.parse_args()
+    train(args)
diff --git a/applications/Chat/examples/community/peft/train_reward_model.sh b/applications/Chat/examples/community/peft/train_reward_model.sh
new file mode 100755
index 000000000000..da24012c2879
--- /dev/null
+++ b/applications/Chat/examples/community/peft/train_reward_model.sh
@@ -0,0 +1,18 @@
+TRAIN_SET='toys_rewards_train/train'
+EVAL_SET='toys_rewards_test/train'
+SAVE_PATH='chatglmrm'
+PRETRAINED_MODEL='/home/yueyulin/pretrained_models/chatglm-6b'
+#change to THUDM/chatglm-6b if you don't have the pretrained model
+torchrun --standalone --nproc_per_node=1 \
+       train_reward_model.py --pretrain $PRETRAINED_MODEL \
+                             --model 'chatglm' \
+                             --strategy colossalai_zero2 \
+                             --loss_fn 'log_exp'\
+                             --save_path $SAVE_PATH \
+                             --train_file $TRAIN_SET \
+                             --eval_file $EVAL_SET \
+                             --valid_file $EVAL_SET \
+                             --batch_size 1 \
+                             --lora_rank 32 \
+                             --max_len 256 \
+                             --max_epochs 1
\ No newline at end of file
diff --git a/applications/Chat/examples/community/peft/trainer.py b/applications/Chat/examples/community/peft/trainer.py
index e69de29bb2d1..42318657aad7 100644
--- a/applications/Chat/examples/community/peft/trainer.py
+++ b/applications/Chat/examples/community/peft/trainer.py
@@ -0,0 +1,464 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from coati.experience_maker import Experience, ExperienceMaker
+from coati.replay_buffer import ReplayBuffer
+from coati.experience_maker import Experience
+from save_vram_em import SaveVramExperienceMaker
+from coati.models.base import Actor, Critic
+from coati.models.generation_utils import update_model_kwargs_fn
+from coati.models.loss import PolicyLoss, ValueLoss
+from coati.replay_buffer import NaiveReplayBuffer
+from torch.optim import Optimizer, lr_scheduler
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from torch import Tensor
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+
+from coati.trainer.callbacks.base import Callback
+from coati.trainer.strategies.base import Strategy
+from coati.trainer.utils import is_rank_0
+from coati.models.utils import compute_reward, normalize
+from transformers import AutoTokenizer
+import wandb
+import time
+import torch.distributed as dist
+from easy_dataset import EasyRewardDataset
+import pandas as pd
+import os
+class RewardModelTrainer(ABC):
+    """
+        Trainer to use while training reward model.
+
+    Args:
+        model (torch.nn.Module): the model to train
+        strategy (Strategy): the strategy to use for training
+        optim(Optimizer): the optimizer to use for training
+        loss_fn (callable): the loss function to use for training
+        train_dataset (Dataset): the dataset to use for training
+        valid_dataset (Dataset): the dataset to use for validation
+        eval_dataset (Dataset): the dataset to use for evaluation
+        batch_size (int, defaults to 1): the batch size while training
+        max_epochs (int, defaults to 2): the number of epochs to train
+    """
+
+    def __init__(
+        self,
+        model,
+        strategy: Strategy,
+        optim: Optimizer,
+        loss_fn,
+        train_dataset: Dataset,
+        valid_dataset: Dataset,
+        eval_dataset: Dataset,
+        batch_size: int = 1,
+        max_epochs: int = 1,
+        save_every_step: int = 1000,
+    ) -> None:
+        super().__init__()
+        self.strategy = strategy
+        self.epochs = max_epochs
+        train_sampler = None
+
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            train_sampler = DistributedSampler(train_dataset, shuffle=True, seed=42, drop_last=True)
+        self.train_dataloader = DataLoader(train_dataset,
+                                           shuffle=(train_sampler is None),
+                                           sampler=train_sampler,
+                                           batch_size=batch_size,collate_fn=EasyRewardDataset.collate_fn)
+        self.valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True,collate_fn=EasyRewardDataset.collate_fn)
+        self.eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True,collate_fn=EasyRewardDataset.collate_fn)
+
+        self.model = strategy.setup_model(model)
+        self.loss_fn = loss_fn
+        self.optimizer = strategy.setup_optimizer(optim, self.model)
+        self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer, self.train_dataloader.__len__() // 100)
+        self.save_every_step = save_every_step
+
+    def eval_acc(self, dataloader):
+        dist = 0
+        on = 0
+        cnt = 0
+        self.model.eval()
+        with torch.no_grad():
+            for chosen_ids, c_mask, reject_ids, r_mask in dataloader:
+                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
+                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                for i in range(len(chosen_reward)):
+                    cnt += 1
+                    if chosen_reward[i] > reject_reward[i]:
+                        on += 1
+                dist += (chosen_reward - reject_reward).mean().item()
+            dist_mean = dist / len(dataloader)
+            acc = on / cnt
+        self.model.train()
+        return dist_mean, acc
+
+    def fit(self):
+        #init a string with YYYY_MM_DD_HH_MM_SS
+        time_str = time.strftime("%Y_%m_%d_%H_%M_%S")
+        epoch_bar = tqdm(range(self.epochs), desc='Train epoch', disable=not is_rank_0())
+        for epoch in range(self.epochs):
+            step_bar = tqdm(range(self.train_dataloader.__len__()),
+                            desc='Train step of epoch %d' % epoch,
+                            disable=not is_rank_0())
+            # train
+            self.model.train()
+            cnt = 0
+            acc = 0
+            dist = 0
+            for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
+                chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
+                c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
+                reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
+                r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
+                # print(f'0.cuda memory usage {torch.cuda.memory_allocated()/1024/1024}  MB')
+                chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
+                # print(f'1.cuda memory usage {torch.cuda.memory_allocated()/1024/1024}  MB')
+                reject_reward = self.model(reject_ids, attention_mask=r_mask)
+                # print(f'1.cuda memory usage {torch.cuda.memory_allocated()/1024/1024}  MB')
+                loss = self.loss_fn(chosen_reward, reject_reward)
+                self.strategy.backward(loss, self.model, self.optimizer)
+                self.strategy.optimizer_step(self.optimizer)
+                self.optimizer.zero_grad()
+                cnt += 1
+                if cnt == 100:
+                    self.scheduler.step()
+                    dist, acc = self.eval_acc(self.valid_dataloader)
+                    cnt = 0
+                    if is_rank_0():
+                        log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]],
+                                           columns=['step', 'loss', 'dist', 'acc'])
+                        log_file = f'log_{time_str}.csv'
+                        if not os.path.exists(log_file):
+                            mode = 'w'
+                        else:
+                            mode = 'a'
+                        log.to_csv(log_file, mode=mode, header=False, index=False)
+                step_bar.update()
+                step_bar.set_postfix({'dist': dist, 'acc': acc})
+
+            # eval
+            dist, acc = self.eval_acc(self.eval_dataloader)
+            if is_rank_0():
+                log = pd.DataFrame([[step_bar.n, loss.item(), dist, acc]], columns=['step', 'loss', 'dist', 'acc'])
+                log_file = 'log.csv'
+                if not os.path.exists(log_file):
+                    mode = 'w'
+                else:
+                    mode = 'a'
+                log.to_csv(log_file, mode=mode, header=False, index=False)
+            epoch_bar.update()
+            step_bar.set_postfix({'dist': dist, 'acc': acc})
+            step_bar.close()
+
+    def save_model(self,
+                   path: str,
+                   only_rank0: bool = False,
+                   tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        self.strategy.save_model(model=self.model, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
+
+
+class Trainer(ABC):
+    """
+        Base class for rlhf trainers.
+
+    Args:
+        strategy (Strategy):the strategy to use for training
+        experience_maker (ExperienceMaker): the experience maker to use for produce experience to fullfill replay buffer
+        replay_buffer (ReplayBuffer): the replay buffer to use for training
+        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
+        max_epochs (int, defaults to 1): the number of epochs of training process
+        tokenizer (Callable, optional): the tokenizer to use for tokenizing the input
+        sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
+        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+    """
+
+    def __init__(self,
+                 strategy: Strategy,
+                 experience_maker: ExperienceMaker,
+                 replay_buffer: ReplayBuffer,
+                 experience_batch_size: int = 8,
+                 max_epochs: int = 1,
+                 tokenizer: Optional[Callable[[Any], dict]] = None,
+                 sample_replay_buffer: bool = False,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[Callback] = [],
+                 **generate_kwargs) -> None:
+        super().__init__()
+        self.strategy = strategy
+        self.experience_maker = experience_maker
+        self.replay_buffer = replay_buffer
+        self.experience_batch_size = experience_batch_size
+        self.max_epochs = max_epochs
+        self.tokenizer = tokenizer
+        self.generate_kwargs = generate_kwargs
+        self.sample_replay_buffer = sample_replay_buffer
+        self.dataloader_pin_memory = dataloader_pin_memory
+        self.callbacks = callbacks
+
+    @abstractmethod
+    def training_step(self, experience: Experience) -> Dict[str, Any]:
+        pass
+
+    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
+        if isinstance(inputs, Tensor):
+            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
+        elif isinstance(inputs, dict):
+            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
+        else:
+            raise ValueError(f'Unsupported input type "{type(inputs)}"')
+
+    def _sample_prompts(self, prompts) -> list:
+        indices = list(range(len(prompts)))
+        sampled_indices = self.strategy.experience_sampler.choice(indices, self.experience_batch_size, replace=False)
+        return [prompts[i] for i in sampled_indices]
+
+    def _learn(self):
+        # replay buffer may be empty at first, we should rebuild at each training
+        if not self.sample_replay_buffer:
+            dataloader = self.strategy.setup_dataloader(self.replay_buffer, self.dataloader_pin_memory)
+            device = torch.cuda.current_device()
+        if self.sample_replay_buffer:
+            pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
+            for _ in pbar:
+                experience = self.replay_buffer.sample()
+                metrics = self.training_step(experience)
+                pbar.set_postfix(metrics)
+        else:
+            for epoch in range(self.max_epochs):
+                self._on_learn_epoch_start(epoch)
+                if isinstance(dataloader.sampler, DistributedSampler):
+                    dataloader.sampler.set_epoch(epoch)
+                pbar = tqdm(dataloader, desc=f'Train epoch [{epoch+1}/{self.max_epochs}]', disable=not is_rank_0())
+                for experience in pbar:
+                    self._on_learn_batch_start()
+                    experience.to_device(device)
+                    metrics = self.training_step(experience)
+                    self._on_learn_batch_end(metrics, experience)
+                    pbar.set_postfix(metrics)
+                self._on_learn_epoch_end(epoch)
+
+    def fit(self,
+            prompt_dataloader,
+            pretrain_dataloader,
+            num_episodes: int = 50000,
+            max_timesteps: int = 500,
+            update_timesteps: int = 5000) -> None:
+        time_str = time.strftime("%Y_%m_%d_%H_%M_%S")
+        wandb.init(project="CHAT-PPO", name=time_str)
+        wandb.watch(self.actor)
+        timeppo = 0
+        self.pretrain_dataloader = pretrain_dataloader
+        self.prompt_dataloader = prompt_dataloader
+        self._on_fit_start()
+        for episode in range(num_episodes):
+            self._on_episode_start(episode)
+            for timestep in tqdm(range(max_timesteps),
+                                 desc=f'Episode [{episode+1}/{num_episodes}]',
+                                 disable=not is_rank_0()):
+                timeppo += 1
+                prompts = next(iter(self.prompt_dataloader))
+                if isinstance(prompts, dict):
+                    prompts = prompts['input_ids'].to(torch.cuda.current_device())
+                self._on_make_experience_start()
+                torch.cuda.empty_cache()
+                # print(f'experience_maker cuda memeor usage now: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+                # self.experience_maker.initial_model.to(torch.cuda.current_device())
+                # self.experience_maker.reward_model.to(torch.cuda.current_device())
+                # print(f'experience_maker cuda memeor usage then: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+                experience = self._make_experience(prompts)
+                self._on_make_experience_end(experience)
+                self.replay_buffer.append(experience)
+                if timeppo % update_timesteps == 0:
+                    self.experience_maker.initial_model.to('cpu')
+                    self.experience_maker.reward_model.to('cpu')
+                    torch.cuda.empty_cache()
+                    print(f'ppo cuda memeor usage now: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+                    print(f'ppo cuda memeor usage then: {torch.cuda.memory_allocated(0)/1024/1024} MB')
+                    self._learn()
+                    self.replay_buffer.clear()
+                    self.actor.to('cpu')
+                    self.critic.to('cpu')
+            self._on_episode_end(episode)
+        self._on_fit_end()
+        wandb.finish()
+
+    # TODO(ver217): maybe simplify these code using context
+    def _on_fit_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_start()
+
+    def _on_fit_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_end()
+
+    def _on_episode_start(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_start(episode)
+
+    def _on_episode_end(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_end(episode)
+
+    def _on_make_experience_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_start()
+
+    def _on_make_experience_end(self, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_end(experience)
+
+    def _on_learn_epoch_start(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_epoch_start(epoch)
+
+    def _on_learn_epoch_end(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_epoch_end(epoch)
+
+    def _on_learn_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_batch_start()
+
+    def _on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_learn_batch_end(metrics, experience)
+
+class PPOTrainer(Trainer):
+    """
+        Trainer for PPO algorithm.
+
+    Args:
+        strategy (Strategy): the strategy to use for training
+        actor (Actor): the actor model in ppo algorithm
+        critic (Critic): the critic model in ppo algorithm
+        reward_model (nn.Module): the reward model in rlhf algorithm to make reward of sentences
+        initial_model (Actor): the initial model in rlhf algorithm to generate reference logits to limit the update of actor
+        actor_optim (Optimizer): the optimizer to use for actor model
+        critic_optim (Optimizer): the optimizer to use for critic model
+        kl_coef (float, defaults to 0.1): the coefficient of kl divergence loss
+        train_batch_size (int, defaults to 8): the batch size to use for training
+        buffer_limit (int, defaults to 0): the max_size limitaiton of replay buffer
+        buffer_cpu_offload (bool, defaults to True): whether to offload replay buffer to cpu
+        eps_clip (float, defaults to 0.2): the clip coefficient of policy loss
+        value_clip (float, defaults to 0.4): the clip coefficient of value loss
+        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
+        max_epochs (int, defaults to 1): the number of epochs of training process
+        tokenier (Callable, optional): the tokenizer to use for tokenizing the input
+        sample_replay_buffer (bool, defaults to False): whether to sample from replay buffer
+        dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+    """
+
+    def __init__(self,
+                 strategy: Strategy,
+                 actor: Actor,
+                 critic: Critic,
+                 reward_model: nn.Module,
+                 initial_model: Actor,
+                 actor_optim: Optimizer,
+                 critic_optim: Optimizer,
+                 kl_coef: float = 0.1,
+                 ptx_coef: float = 0.9,
+                 train_batch_size: int = 8,
+                 buffer_limit: int = 0,
+                 buffer_cpu_offload: bool = True,
+                 eps_clip: float = 0.2,
+                 value_clip: float = 0.4,
+                 experience_batch_size: int = 8,
+                 max_epochs: int = 1,
+                 tokenizer: Optional[Callable[[Any], dict]] = None,
+                 sample_replay_buffer: bool = False,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[Callback] = [],
+                 **generate_kwargs) -> None:
+        experience_maker = SaveVramExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
+        replay_buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
+        generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
+        super().__init__(strategy, experience_maker, replay_buffer, experience_batch_size, max_epochs, tokenizer,
+                         sample_replay_buffer, dataloader_pin_memory, callbacks, **generate_kwargs)
+        #two tokenizers will help to decouple different stages' model type and  tokenizer
+        self.actor = actor
+        self.critic = critic
+
+        self.actor_loss_fn = PolicyLoss(eps_clip)
+        self.critic_loss_fn = ValueLoss(value_clip)
+        self.ptx_loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
+        self.ptx_coef = ptx_coef
+        self.actor_optim = actor_optim
+        self.critic_optim = critic_optim
+
+    def training_step(self, experience: Experience) -> Dict[str, float]:
+        self.critic.cpu()
+        self.actor.to(torch.cuda.current_device())
+        self.actor.train()
+        # policy loss
+        num_actions = experience.action_mask.size(1)
+        action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
+        actor_loss = self.actor_loss_fn(action_log_probs,
+                                        experience.action_log_probs,
+                                        experience.advantages,
+                                        action_mask=experience.action_mask)
+
+        # ptx loss
+        if self.ptx_coef != 0:
+            batch = next(iter(self.pretrain_dataloader))
+            ptx = batch['input_ids'].to(torch.cuda.current_device())
+            label = batch['labels'].to(torch.cuda.current_device())[:, 1:]
+            attention_mask = batch['attention_mask'].to(torch.cuda.current_device())
+            ptx_log_probs = self.actor.get_base_model()(ptx, attention_mask=attention_mask)['logits'][..., :-1, :]
+            ptx_loss = self.ptx_loss_fn(ptx_log_probs.view(-1, ptx_log_probs.size(-1)), label.view(-1))
+            actor_loss = ptx_loss * self.ptx_coef + actor_loss * (1 - self.ptx_coef)
+
+        self.strategy.backward(actor_loss, self.actor, self.actor_optim)
+        self.strategy.optimizer_step(self.actor_optim)
+        self.actor_optim.zero_grad()
+        self.actor.cpu()
+        print(f'current cuda memory allocated after actor is optimized: {torch.cuda.memory_allocated()/1024**3:.2f}GB')
+        # value loss
+        self.critic.to(torch.cuda.current_device())
+        self.critic.train()
+        values = self.critic(experience.sequences,
+                             action_mask=experience.action_mask,
+                             attention_mask=experience.attention_mask)
+        print("values",values) 
+        critic_loss = self.critic_loss_fn(values,
+                                          experience.values,
+                                          experience.reward,
+                                          action_mask=experience.action_mask)
+        print("critic_loss",critic_loss)
+        self.strategy.backward(critic_loss, self.critic, self.critic_optim)
+        self.strategy.optimizer_step(self.critic_optim)
+        self.critic_optim.zero_grad()
+        self.critic.cpu()
+        print(f'current cuda memory allocated after critic is optimized: {torch.cuda.memory_allocated()/1024**3:.2f}GB')
+        return {'reward': experience.reward.mean().item(),'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
+    
+    def save_model(self, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        self.strategy.save_model(model=self.actor, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
+
+    def save_model(self, path: str, only_rank0: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+        self.strategy.save_model(model=self.actor, path=path, only_rank0=only_rank0, tokenizer=tokenizer)
+
+
+def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
+    origin_model = strategy._unwrap_actor(actor)
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs:
+        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
+
+    return new_kwargs

From fa3f795b1cdc77d2afffefb41b0d67d70ebdd59c Mon Sep 17 00:00:00 2001
From: yynil <yueyu.lin@me.com>
Date: Fri, 14 Apr 2023 18:36:18 +0800
Subject: [PATCH 3/3] Update the documents

---
 .../Chat/examples/community/peft/README.md    | 97 ++++++++++++++++++-
 1 file changed, 92 insertions(+), 5 deletions(-)

diff --git a/applications/Chat/examples/community/peft/README.md b/applications/Chat/examples/community/peft/README.md
index a82f02a87317..bcaa3b789b3f 100644
--- a/applications/Chat/examples/community/peft/README.md
+++ b/applications/Chat/examples/community/peft/README.md
@@ -1,3 +1,5 @@
+# Add support for ChatGLM-6B and graphic memory friendly PPO trainer
+
 # Add Peft support for SFT and Prompts model training
 
 The orginal implementation just adopts the loralib and merges the layers into the final model. The huggingface peft is a better lora model implementation and can be easily training and distributed.
@@ -13,12 +15,97 @@ pip install .
 ```
 
 # Usage
-For SFT training, just call train_peft_sft.py
 
-Its arguments are almost identical to train_sft.py instead adding a new eval_dataset if you have a eval_dataset file. The data file is just a plain datafile, please check the format in the easy_dataset.py.
+## Data preparation
+There are 3 stages to train a RLHF LLM. So we need to prepare data for 3 stages.
+
+## Stage 1, fine tuning LLM with domain knowledge
+
+In common cases, we want LLM to learn some domain knowledge that's not possible for public available LLMs. For an example, if courts want to let a LLM be familar with law cases, the courts will feed law cases texts to LLM to finetune the public available LLMs.
+
+Even you can crawl instructions from other LLMs to finetune you LLM to get a close performance with the LLM you crawl from.
+
+Anyway, let's just use the toys_sft.txt as an example
+
+Since I'm using ChatGLM-6B as my finetuned LLM, I face a huge problem that the tokenizer is a sentencepiece based one with poor performance compared to dictionary based tokenizer. So I need to preprocess the large text corpus and save them to parquet format to save my training/evaluation time.
+
+The following I'll stick to use the model THUDM/chatglm-6b as example. 
+
+```
+python easy_dataset.py \
+       --input_file toys_sft.txt \
+       --output_file toys_sft \
+       --need_trust_code \
+       --tokenizer_name THUDM/chatglm-6b \
+       --dataset_type easy_sft
+```
+The above command is used to generate binary dataset used by sft training.  After finishing the command, we will get a directory 'toys_sft' which contains the binary data in SFT tuning.
+
+Run the training sft script to get the stage-1 sft model.
+```
+sh train_peft_sft.sh
+```
+
+After several seconds, you will get a new directory toys_sft_lora.  Only LoRa parameters are stored inside. Your stage-1 is finished!
+
+## Stage-2 Reward model training
+
+In theory, it's unnecessary that the reward model and SFT model share the same pretrained model. However the reality is they have to be the same.
+
+The reason is that in Stage-3  PPO training, the actions are actually the generated ids. These ids has to be evaluated by Critic/Reward model(They are same when the PPO training starts). If reward model and SFT model(aka Actor)'s tokenizers are not identical, the ids generated by SFT mean nothing to Reward model.
+
+Sometimes the same tokenizer might be shared by different models, but in my case, I can't find smaller model sharing the tokenizer with ChatGLM-6B. Even the same model with different scales, it's not garaunteed that share the same tokenizer. 
+
+So I use the same pretrained model to train reward model. 
+
+We first generate reward binary dataset:
+```
+python easy_dataset.py \
+       --input_file toys_rewards_train.jsonl \
+       --output_file toys_rewards_train \
+       --need_trust_code \
+       --tokenizer_name THUDM/chatglm-6b \
+       --dataset_type reward
+```
+```
+python easy_dataset.py \
+       --input_file toys_rewards_test.jsonl \
+       --output_file toys_rewards_test \
+       --need_trust_code \
+       --tokenizer_name THUDM/chatglm-6b \
+       --dataset_type reward
+```
+
+Now we get two new directories: toys_rewards_train and toys_rewards_test which contains our data to be used by stage-2 training.
+
+Now we can run the following script to train reward model:
+```
+sh train_reward_model.sh
+```
+
+After seconds of training, we'll get a new directory chatglmrm which also contains only LoRa parameter. Only several megabytes only!
+
+We get the SFT and Reward model training models, then let's move to the final stage-3 PPO training.
 
-For stage-3 rlhf training, call train_peft_prompts.py.
-Its arguments are almost idential to train_prompts.py. The only difference is that I use text files to indicate the prompt and pretrained data file. The models are included in easy_models.py. Currently only bloom models are tested, but technically gpt2/opt/llama should be supported.
+## Stage 3 PPO Training
+
+PPO training is a difficult process because it has to maintain four models(Actor, Initial model, Critic, Reward) in memory at the same time. I modified the initial trainer that support 6B model training in one 3090ti with 24G VRam.
+
+First we'll prepare the training data. We need two text files which are used as prompts and LLM's pretrain file.
+
+In the directory, toys_prompts.txt and toys_pretrain.txt are these two files. Actually they have the same type of contents, prompt file provid prompts for Actor to generate samples, and pretrain file just keep the Actor prameters moving so farwary from original model.
+
+Run the following scripts:
+```
+sh train_prompts.sh
+```
+
+After several minitues, we will get our first RLHF LLM which is stored in a new directory lora_ppo! Again we only store the LoRa parameters, it's easy and the same usage like standard peft!
+
+
+---
 
 # Dataformat
-Please refer the formats in test_sft.txt, test_prompts.txt, test_pretrained.txt.
+Please refer the formats in toys_pretrain.txt,toys_sft.txt, toys_prompts.txt,toys_rewards_test.jsonl, toys_reward_train.jsonl.
+
+It's easy to use your own data to train all of 3 stages model.