From 3f2b8a069e2d37aacdb296812ea12570023b6482 Mon Sep 17 00:00:00 2001 From: zhang-yi-chi <673865549@qq.com> Date: Fri, 28 Apr 2023 13:22:47 +0800 Subject: [PATCH 1/4] fix gemini strategy bug --- applications/Chat/coati/experience_maker/naive.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py index 94546eeb28e7..a9416878392c 100644 --- a/applications/Chat/coati/experience_maker/naive.py +++ b/applications/Chat/coati/experience_maker/naive.py @@ -1,6 +1,8 @@ import torch from coati.models.utils import compute_reward, normalize +from colossalai.tensor import ColoTensor + from .base import Experience, ExperienceMaker @@ -22,9 +24,15 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie num_actions = action_mask.size(1) action_log_probs = self.actor(sequences, num_actions, attention_mask) - base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask) value = self.critic(sequences, action_mask, attention_mask) - r = self.reward_model(sequences, attention_mask) + if isinstance(sequences, ColoTensor): + torch_sequences = sequences.data.to(torch.cuda.current_device()) + torch_attention_mask = attention_mask.data.to(torch.cuda.current_device()) + base_action_log_probs = self.initial_model(torch_sequences, num_actions, torch_attention_mask) + r = self.reward_model(torch_sequences, torch_attention_mask) + else: + base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask) + r = self.reward_model(sequences, attention_mask) reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask) advantage = reward - value From 18db2e8d43280e666c17ba3990e7c02d5c901bdd Mon Sep 17 00:00:00 2001 From: zhang-yi-chi <673865549@qq.com> Date: Fri, 28 Apr 2023 13:33:25 +0800 Subject: [PATCH 2/4] add comment --- applications/Chat/coati/experience_maker/naive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py index a9416878392c..95eb0ef52acf 100644 --- a/applications/Chat/coati/experience_maker/naive.py +++ b/applications/Chat/coati/experience_maker/naive.py @@ -25,6 +25,7 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie action_log_probs = self.actor(sequences, num_actions, attention_mask) value = self.critic(sequences, action_mask, attention_mask) + # transfer to torch.tensor when using gemini strategy if isinstance(sequences, ColoTensor): torch_sequences = sequences.data.to(torch.cuda.current_device()) torch_attention_mask = attention_mask.data.to(torch.cuda.current_device()) From 7aaad2c2a007bba25dca28834925b907c358b8d7 Mon Sep 17 00:00:00 2001 From: zhang-yi-chi <673865549@qq.com> Date: Fri, 28 Apr 2023 14:35:12 +0800 Subject: [PATCH 3/4] add comment --- applications/Chat/coati/experience_maker/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py index 95eb0ef52acf..9b8b16eb2748 100644 --- a/applications/Chat/coati/experience_maker/naive.py +++ b/applications/Chat/coati/experience_maker/naive.py @@ -25,7 +25,7 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie action_log_probs = self.actor(sequences, num_actions, attention_mask) value = self.critic(sequences, action_mask, attention_mask) - # transfer to torch.tensor when using gemini strategy + # converting to torch.tensor when using gemini strategy if isinstance(sequences, ColoTensor): torch_sequences = sequences.data.to(torch.cuda.current_device()) torch_attention_mask = attention_mask.data.to(torch.cuda.current_device()) From 4e2bab6b9caf79b6fc9e5ffe6bf9e46389bb016f Mon Sep 17 00:00:00 2001 From: zhang-yi-chi <673865549@qq.com> Date: Fri, 28 Apr 2023 14:59:15 +0800 Subject: [PATCH 4/4] better solution --- .../Chat/coati/experience_maker/naive.py | 13 +--- applications/Chat/examples/train_prompts.py | 70 +++++++++---------- 2 files changed, 37 insertions(+), 46 deletions(-) diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py index 9b8b16eb2748..94546eeb28e7 100644 --- a/applications/Chat/coati/experience_maker/naive.py +++ b/applications/Chat/coati/experience_maker/naive.py @@ -1,8 +1,6 @@ import torch from coati.models.utils import compute_reward, normalize -from colossalai.tensor import ColoTensor - from .base import Experience, ExperienceMaker @@ -24,16 +22,9 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie num_actions = action_mask.size(1) action_log_probs = self.actor(sequences, num_actions, attention_mask) + base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask) value = self.critic(sequences, action_mask, attention_mask) - # converting to torch.tensor when using gemini strategy - if isinstance(sequences, ColoTensor): - torch_sequences = sequences.data.to(torch.cuda.current_device()) - torch_attention_mask = attention_mask.data.to(torch.cuda.current_device()) - base_action_log_probs = self.initial_model(torch_sequences, num_actions, torch_attention_mask) - r = self.reward_model(torch_sequences, torch_attention_mask) - else: - base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask) - r = self.reward_model(sequences, attention_mask) + r = self.reward_model(sequences, attention_mask) reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask) advantage = reward - value diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index f4563630aad6..241ee2223f96 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -36,45 +36,45 @@ def main(args): if args.rm_path is not None: state_dict = torch.load(args.rm_path, map_location='cpu') - # configure model - if args.model == 'gpt2': - initial_model = GPTActor(pretrained=args.pretrain) - elif args.model == 'bloom': - initial_model = BLOOMActor(pretrained=args.pretrain) - elif args.model == 'opt': - initial_model = OPTActor(pretrained=args.pretrain) - elif args.model == 'llama': - initial_model = LlamaActor(pretrained=args.pretrain) - elif args.model == 'roberta': - initial_model = RoBERTaActor(pretrained=args.pretrain) - else: - raise ValueError(f'Unsupported actor model "{args.model}"') + with strategy.model_init_context(): + # configure model + if args.model == 'gpt2': + initial_model = GPTActor(pretrained=args.pretrain) + elif args.model == 'bloom': + initial_model = BLOOMActor(pretrained=args.pretrain) + elif args.model == 'opt': + initial_model = OPTActor(pretrained=args.pretrain) + elif args.model == 'llama': + initial_model = LlamaActor(pretrained=args.pretrain) + elif args.model == 'roberta': + initial_model = RoBERTaActor(pretrained=args.pretrain) + else: + raise ValueError(f'Unsupported actor model "{args.model}"') - if args.rm_model == None: - rm_model_name = args.model - else: - rm_model_name = args.rm_model - - if rm_model_name == 'gpt2': - reward_model = GPTRM(pretrained=args.rm_pretrain) - elif rm_model_name == 'bloom': - reward_model = BLOOMRM(pretrained=args.rm_pretrain) - elif rm_model_name == 'opt': - reward_model = OPTRM(pretrained=args.rm_pretrain) - elif rm_model_name == 'llama': - reward_model = LlamaRM(pretrained=args.rm_pretrain) - elif rm_model_name == 'roberta': - reward_model = RoBERTaRM(pretrained=args.rm_pretrain) - else: - raise ValueError(f'Unsupported reward model "{rm_model_name}"') + if args.rm_model == None: + rm_model_name = args.model + else: + rm_model_name = args.rm_model - if args.rm_path is not None: - reward_model.load_state_dict(state_dict) + if rm_model_name == 'gpt2': + reward_model = GPTRM(pretrained=args.rm_pretrain) + elif rm_model_name == 'bloom': + reward_model = BLOOMRM(pretrained=args.rm_pretrain) + elif rm_model_name == 'opt': + reward_model = OPTRM(pretrained=args.rm_pretrain) + elif rm_model_name == 'llama': + reward_model = LlamaRM(pretrained=args.rm_pretrain) + elif rm_model_name == 'roberta': + reward_model = RoBERTaRM(pretrained=args.rm_pretrain) + else: + raise ValueError(f'Unsupported reward model "{rm_model_name}"') - initial_model.to(torch.float16).to(torch.cuda.current_device()) - reward_model.to(torch.float16).to(torch.cuda.current_device()) + if args.rm_path is not None: + reward_model.load_state_dict(state_dict) + + initial_model.to(torch.float16).to(torch.cuda.current_device()) + reward_model.to(torch.float16).to(torch.cuda.current_device()) - with strategy.model_init_context(): if args.model == 'gpt2': actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == 'bloom':