From d5f9f4ab19494bd829d5599f78b5f0d68067b342 Mon Sep 17 00:00:00 2001 From: ver217 Date: Sat, 6 May 2023 18:10:34 +0800 Subject: [PATCH 1/3] [chat] lora add todo --- applications/Chat/coati/models/lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py index bd9cf3320818..2a9059e6901e 100644 --- a/applications/Chat/coati/models/lora.py +++ b/applications/Chat/coati/models/lora.py @@ -62,7 +62,7 @@ def T(w): # Make sure that the weights are not merged if self.r > 0: if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"): - # csric: temporary fix + # FIXME(csric): temporary fix self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features))) self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r))) self.reset_parameters() From 6f7fd29208a1237911fbe9904f70a9e71fcefc3d Mon Sep 17 00:00:00 2001 From: ver217 Date: Sat, 6 May 2023 18:20:56 +0800 Subject: [PATCH 2/3] [chat] remove unused pipeline strategy --- .../Chat/coati/ray/pipeline_strategy.py | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 applications/Chat/coati/ray/pipeline_strategy.py diff --git a/applications/Chat/coati/ray/pipeline_strategy.py b/applications/Chat/coati/ray/pipeline_strategy.py deleted file mode 100644 index 4b01a45b176e..000000000000 --- a/applications/Chat/coati/ray/pipeline_strategy.py +++ /dev/null @@ -1,100 +0,0 @@ -# WIP - -import os -import random -from functools import partial - -import numpy as np -import torch -from coati.models.base import Actor, Critic, RewardModel -from coati.trainer.strategies import NaiveStrategy, Strategy -from torch._C._distributed_rpc import _is_current_rpc_agent_set - -import colossalai -from colossalai.fx import ColoTracer -from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass -from colossalai.pipeline.middleware.adaptor import get_fx_topology -from colossalai.pipeline.pipeline_process_group import ppg -from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine - -rpc_is_initialized = _is_current_rpc_agent_set - - -class PipelineModel(torch.nn.Module): - ''' - Actor has 2 kinds of jobs: forward and generate. - better to just pipelinize the inner model - ''' - - def __init__( - self, - model: torch.nn.Module, - stage_num: int, - num_microbatches: int, - data_kwargs=None, - ): - super().__init__() - - # create partition module - def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs): - model.eval() - tracer = ColoTracer() - meta_args = {k: v.to('meta') for k, v in data_kwargs.items()} - graph = tracer.trace(root=model, meta_args=meta_args) - gm = torch.fx.GraphModule(model, graph, model.__class__.__name__) - annotated_model = balanced_split_pass(gm, stage_num) - top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True) - topo = get_fx_topology(top_module) - for submodule in split_submodules: - if isinstance(submodule, torch.fx.GraphModule): - setattr(submodule, '_topo', topo) - return split_submodules[pp_rank + 1] - - def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int): - partition = create_partition_module(pp_rank, stage_num, model, data_kwargs) - return partition - - self.inference_engine = OneFOneBPipelineEngine( - partition_fn=partial(partition, model, data_kwargs), - stage_num=stage_num, - num_microbatches=num_microbatches, - device='cuda', - ) - - def forward(self, **model_inputs): - return self.inference_engine.forward_backward(**model_inputs, forward_only=True) - - -class PPStrategy(NaiveStrategy): - """ - Strategy for Pipeline inference (inference only!) - - master node only - """ - - def __init__(self, seed: int = 42): - self.seed = seed - super().__init__() - - def setup_distributed(self) -> None: - colossalai.launch_from_torch({}, seed=self.seed) - ppg.set_global_info(rank=int(os.environ['RANK']), - world_size=int(os.environ['WORLD_SIZE']), - dp_degree=1, - tp_degree=1, - num_worker_threads=128, - device="cuda") - - def model_init_context(self): - return super().model_init_context() - - def setup_model(self, model: torch.nn.Module) -> torch.nn.Module: - if isinstance(model, Actor) or \ - isinstance(model, RewardModel) or \ - isinstance(model, Critic): - model.model = PipelineModel(model.model) - - def set_seed(self, seed: int) -> None: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) From 36e5d52f37ce06771f391e73608834929120531e Mon Sep 17 00:00:00 2001 From: ver217 Date: Sat, 6 May 2023 18:34:07 +0800 Subject: [PATCH 3/3] [chat] refactor example structure --- .../ray/1mmt_dummy.py | 3 +- .../ray/mmmt_dummy.py | 3 +- applications/Chat/examples/ray/.gitignore | 1 - applications/Chat/examples/ray/benchmark.sh | 39 ------------------- 4 files changed, 4 insertions(+), 42 deletions(-) rename applications/Chat/{examples => benchmarks}/ray/1mmt_dummy.py (98%) rename applications/Chat/{examples => benchmarks}/ray/mmmt_dummy.py (98%) delete mode 100644 applications/Chat/examples/ray/.gitignore delete mode 100644 applications/Chat/examples/ray/benchmark.sh diff --git a/applications/Chat/examples/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py similarity index 98% rename from applications/Chat/examples/ray/1mmt_dummy.py rename to applications/Chat/benchmarks/ray/1mmt_dummy.py index eba5213a83d3..47985d5c00aa 100644 --- a/applications/Chat/examples/ray/1mmt_dummy.py +++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py @@ -63,7 +63,8 @@ def model_fn(): critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain) actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda() critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda() - reward_model = get_reward_model_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda() + reward_model = get_reward_model_from_args(args.critic_model, + config=critic_cfg).requires_grad_(False).half().cuda() if args.initial_model_quant_ckpt is not None and args.model == 'llama': # quantize initial model with low_resource_init(), no_init_weights(): diff --git a/applications/Chat/examples/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py similarity index 98% rename from applications/Chat/examples/ray/mmmt_dummy.py rename to applications/Chat/benchmarks/ray/mmmt_dummy.py index 082f4851777e..a72eb9bb87de 100644 --- a/applications/Chat/examples/ray/mmmt_dummy.py +++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py @@ -63,7 +63,8 @@ def model_fn(): critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain) actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda() critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda() - reward_model = get_reward_model_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda() + reward_model = get_reward_model_from_args(args.critic_model, + config=critic_cfg).requires_grad_(False).half().cuda() if args.initial_model_quant_ckpt is not None and args.model == 'llama': # quantize initial model with low_resource_init(), no_init_weights(): diff --git a/applications/Chat/examples/ray/.gitignore b/applications/Chat/examples/ray/.gitignore deleted file mode 100644 index 4cf8dd15619e..000000000000 --- a/applications/Chat/examples/ray/.gitignore +++ /dev/null @@ -1 +0,0 @@ -logs/* \ No newline at end of file diff --git a/applications/Chat/examples/ray/benchmark.sh b/applications/Chat/examples/ray/benchmark.sh deleted file mode 100644 index 3852684007b7..000000000000 --- a/applications/Chat/examples/ray/benchmark.sh +++ /dev/null @@ -1,39 +0,0 @@ - -PROMPT_PATH=/home/lccsr/data3/awesome-chatgpt-prompts/prompts.csv - -num_trainers=4 -num_makers=4 - -# "facebook/opt-2.7b" -for pretrain in "facebook/opt-1.3b" "facebook/opt-6.7b" "facebook/opt-13b" -do - - for experience_batch_size in 16 32 64 - do - for train_batch_size in 16 32 64 - do - for update_steps in 8 32 128 - do - # set a big enough experience_steps for twice maker-update - experience_steps=$((2*num_trainers*train_batch_size*update_steps/num_makers/experience_batch_size)) - - config_string=${num_trainers}_${num_makers}_pretrain_${pretrain##*/}_experience_batch_size_${experience_batch_size}_train_batch_size_${train_batch_size}_update_steps_${update_steps}_experience_steps_${experience_steps} - echo running: ${config_string} - - nohup python mmmt_prompt.py \ - --prompt_path $PROMPT_PATH \ - --trainer_strategy colossalai_gemini --maker_strategy naive \ - --model 'opt' \ - --pretrain $pretrain \ - --critic_pretrain "facebook/opt-350m" \ - --num_trainers $num_trainers \ - --num_makers $num_makers \ - --experience_steps $experience_steps \ - --experience_batch_size $experience_batch_size \ - --update_steps $update_steps \ - --train_batch_size $train_batch_size \ - --debug > logs/output_${config_string}.txt 2>&1 - done - done - done -done \ No newline at end of file