From 2ec052e14e62e43ca05012b198e5da4a72d477c7 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Sun, 3 Dec 2023 16:13:25 +0800 Subject: [PATCH 01/15] low level zero support lora low level zero support lora --- .../booster/plugin/low_level_zero_plugin.py | 120 +++++++++++++++++- colossalai/zero/low_level/low_level_optim.py | 2 + .../test_plugin/test_low_level_zero_plugin.py | 34 ++++- 3 files changed, 147 insertions(+), 9 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 57e445735649..1bf295d10b8b 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -3,7 +3,9 @@ from functools import partial from pathlib import Path from types import MethodType -from typing import Callable, Dict, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple, Dict + +from peft import LoraConfig, TaskType, get_peft_model import torch import torch.nn as nn @@ -66,6 +68,10 @@ def forward(self, *args, **kwargs): class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO): + def __init__(self, enable_lora) -> None: + self.enable_lora = enable_lora + super().__init__() + def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool = False): """Save optimizer to checkpoint but only on master process. @@ -208,6 +214,83 @@ def load_sharded_model( super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module) model.update_master_params() + def save_lora_config(self, peft_model, checkpoint): + """ + Save the lora adapters and adapter configuration file to checkpoint directory. + """ + if os.path.isfile(checkpoint): + logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") + return + if self.coordinator.is_master(): + Path(checkpoint).mkdir(parents=True, exist_ok=True) + peft_model.create_or_update_model_card(checkpoint) + + peft_config = peft_model.peft_config["default"] + + # save the config and change the inference mode to `True` + if peft_config.base_model_name_or_path is None: + peft_config.base_model_name_or_path = peft_model.base_model.model.__dict__.get("name_or_path", None) + + inference_mode = peft_config.inference_mode + peft_config.inference_mode = True + + if peft_config.task_type is None: + # deal with auto mapping + base_model_class = peft_model._get_base_model_class( + is_prompt_tuning=peft_config.is_prompt_learning, + ) + parent_library = base_model_class.__module__ + + auto_mapping_dict = { + "base_model_class": base_model_class.__name__, + "parent_library": parent_library, + } + else: + auto_mapping_dict = None + + if self.coordinator.is_master(): + peft_config.save_pretrained(checkpoint, auto_mapping_dict=auto_mapping_dict) # save the config + peft_config.inference_mode = inference_mode + + def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool): + "save unsharded model" + checkpoint_file = checkpoint + if self.enable_lora: + from peft import PeftModel + from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME + assert isinstance(model, ModelWrapper), "Please boost the model before saving!" + peft_model = model.unwrap() + assert isinstance( + peft_model, PeftModel + ), "The model doesn't have lora adapters, please enable lora before saving." + self.save_lora_config(peft_model, checkpoint) + if use_safetensors: + checkpoint_file = os.path.join(checkpoint, SAFETENSORS_WEIGHTS_NAME) + else: + checkpoint_file = os.path.join(checkpoint, WEIGHTS_NAME) + return super().save_unsharded_model(model, checkpoint_file, gather_dtensor, use_safetensors) + + def save_sharded_model( + self, + model: ModelWrapper, + checkpoint_path: str, + gather_dtensor: bool = True, + prefix: Optional[str] = None, + max_shard_size: int = 1024, + use_safetensors: bool = False, + ): + if self.enable_lora: + from peft import PeftModel + from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME + assert isinstance(model, ModelWrapper), "Please boost the model before saving!" + peft_model = model.unwrap() + assert isinstance( + peft_model, PeftModel + ), "The model doesn't have lora adapters, please enable lora before saving." + self.save_lora_config(peft_model, checkpoint_path) + return super().save_sharded_model(model, checkpoint_path, gather_dtensor, prefix, max_shard_size, use_safetensors) + + class LowLevelZeroPlugin(DPPluginBase): """ @@ -287,6 +370,7 @@ def __init__( cpu_offload=cpu_offload, master_weights=master_weights, ) + self.lora_enabled = False self.verbose = verbose # set class name with stage, for better error message @@ -310,6 +394,28 @@ def control_device(self) -> bool: def supported_devices(self) -> List[str]: return ["cuda"] + + def support_lora(self) -> bool: + return True + + def enable_lora( + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> nn.Module: + from peft import PeftModel, get_peft_model + assert not isinstance(model, LowLevelZeroModel), "Lora should be enabled before boosting the model." + self.lora_enabled = True + + if pretrained_dir is None: + peft_model = get_peft_model(model, lora_config) + else: + peft_model = PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) + modules_to_save = peft_model.modules_to_save + if modules_to_save is not None: + for n, p in peft_model.named_parameters(): + if any((f"{key}.original_module" in n) for key in modules_to_save): + p.requires_grad_(False) + return peft_model + def configure( self, model: nn.Module, @@ -318,6 +424,11 @@ def configure( dataloader: Optional[DataLoader] = None, lr_scheduler: Optional[LRScheduler] = None, ) -> Tuple[nn.Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]: + if self.lora_enabled: + from peft import PeftModel + assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" + self.zero_optim_kwargs["enable_lora"] = True + if not isinstance(model, ModelWrapper): model = LowLevelZeroModel(model, self.precision) @@ -334,13 +445,8 @@ def control_checkpoint_io(self) -> bool: return True def get_checkpoint_io(self) -> CheckpointIO: - return LowLevelZeroCheckpointIO() + return LowLevelZeroCheckpointIO(self.enable_lora) def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: assert isinstance(optimizer, LowLevelZeroOptimizer) return optimizer.no_sync() - - def enable_lora( - self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None - ) -> nn.Module: - raise NotImplementedError diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index e6974a6760ce..9bd8d87c8e21 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -76,6 +76,7 @@ def __init__( dp_process_group: Optional[ProcessGroup] = None, # the dp pg for comm forced_dtype: Optional[torch.dtype] = None, master_weights: bool = True, # master weights + enable_lora: bool = False, ): super(LowLevelZeroOptimizer, self).__init__(optim=optimizer) self._dtype = self.optim.param_groups[0]["params"][0].dtype @@ -86,6 +87,7 @@ def __init__( self._partition_grads = partition_grad self._cpu_offload = cpu_offload + self.enable_lora = enable_lora # grad accumulation self.require_grad_sync = True diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 104ca254c572..0ff34ff2b314 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -2,6 +2,7 @@ import torch import torch.distributed as dist +from peft import LoraConfig import colossalai from colossalai.booster import Booster @@ -18,7 +19,7 @@ _STUCK_MODELS = ["transformers_albert_for_multiple_choice"] -def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]: +def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) -> Optional[str]: try: plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) @@ -31,6 +32,9 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]: k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() } + if lora_config is not None: + model = booster.enable_lora(model, lora_config=lora_config) + model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) output = model(**data) @@ -81,10 +85,36 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True): assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()]) +@parameterize("stage", [1,2]) +@parameterize("model_name", ["transformers_llama"]) +def check_low_level_zero_lora(stage, model_name, early_stop: bool = True): + passed_models = [] + failed_info = {} # (model_name, error) pair + + sub_model_zoo = model_zoo.get_sub_registry(model_name) + for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): + task_type = None + if name == "transformers_llama_for_casual_lm": + task_type = "CAUSAL_LM" + if name == "transformers_llama_for_sequence_classification": + task_type = "SEQ_CLS" + lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config) + + torch.cuda.empty_cache() + + if err is None: + passed_models.append(name) + else: + failed_info[name] = err + if early_stop: + break + def run_dist(rank, world_size, port, early_stop: bool = True): # init dist env colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost") - check_low_level_zero_plugin(early_stop=early_stop) + # check_low_level_zero_plugin(early_stop=early_stop) + check_low_level_zero_lora(early_stop=early_stop) @rerun_if_address_is_in_use() From edc7c36d8004ec2a7e474cbb1a4f50adfcf1a866 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Sun, 3 Dec 2023 16:49:58 +0800 Subject: [PATCH 02/15] add checkpoint test --- .../test_plugin/test_low_level_zero_plugin.py | 2 +- .../test_low_level_zero_checkpoint_io.py | 75 +++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 0ff34ff2b314..c87f5f0c2322 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -85,7 +85,7 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True): assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()]) -@parameterize("stage", [1,2]) +@parameterize("stage", [2]) @parameterize("model_name", ["transformers_llama"]) def check_low_level_zero_lora(stage, model_name, early_stop: bool = True): passed_models = [] diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index e7f44f97e3cf..feb058774018 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -2,6 +2,9 @@ import torch.distributed as dist from torchvision.models import resnet18 from utils import shared_tempdir +from typing import Optional +from peft import LoraConfig +from copy import deepcopy import colossalai from colossalai.booster import Booster @@ -15,6 +18,7 @@ spawn, ) from colossalai.zero import LowLevelZeroOptimizer +from tests.kit.model_zoo import model_zoo # stage 1 and 2 process the optimizer/mode the same way @@ -69,9 +73,80 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool): torch.cuda.empty_cache() +def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lora_config=None) -> Optional[str]: + try: + plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5, cpu_offload=offload) + booster = Booster(plugin=plugin) + model = model_fn() + origin_model = deepcopy(model) + optimizer = HybridAdam(model.parameters(), lr=1e-3) + origin_optimizer = deepcopy(optimizer) + criterion = lambda x: x.mean() + data = data_gen_fn() + + data = { + k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() + } + + model = booster.enable_lora(model, lora_config=lora_config) + + model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) + + with shared_tempdir() as tempdir: + model_ckpt_path = f"{tempdir}/model" + optimizer_ckpt_path = f"{tempdir}/optimizer" + # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here + booster.save_model(model, model_ckpt_path, shard=shard) + booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard) + + dist.barrier() + new_model = booster.enable_lora(origin_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) + new_model, new_optimizer, _, _, _ = booster.boost(new_model, origin_optimizer) + + check_state_dict_equal(model.state_dict(), new_model.state_dict(), False) + # check master weight + assert isinstance(new_optimizer, LowLevelZeroOptimizer) + working_param_id_set = set(id(p) for p in new_model.parameters()) + for p_id, master_param in new_optimizer._param_store.working_to_master_param.items(): + assert p_id in working_param_id_set + working_param = new_optimizer._param_store.master_to_working_param[id(master_param)] + padding = new_optimizer._param_store.get_param_padding_size(working_param) + padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding)) + working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()] + assert torch.equal( + working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device) + ) + + booster.load_optimizer(new_optimizer, optimizer_ckpt_path) + check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False) + + except Exception as e: + return repr(e) + +@clear_cache_before_run() +@parameterize("stage", [2]) +@parameterize("shard", [True, False]) +@parameterize("offload", [False, True]) +@parameterize("model_name", ["transformers_llama"]) +def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: bool, model_name: str): + plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload) + booster = Booster(plugin=plugin) + sub_model_zoo = model_zoo.get_sub_registry(model_name) + + sub_model_zoo = model_zoo.get_sub_registry(model_name) + for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): + task_type = None + if name == "transformers_llama_for_casual_lm": + task_type = "CAUSAL_LM" + if name == "transformers_llama_for_sequence_classification": + task_type = "SEQ_CLS" + lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + err = run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lora_config) + def run_dist(rank, world_size, port): colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost") check_low_level_zero_checkpointIO() + check_low_level_zero_lora_checkpointIO() torch.cuda.empty_cache() From 176b74499885ce50a48613c6816025bf933cf2b5 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Sun, 3 Dec 2023 16:56:24 +0800 Subject: [PATCH 03/15] add checkpoint test --- colossalai/zero/low_level/low_level_optim.py | 2 -- tests/test_booster/test_plugin/test_low_level_zero_plugin.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index 9bd8d87c8e21..e6974a6760ce 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -76,7 +76,6 @@ def __init__( dp_process_group: Optional[ProcessGroup] = None, # the dp pg for comm forced_dtype: Optional[torch.dtype] = None, master_weights: bool = True, # master weights - enable_lora: bool = False, ): super(LowLevelZeroOptimizer, self).__init__(optim=optimizer) self._dtype = self.optim.param_groups[0]["params"][0].dtype @@ -87,7 +86,6 @@ def __init__( self._partition_grads = partition_grad self._cpu_offload = cpu_offload - self.enable_lora = enable_lora # grad accumulation self.require_grad_sync = True diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index c87f5f0c2322..ba09b04667c6 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -113,7 +113,7 @@ def check_low_level_zero_lora(stage, model_name, early_stop: bool = True): def run_dist(rank, world_size, port, early_stop: bool = True): # init dist env colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost") - # check_low_level_zero_plugin(early_stop=early_stop) + check_low_level_zero_plugin(early_stop=early_stop) check_low_level_zero_lora(early_stop=early_stop) From c7b25f34d326445fd19ea7d843ebe8ea8b951cea Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Sun, 3 Dec 2023 23:26:48 +0800 Subject: [PATCH 04/15] fix --- .../booster/plugin/low_level_zero_plugin.py | 60 ++++--------------- .../low_level/bookkeeping/gradient_store.py | 1 + colossalai/zero/low_level/low_level_optim.py | 1 + .../test_plugin/test_low_level_zero_plugin.py | 6 ++ .../test_low_level_zero_checkpoint_io.py | 54 ++++++++++++----- 5 files changed, 56 insertions(+), 66 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 1bf295d10b8b..0673f3852258 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -214,48 +214,13 @@ def load_sharded_model( super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module) model.update_master_params() - def save_lora_config(self, peft_model, checkpoint): - """ - Save the lora adapters and adapter configuration file to checkpoint directory. - """ - if os.path.isfile(checkpoint): - logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") - return - if self.coordinator.is_master(): - Path(checkpoint).mkdir(parents=True, exist_ok=True) - peft_model.create_or_update_model_card(checkpoint) - - peft_config = peft_model.peft_config["default"] - - # save the config and change the inference mode to `True` - if peft_config.base_model_name_or_path is None: - peft_config.base_model_name_or_path = peft_model.base_model.model.__dict__.get("name_or_path", None) - - inference_mode = peft_config.inference_mode - peft_config.inference_mode = True - - if peft_config.task_type is None: - # deal with auto mapping - base_model_class = peft_model._get_base_model_class( - is_prompt_tuning=peft_config.is_prompt_learning, - ) - parent_library = base_model_class.__module__ - - auto_mapping_dict = { - "base_model_class": base_model_class.__name__, - "parent_library": parent_library, - } - else: - auto_mapping_dict = None - - if self.coordinator.is_master(): - peft_config.save_pretrained(checkpoint, auto_mapping_dict=auto_mapping_dict) # save the config - peft_config.inference_mode = inference_mode - def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool): "save unsharded model" checkpoint_file = checkpoint if self.enable_lora: + if os.path.isfile(checkpoint): + logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") + return from peft import PeftModel from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME assert isinstance(model, ModelWrapper), "Please boost the model before saving!" @@ -263,11 +228,7 @@ def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dten assert isinstance( peft_model, PeftModel ), "The model doesn't have lora adapters, please enable lora before saving." - self.save_lora_config(peft_model, checkpoint) - if use_safetensors: - checkpoint_file = os.path.join(checkpoint, SAFETENSORS_WEIGHTS_NAME) - else: - checkpoint_file = os.path.join(checkpoint, WEIGHTS_NAME) + return peft_model.save_pretrained(checkpoint) return super().save_unsharded_model(model, checkpoint_file, gather_dtensor, use_safetensors) def save_sharded_model( @@ -280,14 +241,18 @@ def save_sharded_model( use_safetensors: bool = False, ): if self.enable_lora: + if os.path.isfile(checkpoint_path): + logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file") + return from peft import PeftModel from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME assert isinstance(model, ModelWrapper), "Please boost the model before saving!" peft_model = model.unwrap() + # print(peft_model) assert isinstance( peft_model, PeftModel ), "The model doesn't have lora adapters, please enable lora before saving." - self.save_lora_config(peft_model, checkpoint_path) + return peft_model.save_pretrained(checkpoint_path) return super().save_sharded_model(model, checkpoint_path, gather_dtensor, prefix, max_shard_size, use_safetensors) @@ -409,11 +374,6 @@ def enable_lora( peft_model = get_peft_model(model, lora_config) else: peft_model = PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) - modules_to_save = peft_model.modules_to_save - if modules_to_save is not None: - for n, p in peft_model.named_parameters(): - if any((f"{key}.original_module" in n) for key in modules_to_save): - p.requires_grad_(False) return peft_model def configure( @@ -427,7 +387,7 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - self.zero_optim_kwargs["enable_lora"] = True + optimizer.param_groups[0]['params'] = list(model.parameters()) if not isinstance(model, ModelWrapper): model = LowLevelZeroModel(model, self.precision) diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py index 1164532fa3a3..ae1a6cae7011 100644 --- a/colossalai/zero/low_level/bookkeeping/gradient_store.py +++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py @@ -82,6 +82,7 @@ def get_working_grads_by_group_id(self, group_id: int) -> List: """ grad_list = [] + print(f"self._grads_of_params {self._grads_of_params}") for param_grads in self._grads_of_params[group_id].values(): grad_list.append(param_grads[self._working_index]) diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index e6974a6760ce..3515706cd43c 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -627,6 +627,7 @@ def load_state_dict(self, state_dict: Dict): v_list = v.split(v.numel() // self._world_size) zero_state_dict["state"][param_idx][k] = v_list[self._local_rank].detach().clone() + print(self.optim) self.optim.load_state_dict(zero_state_dict) def state_dict_shard(self, max_shard_size: int = 1024) -> Iterator[Tuple[Dict, int]]: diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index ba09b04667c6..3692b9bb58a0 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -49,6 +49,7 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) return repr(e) + @parameterize("stage", [2]) def check_low_level_zero_plugin(stage: int, early_stop: bool = True): """check low level zero plugin over model zoo @@ -110,6 +111,11 @@ def check_low_level_zero_lora(stage, model_name, early_stop: bool = True): if early_stop: break + if dist.get_rank() == 0: + print(f"Passed models({len(passed_models)}): {passed_models}\n\n") + print(f"Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n") + assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()]) + def run_dist(rank, world_size, port, early_stop: bool = True): # init dist env colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost") diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index feb058774018..362397ac4fcc 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -44,6 +44,7 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool): model_ckpt_path = f"{tempdir}/model" optimizer_ckpt_path = f"{tempdir}/optimizer" # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here + print(booster.checkpoint_io.enable_lora) booster.save_model(model, model_ckpt_path, shard=shard) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard) @@ -76,11 +77,13 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool): def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lora_config=None) -> Optional[str]: try: plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5, cpu_offload=offload) + new_plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5, cpu_offload=offload) booster = Booster(plugin=plugin) + new_booster = Booster(plugin=new_plugin) model = model_fn() - origin_model = deepcopy(model) + new_model = deepcopy(model) optimizer = HybridAdam(model.parameters(), lr=1e-3) - origin_optimizer = deepcopy(optimizer) + new_optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -88,22 +91,26 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() } - model = booster.enable_lora(model, lora_config=lora_config) - model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) + output = model(**data) + output = output_transform_fn(output) + output_key = list(output.keys())[0] + loss = criterion(output[output_key]) + + booster.backward(loss, optimizer) + optimizer.step() + with shared_tempdir() as tempdir: model_ckpt_path = f"{tempdir}/model" optimizer_ckpt_path = f"{tempdir}/optimizer" - # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here - booster.save_model(model, model_ckpt_path, shard=shard) - booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard) - - dist.barrier() - new_model = booster.enable_lora(origin_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) - new_model, new_optimizer, _, _, _ = booster.boost(new_model, origin_optimizer) + booster.save_model(model, model_ckpt_path, shard=False) + booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False) + new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) + new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion) check_state_dict_equal(model.state_dict(), new_model.state_dict(), False) + # check master weight assert isinstance(new_optimizer, LowLevelZeroOptimizer) working_param_id_set = set(id(p) for p in new_model.parameters()) @@ -117,7 +124,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device) ) - booster.load_optimizer(new_optimizer, optimizer_ckpt_path) + new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path) check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False) except Exception as e: @@ -128,13 +135,14 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo @parameterize("shard", [True, False]) @parameterize("offload", [False, True]) @parameterize("model_name", ["transformers_llama"]) -def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: bool, model_name: str): - plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload) - booster = Booster(plugin=plugin) - sub_model_zoo = model_zoo.get_sub_registry(model_name) +def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: bool, model_name: str, early_stop: bool = True): + passed_models = [] + failed_info = {} # (model_name, error) pair sub_model_zoo = model_zoo.get_sub_registry(model_name) for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): + if name != "transformers_llama": + continue task_type = None if name == "transformers_llama_for_casual_lm": task_type = "CAUSAL_LM" @@ -143,6 +151,20 @@ def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: boo lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) err = run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lora_config) + torch.cuda.empty_cache() + + if err is None: + passed_models.append(name) + else: + failed_info[name] = err + if early_stop: + break + + if dist.get_rank() == 0: + print(f"Passed models({len(passed_models)}): {passed_models}\n\n") + print(f"Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n") + assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()]) + def run_dist(rank, world_size, port): colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost") check_low_level_zero_checkpointIO() From b15ad593cb156368ce0dc8858db93d00351869d4 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 4 Dec 2023 00:06:04 +0800 Subject: [PATCH 05/15] fix --- .../booster/plugin/low_level_zero_plugin.py | 95 ++++++++++--------- .../low_level/bookkeeping/gradient_store.py | 1 - colossalai/zero/low_level/low_level_optim.py | 1 - .../test_low_level_zero_checkpoint_io.py | 10 +- 4 files changed, 58 insertions(+), 49 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 0673f3852258..89c26ad17b54 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -68,8 +68,8 @@ def forward(self, *args, **kwargs): class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO): - def __init__(self, enable_lora) -> None: - self.enable_lora = enable_lora + def __init__(self, lora_enabled) -> None: + self.lora_enabled = lora_enabled super().__init__() def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool = False): @@ -214,47 +214,56 @@ def load_sharded_model( super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module) model.update_master_params() - def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool): - "save unsharded model" - checkpoint_file = checkpoint - if self.enable_lora: - if os.path.isfile(checkpoint): - logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") - return - from peft import PeftModel - from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME - assert isinstance(model, ModelWrapper), "Please boost the model before saving!" - peft_model = model.unwrap() - assert isinstance( - peft_model, PeftModel - ), "The model doesn't have lora adapters, please enable lora before saving." - return peft_model.save_pretrained(checkpoint) - return super().save_unsharded_model(model, checkpoint_file, gather_dtensor, use_safetensors) - - def save_sharded_model( - self, - model: ModelWrapper, - checkpoint_path: str, - gather_dtensor: bool = True, - prefix: Optional[str] = None, - max_shard_size: int = 1024, - use_safetensors: bool = False, - ): - if self.enable_lora: - if os.path.isfile(checkpoint_path): - logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file") - return - from peft import PeftModel - from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME - assert isinstance(model, ModelWrapper), "Please boost the model before saving!" - peft_model = model.unwrap() - # print(peft_model) - assert isinstance( - peft_model, PeftModel - ), "The model doesn't have lora adapters, please enable lora before saving." - return peft_model.save_pretrained(checkpoint_path) - return super().save_sharded_model(model, checkpoint_path, gather_dtensor, prefix, max_shard_size, use_safetensors) + def save_lora_config(self, peft_model, checkpoint): + """ + Save the lora adapters and adapter configuration file to checkpoint directory. + """ + if os.path.isfile(checkpoint): + logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") + return + if self.coordinator.is_master(): + Path(checkpoint).mkdir(parents=True, exist_ok=True) + peft_model.create_or_update_model_card(checkpoint) + peft_config = peft_model.peft_config["default"] + + # save the config and change the inference mode to `True` + if peft_config.base_model_name_or_path is None: + peft_config.base_model_name_or_path = peft_model.base_model.model.__dict__.get("name_or_path", None) + + inference_mode = peft_config.inference_mode + peft_config.inference_mode = True + + if peft_config.task_type is None: + # deal with auto mapping + base_model_class = peft_model._get_base_model_class( + is_prompt_tuning=peft_config.is_prompt_learning, + ) + parent_library = base_model_class.__module__ + + auto_mapping_dict = { + "base_model_class": base_model_class.__name__, + "parent_library": parent_library, + } + else: + auto_mapping_dict = None + + if self.coordinator.is_master(): + peft_config.save_pretrained(checkpoint, auto_mapping_dict=auto_mapping_dict) # save the config + peft_config.inference_mode = inference_mode + + def save_lora_as_pretrained(self, model, checkpoint, use_safetensors): + if os.path.isfile(checkpoint): + logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") + return + from peft import PeftModel + assert isinstance(model, ModelWrapper), "Please boost the model before saving!" + peft_model = model.unwrap() + assert isinstance( + peft_model, PeftModel + ), "The model doesn't have lora adapters, please enable lora before saving." + self.save_lora_config(peft_model, checkpoint) + return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors) class LowLevelZeroPlugin(DPPluginBase): @@ -405,7 +414,7 @@ def control_checkpoint_io(self) -> bool: return True def get_checkpoint_io(self) -> CheckpointIO: - return LowLevelZeroCheckpointIO(self.enable_lora) + return LowLevelZeroCheckpointIO(self.lora_enabled) def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: assert isinstance(optimizer, LowLevelZeroOptimizer) diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py index ae1a6cae7011..1164532fa3a3 100644 --- a/colossalai/zero/low_level/bookkeeping/gradient_store.py +++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py @@ -82,7 +82,6 @@ def get_working_grads_by_group_id(self, group_id: int) -> List: """ grad_list = [] - print(f"self._grads_of_params {self._grads_of_params}") for param_grads in self._grads_of_params[group_id].values(): grad_list.append(param_grads[self._working_index]) diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index 3515706cd43c..e6974a6760ce 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -627,7 +627,6 @@ def load_state_dict(self, state_dict: Dict): v_list = v.split(v.numel() // self._world_size) zero_state_dict["state"][param_idx][k] = v_list[self._local_rank].detach().clone() - print(self.optim) self.optim.load_state_dict(zero_state_dict) def state_dict_shard(self, max_shard_size: int = 1024) -> Iterator[Tuple[Dict, int]]: diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index 362397ac4fcc..b888a9ef94a0 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -44,7 +44,6 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool): model_ckpt_path = f"{tempdir}/model" optimizer_ckpt_path = f"{tempdir}/optimizer" # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here - print(booster.checkpoint_io.enable_lora) booster.save_model(model, model_ckpt_path, shard=shard) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard) @@ -91,6 +90,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() } + model = booster.enable_lora(model, lora_config=lora_config) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) output = model(**data) @@ -102,10 +102,11 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo optimizer.step() with shared_tempdir() as tempdir: - model_ckpt_path = f"{tempdir}/model" + # model_ckpt_path = f"{tempdir}/model" + model_ckpt_path = f'/home/lcjmy/vepfs/ColossalAI/tests/test_checkpoint_io/model' optimizer_ckpt_path = f"{tempdir}/optimizer" - booster.save_model(model, model_ckpt_path, shard=False) + booster.save_lora_as_pretrained(model, model_ckpt_path) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False) new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion) @@ -128,7 +129,8 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False) except Exception as e: - return repr(e) + # return repr(e) + raise e @clear_cache_before_run() @parameterize("stage", [2]) From 5bf3ee2b63dfd92a34a3d7c66cf580fed9bda24f Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 4 Dec 2023 00:10:44 +0800 Subject: [PATCH 06/15] fix --- .../booster/plugin/low_level_zero_plugin.py | 45 +------------------ .../test_low_level_zero_checkpoint_io.py | 6 +-- 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 89c26ad17b54..9bfa122cdca8 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -68,10 +68,6 @@ def forward(self, *args, **kwargs): class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO): - def __init__(self, lora_enabled) -> None: - self.lora_enabled = lora_enabled - super().__init__() - def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool = False): """Save optimizer to checkpoint but only on master process. @@ -214,44 +210,6 @@ def load_sharded_model( super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module) model.update_master_params() - def save_lora_config(self, peft_model, checkpoint): - """ - Save the lora adapters and adapter configuration file to checkpoint directory. - """ - if os.path.isfile(checkpoint): - logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") - return - if self.coordinator.is_master(): - Path(checkpoint).mkdir(parents=True, exist_ok=True) - peft_model.create_or_update_model_card(checkpoint) - - peft_config = peft_model.peft_config["default"] - - # save the config and change the inference mode to `True` - if peft_config.base_model_name_or_path is None: - peft_config.base_model_name_or_path = peft_model.base_model.model.__dict__.get("name_or_path", None) - - inference_mode = peft_config.inference_mode - peft_config.inference_mode = True - - if peft_config.task_type is None: - # deal with auto mapping - base_model_class = peft_model._get_base_model_class( - is_prompt_tuning=peft_config.is_prompt_learning, - ) - parent_library = base_model_class.__module__ - - auto_mapping_dict = { - "base_model_class": base_model_class.__name__, - "parent_library": parent_library, - } - else: - auto_mapping_dict = None - - if self.coordinator.is_master(): - peft_config.save_pretrained(checkpoint, auto_mapping_dict=auto_mapping_dict) # save the config - peft_config.inference_mode = inference_mode - def save_lora_as_pretrained(self, model, checkpoint, use_safetensors): if os.path.isfile(checkpoint): logging.error(f"Provided path ({checkpoint}) should be a directory, not a file") @@ -262,7 +220,6 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors): assert isinstance( peft_model, PeftModel ), "The model doesn't have lora adapters, please enable lora before saving." - self.save_lora_config(peft_model, checkpoint) return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors) @@ -414,7 +371,7 @@ def control_checkpoint_io(self) -> bool: return True def get_checkpoint_io(self) -> CheckpointIO: - return LowLevelZeroCheckpointIO(self.lora_enabled) + return LowLevelZeroCheckpointIO() def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: assert isinstance(optimizer, LowLevelZeroOptimizer) diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index b888a9ef94a0..a415bd8ca046 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -102,8 +102,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo optimizer.step() with shared_tempdir() as tempdir: - # model_ckpt_path = f"{tempdir}/model" - model_ckpt_path = f'/home/lcjmy/vepfs/ColossalAI/tests/test_checkpoint_io/model' + model_ckpt_path = f"{tempdir}/model" optimizer_ckpt_path = f"{tempdir}/optimizer" booster.save_lora_as_pretrained(model, model_ckpt_path) @@ -129,8 +128,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False) except Exception as e: - # return repr(e) - raise e + return repr(e) @clear_cache_before_run() @parameterize("stage", [2]) From 49973930756dd198ea34149250ed266ad2d3bb16 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 8 Dec 2023 20:46:25 +0800 Subject: [PATCH 07/15] fix fix fix fix --- colossalai/booster/plugin/low_level_zero_plugin.py | 7 ++++++- colossalai/zero/low_level/bookkeeping/gradient_store.py | 2 ++ .../test_plugin/test_low_level_zero_plugin.py | 7 ++++--- .../test_low_level_zero_checkpoint_io.py | 9 +++++---- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 9bfa122cdca8..a9bfc79bfb5b 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -353,7 +353,12 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - optimizer.param_groups[0]['params'] = list(model.parameters()) + + optim_params_nums = 0 + for param_group in optimizer.param_groups: + optim_params_nums += len(param_group['params']) + model_params_nums = len(list(model.named_parameters())) + assert optim_params_nums == model_params_nums, "Optimizer should be initialized after enabling lora." if not isinstance(model, ModelWrapper): model = LowLevelZeroModel(model, self.precision) diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py index 1164532fa3a3..1e969e2d9b76 100644 --- a/colossalai/zero/low_level/bookkeeping/gradient_store.py +++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py @@ -82,6 +82,8 @@ def get_working_grads_by_group_id(self, group_id: int) -> List: """ grad_list = [] + if group_id not in self._grads_of_params.keys(): + return grad_list for param_grads in self._grads_of_params[group_id].values(): grad_list.append(param_grads[self._working_index]) diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3692b9bb58a0..3786bcedbb7f 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -24,6 +24,10 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model = model_fn() + + if lora_config is not None: + model = booster.enable_lora(model, lora_config=lora_config) + optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -32,9 +36,6 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() } - if lora_config is not None: - model = booster.enable_lora(model, lora_config=lora_config) - model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) output = model(**data) diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index a415bd8ca046..64db8ed73e94 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -81,8 +81,8 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo new_booster = Booster(plugin=new_plugin) model = model_fn() new_model = deepcopy(model) + model = booster.enable_lora(model, lora_config=lora_config) optimizer = HybridAdam(model.parameters(), lr=1e-3) - new_optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -90,7 +90,6 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items() } - model = booster.enable_lora(model, lora_config=lora_config) model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) output = model(**data) @@ -108,6 +107,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo booster.save_lora_as_pretrained(model, model_ckpt_path) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False) new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) + new_optimizer = HybridAdam(new_model.parameters(), lr=1e-3) new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion) check_state_dict_equal(model.state_dict(), new_model.state_dict(), False) @@ -128,7 +128,8 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False) except Exception as e: - return repr(e) + # return repr(e) + raise e @clear_cache_before_run() @parameterize("stage", [2]) @@ -167,7 +168,7 @@ def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: boo def run_dist(rank, world_size, port): colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost") - check_low_level_zero_checkpointIO() + # check_low_level_zero_checkpointIO() check_low_level_zero_lora_checkpointIO() torch.cuda.empty_cache() From 3f4aeaab16225b8b1f56ca816cc3ed9626dc1891 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 8 Dec 2023 21:24:01 +0800 Subject: [PATCH 08/15] fix --- tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index 64db8ed73e94..e20bc5a6f97e 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -168,7 +168,7 @@ def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: boo def run_dist(rank, world_size, port): colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost") - # check_low_level_zero_checkpointIO() + check_low_level_zero_checkpointIO() check_low_level_zero_lora_checkpointIO() torch.cuda.empty_cache() From a69c1394b30b9817fbd3316f3a9e81a4083fd968 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 8 Dec 2023 21:24:01 +0800 Subject: [PATCH 09/15] fix fix fix fix fix fix fix --- .../booster/plugin/low_level_zero_plugin.py | 43 +++++++++++++++---- .../low_level/bookkeeping/gradient_store.py | 1 + .../test_plugin/test_low_level_zero_plugin.py | 3 +- .../test_low_level_zero_checkpoint_io.py | 6 +-- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index a9bfc79bfb5b..41e50b911885 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -1,14 +1,14 @@ import logging +import warnings import os from functools import partial from pathlib import Path from types import MethodType from typing import Callable, Dict, Iterator, List, Optional, Tuple, Dict -from peft import LoraConfig, TaskType, get_peft_model - import torch import torch.nn as nn +from torch.nn import Parameter from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler as LRScheduler from torch.utils._pytree import tree_map @@ -335,6 +335,7 @@ def enable_lora( from peft import PeftModel, get_peft_model assert not isinstance(model, LowLevelZeroModel), "Lora should be enabled before boosting the model." self.lora_enabled = True + warnings.warn("You have enabled LoRa training. Please check the hyperparameters such as lr") if pretrained_dir is None: peft_model = get_peft_model(model, lora_config) @@ -342,6 +343,36 @@ def enable_lora( peft_model = PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) return peft_model + def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): + origin_param_id = id(origin_param) + for group_id, param_group in enumerate(optimizer.param_groups): + for p in param_group['params']: + if id(p) == origin_param_id: + return group_id + return -1 + + def add_lora_para_to_optimizer(self, model, optimizer): + """ add lora parameters to optimizer """ + name2param= {} + for name, param in model.named_parameters(): + name2param[name] = param + + optimizer_param_nums = 0 + for param_group in optimizer.param_groups: + optimizer_param_nums += len(param_group['params']) + + # Check if the optimizer is created after the model is transformed into a LoRa model. + if len(name2param) != optimizer_param_nums: + for name, param in name2param.items(): + if 'lora_A' in name or 'lora_B' in name: + origin_key = name.replace("lora_A.", "") + origin_key = origin_key.replace("lora_B.", "") + origin_key = origin_key.replace(f"{model.active_adapter}.", "") + origin_param = name2param[origin_key] + group_id = self.get_param_group_id(optimizer, origin_param) + assert group_id != -1, "Parameter error, origin parameter does't exists." + optimizer.param_groups[group_id]['params'].append(param) + def configure( self, model: nn.Module, @@ -353,12 +384,8 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - - optim_params_nums = 0 - for param_group in optimizer.param_groups: - optim_params_nums += len(param_group['params']) - model_params_nums = len(list(model.named_parameters())) - assert optim_params_nums == model_params_nums, "Optimizer should be initialized after enabling lora." + self.add_lora_para_to_optimizer(model, optimizer) + if not isinstance(model, ModelWrapper): model = LowLevelZeroModel(model, self.precision) diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py index 1e969e2d9b76..631242a4357e 100644 --- a/colossalai/zero/low_level/bookkeeping/gradient_store.py +++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py @@ -82,6 +82,7 @@ def get_working_grads_by_group_id(self, group_id: int) -> List: """ grad_list = [] + # When using LoRa and the user sets multiple param_groups, it is possible that some param_groups have no parameters with gradients. if group_id not in self._grads_of_params.keys(): return grad_list for param_grads in self._grads_of_params[group_id].values(): diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3786bcedbb7f..9ad39d0897f0 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -24,11 +24,11 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model = model_fn() + optimizer = HybridAdam(model.parameters(), lr=1e-3) if lora_config is not None: model = booster.enable_lora(model, lora_config=lora_config) - optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -48,6 +48,7 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) except Exception as e: return repr(e) + # raise e diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index 64db8ed73e94..ed5aa7dbdfde 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -80,9 +80,10 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo booster = Booster(plugin=plugin) new_booster = Booster(plugin=new_plugin) model = model_fn() + optimizer = HybridAdam(model.parameters(), lr=1e-3) new_model = deepcopy(model) + new_optimizer = HybridAdam(new_model.parameters(), lr=1e-3) model = booster.enable_lora(model, lora_config=lora_config) - optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -107,7 +108,6 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo booster.save_lora_as_pretrained(model, model_ckpt_path) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False) new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) - new_optimizer = HybridAdam(new_model.parameters(), lr=1e-3) new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion) check_state_dict_equal(model.state_dict(), new_model.state_dict(), False) @@ -168,7 +168,7 @@ def check_low_level_zero_lora_checkpointIO(stage: int, shard: bool, offload: boo def run_dist(rank, world_size, port): colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost") - # check_low_level_zero_checkpointIO() + check_low_level_zero_checkpointIO() check_low_level_zero_lora_checkpointIO() torch.cuda.empty_cache() From bd4fea28987fb2f37d217613d8c5791965d0db84 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 11 Dec 2023 19:15:20 +0800 Subject: [PATCH 10/15] fix --- colossalai/booster/plugin/low_level_zero_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 41e50b911885..b6e93b35115e 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -351,7 +351,7 @@ def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): return group_id return -1 - def add_lora_para_to_optimizer(self, model, optimizer): + def add_lora_params_to_optimizer(self, model, optimizer): """ add lora parameters to optimizer """ name2param= {} for name, param in model.named_parameters(): @@ -384,7 +384,7 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - self.add_lora_para_to_optimizer(model, optimizer) + self.add_lora_params_to_optimizer(model, optimizer) if not isinstance(model, ModelWrapper): From 28c125351f11dce98499d799755ac4f37bf93b50 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 8 Dec 2023 21:24:01 +0800 Subject: [PATCH 11/15] fix fix fix fix fix fix fix --- .../booster/plugin/low_level_zero_plugin.py | 43 +++++++++++++++---- .../low_level/bookkeeping/gradient_store.py | 1 + .../test_plugin/test_low_level_zero_plugin.py | 3 +- .../test_low_level_zero_checkpoint_io.py | 4 +- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index a9bfc79bfb5b..41e50b911885 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -1,14 +1,14 @@ import logging +import warnings import os from functools import partial from pathlib import Path from types import MethodType from typing import Callable, Dict, Iterator, List, Optional, Tuple, Dict -from peft import LoraConfig, TaskType, get_peft_model - import torch import torch.nn as nn +from torch.nn import Parameter from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler as LRScheduler from torch.utils._pytree import tree_map @@ -335,6 +335,7 @@ def enable_lora( from peft import PeftModel, get_peft_model assert not isinstance(model, LowLevelZeroModel), "Lora should be enabled before boosting the model." self.lora_enabled = True + warnings.warn("You have enabled LoRa training. Please check the hyperparameters such as lr") if pretrained_dir is None: peft_model = get_peft_model(model, lora_config) @@ -342,6 +343,36 @@ def enable_lora( peft_model = PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) return peft_model + def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): + origin_param_id = id(origin_param) + for group_id, param_group in enumerate(optimizer.param_groups): + for p in param_group['params']: + if id(p) == origin_param_id: + return group_id + return -1 + + def add_lora_para_to_optimizer(self, model, optimizer): + """ add lora parameters to optimizer """ + name2param= {} + for name, param in model.named_parameters(): + name2param[name] = param + + optimizer_param_nums = 0 + for param_group in optimizer.param_groups: + optimizer_param_nums += len(param_group['params']) + + # Check if the optimizer is created after the model is transformed into a LoRa model. + if len(name2param) != optimizer_param_nums: + for name, param in name2param.items(): + if 'lora_A' in name or 'lora_B' in name: + origin_key = name.replace("lora_A.", "") + origin_key = origin_key.replace("lora_B.", "") + origin_key = origin_key.replace(f"{model.active_adapter}.", "") + origin_param = name2param[origin_key] + group_id = self.get_param_group_id(optimizer, origin_param) + assert group_id != -1, "Parameter error, origin parameter does't exists." + optimizer.param_groups[group_id]['params'].append(param) + def configure( self, model: nn.Module, @@ -353,12 +384,8 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - - optim_params_nums = 0 - for param_group in optimizer.param_groups: - optim_params_nums += len(param_group['params']) - model_params_nums = len(list(model.named_parameters())) - assert optim_params_nums == model_params_nums, "Optimizer should be initialized after enabling lora." + self.add_lora_para_to_optimizer(model, optimizer) + if not isinstance(model, ModelWrapper): model = LowLevelZeroModel(model, self.precision) diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py index 1e969e2d9b76..631242a4357e 100644 --- a/colossalai/zero/low_level/bookkeeping/gradient_store.py +++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py @@ -82,6 +82,7 @@ def get_working_grads_by_group_id(self, group_id: int) -> List: """ grad_list = [] + # When using LoRa and the user sets multiple param_groups, it is possible that some param_groups have no parameters with gradients. if group_id not in self._grads_of_params.keys(): return grad_list for param_grads in self._grads_of_params[group_id].values(): diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3786bcedbb7f..9ad39d0897f0 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -24,11 +24,11 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) model = model_fn() + optimizer = HybridAdam(model.parameters(), lr=1e-3) if lora_config is not None: model = booster.enable_lora(model, lora_config=lora_config) - optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -48,6 +48,7 @@ def run_fn(stage, model_fn, data_gen_fn, output_transform_fn, lora_config=None) except Exception as e: return repr(e) + # raise e diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py index e20bc5a6f97e..ed5aa7dbdfde 100644 --- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py @@ -80,9 +80,10 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo booster = Booster(plugin=plugin) new_booster = Booster(plugin=new_plugin) model = model_fn() + optimizer = HybridAdam(model.parameters(), lr=1e-3) new_model = deepcopy(model) + new_optimizer = HybridAdam(new_model.parameters(), lr=1e-3) model = booster.enable_lora(model, lora_config=lora_config) - optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -107,7 +108,6 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo booster.save_lora_as_pretrained(model, model_ckpt_path) booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False) new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config) - new_optimizer = HybridAdam(new_model.parameters(), lr=1e-3) new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion) check_state_dict_equal(model.state_dict(), new_model.state_dict(), False) From 0f5db493030028f688e1cbb564673b2a343bcdf0 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 11 Dec 2023 19:15:20 +0800 Subject: [PATCH 12/15] fix --- colossalai/booster/plugin/low_level_zero_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 41e50b911885..b6e93b35115e 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -351,7 +351,7 @@ def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): return group_id return -1 - def add_lora_para_to_optimizer(self, model, optimizer): + def add_lora_params_to_optimizer(self, model, optimizer): """ add lora parameters to optimizer """ name2param= {} for name, param in model.named_parameters(): @@ -384,7 +384,7 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - self.add_lora_para_to_optimizer(model, optimizer) + self.add_lora_params_to_optimizer(model, optimizer) if not isinstance(model, ModelWrapper): From d9df7f3e0803609570733ecb0a13d6fc6ed71dd8 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 12 Dec 2023 18:50:23 +0800 Subject: [PATCH 13/15] test ci --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e2114d43bcd0..506c1c842b00 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -208,7 +208,7 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/test_booster/test_plugin/test_3d_plugin.py env: DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 From 80fdc4db8b2d0f1b60a54b13bb3641e330944688 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 13 Dec 2023 12:23:39 +0800 Subject: [PATCH 14/15] git # This is a combination of 3 commits. Update low_level_zero_plugin.py Update low_level_zero_plugin.py fix fix fix --- .github/workflows/build_on_pr.yml | 2 +- .../booster/plugin/low_level_zero_plugin.py | 40 ++++++++++++------- .../test_plugin/test_dp_plugin_base.py | 8 +++- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 506c1c842b00..e2114d43bcd0 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -208,7 +208,7 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/test_booster/test_plugin/test_3d_plugin.py + CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ env: DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index b6e93b35115e..5b367379aaa5 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -351,26 +351,35 @@ def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): return group_id return -1 + def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter, lora_param: Parameter): + origin_param_id = id(origin_param) + lora_param_id = id(lora_param) + target_group_id = -1 + for group_id, param_group in enumerate(optimizer.param_groups): + for p in param_group['params']: + if id(p) == lora_param_id: + # check if the lora parameter exists. + return -2 + if id(p) == origin_param_id: + target_group_id = group_id + return target_group_id + def add_lora_params_to_optimizer(self, model, optimizer): """ add lora parameters to optimizer """ name2param= {} for name, param in model.named_parameters(): name2param[name] = param - optimizer_param_nums = 0 - for param_group in optimizer.param_groups: - optimizer_param_nums += len(param_group['params']) - - # Check if the optimizer is created after the model is transformed into a LoRa model. - if len(name2param) != optimizer_param_nums: - for name, param in name2param.items(): - if 'lora_A' in name or 'lora_B' in name: - origin_key = name.replace("lora_A.", "") - origin_key = origin_key.replace("lora_B.", "") - origin_key = origin_key.replace(f"{model.active_adapter}.", "") - origin_param = name2param[origin_key] - group_id = self.get_param_group_id(optimizer, origin_param) - assert group_id != -1, "Parameter error, origin parameter does't exists." + for name, param in name2param.items(): + if 'lora_A' in name or 'lora_B' in name: + origin_key = name.replace("lora_A.", "") + origin_key = origin_key.replace("lora_B.", "") + origin_key = origin_key.replace(f"{model.active_adapter}.", "") + origin_param = name2param[origin_key] + group_id = self.get_param_group_id(optimizer, origin_param, param) + if group_id == -1: + warnings.warn("Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups.") + elif group_id >= 0: optimizer.param_groups[group_id]['params'].append(param) def configure( @@ -384,7 +393,8 @@ def configure( if self.lora_enabled: from peft import PeftModel assert isinstance(model, PeftModel), "The model should have been wrapped as a PeftModel when self.lora_enabled is True" - self.add_lora_params_to_optimizer(model, optimizer) + if optimizer is not None: + self.add_lora_params_to_optimizer(model, optimizer) if not isinstance(model, ModelWrapper): diff --git a/tests/test_booster/test_plugin/test_dp_plugin_base.py b/tests/test_booster/test_plugin/test_dp_plugin_base.py index 0ac9d0f6d409..eabe69ed3094 100644 --- a/tests/test_booster/test_plugin/test_dp_plugin_base.py +++ b/tests/test_booster/test_plugin/test_dp_plugin_base.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterator, List, Tuple, Union +from typing import Callable, Iterator, List, Tuple, Union, Dict import torch import torch.distributed as dist @@ -51,6 +51,12 @@ def supported_precisions(self) -> List[str]: def no_sync(self, model: nn.Module) -> Iterator[None]: pass + def enable_lora(self, model: nn.Module, pretrained_dir: str, lora_config: Dict) -> nn.Module: + pass + + def support_lora(self) -> bool: + pass + def check_dataloader_sharding(): plugin = DPPluginWrapper() From db94ff9e63ba5084e195643cb4cf83192b2fe47e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 19 Dec 2023 18:27:47 +0800 Subject: [PATCH 15/15] fix naming fix naming fix naming fix --- .../booster/plugin/low_level_zero_plugin.py | 23 +++++++++++++------ colossalai/pipeline/p2p.py | 16 ++++++++++++- requirements/requirements-test.txt | 2 +- requirements/requirements.txt | 1 + 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 5b367379aaa5..b2087af68437 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -1,5 +1,6 @@ import logging import warnings +import enum import os from functools import partial from pathlib import Path @@ -43,6 +44,11 @@ def _convert_floating_point(x, dtype: torch.dtype = torch.float16): SUPPORTED_PRECISION = ["fp16", "bf16", "fp32"] +class OptimizerParamCheckState(enum.Enum): + ORIGIN_PARAM_FINDED = 0 + ORIGIN_PARAM_NOT_FIND = -1 + LORA_PARM_EXISTED = -2 + class LowLevelZeroModel(ModelWrapper, AMPModelMixin): def __init__(self, module: nn.Module, precision: str) -> None: @@ -354,15 +360,18 @@ def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter): def get_param_group_id(self, optimizer: Optimizer, origin_param: Parameter, lora_param: Parameter): origin_param_id = id(origin_param) lora_param_id = id(lora_param) - target_group_id = -1 + target_group_id = None for group_id, param_group in enumerate(optimizer.param_groups): for p in param_group['params']: if id(p) == lora_param_id: # check if the lora parameter exists. - return -2 + return target_group_id, OptimizerParamCheckState.LORA_PARM_EXISTED if id(p) == origin_param_id: target_group_id = group_id - return target_group_id + if target_group_id is not None: + return target_group_id, OptimizerParamCheckState.ORIGIN_PARAM_FINDED + else: + return target_group_id, OptimizerParamCheckState.ORIGIN_PARAM_NOT_FIND def add_lora_params_to_optimizer(self, model, optimizer): """ add lora parameters to optimizer """ @@ -374,12 +383,12 @@ def add_lora_params_to_optimizer(self, model, optimizer): if 'lora_A' in name or 'lora_B' in name: origin_key = name.replace("lora_A.", "") origin_key = origin_key.replace("lora_B.", "") - origin_key = origin_key.replace(f"{model.active_adapter}.", "") + origin_key = origin_key.replace(f"{model.active_adapter}", "base_layer") origin_param = name2param[origin_key] - group_id = self.get_param_group_id(optimizer, origin_param, param) - if group_id == -1: + group_id, check_state = self.get_param_group_id(optimizer, origin_param, param) + if check_state == OptimizerParamCheckState.ORIGIN_PARAM_NOT_FIND: warnings.warn("Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups.") - elif group_id >= 0: + elif check_state == OptimizerParamCheckState.ORIGIN_PARAM_FINDED and group_id is not None and group_id >= 0: optimizer.param_groups[group_id]['params'].append(param) def configure( diff --git a/colossalai/pipeline/p2p.py b/colossalai/pipeline/p2p.py index f822c1819adc..29a102be0391 100644 --- a/colossalai/pipeline/p2p.py +++ b/colossalai/pipeline/p2p.py @@ -44,6 +44,20 @@ def _cuda_safe_tensor_to_object(tensor: torch.Tensor, tensor_size: torch.Size) - return unpickle +def check_for_nccl_backend(group): + + pg = group or c10d._get_default_group() + # Gate PG wrapper check on Gloo availability. + if c10d._GLOO_AVAILABLE: + # It is not expected for PG to be wrapped many times, but support it just + # in case + while isinstance(pg, c10d._ProcessGroupWrapper): + pg = pg.wrapped_pg + + return ( + c10d.is_nccl_available() and + pg.name() == c10d.Backend.NCCL + ) def _broadcast_object_list( object_list: List[Any], src: int, group: ProcessGroup, device: Optional[Union[torch.device, str, int]] = None @@ -65,7 +79,7 @@ def _broadcast_object_list( c10d._warn_not_in_group("broadcast_object_list") return - is_nccl_backend = c10d._check_for_nccl_backend(group) + is_nccl_backend = check_for_nccl_backend(group) current_device = None if device is not None: diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 5af31177004f..29a17ce7fb1a 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -18,5 +18,5 @@ SentencePiece ninja flash_attn==2.0.5 datasets -peft +peft>=0.7.1 #auto-gptq now not support torch1.12 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 19cb7a154a01..db9c9908c554 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -14,3 +14,4 @@ einops sentencepiece google protobuf +peft>=0.7.1 \ No newline at end of file