From 93a7e7bbf99f67f3475a290bcc4e5aa4ac4d6f24 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Tue, 24 Oct 2023 17:57:30 +0800 Subject: [PATCH 01/11] add apis and peft requirement --- colossalai/booster/booster.py | 15 +++++++++++++++ colossalai/checkpoint_io/general_checkpoint_io.py | 5 +++++ requirements/requirements-test.txt | 1 + tests/test_lora/test_ddp_lora.py | 0 4 files changed, 21 insertions(+) create mode 100644 tests/test_lora/test_ddp_lora.py diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index d73bc5babd80..78d68c9cccfe 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -221,6 +221,15 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) - assert self.plugin.support_no_sync(), f"The plugin {self.plugin.__class__.__name__} does not support no_sync." return self.plugin.no_sync(model, optimizer) + def enable_lora(self, model: nn.Module) -> nn.Module: + # Arguments for lora configs should be passed to this function. + + # 1. Check whether peft can be imported + # 2. Check whether plugin supports LoRA + # 3. Enable lora for ckpt_io and plugin + # 4. Create LORAConfig and wrap model with get_peft_model() + pass + def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None: """Load model from checkpoint. @@ -323,3 +332,9 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None: checkpoint (str): Path to the checkpoint. It must be a local file path. """ self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint) + + def load_lora(self, model: nn.Module, checkpoint: str) -> None: + pass + + def save_lora(self, model: nn.Module, checkpoint: str) -> None: + pass diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py index a652d9b4538e..954ed06ff894 100644 --- a/colossalai/checkpoint_io/general_checkpoint_io.py +++ b/colossalai/checkpoint_io/general_checkpoint_io.py @@ -228,3 +228,8 @@ def load_sharded_model( self.__class__.__name__, "\n\t".join(error_msgs) ) ) + + def is_lora_model(self, model: nn.Module) -> bool: + # import PeftModel from peft (remember to check ImportError) + # and then check model using isinstance(model, PeftModel) + pass diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 467f83610eb0..dd3da281f7fd 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -18,4 +18,5 @@ SentencePiece ninja flash_attn==2.0.5 datasets +peft #auto-gptq now not support torch1.12 diff --git a/tests/test_lora/test_ddp_lora.py b/tests/test_lora/test_ddp_lora.py new file mode 100644 index 000000000000..e69de29bb2d1 From 8322a4e5c7c0b089c10f1a7b2cef711ae56422a3 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Wed, 25 Oct 2023 12:27:44 +0800 Subject: [PATCH 02/11] add liscense and implement apis --- LICENSE | 16 +++++++ colossalai/booster/booster.py | 46 ++++++++++++++++--- colossalai/booster/plugin/plugin_base.py | 12 ++++- colossalai/booster/plugin/torch_ddp_plugin.py | 15 +++++- 4 files changed, 80 insertions(+), 9 deletions(-) diff --git a/LICENSE b/LICENSE index b3eb43520a6f..24c36ae10f22 100644 --- a/LICENSE +++ b/LICENSE @@ -527,3 +527,19 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ---------------- LICENSE FOR peft ---------------- + + from PEFT TEAM: + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://github.com/huggingface/peft/blob/main/LICENSE + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 78d68c9cccfe..2a08645536a8 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -221,14 +221,46 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) - assert self.plugin.support_no_sync(), f"The plugin {self.plugin.__class__.__name__} does not support no_sync." return self.plugin.no_sync(model, optimizer) - def enable_lora(self, model: nn.Module) -> nn.Module: - # Arguments for lora configs should be passed to this function. + def enable_lora( + self, + model: nn.Module, + r: int = 8, + target_modules: Optional[Union[List[str], str]] = None, + lora_alpha: int = 8, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + bias: str = "none", + ) -> nn.Module: + """ + Wrap the passed in model with LoRA modules for training. + Lora in ColossalAI is implemented using Huggingface peft library, so the arguments for Lora configuration are identical to those of peft. - # 1. Check whether peft can be imported - # 2. Check whether plugin supports LoRA - # 3. Enable lora for ckpt_io and plugin - # 4. Create LORAConfig and wrap model with get_peft_model() - pass + Args: + model (nn.Module): The model to be appended with LoRA modules. + r (int, optional): Lora attention dimension. Defaults to 8. + target_modules (Union[List[str],str], optional): List of names or regex expressions of the modules to apply Lora to. + For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Defaults to None. + lora_alpha (int, optional): The alpha parameter for Lora scaling. Defaults to 8. + lora_dropout (float, optional): The dropout probability for Lora layers. Defaults to 0.0. + fan_in_fan_out (bool, optional): Set this to True if the layer to replace stores weight like (fan_in, fan_out). + For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set + to `True`. Defaults to False. + bias (str, optional): Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the + corresponding biases will be updated during training. Be aware that this means that, even when disabling + the adapters, the model will not produce the same output as the base model would have without adaptation. + Defaults to "none". + """ + assert self.plugin is not None, f"Lora can only enabled when a plugin is provided." + assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." + lora_config = dict( + r=r, + target_modules=target_modules, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + fan_in_fan_out=fan_in_fan_out, + bias=bias, + ) + return self.plugin.enable_lora(model, lora_config) def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None: """Load model from checkpoint. diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py index 4e570cbe8abc..c9c9406769c8 100644 --- a/colossalai/booster/plugin/plugin_base.py +++ b/colossalai/booster/plugin/plugin_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Callable, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple import torch.nn as nn from torch.optim import Optimizer @@ -33,6 +33,10 @@ def control_device(self) -> bool: def support_no_sync(self) -> bool: pass + @abstractmethod + def support_lora(self) -> bool: + pass + @abstractmethod def configure( self, @@ -63,6 +67,12 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non Context manager to disable gradient synchronization. """ + @abstractmethod + def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + """ + Add LoRA modules to the model passed in. + """ + @abstractmethod def prepare_dataloader( self, diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 738634473dbc..0ccf11c59463 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP @@ -173,6 +173,9 @@ def __init__( def support_no_sync(self) -> bool: return True + def support_lora(self) -> bool: + return True + def control_precision(self) -> bool: return False @@ -216,3 +219,13 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: assert isinstance(model, TorchDDPModel), "Model is not boosted by TorchDDPPlugin." return model.module.no_sync() + + def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + assert not isinstance(model, TorchDDPModel), "Lora should be enabled before boosting the model." + + try: + from peft import LoraConfig + except ImportError: + raise ImportError("Please install Huggingface Peft library to enable lora feature in ColossalAI!") + + LoraConfig(**lora_config) From 8559144105ef355a9eb5468e299cca61e0adaff3 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Wed, 25 Oct 2023 17:55:22 +0800 Subject: [PATCH 03/11] add checkpointio apis --- colossalai/booster/booster.py | 48 +++++++++++++++++-- colossalai/booster/plugin/plugin_base.py | 2 +- colossalai/booster/plugin/torch_ddp_plugin.py | 36 ++++++++++---- .../checkpoint_io/checkpoint_io_base.py | 26 ++++++++++ .../checkpoint_io/general_checkpoint_io.py | 5 -- 5 files changed, 99 insertions(+), 18 deletions(-) diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 2a08645536a8..6c5b90127851 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -8,6 +8,14 @@ from torch.optim.lr_scheduler import _LRScheduler as LRScheduler from torch.utils.data import DataLoader +SUPPORT_PEFT = False +try: + import peft + + SUPPORT_PEFT = True +except ImportError: + pass + import colossalai.interface.pretrained as pretrained_utils from colossalai.checkpoint_io import GeneralCheckpointIO from colossalai.interface import ModelWrapper, OptimizerWrapper @@ -224,12 +232,15 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) - def enable_lora( self, model: nn.Module, + task_type: Optional[Union["peft.TaskType", str]] = None, r: int = 8, target_modules: Optional[Union[List[str], str]] = None, lora_alpha: int = 8, lora_dropout: float = 0.0, fan_in_fan_out: bool = False, bias: str = "none", + modules_to_save: Optional[List[str]] = None, + inference_mode: bool = False, ) -> nn.Module: """ Wrap the passed in model with LoRA modules for training. @@ -237,6 +248,7 @@ def enable_lora( Args: model (nn.Module): The model to be appended with LoRA modules. + task_type (Union[peft.TaskType, str], optional): The type of task to perform in peft(For example, TaskType.CAUSAL_LM). Defaults to None. r (int, optional): Lora attention dimension. Defaults to 8. target_modules (Union[List[str],str], optional): List of names or regex expressions of the modules to apply Lora to. For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Defaults to None. @@ -249,16 +261,24 @@ def enable_lora( corresponding biases will be updated during training. Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation. Defaults to "none". + modules_to_save (List[str], optional):List of modules apart from LoRA layers to be set as trainable + and saved in the final checkpoint. Defaults to None. + inference_mode (bool, optional): Whether to use the Peft model in inference mode. Defaults to False. """ + if not SUPPORT_PEFT: + raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") assert self.plugin is not None, f"Lora can only enabled when a plugin is provided." assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." lora_config = dict( + task_type=task_type, r=r, target_modules=target_modules, lora_alpha=lora_alpha, lora_dropout=lora_dropout, fan_in_fan_out=fan_in_fan_out, bias=bias, + modules_to_save=modules_to_save, + inference_mode=inference_mode, ) return self.plugin.enable_lora(model, lora_config) @@ -365,8 +385,28 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None: """ self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint) - def load_lora(self, model: nn.Module, checkpoint: str) -> None: - pass + def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: + """ + Save the lora adapters and adapter configuration file to checkpoint directory. + + Args: + model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + checkpoint (str): Path to the checkpoint directory. It must be a local path. + use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. + """ + if not SUPPORT_PEFT: + raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") + self.checkpoint_io.save_lora(model, checkpoint, use_safetensors) + + def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: + """ + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. - def save_lora(self, model: nn.Module, checkpoint: str) -> None: - pass + Args: + model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + checkpoint (str): Path to the checkpoint directory. It must be a local path. + use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. + """ + if not SUPPORT_PEFT: + raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") + self.checkpoint_io.load_lora(model, checkpoint) diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py index c9c9406769c8..d815914bf491 100644 --- a/colossalai/booster/plugin/plugin_base.py +++ b/colossalai/booster/plugin/plugin_base.py @@ -70,7 +70,7 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non @abstractmethod def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: """ - Add LoRA modules to the model passed in. + Add LoRA modules to the model passed in. Should only be called in booster.enable_lora(). """ @abstractmethod diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 0ccf11c59463..77eb23a984ef 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple, Union import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP @@ -116,6 +116,30 @@ def load_sharded_optimizer( assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!" super().load_sharded_optimizer(optimizer.unwrap(), index_file_path, prefix) + def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: + """ + Save the lora adapters and adapter configuration file to checkpoint directory. + """ + from peft import PeftModel + + assert isinstance(model, ModelWrapper), "Please boost the model before saving!" + if self.coordinator.is_master(): + peft_model = model.unwrap() + assert isinstance(peft_model, PeftModel), "Please use save_lora method when lora is enabled." + peft_model.save_pretrained(save_directory=checkpoint, safe_serialization=use_safetensors) + + def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: + """ + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + """ + from peft import PeftModel + + assert isinstance(model, ModelWrapper), "Please boost the model before loading!" + if self.coordinator.is_master(): + peft_model = model.unwrap() + assert isinstance(peft_model, PeftModel), "Please use load_lora method when lora is enabled." + # peft_model.from_pretrained() + class TorchDDPModel(ModelWrapper): def __init__(self, module: nn.Module, *args, **kwargs) -> None: @@ -221,11 +245,7 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non return model.module.no_sync() def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: - assert not isinstance(model, TorchDDPModel), "Lora should be enabled before boosting the model." - - try: - from peft import LoraConfig - except ImportError: - raise ImportError("Please install Huggingface Peft library to enable lora feature in ColossalAI!") + from peft import LoraConfig, get_peft_model - LoraConfig(**lora_config) + assert not isinstance(model, TorchDDPModel), "Lora should be enabled before boosting the model." + return get_peft_model(model, LoraConfig(**lora_config)) diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py index 780117598e18..58f5688a307c 100644 --- a/colossalai/checkpoint_io/checkpoint_io_base.py +++ b/colossalai/checkpoint_io/checkpoint_io_base.py @@ -327,3 +327,29 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str): """ state_dict = torch.load(checkpoint) lr_scheduler.load_state_dict(state_dict) + + # ======================================================== + # Abstract methods for lora loading/saving implementation + # ======================================================== + + @abstractmethod + def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: + """ + Save the lora adapters and adapter configuration file to checkpoint directory. + + Args: + model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + checkpoint (str): Path to the checkpoint directory. It must be a local path. + use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. + """ + + @abstractmethod + def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: + """ + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + + Args: + model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + checkpoint (str): Path to the checkpoint directory. It must be a local path. + use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. + """ diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py index 954ed06ff894..a652d9b4538e 100644 --- a/colossalai/checkpoint_io/general_checkpoint_io.py +++ b/colossalai/checkpoint_io/general_checkpoint_io.py @@ -228,8 +228,3 @@ def load_sharded_model( self.__class__.__name__, "\n\t".join(error_msgs) ) ) - - def is_lora_model(self, model: nn.Module) -> bool: - # import PeftModel from peft (remember to check ImportError) - # and then check model using isinstance(model, PeftModel) - pass From 4ca87bfb765b282e69c31a57bd6dd08f760b049a Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Thu, 26 Oct 2023 14:29:15 +0800 Subject: [PATCH 04/11] add torchddp fwd_bwd test --- colossalai/booster/booster.py | 18 +++++-- tests/test_lora/test_ddp_lora.py | 0 tests/test_lora/test_torch_ddp_lora.py | 74 ++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) delete mode 100644 tests/test_lora/test_ddp_lora.py create mode 100644 tests/test_lora/test_torch_ddp_lora.py diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 6c5b90127851..a4a904e59792 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -233,6 +233,7 @@ def enable_lora( self, model: nn.Module, task_type: Optional[Union["peft.TaskType", str]] = None, + inference_mode: bool = False, r: int = 8, target_modules: Optional[Union[List[str], str]] = None, lora_alpha: int = 8, @@ -240,7 +241,8 @@ def enable_lora( fan_in_fan_out: bool = False, bias: str = "none", modules_to_save: Optional[List[str]] = None, - inference_mode: bool = False, + layers_to_transform: Optional[Union[List[int], int]] = None, + layers_pattern: Optional[str] = None, ) -> nn.Module: """ Wrap the passed in model with LoRA modules for training. @@ -248,7 +250,9 @@ def enable_lora( Args: model (nn.Module): The model to be appended with LoRA modules. - task_type (Union[peft.TaskType, str], optional): The type of task to perform in peft(For example, TaskType.CAUSAL_LM). Defaults to None. + task_type (Union[peft.TaskType, str], optional): The type of task to perform in peft. Available task types in string include "SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", + "TOKEN_CLS", "QUESTION_ANS", and "FEATURE_EXTRACTION". Defaults to None. + inference_mode (bool, optional): Whether to use the Peft model in inference mode. Defaults to False. r (int, optional): Lora attention dimension. Defaults to 8. target_modules (Union[List[str],str], optional): List of names or regex expressions of the modules to apply Lora to. For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Defaults to None. @@ -263,7 +267,11 @@ def enable_lora( Defaults to "none". modules_to_save (List[str], optional):List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. Defaults to None. - inference_mode (bool, optional): Whether to use the Peft model in inference mode. Defaults to False. + layers_to_transform (Union[List[int],int], optional): The layer indexes to transform, if this argument is specified, + it will apply the LoRA transformations on the layer indexes that are specified in this list. If a single integer + is passed, it will apply the LoRA transformations on the layer at this index. Defaults to None. + layers_pattern (str, optional): The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer + pattern is not in the common layers pattern. Defaults to None. """ if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") @@ -271,6 +279,7 @@ def enable_lora( assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." lora_config = dict( task_type=task_type, + inference_mode=inference_mode, r=r, target_modules=target_modules, lora_alpha=lora_alpha, @@ -278,7 +287,8 @@ def enable_lora( fan_in_fan_out=fan_in_fan_out, bias=bias, modules_to_save=modules_to_save, - inference_mode=inference_mode, + layers_to_transform=layers_to_transform, + layers_pattern=layers_pattern, ) return self.plugin.enable_lora(model, lora_config) diff --git a/tests/test_lora/test_ddp_lora.py b/tests/test_lora/test_ddp_lora.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/test_lora/test_torch_ddp_lora.py b/tests/test_lora/test_torch_ddp_lora.py new file mode 100644 index 000000000000..84b97be374fe --- /dev/null +++ b/tests/test_lora/test_torch_ddp_lora.py @@ -0,0 +1,74 @@ +import copy + +import torch +from torch import distributed as dist +from torch.optim import AdamW + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import TorchDDPPlugin +from colossalai.testing import assert_equal, assert_not_equal, rerun_if_address_is_in_use, spawn +from tests.kit.model_zoo import model_zoo + + +def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type): + model = model_fn() + + plugin = TorchDDPPlugin() + booster = Booster(plugin=plugin) + model = booster.enable_lora(model, task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + model_copy = copy.deepcopy(model) + + optimizer = AdamW(model.parameters(), lr=0.001) + criterion = loss_fn + + model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion) + + data = data_gen_fn() + data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()} + + output = model(**data) + output = output_transform_fn(output) + loss = criterion(output) + + booster.backward(loss, optimizer) + optimizer.clip_grad_by_norm(1.0) + optimizer.step() + + if dist.get_rank() == 0: + for (n1, p1), (n2, p2) in zip(model.named_parameters(), model_copy.named_parameters()): + if "lora_" in n1: + # lora modules require gradients, thus updated + assert p1.requires_grad + assert_not_equal(p1.to(p2.device), p2) + else: + if not p1.requires_grad: + assert_equal(p1.to(p2.device), p2) + + # # test saving and loading + # with shared_tempdir() as tempdir: + # booster.save_lora(model, f"{tempdir}/model") + torch.cuda.empty_cache() + + +def run_lora_test(): + sub_model_zoo = model_zoo.get_sub_registry("transformers_llama") + for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): + task_type = None + if name == "transformers_llama_for_casual_lm": + task_type = "CAUSAL_LM" + if name == "transformers_llama_for_sequence_classification": + task_type = "SEQ_CLS" + check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type) + # check_checkpoint(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type) + + +def run_dist(rank, world_size, port): + config = {} + colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + run_lora_test() + + +@rerun_if_address_is_in_use() +def test_torch_ddp_lora(): + spawn(run_dist, 2) From 400f848a7e6323b31aa7681f82812a0585f2484b Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Thu, 26 Oct 2023 14:36:04 +0800 Subject: [PATCH 05/11] add support_lora methods --- colossalai/booster/plugin/gemini_plugin.py | 3 +++ colossalai/booster/plugin/hybrid_parallel_plugin.py | 3 +++ colossalai/booster/plugin/low_level_zero_plugin.py | 3 +++ colossalai/booster/plugin/torch_fsdp_plugin.py | 5 ++++- 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index 20a931b816ea..caaf59e59f0b 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -360,6 +360,9 @@ def __init__( def support_no_sync(self) -> bool: return False + def support_lora(self) -> bool: + return False + def control_precision(self) -> bool: return True diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 72c3ec46ae75..0686a5c0ea0e 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -753,6 +753,9 @@ def control_precision(self) -> bool: def support_no_sync(self) -> bool: return False + def support_lora(self) -> bool: + return False + def control_checkpoint_io(self) -> bool: return True diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index dc78fe8c094c..3c330815cdf3 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -295,6 +295,9 @@ def __init__( def support_no_sync(self) -> bool: return self.stage == 1 + def support_lora(self) -> bool: + return False + def control_precision(self) -> bool: return True diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py index 2ea7593a5cc5..3a574e3dac21 100644 --- a/colossalai/booster/plugin/torch_fsdp_plugin.py +++ b/colossalai/booster/plugin/torch_fsdp_plugin.py @@ -190,7 +190,10 @@ def __init__( raise RuntimeError("FSDP is not supported while torch version under 1.12.0.") def support_no_sync(self) -> bool: - False + return False + + def support_lora(self) -> bool: + return False def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: raise NotImplementedError("Torch fsdp no_sync func not supported yet.") From 002635dcb5ade7dc94b25eb746337f6833797372 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Thu, 26 Oct 2023 17:20:42 +0800 Subject: [PATCH 06/11] add checkpointio test and debug --- colossalai/booster/booster.py | 45 ++++++++------ colossalai/booster/plugin/torch_ddp_plugin.py | 34 +++++++---- .../checkpoint_io/checkpoint_io_base.py | 29 ++++----- tests/test_lora/test_torch_ddp_lora.py | 60 ++++++++++++++----- 4 files changed, 108 insertions(+), 60 deletions(-) diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index a4a904e59792..68eb810a08c6 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -232,6 +232,7 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) - def enable_lora( self, model: nn.Module, + pretrained_dir: Optional[str] = None, task_type: Optional[Union["peft.TaskType", str]] = None, inference_mode: bool = False, r: int = 8, @@ -245,11 +246,14 @@ def enable_lora( layers_pattern: Optional[str] = None, ) -> nn.Module: """ - Wrap the passed in model with LoRA modules for training. - Lora in ColossalAI is implemented using Huggingface peft library, so the arguments for Lora configuration are identical to those of peft. + Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, load lora configs and weights are loaded from that directory. + Lora in ColossalAI is implemented using Huggingface peft library, so the arguments for Lora configuration are same as those of peft. Args: model (nn.Module): The model to be appended with LoRA modules. + pretrained_dir(str, optional): The path to the pretrained directory, can be a local directory + or model_id of a PEFT configuration hosted inside a model repo on the Hugging Face Hub. + When set to None, create new lora configs and weights for the model. Defaults to None. task_type (Union[peft.TaskType, str], optional): The type of task to perform in peft. Available task types in string include "SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", "TOKEN_CLS", "QUESTION_ANS", and "FEATURE_EXTRACTION". Defaults to None. inference_mode (bool, optional): Whether to use the Peft model in inference mode. Defaults to False. @@ -278,6 +282,7 @@ def enable_lora( assert self.plugin is not None, f"Lora can only enabled when a plugin is provided." assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." lora_config = dict( + pretrained_dir=pretrained_dir, task_type=task_type, inference_mode=inference_mode, r=r, @@ -395,22 +400,11 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None: """ self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint) - def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: - """ - Save the lora adapters and adapter configuration file to checkpoint directory. - - Args: - model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. - checkpoint (str): Path to the checkpoint directory. It must be a local path. - use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. - """ - if not SUPPORT_PEFT: - raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") - self.checkpoint_io.save_lora(model, checkpoint, use_safetensors) - - def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: + def save_lora_as_pretrained( + self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False + ) -> None: """ - Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + Save the lora adapters and adapter configuration file to a pretrained checkpoint directory. Args: model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. @@ -419,4 +413,19 @@ def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> N """ if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") - self.checkpoint_io.load_lora(model, checkpoint) + self.checkpoint_io.save_lora_as_pretrained(model, checkpoint, use_safetensors) + + # def load_lora_from_pretrained(self, model: nn.Module, checkpoint: str) -> None: + # """ + # Instantiate a PEFT model from a pretrained model and load PEFT weights from pretrained checkpoint. + + # Args: + # model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + # checkpoint (str): Path to the checkpoint directory. It must be a local path. + # """ + # if not SUPPORT_PEFT: + # raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") + + # from peft import PeftModel + # assert (not isinstance(model, PeftModel)) and (not isinstance(model, ModelWrapper)), f"Lora should be loaded before boosting or enabling lora." + # self.checkpoint_io.load_lora_from_pretrained(model, checkpoint) diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 77eb23a984ef..7851290370e8 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -116,7 +116,9 @@ def load_sharded_optimizer( assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!" super().load_sharded_optimizer(optimizer.unwrap(), index_file_path, prefix) - def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: + def save_lora_as_pretrained( + self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False + ) -> None: """ Save the lora adapters and adapter configuration file to checkpoint directory. """ @@ -128,17 +130,17 @@ def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_ assert isinstance(peft_model, PeftModel), "Please use save_lora method when lora is enabled." peft_model.save_pretrained(save_directory=checkpoint, safe_serialization=use_safetensors) - def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: - """ - Instantiate a PEFT model from a pretrained model and loaded PEFT weights. - """ - from peft import PeftModel + # def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: + # """ + # Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + # """ + # from peft import PeftModel - assert isinstance(model, ModelWrapper), "Please boost the model before loading!" - if self.coordinator.is_master(): - peft_model = model.unwrap() - assert isinstance(peft_model, PeftModel), "Please use load_lora method when lora is enabled." - # peft_model.from_pretrained() + # assert isinstance(model, ModelWrapper), "Please boost the model before loading!" + # if self.coordinator.is_master(): + # peft_model = model.unwrap() + # assert isinstance(peft_model, PeftModel), "Please use load_lora method when lora is enabled." + # PeftModel.from_pretrained(peft_model, checkpoint) class TorchDDPModel(ModelWrapper): @@ -245,7 +247,13 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non return model.module.no_sync() def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: - from peft import LoraConfig, get_peft_model + from peft import LoraConfig, PeftModel, get_peft_model assert not isinstance(model, TorchDDPModel), "Lora should be enabled before boosting the model." - return get_peft_model(model, LoraConfig(**lora_config)) + + pretrained_dir = lora_config.pop("pretrained_dir") + + if pretrained_dir is None: + return get_peft_model(model, LoraConfig(**lora_config)) + else: + return PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py index 58f5688a307c..b843b3a21583 100644 --- a/colossalai/checkpoint_io/checkpoint_io_base.py +++ b/colossalai/checkpoint_io/checkpoint_io_base.py @@ -328,14 +328,17 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str): state_dict = torch.load(checkpoint) lr_scheduler.load_state_dict(state_dict) - # ======================================================== - # Abstract methods for lora loading/saving implementation - # ======================================================== + # ================================================================================ + # Abstract methods for lora saving implementation. + # Loading lora is standard for different CheckpointIOs, so it needn't be abstract. + # ================================================================================ @abstractmethod - def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False) -> None: + def save_lora_as_pretrained( + self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False + ) -> None: """ - Save the lora adapters and adapter configuration file to checkpoint directory. + Save the lora adapters and adapter configuration file to a pretrained checkpoint directory. Args: model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. @@ -343,13 +346,11 @@ def save_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_ use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. """ - @abstractmethod - def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: - """ - Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + # def load_lora_from_pretrained(self, model: nn.Module, checkpoint: str) -> None: + # """ + # Instantiate a PEFT model from a pretrained model and load PEFT weights from pretrained checkpoint. - Args: - model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. - checkpoint (str): Path to the checkpoint directory. It must be a local path. - use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. - """ + # Args: + # model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. + # checkpoint (str): Path to the checkpoint directory. It must be a local path. + # """ diff --git a/tests/test_lora/test_torch_ddp_lora.py b/tests/test_lora/test_torch_ddp_lora.py index 84b97be374fe..fa6966167b5c 100644 --- a/tests/test_lora/test_torch_ddp_lora.py +++ b/tests/test_lora/test_torch_ddp_lora.py @@ -1,4 +1,5 @@ import copy +import os import torch from torch import distributed as dist @@ -7,10 +8,19 @@ import colossalai from colossalai.booster import Booster from colossalai.booster.plugin import TorchDDPPlugin -from colossalai.testing import assert_equal, assert_not_equal, rerun_if_address_is_in_use, spawn +from colossalai.testing import ( + assert_equal, + assert_not_equal, + check_state_dict_equal, + clear_cache_before_run, + rerun_if_address_is_in_use, + spawn, +) from tests.kit.model_zoo import model_zoo +from tests.test_checkpoint_io.utils import shared_tempdir +@clear_cache_before_run() def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type): model = model_fn() @@ -35,20 +45,40 @@ def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type optimizer.clip_grad_by_norm(1.0) optimizer.step() - if dist.get_rank() == 0: - for (n1, p1), (n2, p2) in zip(model.named_parameters(), model_copy.named_parameters()): - if "lora_" in n1: - # lora modules require gradients, thus updated - assert p1.requires_grad - assert_not_equal(p1.to(p2.device), p2) - else: - if not p1.requires_grad: - assert_equal(p1.to(p2.device), p2) + for (n1, p1), (n2, p2) in zip(model.named_parameters(), model_copy.named_parameters()): + if "lora_" in n1: + # lora modules require gradients, thus updated + assert p1.requires_grad + assert_not_equal(p1.to(p2.device), p2) + else: + if not p1.requires_grad: + assert_equal(p1.to(p2.device), p2) - # # test saving and loading - # with shared_tempdir() as tempdir: - # booster.save_lora(model, f"{tempdir}/model") - torch.cuda.empty_cache() + +@clear_cache_before_run() +def check_checkpoint(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type): + plugin = TorchDDPPlugin() + + model_save = model_fn() + model_load = copy.deepcopy(model_save) + + booster = Booster(plugin=plugin) + model_save = booster.enable_lora(model_save, task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + model_save, _, _, _, _ = booster.boost(model_save) + + with shared_tempdir() as tempdir: + lora_ckpt_path = os.path.join(tempdir, "ckpt") + booster.save_lora_as_pretrained(model_save, lora_ckpt_path) + dist.barrier() + + # The Lora checkpoint should be small in size + checkpoint_size_mb = os.path.getsize(os.path.join(lora_ckpt_path, "adapter_model.bin")) / (1024 * 1024) + assert checkpoint_size_mb < 1 + + model_load = booster.enable_lora(model_load, pretrained_dir=lora_ckpt_path) + model_load, _, _, _, _ = booster.boost(model_load) + + check_state_dict_equal(model_save.state_dict(), model_load.state_dict()) def run_lora_test(): @@ -60,7 +90,7 @@ def run_lora_test(): if name == "transformers_llama_for_sequence_classification": task_type = "SEQ_CLS" check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type) - # check_checkpoint(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type) + check_checkpoint(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type) def run_dist(rank, world_size, port): From 53229b7eda432bc99c411e3b6f175046de04ca29 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Thu, 26 Oct 2023 17:25:55 +0800 Subject: [PATCH 07/11] delete unneeded codes --- colossalai/booster/booster.py | 15 --------------- colossalai/booster/plugin/torch_ddp_plugin.py | 12 ------------ colossalai/checkpoint_io/checkpoint_io_base.py | 12 +----------- 3 files changed, 1 insertion(+), 38 deletions(-) diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 68eb810a08c6..e695c54e5033 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -414,18 +414,3 @@ def save_lora_as_pretrained( if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") self.checkpoint_io.save_lora_as_pretrained(model, checkpoint, use_safetensors) - - # def load_lora_from_pretrained(self, model: nn.Module, checkpoint: str) -> None: - # """ - # Instantiate a PEFT model from a pretrained model and load PEFT weights from pretrained checkpoint. - - # Args: - # model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. - # checkpoint (str): Path to the checkpoint directory. It must be a local path. - # """ - # if not SUPPORT_PEFT: - # raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") - - # from peft import PeftModel - # assert (not isinstance(model, PeftModel)) and (not isinstance(model, ModelWrapper)), f"Lora should be loaded before boosting or enabling lora." - # self.checkpoint_io.load_lora_from_pretrained(model, checkpoint) diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 7851290370e8..46504705819b 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -130,18 +130,6 @@ def save_lora_as_pretrained( assert isinstance(peft_model, PeftModel), "Please use save_lora method when lora is enabled." peft_model.save_pretrained(save_directory=checkpoint, safe_serialization=use_safetensors) - # def load_lora(self, model: Union[nn.Module, ModelWrapper], checkpoint: str) -> None: - # """ - # Instantiate a PEFT model from a pretrained model and loaded PEFT weights. - # """ - # from peft import PeftModel - - # assert isinstance(model, ModelWrapper), "Please boost the model before loading!" - # if self.coordinator.is_master(): - # peft_model = model.unwrap() - # assert isinstance(peft_model, PeftModel), "Please use load_lora method when lora is enabled." - # PeftModel.from_pretrained(peft_model, checkpoint) - class TorchDDPModel(ModelWrapper): def __init__(self, module: nn.Module, *args, **kwargs) -> None: diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py index b843b3a21583..7afc4da38da4 100644 --- a/colossalai/checkpoint_io/checkpoint_io_base.py +++ b/colossalai/checkpoint_io/checkpoint_io_base.py @@ -329,8 +329,7 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str): lr_scheduler.load_state_dict(state_dict) # ================================================================================ - # Abstract methods for lora saving implementation. - # Loading lora is standard for different CheckpointIOs, so it needn't be abstract. + # Abstract method for lora saving implementation. # ================================================================================ @abstractmethod @@ -345,12 +344,3 @@ def save_lora_as_pretrained( checkpoint (str): Path to the checkpoint directory. It must be a local path. use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False. """ - - # def load_lora_from_pretrained(self, model: nn.Module, checkpoint: str) -> None: - # """ - # Instantiate a PEFT model from a pretrained model and load PEFT weights from pretrained checkpoint. - - # Args: - # model (Union[nn.Module, ModelWrapper]): A model boosted by Booster. - # checkpoint (str): Path to the checkpoint directory. It must be a local path. - # """ From 9d733f37c9603cacba03c305e7556ce613f3bf76 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Oct 2023 11:11:33 +0800 Subject: [PATCH 08/11] remove peft from LICENSE --- LICENSE | 16 ---------------- colossalai/booster/booster.py | 6 ++++-- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/LICENSE b/LICENSE index 24c36ae10f22..b3eb43520a6f 100644 --- a/LICENSE +++ b/LICENSE @@ -527,19 +527,3 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - ---------------- LICENSE FOR peft ---------------- - - from PEFT TEAM: - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - https://github.com/huggingface/peft/blob/main/LICENSE - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index e695c54e5033..7c414a137052 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -246,7 +246,7 @@ def enable_lora( layers_pattern: Optional[str] = None, ) -> nn.Module: """ - Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, load lora configs and weights are loaded from that directory. + Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, lora configs and weights are loaded from that directory. Lora in ColossalAI is implemented using Huggingface peft library, so the arguments for Lora configuration are same as those of peft. Args: @@ -279,7 +279,7 @@ def enable_lora( """ if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") - assert self.plugin is not None, f"Lora can only enabled when a plugin is provided." + assert self.plugin is not None, f"Lora can only be enabled when a plugin is provided." assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." lora_config = dict( pretrained_dir=pretrained_dir, @@ -413,4 +413,6 @@ def save_lora_as_pretrained( """ if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") + assert self.plugin is not None, f"Lora can only be enabled when a plugin is provided." + assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." self.checkpoint_io.save_lora_as_pretrained(model, checkpoint, use_safetensors) From b4a90263e29ebd2ec0fcbe01a892dcc50ab9dc06 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Oct 2023 15:44:10 +0800 Subject: [PATCH 09/11] add concrete methods for enable_lora --- colossalai/booster/plugin/gemini_plugin.py | 5 ++++- colossalai/booster/plugin/hybrid_parallel_plugin.py | 5 ++++- colossalai/booster/plugin/low_level_zero_plugin.py | 5 ++++- colossalai/booster/plugin/torch_ddp_plugin.py | 4 +++- colossalai/booster/plugin/torch_fsdp_plugin.py | 5 ++++- colossalai/checkpoint_io/general_checkpoint_io.py | 3 +++ 6 files changed, 22 insertions(+), 5 deletions(-) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index caaf59e59f0b..fe0a40cb5fb1 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -2,7 +2,7 @@ import logging import os from pathlib import Path -from typing import Callable, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple import torch import torch.nn as nn @@ -411,3 +411,6 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: raise NotImplementedError + + def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + raise NotImplementedError diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 0686a5c0ea0e..37c02335ea67 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -3,7 +3,7 @@ from contextlib import nullcontext from functools import partial from types import MethodType -from typing import Any, Callable, Iterator, List, Optional, OrderedDict, Tuple, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, OrderedDict, Tuple, Union import numpy as np import torch @@ -894,3 +894,6 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: Module) -> Iterator[None]: raise NotImplementedError + + def enable_lora(self, model: Module, lora_config: Dict) -> Module: + raise NotImplementedError diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 3c330815cdf3..b1a2ab68624e 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -3,7 +3,7 @@ from functools import partial from pathlib import Path from types import MethodType -from typing import Callable, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterator, List, Optional, Tuple import torch import torch.nn as nn @@ -339,3 +339,6 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: assert isinstance(optimizer, LowLevelZeroOptimizer) return optimizer.no_sync() + + def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + raise NotImplementedError diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 46504705819b..4778cd0d4d28 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -127,7 +127,9 @@ def save_lora_as_pretrained( assert isinstance(model, ModelWrapper), "Please boost the model before saving!" if self.coordinator.is_master(): peft_model = model.unwrap() - assert isinstance(peft_model, PeftModel), "Please use save_lora method when lora is enabled." + assert isinstance( + peft_model, PeftModel + ), "The model doesn't have lora adapters, please enable lora before saving." peft_model.save_pretrained(save_directory=checkpoint, safe_serialization=use_safetensors) diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py index 3a574e3dac21..884330114560 100644 --- a/colossalai/booster/plugin/torch_fsdp_plugin.py +++ b/colossalai/booster/plugin/torch_fsdp_plugin.py @@ -1,6 +1,6 @@ import warnings from pathlib import Path -from typing import Callable, Iterable, Iterator, List, Optional, Tuple +from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple import torch import torch.nn as nn @@ -238,3 +238,6 @@ def control_checkpoint_io(self) -> bool: def get_checkpoint_io(self) -> CheckpointIO: return TorchFSDPCheckpointIO() + + def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + raise NotImplementedError diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py index a652d9b4538e..b9253a56dcbb 100644 --- a/colossalai/checkpoint_io/general_checkpoint_io.py +++ b/colossalai/checkpoint_io/general_checkpoint_io.py @@ -228,3 +228,6 @@ def load_sharded_model( self.__class__.__name__, "\n\t".join(error_msgs) ) ) + + def save_lora_as_pretrained(self, model: nn.Module, checkpoint: str, use_safetensors: bool = False) -> None: + raise NotImplementedError From 27ba80e44559dae4f151c58c8775b5659e6ee26a Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 27 Oct 2023 16:25:16 +0800 Subject: [PATCH 10/11] simplify enable_lora api --- colossalai/booster/booster.py | 68 +++++-------------- colossalai/booster/plugin/gemini_plugin.py | 4 +- .../booster/plugin/hybrid_parallel_plugin.py | 4 +- .../booster/plugin/low_level_zero_plugin.py | 4 +- colossalai/booster/plugin/plugin_base.py | 2 +- colossalai/booster/plugin/torch_ddp_plugin.py | 11 ++- .../booster/plugin/torch_fsdp_plugin.py | 4 +- tests/test_lora/test_torch_ddp_lora.py | 8 ++- 8 files changed, 40 insertions(+), 65 deletions(-) diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 7c414a137052..c2a72408498b 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -230,20 +230,7 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) - return self.plugin.no_sync(model, optimizer) def enable_lora( - self, - model: nn.Module, - pretrained_dir: Optional[str] = None, - task_type: Optional[Union["peft.TaskType", str]] = None, - inference_mode: bool = False, - r: int = 8, - target_modules: Optional[Union[List[str], str]] = None, - lora_alpha: int = 8, - lora_dropout: float = 0.0, - fan_in_fan_out: bool = False, - bias: str = "none", - modules_to_save: Optional[List[str]] = None, - layers_to_transform: Optional[Union[List[int], int]] = None, - layers_pattern: Optional[str] = None, + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: "peft.LoraConfig" = None ) -> nn.Module: """ Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, lora configs and weights are loaded from that directory. @@ -253,49 +240,26 @@ def enable_lora( model (nn.Module): The model to be appended with LoRA modules. pretrained_dir(str, optional): The path to the pretrained directory, can be a local directory or model_id of a PEFT configuration hosted inside a model repo on the Hugging Face Hub. - When set to None, create new lora configs and weights for the model. Defaults to None. - task_type (Union[peft.TaskType, str], optional): The type of task to perform in peft. Available task types in string include "SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", - "TOKEN_CLS", "QUESTION_ANS", and "FEATURE_EXTRACTION". Defaults to None. - inference_mode (bool, optional): Whether to use the Peft model in inference mode. Defaults to False. - r (int, optional): Lora attention dimension. Defaults to 8. - target_modules (Union[List[str],str], optional): List of names or regex expressions of the modules to apply Lora to. - For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Defaults to None. - lora_alpha (int, optional): The alpha parameter for Lora scaling. Defaults to 8. - lora_dropout (float, optional): The dropout probability for Lora layers. Defaults to 0.0. - fan_in_fan_out (bool, optional): Set this to True if the layer to replace stores weight like (fan_in, fan_out). - For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set - to `True`. Defaults to False. - bias (str, optional): Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the - corresponding biases will be updated during training. Be aware that this means that, even when disabling - the adapters, the model will not produce the same output as the base model would have without adaptation. - Defaults to "none". - modules_to_save (List[str], optional):List of modules apart from LoRA layers to be set as trainable - and saved in the final checkpoint. Defaults to None. - layers_to_transform (Union[List[int],int], optional): The layer indexes to transform, if this argument is specified, - it will apply the LoRA transformations on the layer indexes that are specified in this list. If a single integer - is passed, it will apply the LoRA transformations on the layer at this index. Defaults to None. - layers_pattern (str, optional): The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer - pattern is not in the common layers pattern. Defaults to None. + When set to None, create new lora configs and weights for the model using the passed in lora_config. Defaults to None. + lora_config: (peft.LoraConfig, optional): Passed in LoraConfig for peft. Defaults to None. """ if not SUPPORT_PEFT: raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!") + assert self.plugin is not None, f"Lora can only be enabled when a plugin is provided." assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora." - lora_config = dict( - pretrained_dir=pretrained_dir, - task_type=task_type, - inference_mode=inference_mode, - r=r, - target_modules=target_modules, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - fan_in_fan_out=fan_in_fan_out, - bias=bias, - modules_to_save=modules_to_save, - layers_to_transform=layers_to_transform, - layers_pattern=layers_pattern, - ) - return self.plugin.enable_lora(model, lora_config) + if pretrained_dir is None: + assert ( + lora_config is not None + ), "Please provide configuration for Lora when pretrained directory path isn't passed in." + assert isinstance( + lora_config, peft.LoraConfig + ), "The passed in configuration should be an instance of peft.LoraConfig." + if lora_config is None: + assert ( + pretrained_dir is not None + ), "Please provide pretrained directory path if not passing in lora configuration." + return self.plugin.enable_lora(model, pretrained_dir, lora_config) def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None: """Load model from checkpoint. diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index fe0a40cb5fb1..c1193d397113 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -412,5 +412,7 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]: raise NotImplementedError - def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + def enable_lora( + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> nn.Module: raise NotImplementedError diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 37c02335ea67..59afe99d5ce3 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -895,5 +895,7 @@ def get_checkpoint_io(self) -> CheckpointIO: def no_sync(self, model: Module) -> Iterator[None]: raise NotImplementedError - def enable_lora(self, model: Module, lora_config: Dict) -> Module: + def enable_lora( + self, model: Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> Module: raise NotImplementedError diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index b1a2ab68624e..57e445735649 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -340,5 +340,7 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non assert isinstance(optimizer, LowLevelZeroOptimizer) return optimizer.no_sync() - def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + def enable_lora( + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> nn.Module: raise NotImplementedError diff --git a/colossalai/booster/plugin/plugin_base.py b/colossalai/booster/plugin/plugin_base.py index d815914bf491..6dc0c560d06d 100644 --- a/colossalai/booster/plugin/plugin_base.py +++ b/colossalai/booster/plugin/plugin_base.py @@ -68,7 +68,7 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non """ @abstractmethod - def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + def enable_lora(self, model: nn.Module, pretrained_dir: str, lora_config: Dict) -> nn.Module: """ Add LoRA modules to the model passed in. Should only be called in booster.enable_lora(). """ diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 4778cd0d4d28..9ba520de24f4 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -236,14 +236,13 @@ def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[Non assert isinstance(model, TorchDDPModel), "Model is not boosted by TorchDDPPlugin." return model.module.no_sync() - def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: - from peft import LoraConfig, PeftModel, get_peft_model + def enable_lora( + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> nn.Module: + from peft import PeftModel, get_peft_model assert not isinstance(model, TorchDDPModel), "Lora should be enabled before boosting the model." - - pretrained_dir = lora_config.pop("pretrained_dir") - if pretrained_dir is None: - return get_peft_model(model, LoraConfig(**lora_config)) + return get_peft_model(model, lora_config) else: return PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True) diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py index 884330114560..9bf27397b886 100644 --- a/colossalai/booster/plugin/torch_fsdp_plugin.py +++ b/colossalai/booster/plugin/torch_fsdp_plugin.py @@ -239,5 +239,7 @@ def control_checkpoint_io(self) -> bool: def get_checkpoint_io(self) -> CheckpointIO: return TorchFSDPCheckpointIO() - def enable_lora(self, model: nn.Module, lora_config: Dict) -> nn.Module: + def enable_lora( + self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None + ) -> nn.Module: raise NotImplementedError diff --git a/tests/test_lora/test_torch_ddp_lora.py b/tests/test_lora/test_torch_ddp_lora.py index fa6966167b5c..b3169bf86786 100644 --- a/tests/test_lora/test_torch_ddp_lora.py +++ b/tests/test_lora/test_torch_ddp_lora.py @@ -2,6 +2,7 @@ import os import torch +from peft import LoraConfig from torch import distributed as dist from torch.optim import AdamW @@ -23,10 +24,12 @@ @clear_cache_before_run() def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type): model = model_fn() + lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) plugin = TorchDDPPlugin() booster = Booster(plugin=plugin) - model = booster.enable_lora(model, task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + + model = booster.enable_lora(model, lora_config=lora_config) model_copy = copy.deepcopy(model) optimizer = AdamW(model.parameters(), lr=0.001) @@ -58,12 +61,13 @@ def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type @clear_cache_before_run() def check_checkpoint(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type): plugin = TorchDDPPlugin() + lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) model_save = model_fn() model_load = copy.deepcopy(model_save) booster = Booster(plugin=plugin) - model_save = booster.enable_lora(model_save, task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1) + model_save = booster.enable_lora(model_save, lora_config=lora_config) model_save, _, _, _, _ = booster.boost(model_save) with shared_tempdir() as tempdir: From 82ddca188c1b77da916f024393279107020cfb32 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Tue, 31 Oct 2023 13:46:46 +0800 Subject: [PATCH 11/11] fix requirements --- requirements/requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index dd3da281f7fd..5af31177004f 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -7,7 +7,7 @@ torchvision transformers==4.33.0 timm titans -torchaudio +torchaudio>=0.13.1 torchx-nightly==2022.6.29 # torchrec 0.2.0 requires torchx-nightly. This package is updated every day. We fix the version to a specific date to avoid breaking changes. torchrec==0.2.0 contexttimer