From 88502370f044db726689f21c48a2cfa1bf1b8e2d Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 31 Mar 2023 16:37:02 +0800 Subject: [PATCH 01/19] [gemini] fix nvme optimizer init --- colossalai/nn/optimizer/nvme_optimizer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/colossalai/nn/optimizer/nvme_optimizer.py b/colossalai/nn/optimizer/nvme_optimizer.py index cbb435a90f61..53e4a46c9741 100644 --- a/colossalai/nn/optimizer/nvme_optimizer.py +++ b/colossalai/nn/optimizer/nvme_optimizer.py @@ -1,9 +1,10 @@ -import torch +import math import os import tempfile -import math +from typing import Callable, Dict, List, Optional + +import torch from torch.nn.parameter import Parameter -from typing import Optional, List, Dict, Callable class NVMeOptimizer(torch.optim.Optimizer): @@ -42,8 +43,9 @@ def __init__(self, self.offloader = None self.is_on_nvme: Dict[Parameter, bool] = {} self.offloaded_numel: int = 0 - self.total_numel: int = self._get_numel() - self.can_offload_numel = math.floor(self.total_numel * self.nvme_offload_fraction) + # As param may be not materialized here, these attributes are initalized when the first step + self.total_numel: Optional[int] = None + self.can_offload_numel: Optional[int] = None self.prefetch_params: List[Parameter] = [] self.param_to_prefetch_idx: Dict[Parameter, int] = {} @@ -77,6 +79,9 @@ def _setup_prefetch_params(self) -> List[Parameter]: self.prefetch_params.append(p) def _pre_step(self, *state_keys: str) -> None: + if self.total_numel is None: + self.total_numel = self._get_numel() + self.can_offload_numel = math.floor(self.total_numel * self.nvme_offload_fraction) self._setup_prefetch_params() if self.offloader is None or len(self.prefetch_params) == 0: return From 51de0f592bd0c9cfce746393d671292f074937ae Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 31 Mar 2023 16:39:43 +0800 Subject: [PATCH 02/19] [gemini] gemini supports lazy init --- colossalai/gemini/chunk/search_utils.py | 9 ++++++--- colossalai/nn/parallel/data_parallel.py | 26 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/colossalai/gemini/chunk/search_utils.py b/colossalai/gemini/chunk/search_utils.py index fe9650721d74..ed663b25a103 100644 --- a/colossalai/gemini/chunk/search_utils.py +++ b/colossalai/gemini/chunk/search_utils.py @@ -46,9 +46,10 @@ def _get_unused_byte(size_list: List[int], chunk_size: int) -> int: def _tensor_numel(local_param: ColoParameter, strict_ddp_flag: bool): - if strict_ddp_flag: + if strict_ddp_flag and type(local_param) is ColoParameter: return local_param.numel_global() else: + # if local_param is not ColoParameter, we assume it's replicated return local_param.numel() @@ -67,11 +68,13 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator, """ params_dict: Dict[int, List[ColoParameter]] = dict() for param in param_order.generate(): - assert isinstance(param, ColoParameter), "please init model in the ColoInitContext" + # assert isinstance(param, ColoParameter), "please init model in the ColoInitContext" if is_ddp_ignored(param): continue - if strict_ddp_flag: + if strict_ddp_flag or type(param) is not ColoParameter: + # if model is not initialized with ColoInitContext, we assume it's replicated + # TODO(ver217): integrate DTensor param_key = dist.get_world_size() else: param_key = param.process_group.dp_world_size() diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py index a9d001bd0a9c..d97c67c01816 100644 --- a/colossalai/nn/parallel/data_parallel.py +++ b/colossalai/nn/parallel/data_parallel.py @@ -1,7 +1,7 @@ import itertools from collections import OrderedDict from functools import partial -from typing import Dict, Iterable, List, Optional, Set +from typing import Dict, Iterable, List, Optional, Set, Union import torch import torch.distributed as dist @@ -17,6 +17,7 @@ from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec from colossalai.tensor.param_op_hook import ColoParamOpHookManager from colossalai.utils import get_current_device, is_ddp_ignored +from colossalai.utils.model.experimental import LazyTensor from colossalai.zero.utils.gemini_hook import GeminiZeROHook from .reducer import Reducer @@ -214,7 +215,6 @@ def __init__(self, pin_memory: bool = False, force_outputs_fp32: bool = False, strict_ddp_mode: bool = False) -> None: - super().__init__(module, process_group=ColoProcessGroup()) self.gemini_manager = gemini_manager self.chunk_manager: ChunkManager = gemini_manager.chunk_manager self.force_outputs_fp32 = force_outputs_fp32 @@ -226,7 +226,6 @@ def __init__(self, self.param2name: Dict[nn.Parameter, str] = dict() self.name2param: Dict[str, nn.Parameter] = dict() - self._cast_buffers() self._logger = get_dist_logger() if self.gemini_manager._premade_memstats_: @@ -250,6 +249,8 @@ def __init__(self, for p_name, p_var in m_var.named_parameters(recurse=False): param_name = m_name + '.' + p_name if m_name else p_name self.name2param[param_name] = p_var + super().__init__(module, process_group=ColoProcessGroup()) + self._cast_buffers() def _post_forward(self): """This function is only triggered for inference. @@ -637,7 +638,8 @@ def load_fp32_parameter(chunk_slice, data): def _init_chunks(self, param_order, strict_ddp_mode: bool, cpu_offload: bool, pin_memory: bool): ddp_pg = ColoProcessGroup() for p in param_order.generate(): - assert isinstance(p, ColoParameter) + self._preprocess_param(p) + assert type(p) is ColoParameter # gather sharded parameters in the strict ddp mode if strict_ddp_mode: @@ -693,3 +695,19 @@ def _cast_buffers(self): buffer.data = buffer.cuda() if torch.is_floating_point(buffer): buffer.data = buffer.half() + + def _preprocess_param(self, p: Union[nn.Parameter, ColoParameter, LazyTensor]) -> None: + """Convert parameter to ColoParameter in-place. + + Args: + p (Union[nn.Parameter, ColoParameter, LazyTensor]): parameter to be converted + """ + if type(p) is ColoParameter: + # model is initialized with ColoInitContext + return + requires_grad = p.requires_grad + if isinstance(p, LazyTensor): + # model is initialized with LazyInitContext + p.materialize() + p.__class__ = ColoParameter + p.__init__(p, requires_grad=requires_grad) From 76e1dae0f22cda078ea7cc886914bd0773c5fc15 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 31 Mar 2023 16:49:51 +0800 Subject: [PATCH 03/19] [gemini] add init example --- .../language/gpt/gemini/train_gpt_init.py | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 examples/language/gpt/gemini/train_gpt_init.py diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py new file mode 100644 index 000000000000..08189324e0e4 --- /dev/null +++ b/examples/language/gpt/gemini/train_gpt_init.py @@ -0,0 +1,185 @@ +import os +import resource +from contextlib import nullcontext +from functools import partial +from time import time + +import psutil +import torch +import torch.nn as nn +from commons.model_zoo import model_builder +from commons.utils import get_data, get_profile_context, get_tflops, get_time_stamp +from packaging import version +from torch.nn.parallel import DistributedDataParallel as DDP + +import colossalai +from colossalai.logging import disable_existing_loggers, get_dist_logger +from colossalai.nn.optimizer import HybridAdam +from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper +from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec +from colossalai.utils import get_current_device +from colossalai.utils.model.colo_init_context import ColoInitContext +from colossalai.utils.model.experimental import LazyInitContext + +CAI_VERSION = colossalai.__version__ + + +def parse_args(): + parser = colossalai.get_default_parser() + parser.add_argument( + "--placement", + type=str, + default='cuda', + help="Placement Policy for Gemini. Valid when using colossalai as dist plan.", + ) + parser.add_argument("--init_method", + choices=['naive', 'colo', 'lazy'], + default='naive', + help='Model initialization method.') + parser.add_argument( + "--model_type", + type=str, + default="gpt2_medium", + help="model model scale", + ) + parser.add_argument( + "--use_gemini", + default=False, + action='store_true', + ) + + args = parser.parse_args() + return args + + +class GPTLMLoss(nn.Module): + + def __init__(self): + super().__init__() + self.loss_fn = nn.CrossEntropyLoss() + + def forward(self, logits, labels): + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + +def get_cpu_mem(): + return psutil.Process().memory_info().rss / 1024**2 + + +def get_gpu_mem(): + return torch.cuda.memory_allocated() / 1024**2 + + +def get_peak_gpu_mem(): + return torch.cuda.max_memory_allocated() / 1024**2 + + +def get_peak_cpu_mem(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + + +def get_mem_info(prefix=''): + return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB' + + +def get_peak_mem_info(prefix=''): + return f'{prefix}Peak GPU memory usage: {get_peak_gpu_mem():.2f} MB, Peak CPU memory usage: {get_peak_cpu_mem():.2f} MB' + + +def get_model_size(model: nn.Module): + total_numel = 0 + for module in model.modules(): + for p in module.parameters(recurse=False): + total_numel += p.numel() + return total_numel + + +def model_size_formatter(numel: int) -> str: + GB_SIZE = 10**9 + MB_SIZE = 10**6 + KB_SIZE = 10**3 + if numel >= GB_SIZE: + return f'{numel / GB_SIZE:.1f}B' + elif numel >= MB_SIZE: + return f'{numel / MB_SIZE:.1f}M' + elif numel >= KB_SIZE: + return f'{numel / KB_SIZE:.1f}K' + else: + return str(numel) + + +def set_cpu_maximum_parallelism(): + conf_str = torch.__config__.parallel_info() + inter_str = conf_str.split("hardware_concurrency() : ")[1] + max_concurrency = inter_str.split('\n')[0] + os.environ["OMP_NUM_THREADS"] = max_concurrency + print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.") + + +def main(): + # version check + # this example is supposed to work for versions greater than 0.2.0 + assert version.parse(CAI_VERSION) >= version.parse("0.2.0") + + set_cpu_maximum_parallelism() + args = parse_args() + + # batch size per DP degree + disable_existing_loggers() + colossalai.launch_from_torch(config={}) + + logger = get_dist_logger() + logger.info(f"{args.model_type}, {args.init_method}", ranks=[0]) + + # build criterion + + torch.manual_seed(123) + # all param must use the same process group. + world_size = torch.distributed.get_world_size() + + if args.init_method == 'naive': + ctx = nullcontext() + elif args.init_method == 'colo': + shard_pg = ProcessGroup(tp_degree=world_size) + default_dist_spec = ShardSpec([-1], [world_size]) + ctx = ColoInitContext(default_pg=shard_pg, default_dist_spec=default_dist_spec) + else: + ctx = LazyInitContext() + + # build GPT model + with ctx: + model = model_builder(args.model_type)(checkpoint=True) + + logger.info(get_mem_info(prefix='After init model, '), ranks=[0]) + logger.info(get_peak_mem_info(prefix='After init model, '), ranks=[0]) + # asign running configurations + + if args.use_gemini: + gemini_config = dict(strict_ddp_mode=True, + device=get_current_device(), + placement_policy=args.placement, + pin_memory=True, + hidden_dim=model.config.n_embd, + search_range_mb=128) + + # build a highly optimized gpu/cpu optimizer + + # wrap your model and optimizer + model = zero_model_wrapper(model, 3, gemini_config) + + logger.info(get_mem_info(prefix='After init gemini, '), ranks=[0]) + logger.info(get_peak_mem_info(prefix='After init gemini, '), ranks=[0]) + else: + if args.init_method == 'colo': + logger.info('ColoInitContext is coupled with Gemini, ignore', ranks=[0]) + elif args.init_method == 'lazy': + model = LazyInitContext.materialize(model) + logger.info(get_mem_info(prefix='After materialization, '), ranks=[0]) + logger.info(get_peak_mem_info(prefix='After materialization, '), ranks=[0]) + + +if __name__ == '__main__': + main() From d925d98695e27fb2a4661e69c30f5349610c21e7 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 31 Mar 2023 17:25:28 +0800 Subject: [PATCH 04/19] [gemini] add fool model --- .../language/gpt/gemini/train_gpt_init.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py index 08189324e0e4..5107cf8a55b2 100644 --- a/examples/language/gpt/gemini/train_gpt_init.py +++ b/examples/language/gpt/gemini/train_gpt_init.py @@ -21,6 +21,14 @@ from colossalai.utils.model.colo_init_context import ColoInitContext from colossalai.utils.model.experimental import LazyInitContext + +class Fool(nn.Module): + + def __init__(self) -> None: + super().__init__() + self.p = nn.Parameter(torch.empty(1024, 1024, 1024)) + + CAI_VERSION = colossalai.__version__ @@ -151,18 +159,25 @@ def main(): # build GPT model with ctx: - model = model_builder(args.model_type)(checkpoint=True) + if args.model_type == 'fool': + model = Fool() + else: + model = model_builder(args.model_type)(checkpoint=True) logger.info(get_mem_info(prefix='After init model, '), ranks=[0]) logger.info(get_peak_mem_info(prefix='After init model, '), ranks=[0]) # asign running configurations if args.use_gemini: + if args.model_type == 'fool': + hidden_dim = None + else: + hidden_dim = model.config.n_embd gemini_config = dict(strict_ddp_mode=True, - device=get_current_device(), + device='cpu', placement_policy=args.placement, - pin_memory=True, - hidden_dim=model.config.n_embd, + pin_memory=False, + hidden_dim=hidden_dim, search_range_mb=128) # build a highly optimized gpu/cpu optimizer From 2b34743900f0ba27f3b469de7d1fc270faad71bc Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 4 Apr 2023 16:19:29 +0800 Subject: [PATCH 05/19] [zero] update gemini ddp --- colossalai/zero/gemini/gemini_ddp.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 50f1b1ef1ccc..917899156ea7 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -1,7 +1,7 @@ import itertools from collections import OrderedDict from functools import partial -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import torch import torch.distributed as dist @@ -14,6 +14,7 @@ from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec from colossalai.tensor.param_op_hook import ColoParamOpHookManager from colossalai.utils import get_current_device, is_ddp_ignored +from colossalai.utils.model.experimental import LazyTensor from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager from .gemini_hook import GeminiZeROHook @@ -55,7 +56,6 @@ def __init__(self, pin_memory: bool = False, force_outputs_fp32: bool = False, strict_ddp_mode: bool = False) -> None: - super().__init__(module, process_group=ColoProcessGroup()) self.gemini_manager = gemini_manager self.chunk_manager: ChunkManager = gemini_manager.chunk_manager self.force_outputs_fp32 = force_outputs_fp32 @@ -67,7 +67,6 @@ def __init__(self, self.param2name: Dict[nn.Parameter, str] = dict() self.name2param: Dict[str, nn.Parameter] = dict() - self._cast_buffers() self._logger = get_dist_logger() if self.gemini_manager._premade_memstats_: @@ -91,6 +90,8 @@ def __init__(self, for p_name, p_var in m_var.named_parameters(recurse=False): param_name = m_name + '.' + p_name if m_name else p_name self.name2param[param_name] = p_var + super().__init__(module, process_group=ColoProcessGroup()) + self._cast_buffers() def _post_forward(self): """This function is only triggered for inference. @@ -478,7 +479,8 @@ def load_fp32_parameter(chunk_slice, data): def _init_chunks(self, param_order, strict_ddp_mode: bool, cpu_offload: bool, pin_memory: bool): ddp_pg = ColoProcessGroup() for p in param_order.generate(): - assert isinstance(p, ColoParameter) + self._preprocess_param(p) + assert type(p) is ColoParameter # gather sharded parameters in the strict ddp mode if strict_ddp_mode: @@ -535,6 +537,21 @@ def _cast_buffers(self): if torch.is_floating_point(buffer): buffer.data = buffer.half() + def _preprocess_param(self, p: Union[nn.Parameter, ColoParameter, LazyTensor]) -> None: + """Convert parameter to ColoParameter in-place. + Args: + p (Union[nn.Parameter, ColoParameter, LazyTensor]): parameter to be converted + """ + if type(p) is ColoParameter: + # model is initialized with ColoInitContext + return + requires_grad = p.requires_grad + if isinstance(p, LazyTensor): + # model is initialized with LazyInitContext + p.materialize() + p.__class__ = ColoParameter + p.__init__(p, requires_grad=requires_grad) + class GeminiDDP(ZeroDDP): From 887ba37087fc6974155ac6bcb86d4660fb8e390b Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 4 Apr 2023 16:21:33 +0800 Subject: [PATCH 06/19] [zero] update init example --- examples/language/gpt/gemini/train_gpt_init.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py index 5107cf8a55b2..29924d9ced12 100644 --- a/examples/language/gpt/gemini/train_gpt_init.py +++ b/examples/language/gpt/gemini/train_gpt_init.py @@ -1,25 +1,18 @@ import os import resource from contextlib import nullcontext -from functools import partial -from time import time import psutil import torch import torch.nn as nn from commons.model_zoo import model_builder -from commons.utils import get_data, get_profile_context, get_tflops, get_time_stamp from packaging import version -from torch.nn.parallel import DistributedDataParallel as DDP import colossalai from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.nn.optimizer import HybridAdam -from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper -from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec -from colossalai.utils import get_current_device -from colossalai.utils.model.colo_init_context import ColoInitContext +from colossalai.tensor import ProcessGroup, ShardSpec from colossalai.utils.model.experimental import LazyInitContext +from colossalai.zero import ColoInitContext, zero_model_wrapper class Fool(nn.Module): From e952a22411195ba0a4b7603fc03ac8441924a31f Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 4 Apr 2023 17:56:23 +0800 Subject: [PATCH 07/19] add chunk method --- .../language/gpt/gemini/train_gpt_init.py | 63 ++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py index 29924d9ced12..16a9a5b38ce4 100644 --- a/examples/language/gpt/gemini/train_gpt_init.py +++ b/examples/language/gpt/gemini/train_gpt_init.py @@ -10,16 +10,57 @@ import colossalai from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.tensor import ProcessGroup, ShardSpec -from colossalai.utils.model.experimental import LazyInitContext +from colossalai.tensor import ColoParameter +from colossalai.tensor import ProcessGroup +from colossalai.tensor import ProcessGroup as ColoProcessGroup +from colossalai.tensor import ShardSpec +from colossalai.utils.model.experimental import LazyInitContext, LazyTensor from colossalai.zero import ColoInitContext, zero_model_wrapper +from colossalai.zero.gemini.chunk import ChunkManager +from colossalai.zero.gemini.chunk.search_utils import search_chunk_configuration + + +def chunk_wrapper(model: nn.Module, + search_range_mb: float, + search_interval_byte: int, + register_chunk: bool = False) -> nn.Module: + + def preprocess_param(p: nn.Parameter): + if type(p) is ColoParameter: + # model is initialized with ColoInitContext + return + requires_grad = p.requires_grad + if isinstance(p, LazyTensor): + # model is initialized with LazyInitContext + p.materialize() + p.__class__ = ColoParameter + p.__init__(p, requires_grad=requires_grad) + + cfg, total, wasted = search_chunk_configuration(model, search_range_mb, search_interval_byte, strict_ddp_flag=True) + print(cfg) + chunk_manager = ChunkManager(cfg, 'cpu') + ddp_pg = ColoProcessGroup() + for p in model.parameters(): + preprocess_param(p) + p.set_process_group(ddp_pg) + dp_world_size = p.process_group.dp_world_size() + if register_chunk: + chunk_manager.register_tensor(tensor=p, + group_type='fp32_param', + config_key=dp_world_size, + cpu_offload=True, + pin_memory=False) + if register_chunk: + chunk_manager.close_all_groups() + print(chunk_manager.total_mem) + return cfg class Fool(nn.Module): def __init__(self) -> None: super().__init__() - self.p = nn.Parameter(torch.empty(1024, 1024, 1024)) + self.p = nn.Parameter(torch.rand(1024, 1024, 1024)) CAI_VERSION = colossalai.__version__ @@ -44,9 +85,9 @@ def parse_args(): help="model model scale", ) parser.add_argument( - "--use_gemini", - default=False, - action='store_true', + '--chunk_method', + default='none', + choices=['gemini', 'chunk', 'none'], ) args = parser.parse_args() @@ -161,7 +202,7 @@ def main(): logger.info(get_peak_mem_info(prefix='After init model, '), ranks=[0]) # asign running configurations - if args.use_gemini: + if args.chunk_method == 'gemini': if args.model_type == 'fool': hidden_dim = None else: @@ -180,6 +221,14 @@ def main(): logger.info(get_mem_info(prefix='After init gemini, '), ranks=[0]) logger.info(get_peak_mem_info(prefix='After init gemini, '), ranks=[0]) + elif args.chunk_method == 'chunk': + if args.model_type == 'fool': + hidden_dim = 1024 + else: + hidden_dim = model.config.n_embd + cfg = chunk_wrapper(model, 128, hidden_dim, register_chunk=False) + logger.info(get_mem_info(prefix='After init chunk, '), ranks=[0]) + logger.info(get_peak_mem_info(prefix='After init chunk, '), ranks=[0]) else: if args.init_method == 'colo': logger.info('ColoInitContext is coupled with Gemini, ignore', ranks=[0]) From 29b2854e8a762b951f65589779851b15f92fddb4 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 4 Apr 2023 18:35:33 +0800 Subject: [PATCH 08/19] add chunk method --- examples/language/gpt/gemini/train_gpt_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py index 16a9a5b38ce4..d572ad6a8430 100644 --- a/examples/language/gpt/gemini/train_gpt_init.py +++ b/examples/language/gpt/gemini/train_gpt_init.py @@ -226,7 +226,7 @@ def main(): hidden_dim = 1024 else: hidden_dim = model.config.n_embd - cfg = chunk_wrapper(model, 128, hidden_dim, register_chunk=False) + chunk_wrapper(model, 128, hidden_dim, register_chunk=True) logger.info(get_mem_info(prefix='After init chunk, '), ranks=[0]) logger.info(get_peak_mem_info(prefix='After init chunk, '), ranks=[0]) else: From 7fff1097f3a7a71d9bbf15d4780c090c49122a33 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 11 Apr 2023 18:00:13 +0800 Subject: [PATCH 09/19] [lazyinit] fix lazy tensor tolist --- colossalai/utils/model/experimental.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py index 6427a147a5c0..4c945ee21c8a 100644 --- a/colossalai/utils/model/experimental.py +++ b/colossalai/utils/model/experimental.py @@ -75,6 +75,12 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): return super().__torch_function__(func, types, args, kwargs) +def _data_tolist(tensor: torch.Tensor) -> list: + """tolist() method is not allowed for a subclass of tensor. Tensor.data returns a Tensor. + """ + return tensor.data.tolist() + + def _convert_cls(tensor: 'LazyTensor', target: torch.Tensor) -> torch.Tensor: """Convert a lazy tensor's class to target's class, with target's data. @@ -94,7 +100,7 @@ def _convert_cls(tensor: 'LazyTensor', target: torch.Tensor) -> torch.Tensor: tensor.requires_grad = target.requires_grad # subclass of torch.Tensor does not have tolist() method # overwrite this method after materialization or distribution - tensor.tolist = MethodType(torch.Tensor.tolist, target) + tensor.tolist = MethodType(_data_tolist, tensor) return tensor From 816bee05f73a7e5d30f5e73825fe3a4dd92166e5 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 11 Apr 2023 18:00:33 +0800 Subject: [PATCH 10/19] [gemini] fix buffer materialization --- colossalai/zero/gemini/gemini_ddp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 917899156ea7..7a38deff804d 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -533,6 +533,8 @@ def _init_chunks(self, param_order, strict_ddp_mode: bool, cpu_offload: bool, pi def _cast_buffers(self): for buffer in self.module.buffers(): + if isinstance(buffer, LazyTensor): + buffer.materialize() buffer.data = buffer.cuda() if torch.is_floating_point(buffer): buffer.data = buffer.half() From b79a3e667d7ccd6ced08bfeda6ab1f91949e6c55 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 11 Apr 2023 18:00:52 +0800 Subject: [PATCH 11/19] [misc] remove useless file --- .../language/gpt/gemini/train_gpt_init.py | 242 ------------------ 1 file changed, 242 deletions(-) delete mode 100644 examples/language/gpt/gemini/train_gpt_init.py diff --git a/examples/language/gpt/gemini/train_gpt_init.py b/examples/language/gpt/gemini/train_gpt_init.py deleted file mode 100644 index d572ad6a8430..000000000000 --- a/examples/language/gpt/gemini/train_gpt_init.py +++ /dev/null @@ -1,242 +0,0 @@ -import os -import resource -from contextlib import nullcontext - -import psutil -import torch -import torch.nn as nn -from commons.model_zoo import model_builder -from packaging import version - -import colossalai -from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.tensor import ColoParameter -from colossalai.tensor import ProcessGroup -from colossalai.tensor import ProcessGroup as ColoProcessGroup -from colossalai.tensor import ShardSpec -from colossalai.utils.model.experimental import LazyInitContext, LazyTensor -from colossalai.zero import ColoInitContext, zero_model_wrapper -from colossalai.zero.gemini.chunk import ChunkManager -from colossalai.zero.gemini.chunk.search_utils import search_chunk_configuration - - -def chunk_wrapper(model: nn.Module, - search_range_mb: float, - search_interval_byte: int, - register_chunk: bool = False) -> nn.Module: - - def preprocess_param(p: nn.Parameter): - if type(p) is ColoParameter: - # model is initialized with ColoInitContext - return - requires_grad = p.requires_grad - if isinstance(p, LazyTensor): - # model is initialized with LazyInitContext - p.materialize() - p.__class__ = ColoParameter - p.__init__(p, requires_grad=requires_grad) - - cfg, total, wasted = search_chunk_configuration(model, search_range_mb, search_interval_byte, strict_ddp_flag=True) - print(cfg) - chunk_manager = ChunkManager(cfg, 'cpu') - ddp_pg = ColoProcessGroup() - for p in model.parameters(): - preprocess_param(p) - p.set_process_group(ddp_pg) - dp_world_size = p.process_group.dp_world_size() - if register_chunk: - chunk_manager.register_tensor(tensor=p, - group_type='fp32_param', - config_key=dp_world_size, - cpu_offload=True, - pin_memory=False) - if register_chunk: - chunk_manager.close_all_groups() - print(chunk_manager.total_mem) - return cfg - - -class Fool(nn.Module): - - def __init__(self) -> None: - super().__init__() - self.p = nn.Parameter(torch.rand(1024, 1024, 1024)) - - -CAI_VERSION = colossalai.__version__ - - -def parse_args(): - parser = colossalai.get_default_parser() - parser.add_argument( - "--placement", - type=str, - default='cuda', - help="Placement Policy for Gemini. Valid when using colossalai as dist plan.", - ) - parser.add_argument("--init_method", - choices=['naive', 'colo', 'lazy'], - default='naive', - help='Model initialization method.') - parser.add_argument( - "--model_type", - type=str, - default="gpt2_medium", - help="model model scale", - ) - parser.add_argument( - '--chunk_method', - default='none', - choices=['gemini', 'chunk', 'none'], - ) - - args = parser.parse_args() - return args - - -class GPTLMLoss(nn.Module): - - def __init__(self): - super().__init__() - self.loss_fn = nn.CrossEntropyLoss() - - def forward(self, logits, labels): - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - - -def get_cpu_mem(): - return psutil.Process().memory_info().rss / 1024**2 - - -def get_gpu_mem(): - return torch.cuda.memory_allocated() / 1024**2 - - -def get_peak_gpu_mem(): - return torch.cuda.max_memory_allocated() / 1024**2 - - -def get_peak_cpu_mem(): - return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 - - -def get_mem_info(prefix=''): - return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB' - - -def get_peak_mem_info(prefix=''): - return f'{prefix}Peak GPU memory usage: {get_peak_gpu_mem():.2f} MB, Peak CPU memory usage: {get_peak_cpu_mem():.2f} MB' - - -def get_model_size(model: nn.Module): - total_numel = 0 - for module in model.modules(): - for p in module.parameters(recurse=False): - total_numel += p.numel() - return total_numel - - -def model_size_formatter(numel: int) -> str: - GB_SIZE = 10**9 - MB_SIZE = 10**6 - KB_SIZE = 10**3 - if numel >= GB_SIZE: - return f'{numel / GB_SIZE:.1f}B' - elif numel >= MB_SIZE: - return f'{numel / MB_SIZE:.1f}M' - elif numel >= KB_SIZE: - return f'{numel / KB_SIZE:.1f}K' - else: - return str(numel) - - -def set_cpu_maximum_parallelism(): - conf_str = torch.__config__.parallel_info() - inter_str = conf_str.split("hardware_concurrency() : ")[1] - max_concurrency = inter_str.split('\n')[0] - os.environ["OMP_NUM_THREADS"] = max_concurrency - print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.") - - -def main(): - # version check - # this example is supposed to work for versions greater than 0.2.0 - assert version.parse(CAI_VERSION) >= version.parse("0.2.0") - - set_cpu_maximum_parallelism() - args = parse_args() - - # batch size per DP degree - disable_existing_loggers() - colossalai.launch_from_torch(config={}) - - logger = get_dist_logger() - logger.info(f"{args.model_type}, {args.init_method}", ranks=[0]) - - # build criterion - - torch.manual_seed(123) - # all param must use the same process group. - world_size = torch.distributed.get_world_size() - - if args.init_method == 'naive': - ctx = nullcontext() - elif args.init_method == 'colo': - shard_pg = ProcessGroup(tp_degree=world_size) - default_dist_spec = ShardSpec([-1], [world_size]) - ctx = ColoInitContext(default_pg=shard_pg, default_dist_spec=default_dist_spec) - else: - ctx = LazyInitContext() - - # build GPT model - with ctx: - if args.model_type == 'fool': - model = Fool() - else: - model = model_builder(args.model_type)(checkpoint=True) - - logger.info(get_mem_info(prefix='After init model, '), ranks=[0]) - logger.info(get_peak_mem_info(prefix='After init model, '), ranks=[0]) - # asign running configurations - - if args.chunk_method == 'gemini': - if args.model_type == 'fool': - hidden_dim = None - else: - hidden_dim = model.config.n_embd - gemini_config = dict(strict_ddp_mode=True, - device='cpu', - placement_policy=args.placement, - pin_memory=False, - hidden_dim=hidden_dim, - search_range_mb=128) - - # build a highly optimized gpu/cpu optimizer - - # wrap your model and optimizer - model = zero_model_wrapper(model, 3, gemini_config) - - logger.info(get_mem_info(prefix='After init gemini, '), ranks=[0]) - logger.info(get_peak_mem_info(prefix='After init gemini, '), ranks=[0]) - elif args.chunk_method == 'chunk': - if args.model_type == 'fool': - hidden_dim = 1024 - else: - hidden_dim = model.config.n_embd - chunk_wrapper(model, 128, hidden_dim, register_chunk=True) - logger.info(get_mem_info(prefix='After init chunk, '), ranks=[0]) - logger.info(get_peak_mem_info(prefix='After init chunk, '), ranks=[0]) - else: - if args.init_method == 'colo': - logger.info('ColoInitContext is coupled with Gemini, ignore', ranks=[0]) - elif args.init_method == 'lazy': - model = LazyInitContext.materialize(model) - logger.info(get_mem_info(prefix='After materialization, '), ranks=[0]) - logger.info(get_peak_mem_info(prefix='After materialization, '), ranks=[0]) - - -if __name__ == '__main__': - main() From 709baed13cd980295fa8f1b04e8a4a5360917d03 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 11 Apr 2023 18:12:51 +0800 Subject: [PATCH 12/19] [booster] update gemini plugin --- colossalai/booster/plugin/gemini_plugin.py | 50 ---------------------- 1 file changed, 50 deletions(-) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index 6693b1f44d62..659f36c210f4 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -16,10 +16,8 @@ from colossalai.checkpoint_io.utils import save_state_dict from colossalai.cluster import DistCoordinator from colossalai.interface import ModelWrapper, OptimizerWrapper -from colossalai.tensor.colo_parameter import ColoParameter from colossalai.utils import get_current_device from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper -from colossalai.zero.gemini.colo_init_context import _convert_to_coloparam from colossalai.zero.gemini.memory_tracer import MemStats from .plugin_base import Plugin @@ -27,50 +25,6 @@ __all__ = ['GeminiPlugin'] -def convert_to_colo_param(module: nn.Module) -> None: - """Convert module's paramters to ColoParameter. This is a workaround and will be deprecated when lazy init is compatible with Gemini. - - Args: - module (nn.Module): Module to be converted. - """ - converted_modules = set() # handle shared modules - converted_params = dict() # record mapping between (torch.Tensor, ColoTensor) to distinguish the same reference - - def convert_recursively(m: nn.Module): - for child in m.children(): - if child not in converted_modules: - converted_modules.add(child) - convert_recursively(child) - - for name, p in m.named_parameters(recurse=False): - assert not isinstance(p, ColoParameter) - if p in converted_params: - target = converted_params[p] - else: - target = _convert_to_coloparam(p, p.device, p.dtype) - converted_params[p] = target - setattr(m, name, target) - target.shared_param_modules.append(m) - - convert_recursively(module) - - # optimizer should replace params in group as well. This attr should be deleted after replacing to avoid memory leak - module._converted_params = converted_params - - -def replace_param_in_group(optimizer: Optimizer, converted_params: dict) -> None: - """Replace param in optimizer's group with converted ColoParameter. - - Args: - optimizer (Optimizer): Optimizer to be replaced. - converted_params (dict): Mapping between (torch.Tensor, ColoTensor). - """ - for group in optimizer.param_groups: - for i, p in enumerate(group['params']): - if p in converted_params: - group['params'][i] = converted_params[p] - - class GeminiCheckpointIO(GeneralCheckpointIO): def __init__(self) -> None: @@ -113,8 +67,6 @@ class GeminiModel(ModelWrapper): def __init__(self, module: nn.Module, gemini_config: dict) -> None: super().__init__(module) - # TODO(ver217): only support Gemini now - convert_to_colo_param(module) self.module = zero_model_wrapper(module, zero_stage=3, gemini_config=gemini_config) def unwrap(self): @@ -125,8 +77,6 @@ def unwrap(self): class GeminiOptimizer(OptimizerWrapper): def __init__(self, module: GeminiDDP, optimizer: Optimizer, zero_optim_config: dict, optim_kwargs: dict) -> None: - replace_param_in_group(optimizer, module.module._converted_params) - del module.module._converted_params optimizer = zero_optim_wrapper(module, optimizer, optim_config=zero_optim_config, **optim_kwargs) super().__init__(optimizer) From 73e7bf730a27640c9ac5f47a87a3d781a91ad717 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 11 Apr 2023 19:36:52 +0800 Subject: [PATCH 13/19] [test] update gemini plugin test --- .../test_plugin/test_gemini_plugin.py | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index a3c63fd09d26..58e6597f2328 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -1,3 +1,5 @@ +from contextlib import nullcontext + import torch import torch.distributed as dist @@ -6,11 +8,14 @@ from colossalai.booster.plugin import GeminiPlugin from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter -from colossalai.testing import rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.utils.model.experimental import LazyInitContext +from colossalai.zero import ColoInitContext from tests.kit.model_zoo import model_zoo -def check_gemini_plugin(early_stop: bool = True): +@parameterize('init_method', ['lazy', 'none', 'colo']) +def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True): """check gemini plugin over model zoo Args: @@ -40,10 +45,25 @@ def check_gemini_plugin(early_stop: bool = True): ]: continue + if init_method == 'lazy' and name in [ + 'timm_convmixer', 'timm_vision_transformer', 'timm_deit', 'timm_deit3', 'timm_inception_v3', + 'timm_tnt_b_patch16_224', 'timm_rexnet', 'torchvision_densenet121', 'torchvision_efficientnet_b0', + 'torchvision_mobilenet_v2', 'torchvision_mnasnet0_5', 'torchvision_regnet_x_16gf', + 'torchvision_shufflenet_v2_x0_5', 'torchvision_efficientnet_v2_s' + ]: + continue + try: + if init_method == 'colo': + ctx = ColoInitContext() + elif init_method == 'lazy': + ctx = LazyInitContext() + else: + ctx = nullcontext() plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5) booster = Booster(plugin=plugin) - model = model_fn() + with ctx: + model = model_fn() optimizer = HybridAdam(model.parameters(), lr=1e-3) criterion = lambda x: x.mean() data = data_gen_fn() @@ -76,6 +96,7 @@ def check_gemini_plugin(early_stop: bool = True): torch.cuda.empty_cache() if dist.get_rank() == 0: + print(f'Init method: {init_method}') print(f'Passed models({len(passed_models)}): {passed_models}\n\n') print(f'Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n') assert len(failed_info) == 0, '\n'.join([f'{k}: {v}' for k, v in failed_info.items()]) From 37440b28ac94f25feabe90ba9be6e3ba61d50824 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 10:25:22 +0800 Subject: [PATCH 14/19] [test] fix gemini plugin test --- tests/test_booster/test_plugin/test_gemini_plugin.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index 58e6597f2328..d804c727ad3e 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -6,10 +6,10 @@ import colossalai from colossalai.booster import Booster from colossalai.booster.plugin import GeminiPlugin +from colossalai.fx import is_compatible_with_meta from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from colossalai.utils.model.experimental import LazyInitContext from colossalai.zero import ColoInitContext from tests.kit.model_zoo import model_zoo @@ -21,6 +21,11 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True): Args: early_stop (bool, optional): Whether to stop when getting the first error. Defaults to True. """ + is_support_meta = is_compatible_with_meta() + if not is_support_meta and init_method == 'lazy': + return + + from colossalai.utils.model.experimental import LazyInitContext passed_models = [] failed_info = {} # (model_name, error) pair From d513754cdcdbe9ff0dd4bc6c950fdb9c0639b96a Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 10:42:18 +0800 Subject: [PATCH 15/19] [gemini] fix import --- colossalai/zero/gemini/gemini_ddp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 7a38deff804d..9b1c70381c71 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -7,6 +7,7 @@ import torch.distributed as dist import torch.nn as nn +from colossalai.fx import is_compatible_with_meta from colossalai.logging import get_dist_logger from colossalai.nn.parallel.data_parallel import ColoDDP, _cast_float, free_storage from colossalai.tensor import ProcessGroup as ColoProcessGroup @@ -14,7 +15,6 @@ from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec from colossalai.tensor.param_op_hook import ColoParamOpHookManager from colossalai.utils import get_current_device, is_ddp_ignored -from colossalai.utils.model.experimental import LazyTensor from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager from .gemini_hook import GeminiZeROHook @@ -22,6 +22,9 @@ from .memory_tracer import MemStats, OrderedParamGenerator from .utils import get_temp_total_chunk_on_cuda +if is_compatible_with_meta(): + from colossalai.utils.model.experimental import LazyTensor + try: from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX, _IncompatibleKeys except ImportError: From 17c6616df4528f346e696dfe5da85a12582839c6 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 11:12:21 +0800 Subject: [PATCH 16/19] [gemini] fix import --- colossalai/zero/gemini/gemini_ddp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 9b1c70381c71..6a93b950c6ed 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -542,7 +542,7 @@ def _cast_buffers(self): if torch.is_floating_point(buffer): buffer.data = buffer.half() - def _preprocess_param(self, p: Union[nn.Parameter, ColoParameter, LazyTensor]) -> None: + def _preprocess_param(self, p: Union[nn.Parameter, ColoParameter, 'LazyTensor']) -> None: """Convert parameter to ColoParameter in-place. Args: p (Union[nn.Parameter, ColoParameter, LazyTensor]): parameter to be converted From fdc3b33b21cf3ad0f6e871399e786dc57ebe53a7 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 11:52:08 +0800 Subject: [PATCH 17/19] [lazyinit] use new metatensor --- colossalai/_analyzer/_subclasses/_monkey_patch.py | 3 +-- colossalai/utils/model/experimental.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/colossalai/_analyzer/_subclasses/_monkey_patch.py b/colossalai/_analyzer/_subclasses/_monkey_patch.py index 7c1c3d3d8cd4..b3ec98f0811f 100644 --- a/colossalai/_analyzer/_subclasses/_monkey_patch.py +++ b/colossalai/_analyzer/_subclasses/_monkey_patch.py @@ -2,8 +2,6 @@ import torch.distributed as dist from packaging import version -aten = torch.ops.aten - __all__ = [ "_TorchFactoryMethod", "_TorchOverrideableFactoryMethod", @@ -51,6 +49,7 @@ ] if version.parse(torch.__version__) >= version.parse('1.12.0'): + aten = torch.ops.aten # TODO: dive deep here # refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp _AliasATen = [ diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py index 4c945ee21c8a..b17367eb0245 100644 --- a/colossalai/utils/model/experimental.py +++ b/colossalai/utils/model/experimental.py @@ -7,7 +7,7 @@ from torch import Tensor from torch.utils._pytree import tree_map -from colossalai.fx.profiler.tensor import MetaTensor +from colossalai._analyzer._subclasses import MetaTensor from colossalai.tensor.d_tensor.d_tensor import DTensor from colossalai.tensor.d_tensor.layout import Layout @@ -150,7 +150,7 @@ def __new__(cls, func, *args, meta_data=None, concrete_data=None, **kwargs): if meta_data is None: device = kwargs.get('device', 'cpu') elem = func(*args, **{**kwargs, 'device': 'meta'}) - meta_data = MetaTensor(elem, fake_device=device) + meta_data = MetaTensor(elem, device=device) elem = meta_data._tensor # As a meta tensor cannot be modified __class__ to torch.Tensor, we should use an empty real tensor here r = torch.Tensor._make_subclass(cls, _EMPTY_DATA, require_grad=elem.requires_grad) From 4d4db92a8b2df81e015023ffc3c4ae20a69fb7e2 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 13:55:24 +0800 Subject: [PATCH 18/19] [lazyinit] use new metatensor --- colossalai/zero/gemini/gemini_ddp.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 6a93b950c6ed..c06239dfac20 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -7,7 +7,6 @@ import torch.distributed as dist import torch.nn as nn -from colossalai.fx import is_compatible_with_meta from colossalai.logging import get_dist_logger from colossalai.nn.parallel.data_parallel import ColoDDP, _cast_float, free_storage from colossalai.tensor import ProcessGroup as ColoProcessGroup @@ -15,6 +14,7 @@ from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec from colossalai.tensor.param_op_hook import ColoParamOpHookManager from colossalai.utils import get_current_device, is_ddp_ignored +from colossalai.utils.model.experimental import LazyTensor from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager from .gemini_hook import GeminiZeROHook @@ -22,9 +22,6 @@ from .memory_tracer import MemStats, OrderedParamGenerator from .utils import get_temp_total_chunk_on_cuda -if is_compatible_with_meta(): - from colossalai.utils.model.experimental import LazyTensor - try: from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX, _IncompatibleKeys except ImportError: From 94fd2b73c38020ca965a668fe06951b3a0f1d718 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 12 Apr 2023 15:10:12 +0800 Subject: [PATCH 19/19] [lazyinit] fix __set__ method --- colossalai/utils/model/experimental.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py index b17367eb0245..c91751f1cb28 100644 --- a/colossalai/utils/model/experimental.py +++ b/colossalai/utils/model/experimental.py @@ -37,7 +37,7 @@ # If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset) # without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block. # These ops cannot be unwrapped using .data -_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__'] +_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__'] _LEGACY_TENSOR_CONSTRUCTOR = { 'FloatTensor': torch.float, @@ -261,7 +261,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): tree_map(cls._replace_with_materialized, args) tree_map(cls._replace_with_materialized, kwargs) is_inplace: bool = (func.__name__.endswith('_') and not (func.__name__.endswith('__')) - or func.__name__ == "__setitem__") + or func.__name__ in ('__setitem__', '__set__')) is_change_meta_op: bool = func.__name__ in _CHANGE_META_OPS