diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/amp/apex_amp/apex_amp.py index e6bdbe4520f9..ba603ca0975c 100644 --- a/colossalai/amp/apex_amp/apex_amp.py +++ b/colossalai/amp/apex_amp/apex_amp.py @@ -10,11 +10,11 @@ from torch import Tensor -from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.interface import OptimizerWrapper from colossalai.utils import clip_grad_norm_fp32 -class ApexAMPOptimizer(ColossalaiOptimizer): +class ApexAMPOptimizer(OptimizerWrapper): """ A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm methods """ diff --git a/colossalai/amp/naive_amp/naive_amp.py b/colossalai/amp/naive_amp/naive_amp.py index 6a39d518d3f4..c09f09f8118b 100644 --- a/colossalai/amp/naive_amp/naive_amp.py +++ b/colossalai/amp/naive_amp/naive_amp.py @@ -13,12 +13,12 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.interface import OptimizerWrapper from ._fp16_optimizer import FP16Optimizer -class NaiveAMPOptimizer(ColossalaiOptimizer): +class NaiveAMPOptimizer(OptimizerWrapper): """A wrapper class for optimizer to cast all parameters to fp16 Args: diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/amp/torch_amp/torch_amp.py index 65718d77c2e0..452b3d8a00fc 100644 --- a/colossalai/amp/torch_amp/torch_amp.py +++ b/colossalai/amp/torch_amp/torch_amp.py @@ -7,13 +7,13 @@ from torch.nn.modules.loss import _Loss from torch.optim import Optimizer -from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.interface import OptimizerWrapper from colossalai.utils import clip_grad_norm_fp32 from ._grad_scaler import GradScaler -class TorchAMPOptimizer(ColossalaiOptimizer): +class TorchAMPOptimizer(OptimizerWrapper): """A wrapper class which integrate Pytorch AMP with an optimizer Args: diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py index 19d85b80dd3d..353133bd6f2d 100644 --- a/colossalai/auto_parallel/offload/amp_optimizer.py +++ b/colossalai/auto_parallel/offload/amp_optimizer.py @@ -5,8 +5,8 @@ from torch.optim import Optimizer from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler +from colossalai.interface import OptimizerWrapper from colossalai.logging import get_dist_logger -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.utils import get_current_device from .base_offload_module import BaseOffloadModule @@ -19,7 +19,7 @@ class OptimState(Enum): UNSCALED = 1 -class AMPOptimizer(ColossalaiOptimizer): +class AMPOptimizer(OptimizerWrapper): """ A wrapper for Optimizer. Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 3441eca38ce7..664ac63e45ac 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -13,7 +13,6 @@ from torch.optim import Optimizer from colossalai.interface import ModelWrapper, OptimizerWrapper -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.tensor.d_tensor import ( is_customized_distributed_tensor, is_distributed_tensor, @@ -130,10 +129,7 @@ def unwrap_optimizer(optimizer: OptimizerWrapper): This method should be used before saving/loading it to/from sharded checkpoints. ''' - # TODO(Baizhou): ColossalaiOptimizer will be replaced with OptimizerWrapper in the future unwrapped_optim = optimizer.optim - if isinstance(unwrapped_optim, ColossalaiOptimizer): - unwrapped_optim = unwrapped_optim.optim return unwrapped_optim diff --git a/colossalai/cli/benchmark/__init__.py b/colossalai/cli/benchmark/__init__.py deleted file mode 100644 index 618ff8c61dd4..000000000000 --- a/colossalai/cli/benchmark/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -import click - -from colossalai.context import Config - -from .benchmark import run_benchmark -from .utils import * - -__all__ = ['benchmark'] - - -@click.command() -@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.") -@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.") -@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.") -@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.") -@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.") -@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.") -@click.option("-l", "--layers", type=int, default=2) -@click.option("-m", - "--model", - type=click.Choice(['mlp'], case_sensitive=False), - default='mlp', - help="Select the model to benchmark, currently only supports MLP") -def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int, - layers: int, model: str): - args_dict = locals() - args = Config(args_dict) - run_benchmark(args) diff --git a/colossalai/cli/benchmark/benchmark.py b/colossalai/cli/benchmark/benchmark.py deleted file mode 100644 index 97a9f45722dd..000000000000 --- a/colossalai/cli/benchmark/benchmark.py +++ /dev/null @@ -1,105 +0,0 @@ -from functools import partial -from typing import Dict, List - -import click -import torch.multiprocessing as mp - -import colossalai -from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model -from colossalai.context import Config -from colossalai.context.random import reset_seeds -from colossalai.core import global_context as gpc -from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.testing import free_port -from colossalai.utils import MultiTimer - -from .models import MLP - - -def run_benchmark(args: Config) -> None: - """ - Run benchmarking with torch.multiprocessing. - """ - - # sanity checks - if args.gpus is None: - click.echo("Error: --num_gpus is not given") - exit() - if args.gpus <= 1: - click.echo("Warning: tensor parallel will be activated with at least 2 devices.") - - click.echo("=== Benchmarking Parameters ===") - for k, v in args.items(): - click.echo(f'{k}: {v}') - click.echo('') - - config_list = find_all_configs(args.gpus) - - avail_ports = [free_port() for _ in range(len(config_list))] - run_func = partial(run_dist_profiling, - world_size=args.gpus, - port_list=avail_ports, - config_list=config_list, - hyperparams=args) - mp.spawn(run_func, nprocs=args.gpus) - - -def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict], - hyperparams: Config) -> None: - """ - A function executed for profiling, this function should be spawn by torch.multiprocessing. - - Args: - rank (int): rank of the process - world_size (int): the number of processes - port_list (List[int]): a list of free ports for initializing distributed networks - config_list (List[Dict]): a list of configuration - hyperparams (Config): the hyperparameters given by the user - - """ - - # disable logging for clean output - disable_existing_loggers() - logger = get_dist_logger() - logger.set_level('WARNING') - - for config, port in zip(config_list, port_list): - colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') - timer = MultiTimer() - - # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size. - if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0: - click.echo( - "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size." - ) - continue - - if hyperparams.model == 'mlp': - model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers) - else: - if gpc.get_global_rank() == 0: - click.echo("Error: Invalid argument for --model") - exit() - - data_func = partial(get_batch_data, - dim=hyperparams.dimension, - batch_size=hyperparams.batch_size, - seq_length=hyperparams.seq_len, - mode=config.parallel.tensor.mode) - - fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model, - warmup_steps=hyperparams.warmup_steps, - profile_steps=hyperparams.profile_steps, - data_func=data_func, - timer=timer) - - gpc.destroy() - reset_seeds() - - if gpc.get_global_rank() == 0: - config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()]) - click.echo(f"=== {config_str} ===") - click.echo(f"Average forward time: {fwd_time}") - click.echo(f"Average backward time: {bwd_time}") - click.echo(f"Max allocated GPU memory: {max_allocated}") - click.echo(f"Max cached GPU memory: {max_cached}\n") diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py deleted file mode 100644 index 385b485b6016..000000000000 --- a/colossalai/cli/benchmark/models.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch - -import colossalai.legacy.nn as col_nn - - -class MLP(torch.nn.Module): - - def __init__(self, dim: int, layers: int): - super().__init__() - self.layers = torch.nn.ModuleList() - - for _ in range(layers): - self.layers.append(col_nn.Linear(dim, dim)) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - return x diff --git a/colossalai/cli/benchmark/utils.py b/colossalai/cli/benchmark/utils.py deleted file mode 100644 index ee7d92d6ea6a..000000000000 --- a/colossalai/cli/benchmark/utils.py +++ /dev/null @@ -1,159 +0,0 @@ -import math -import time -from typing import Callable, Dict, List, Tuple - -import torch - -from colossalai.context import Config, ParallelMode -from colossalai.utils import MultiTimer - - -def get_time_stamp() -> int: - """ - Return the time stamp for profiling. - - Returns: - time_stamp (int): the time given by time.time() - """ - - torch.cuda.synchronize() - time_stamp = time.time() - return time_stamp - - -def get_memory_states() -> Tuple[float]: - """ - Return the memory statistics. - - Returns: - max_allocated (float): the allocated CUDA memory - max_cached (float): the cached CUDA memory - """ - - max_allocated = torch.cuda.max_memory_allocated() / (1024**3) - max_cached = torch.cuda.max_memory_reserved() / (1024**3) - torch.cuda.reset_peak_memory_stats() - torch.cuda.empty_cache() - return max_allocated, max_cached - - -def find_all_configs(device_cnt: int) -> List[Dict]: - """ - Find all possible configurations for tensor parallelism - - Args: - device_cnt (int): the number of devices - - Returns: - config_list (List[Dict]): a list of configurations - """ - - def _is_square(num): - # 2D parallel should be implemented with at least 2 devices. - if num <= 1: - return False - return math.floor(math.sqrt(num))**2 == num - - def _is_cube(num): - # 3D parallel should be implemented with at least 2 devices. - if num <= 1: - return False - return math.floor(num**(1. / 3.))**3 == num - - config_list = [] - - # add non-parallel config - config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None))) - config_list.append(config) - - # add 1D config - config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d'))) - config_list.append(config) - - # add 2D config only if device_cnt is a square - if _is_square(device_cnt): - config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d'))) - config_list.append(config) - - # check for 2.5D - # iterate over depth - for depth in range(1, device_cnt): - if device_cnt % depth == 0 and _is_square(device_cnt // depth): - config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth))) - config_list.append(config) - - # check for 3D if device_cnt is a cube - if _is_cube(device_cnt): - config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d'))) - config_list.append(config) - - config_list = [Config(cfg) for cfg in config_list] - return config_list - - -def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable, - timer: MultiTimer) -> Tuple[float]: - """ - Profile the forward and backward of a model - - Args: - model (torch.nn.Module): a PyTorch model - warmup_steps (int): the number of steps for warmup - profile_steps (int): the number of steps for profiling - data_func (Callable): a function to generate random data - timer (colossalai.utils.Multitimer): a timer instance for time recording - - Returns: - fwd_time (float): the average forward time taken by forward pass in second - bwd_time (float): the average backward time taken by forward pass in second - max_allocated (float): the maximum GPU memory allocated in GB - max_cached (float): the maximum GPU memory cached in GB - """ - - def _run_step(data): - timer.start('forward') - out = model(data) - timer.stop('forward', keep_in_history=True) - timer.start('backward') - out.mean().backward() - timer.stop('backward', keep_in_history=True) - - data_list = [data_func() for _ in range(warmup_steps)] - for data in data_list: - _run_step(data) - timer.reset('forward') - timer.reset('backward') - - for _ in range(profile_steps): - data = data_func() - _run_step(data) - - max_allocated, max_cached = get_memory_states() - fwd_time = timer.get_timer('forward').get_history_mean() - bwd_time = timer.get_timer('backward').get_history_mean() - return fwd_time, bwd_time, max_allocated, max_cached - - -def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor: - """ - Return a random data of shape (batch_size, seq_length, dim) for profiling. - - Args: - dim (int): hidden size - batch_size (int): the number of data samples - seq_length (int): the number of tokens - mode (ParallelMode): Colossal-AI ParallelMode enum - - Returns: - data (torch.Tensor): random data - """ - - if mode in ['2d', '2.5d']: - batch_size = batch_size // 2 - dim = dim // 2 - elif mode == '3d': - batch_size = batch_size // 4 - dim = dim // 2 - - data = torch.rand(batch_size, seq_length, dim).cuda() - return data diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py index a94e1150e49f..0dea7c504957 100644 --- a/colossalai/cli/cli.py +++ b/colossalai/cli/cli.py @@ -1,6 +1,5 @@ import click -from .benchmark import benchmark from .check import check from .launcher import run @@ -19,7 +18,6 @@ def cli(): cli.add_command(run) cli.add_command(check) -cli.add_command(benchmark) if __name__ == '__main__': cli() diff --git a/colossalai/initialize.py b/colossalai/initialize.py index a1694e059fb4..0de04e24091e 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -20,6 +20,7 @@ from colossalai.context import Config, ConfigException, ParallelMode from colossalai.context.moe_context import MOE_CONTEXT from colossalai.core import global_context as gpc +from colossalai.interface import OptimizerWrapper from colossalai.legacy.builder.builder import build_gradient_handler from colossalai.legacy.engine import Engine from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient @@ -30,7 +31,6 @@ get_tensor_shape, ) from colossalai.logging import get_dist_logger -from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param from colossalai.utils.moe import sync_moe_model_param from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2 @@ -445,9 +445,9 @@ def initialize(model: nn.Module, else: gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg] - # check if optimizer is ColossalaiOptimizer - if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizerV2)): - optimizer = ColossalaiOptimizer(optim=optimizer) + # check if optimizer is OptimizerWrapper + if not isinstance(optimizer, (OptimizerWrapper, ShardedOptimizerV2)): + optimizer = OptimizerWrapper(optim=optimizer) # gradient accumulation grad_accum_size = gpc.config.get('gradient_accumulation', None) diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py index 9af4469f403f..9a1a2dc325a3 100644 --- a/colossalai/legacy/engine/_base_engine.py +++ b/colossalai/legacy/engine/_base_engine.py @@ -8,6 +8,7 @@ from torch.nn import Module from torch.nn.modules.loss import _Loss +from colossalai.interface import OptimizerWrapper from colossalai.legacy.engine.gradient_handler import BaseGradientHandler from colossalai.legacy.engine.schedule import ( BaseSchedule, @@ -16,7 +17,6 @@ PipelineSchedule, ) from colossalai.logging import get_dist_logger -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively @@ -27,7 +27,7 @@ class Engine: Args: model (``torch.nn.Module``): The neural network model. - optimizer (``colossalai.nn.optimizer.ColossalaiOptimizer``): Optimizer for updating the parameters. + optimizer (``colossalai.interface.OptimizerWrapper``): Optimizer for updating the parameters. criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss. gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward. clip_grad_norm (float, optional): The norm of gradient clipping. @@ -61,7 +61,7 @@ class Engine: def __init__(self, model: Module, - optimizer: "ColossalaiOptimizer", + optimizer: "OptimizerWrapper", criterion: Optional[_Loss] = None, gradient_handlers: Optional[List[BaseGradientHandler]] = None, clip_grad_norm: float = 0.0, @@ -157,7 +157,7 @@ def step(self): """Execute parameter update """ self._all_reduce_gradients() - self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm) + self.optimizer.clip_grad_by_norm(self._clip_grad_norm) return self.optimizer.step() def backward(self, loss: Tensor): diff --git a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py index c466f7e2d03b..c2270dc53a50 100644 --- a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py +++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py @@ -10,12 +10,12 @@ from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader +from colossalai.interface import OptimizerWrapper from colossalai.legacy.engine import BaseGradientHandler -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.utils import conditional_context -class GradAccumOptimizer(ColossalaiOptimizer): +class GradAccumOptimizer(OptimizerWrapper): """A wrapper for the optimizer to enable gradient accumulation by skipping the steps before accumulation size is reached. @@ -74,7 +74,7 @@ def clip_grad_norm(self, model: nn.Module, max_norm: float) -> None: if self.accumulate_step < self.accumulate_size: pass else: - self.optim.clip_grad_norm(model, max_norm) + self.optim.clip_grad_by_norm(max_norm) def backward(self, loss: Tensor) -> None: """Execute backward pass. diff --git a/colossalai/nn/optimizer/__init__.py b/colossalai/nn/optimizer/__init__.py index 06072648beba..7e310793f515 100644 --- a/colossalai/nn/optimizer/__init__.py +++ b/colossalai/nn/optimizer/__init__.py @@ -1,10 +1,9 @@ -from .colossalai_optimizer import ColossalaiOptimizer +from .cpu_adam import CPUAdam from .fused_adam import FusedAdam from .fused_lamb import FusedLAMB from .fused_sgd import FusedSGD +from .hybrid_adam import HybridAdam from .lamb import Lamb from .lars import Lars -from .cpu_adam import CPUAdam -from .hybrid_adam import HybridAdam -__all__ = ['ColossalaiOptimizer', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam'] +__all__ = ['FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam'] diff --git a/colossalai/nn/optimizer/colossalai_optimizer.py b/colossalai/nn/optimizer/colossalai_optimizer.py deleted file mode 100644 index 34f5a9541975..000000000000 --- a/colossalai/nn/optimizer/colossalai_optimizer.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import torch.nn as nn -from torch import Tensor -from torch.optim import Optimizer -from colossalai.utils import clip_grad_norm_fp32 - - -class ColossalaiOptimizer(Optimizer): - - def __init__(self, optim: Optimizer): - self.optim = optim - - @property - def param_groups(self): - return self.optim.param_groups - - @property - def defaults(self): - return self.optim.defaults - - def add_param_group(self, *args, **kwargs): - return self.optim.add_param_group(*args, **kwargs) - - def step(self, *args, **kwargs): - return self.optim.step(*args, **kwargs) - - def zero_grad(self, *args, **kwargs): - self.optim.zero_grad(*args, **kwargs) - - def load_state_dict(self, *args, **kwargs): - self.optim.load_state_dict(*args, **kwargs) - - def state_dict(self): - return self.optim.state_dict() - - def backward(self, loss: Tensor): - loss.backward() - - def backward_by_grad(self, tensor: Tensor, grad: Tensor): - torch.autograd.backward(tensors=tensor, grad_tensors=grad) - - def clip_grad_norm(self, model: nn.Module, max_norm: float): - if max_norm > 0.0: - clip_grad_norm_fp32(model.parameters(), max_norm) diff --git a/colossalai/utils/checkpoint/module_checkpoint.py b/colossalai/utils/checkpoint/module_checkpoint.py index d390da864cd3..ee8773e5059c 100644 --- a/colossalai/utils/checkpoint/module_checkpoint.py +++ b/colossalai/utils/checkpoint/module_checkpoint.py @@ -1,25 +1,27 @@ +from typing import Dict, Optional + import torch import torch.distributed as dist + +from colossalai.interface import OptimizerWrapper from colossalai.tensor import ColoTensor -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.utils.checkpoint.utils import gather_tensor, scatter_tensor -from typing import Optional, Dict def save_checkpoint(path: str, epoch: int, model: torch.nn.Module, - optimizer: Optional[ColossalaiOptimizer] = None, + optimizer: Optional[OptimizerWrapper] = None, lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None, *args, **kwargs): - """save_checkpoint + """save_checkpoint save a model, whose parameters are `ColoTensor`s. Args: path (str): directory to save the checkpoint files. epoch (int): the number of epoch model (torch.nn.Module): a torch module initialized by ColoInitContext - optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None. + optimizer (OptimizerWrapper, optional): optimizers. Defaults to None. lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None. """ rank = dist.get_rank() @@ -74,17 +76,17 @@ def save_checkpoint(path: str, def load_checkpoint(path: str, epoch: int, model: torch.nn.Module, - optimizer: Optional[ColossalaiOptimizer] = None, + optimizer: Optional[OptimizerWrapper] = None, lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None, torch_load_kwargs: Optional[Dict] = None, load_state_dict_kwargs: Optional[Dict] = None): - """load_checkpoint + """load_checkpoint load a model, whose parameters are `ColoTensor`s. Args: path (str): directory to save the checkpoint files. epoch (int): the number of epoch model (torch.nn.Module): a torch module initialized by ColoInitContext - optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None. + optimizer (OptimizerWrapper, optional): optimizers. Defaults to None. lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None. torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py index 41dd174cb65a..7efe25142a27 100644 --- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py @@ -14,8 +14,8 @@ from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.interface import OptimizerWrapper from colossalai.logging import get_dist_logger -from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage @@ -28,7 +28,7 @@ class OptimState(Enum): UNSCALED = 2 -class ShardedOptimizerV2(ColossalaiOptimizer): +class ShardedOptimizerV2(OptimizerWrapper): """A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO). By default the ZeRO optimizer stage 3 offload Optimizer States on CPU. diff --git a/docs/source/en/basics/command_line_tool.md b/docs/source/en/basics/command_line_tool.md index 48b199cf78e9..4c278aaa0c6a 100644 --- a/docs/source/en/basics/command_line_tool.md +++ b/docs/source/en/basics/command_line_tool.md @@ -30,24 +30,4 @@ This command will inform you information regarding the version compatibility and To launch distributed jobs on single or multiple nodes, the command `colossalai run` can be used for process launching. You may refer to [Launch Colossal-AI](./launch_colossalai.md) for more details. -## Tensor Parallel Micro-Benchmarking - -As Colossal-AI provides an array of tensor parallelism methods, it is not intuitive to choose one for your hardware and -model. Therefore, we provide a simple benchmarking to evaluate the performance of various tensor parallelisms on your system. -This benchmarking is run on a simple MLP model where the input data is of the shape `(batch_size, seq_length, hidden_size)`. -Based on the number of GPUs, the CLI will look for all possible tensor parallel configurations and display the benchmarking results. -You can customize the benchmarking configurations by checking out `colossalai benchmark --help`. - -```shell -# run on 4 GPUs -colossalai benchmark --gpus 4 - -# run on 8 GPUs -colossalai benchmark --gpus 8 -``` - -:::caution - -Only single-node benchmarking is supported currently. - -::: + diff --git a/docs/source/zh-Hans/basics/command_line_tool.md b/docs/source/zh-Hans/basics/command_line_tool.md index 9b0275a6cedd..5c4c18989c17 100644 --- a/docs/source/zh-Hans/basics/command_line_tool.md +++ b/docs/source/zh-Hans/basics/command_line_tool.md @@ -26,22 +26,4 @@ Colossal-AI给用户提供了命令行工具,目前命令行工具可以用来 在分布式训练时,我们可以使用`colossalai run`来启动单节点或者多节点的多进程,详细的内容可以参考[启动 Colossal-AI](./launch_colossalai.md)。 -## 张量并行基准测试 - -Colossal-AI提供了多种张量并行,想要充分理解这些方法需要一定的学习成本,对于新手来说很难靠经验选择一个并行方式。 -所以我们提供了一个简单的基准测试,能够让用户在自己的机器上测试不同张量并行的性能。这个基准测试跑一个并行的MLP模型, -输入数据的维度为`(批大小,序列长度,隐藏层维度)`。通过指定GPU的数量,Colossal-AI会搜索所有可行的并行配置。用户可以通过查看`colossalai benchmark --help`来自定义相关的测试参数。 - -```shell -# 使用4个GPU -colossalai benchmark --gpus 4 - -# 使用8个GPU -colossalai benchmark --gpus 8 -``` - -:::caution - -目前仅支持单节点的基准测试。 - -::: +