diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py index 1a90c72bde28..730a90d74cf8 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py @@ -1,5 +1,4 @@ class Registry: - # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here def __init__(self, name): self.name = name diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py index af7b7de54a8d..f9abe4a2a2b6 100644 --- a/colossalai/logging/logger.py +++ b/colossalai/logging/logger.py @@ -6,8 +6,7 @@ from pathlib import Path from typing import List, Union -import colossalai -from colossalai.context.parallel_mode import ParallelMode +import torch.distributed as dist class DistributedLogger: @@ -63,6 +62,7 @@ def __init__(self, name): self._logger.propagate = False DistributedLogger.__instances[name] = self + self.rank = dist.get_rank() if dist.is_initialized() else 0 @staticmethod def __get_call_info(): @@ -109,16 +109,10 @@ def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INF # create log directory path.mkdir(parents=True, exist_ok=True) - # set the default file name if path is a directory - if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL): - rank = 0 - else: - rank = colossalai.core.global_context.get_global_rank() - if suffix is not None: - log_file_name = f'rank_{rank}_{suffix}.log' + log_file_name = f'rank_{self.rank}_{suffix}.log' else: - log_file_name = f'rank_{rank}.log' + log_file_name = f'rank_{self.rank}.log' path = path.joinpath(log_file_name) # add file handler @@ -128,19 +122,14 @@ def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INF file_handler.setFormatter(formatter) self._logger.addHandler(file_handler) - def _log(self, - level, - message: str, - parallel_mode: ParallelMode = ParallelMode.GLOBAL, - ranks: List[int] = None) -> None: + def _log(self, level, message: str, ranks: List[int] = None) -> None: if ranks is None: getattr(self._logger, level)(message) else: - local_rank = colossalai.core.global_context.get_local_rank(parallel_mode) - if local_rank in ranks: + if self.rank in ranks: getattr(self._logger, level)(message) - def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def info(self, message: str, ranks: List[int] = None) -> None: """Log an info message. Args: @@ -150,10 +139,10 @@ def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('info', message_prefix, parallel_mode, ranks) - self._log('info', message, parallel_mode, ranks) + self._log('info', message_prefix, ranks) + self._log('info', message, ranks) - def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def warning(self, message: str, ranks: List[int] = None) -> None: """Log a warning message. Args: @@ -163,10 +152,10 @@ def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBA ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('warning', message_prefix, parallel_mode, ranks) - self._log('warning', message, parallel_mode, ranks) + self._log('warning', message_prefix, ranks) + self._log('warning', message, ranks) - def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def debug(self, message: str, ranks: List[int] = None) -> None: """Log a debug message. Args: @@ -176,10 +165,10 @@ def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('debug', message_prefix, parallel_mode, ranks) - self._log('debug', message, parallel_mode, ranks) + self._log('debug', message_prefix, ranks) + self._log('debug', message, ranks) - def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def error(self, message: str, ranks: List[int] = None) -> None: """Log an error message. Args: @@ -189,5 +178,5 @@ def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('error', message_prefix, parallel_mode, ranks) - self._log('error', message, parallel_mode, ranks) + self._log('error', message_prefix, ranks) + self._log('error', message, ranks) diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py index 0010435c25d5..fb587e1a1341 100644 --- a/colossalai/nn/lr_scheduler/cosine.py +++ b/colossalai/nn/lr_scheduler/cosine.py @@ -1,11 +1,8 @@ from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler -@LR_SCHEDULERS.register_module class CosineAnnealingLR(_CosineAnnealingLR): r"""Set the learning rate of each parameter group using a cosine annealing schedule, where :math:`\eta_{max}` is set to the initial lr and @@ -49,7 +46,6 @@ def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: in super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class CosineAnnealingWarmupLR(WarmupScheduler): """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. @@ -70,7 +66,6 @@ def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: super().__init__(optimizer, warmup_steps, base_scheduler) -@LR_SCHEDULERS.register_module class FlatAnnealingLR(DelayerScheduler): """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay. @@ -91,7 +86,6 @@ def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_ep super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class FlatAnnealingWarmupLR(WarmupDelayerScheduler): """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied, and then the learning rate will be a fixed value before starting decay. diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py index 2517796473f2..21a865e4c12b 100644 --- a/colossalai/nn/lr_scheduler/linear.py +++ b/colossalai/nn/lr_scheduler/linear.py @@ -1,9 +1,6 @@ from torch.optim.lr_scheduler import _LRScheduler -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class LinearWarmupLR(_LRScheduler): """Linearly warmup learning rate and then linearly decay. diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py index 4f18b49fcc15..c428c911c94d 100644 --- a/colossalai/nn/lr_scheduler/multistep.py +++ b/colossalai/nn/lr_scheduler/multistep.py @@ -2,12 +2,9 @@ from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import WarmupScheduler -@LR_SCHEDULERS.register_module class MultiStepLR(_MultiStepLR): """Decays the learning rate of each parameter group by gamma once the number of epoch reaches one of the milestones. Notice that such decay can @@ -33,7 +30,6 @@ def __init__(self, super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class MultiStepWarmupLR(WarmupScheduler): """Multistep learning rate scheduler with warmup. diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py index 20e9aaec60de..6835b3ee1cf2 100644 --- a/colossalai/nn/lr_scheduler/onecycle.py +++ b/colossalai/nn/lr_scheduler/onecycle.py @@ -1,9 +1,6 @@ from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class OneCycleLR(_OneCycleLR): r"""Sets the learning rate of each parameter group according to the 1cycle learning rate policy. The 1cycle policy anneals the learning diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py index a985064235e3..4f2249720ef6 100644 --- a/colossalai/nn/lr_scheduler/poly.py +++ b/colossalai/nn/lr_scheduler/poly.py @@ -1,11 +1,8 @@ from torch.optim.lr_scheduler import _LRScheduler -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import WarmupScheduler -@LR_SCHEDULERS.register_module class PolynomialLR(_LRScheduler): """Polynomial learning rate scheduler. @@ -41,7 +38,6 @@ def _get_closed_form_lr(self): for base_lr in self.base_lrs] -@LR_SCHEDULERS.register_module class PolynomialWarmupLR(WarmupScheduler): """Polynomial learning rate scheduler with warmup. diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py index 09f5d4585d47..8846e13c7511 100644 --- a/colossalai/nn/lr_scheduler/torch.py +++ b/colossalai/nn/lr_scheduler/torch.py @@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR from torch.optim.lr_scheduler import StepLR as _StepLR -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class LambdaLR(_LambdaLR): """Sets the learning rate of each parameter group to the initial lr times a given function. When last_epoch=-1, sets initial lr as lr. @@ -24,7 +21,6 @@ def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class MultiplicativeLR(_MultiplicativeLR): """Multiply the learning rate of each parameter group by the factor given in the specified function. When last_epoch=-1, sets initial lr as lr. @@ -42,7 +38,6 @@ def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class StepLR(_StepLR): """Decays the learning rate of each parameter group by gamma every step_size epochs. Notice that such decay can happen simultaneously with @@ -61,7 +56,6 @@ def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0. super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class ExponentialLR(_ExponentialLR): """Decays the learning rate of each parameter group by gamma every epoch. When last_epoch=-1, sets initial lr as lr diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 210400a21c80..9767fcb8b1e2 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -4,12 +4,10 @@ import torch from colossalai.kernel.op_builder import CPUAdamBuilder -from colossalai.legacy.registry import OPTIMIZERS from .nvme_optimizer import NVMeOptimizer -@OPTIMIZERS.register_module class CPUAdam(NVMeOptimizer): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index 0d13873cdba8..3a05a34f52d2 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -8,11 +8,9 @@ ''' import torch -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedAdam(torch.optim.Optimizer): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py index 48cc097c7da6..a2807d70f454 100644 --- a/colossalai/nn/optimizer/fused_lamb.py +++ b/colossalai/nn/optimizer/fused_lamb.py @@ -1,11 +1,9 @@ # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py import torch -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedLAMB(torch.optim.Optimizer): """Implements LAMB algorithm. diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py index 0e8d3fc10d64..59a93a8be9c7 100644 --- a/colossalai/nn/optimizer/fused_sgd.py +++ b/colossalai/nn/optimizer/fused_sgd.py @@ -2,11 +2,9 @@ import torch from torch.optim.optimizer import Optimizer, required -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedSGD(Optimizer): r"""Implements stochastic gradient descent (optionally with momentum). diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 7aa0ced18e24..e08df410effe 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -4,13 +4,11 @@ from torch.optim import Adam from colossalai.kernel.op_builder import FusedOptimBuilder -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier from .cpu_adam import CPUAdam -@OPTIMIZERS.register_module class HybridAdam(CPUAdam): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py index 769c11f6222f..d5de267f73ee 100644 --- a/colossalai/nn/optimizer/lamb.py +++ b/colossalai/nn/optimizer/lamb.py @@ -5,10 +5,7 @@ import torch from torch.optim import Optimizer -from colossalai.legacy.registry import OPTIMIZERS - -@OPTIMIZERS.register_module class Lamb(Optimizer): r"""Implements Lamb algorithm. It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. diff --git a/colossalai/nn/optimizer/lars.py b/colossalai/nn/optimizer/lars.py index 9dbb83b84280..58393fdae4bf 100644 --- a/colossalai/nn/optimizer/lars.py +++ b/colossalai/nn/optimizer/lars.py @@ -5,10 +5,7 @@ import torch from torch.optim import Optimizer -from colossalai.legacy.registry import OPTIMIZERS - -@OPTIMIZERS.register_module class Lars(Optimizer): r"""Implements the LARS optimizer from `"Large batch training of convolutional networks" `_. diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py index 4ca7bce7bc3f..881ddde78648 100644 --- a/colossalai/utils/data_sampler/data_parallel_sampler.py +++ b/colossalai/utils/data_sampler/data_parallel_sampler.py @@ -12,12 +12,10 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.legacy.registry import DATA_SAMPLERS T_co = TypeVar('T_co', covariant=True) -@DATA_SAMPLERS.register_module class DataParallelSampler(Sampler): """A data sampler for distributed data parallelism. diff --git a/tests/test_engine/test_engine.py b/tests/test_legacy/test_engine/test_engine.py similarity index 100% rename from tests/test_engine/test_engine.py rename to tests/test_legacy/test_engine/test_engine.py diff --git a/tests/test_engine/test_gradient_accumluation.py b/tests/test_legacy/test_engine/test_gradient_accumluation.py similarity index 100% rename from tests/test_engine/test_gradient_accumluation.py rename to tests/test_legacy/test_engine/test_gradient_accumluation.py