hpcaitech · ver217 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@@ -1,5 +1,4 @@
 class Registry:
-    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
 
     def __init__(self, name):
         self.name = name

diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py
@@ -6,8 +6,7 @@
 from pathlib import Path
 from typing import List, Union
 
-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
+import torch.distributed as dist
 
 
 class DistributedLogger:
@@ -63,6 +62,7 @@ def __init__(self, name):
             self._logger.propagate = False
 
             DistributedLogger.__instances[name] = self
+        self.rank = dist.get_rank() if dist.is_initialized() else 0
 
     @staticmethod
     def __get_call_info():
@@ -109,16 +109,10 @@ def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INF
         # create log directory
         path.mkdir(parents=True, exist_ok=True)
 
-        # set the default file name if path is a directory
-        if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
-            rank = 0
-        else:
-            rank = colossalai.core.global_context.get_global_rank()
-
         if suffix is not None:
-            log_file_name = f'rank_{rank}_{suffix}.log'
+            log_file_name = f'rank_{self.rank}_{suffix}.log'
         else:
-            log_file_name = f'rank_{rank}.log'
+            log_file_name = f'rank_{self.rank}.log'
         path = path.joinpath(log_file_name)
 
         # add file handler
@@ -128,19 +122,14 @@ def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INF
         file_handler.setFormatter(formatter)
         self._logger.addHandler(file_handler)
 
-    def _log(self,
-             level,
-             message: str,
-             parallel_mode: ParallelMode = ParallelMode.GLOBAL,
-             ranks: List[int] = None) -> None:
+    def _log(self, level, message: str, ranks: List[int] = None) -> None:
         if ranks is None:
             getattr(self._logger, level)(message)
         else:
-            local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
-            if local_rank in ranks:
+            if self.rank in ranks:
                 getattr(self._logger, level)(message)
 
-    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def info(self, message: str, ranks: List[int] = None) -> None:
         """Log an info message.
 
         Args:
@@ -150,10 +139,10 @@ def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL,
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('info', message_prefix, parallel_mode, ranks)
-        self._log('info', message, parallel_mode, ranks)
+        self._log('info', message_prefix, ranks)
+        self._log('info', message, ranks)
 
-    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def warning(self, message: str, ranks: List[int] = None) -> None:
         """Log a warning message.
 
         Args:
@@ -163,10 +152,10 @@ def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBA
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('warning', message_prefix, parallel_mode, ranks)
-        self._log('warning', message, parallel_mode, ranks)
+        self._log('warning', message_prefix, ranks)
+        self._log('warning', message, ranks)
 
-    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def debug(self, message: str, ranks: List[int] = None) -> None:
         """Log a debug message.
 
         Args:
@@ -176,10 +165,10 @@ def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL,
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('debug', message_prefix, parallel_mode, ranks)
-        self._log('debug', message, parallel_mode, ranks)
+        self._log('debug', message_prefix, ranks)
+        self._log('debug', message, ranks)
 
-    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def error(self, message: str, ranks: List[int] = None) -> None:
         """Log an error message.
 
         Args:
@@ -189,5 +178,5 @@ def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL,
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('error', message_prefix, parallel_mode, ranks)
-        self._log('error', message, parallel_mode, ranks)
+        self._log('error', message_prefix, ranks)
+        self._log('error', message, ranks)
diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py
@@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class CosineAnnealingLR(_CosineAnnealingLR):
     r"""Set the learning rate of each parameter group using a cosine annealing
     schedule, where :math:`\eta_{max}` is set to the initial lr and
@@ -49,7 +46,6 @@ def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: in
         super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class CosineAnnealingWarmupLR(WarmupScheduler):
     """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
 
@@ -70,7 +66,6 @@ def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min:
         super().__init__(optimizer, warmup_steps, base_scheduler)
 
 
-@LR_SCHEDULERS.register_module
 class FlatAnnealingLR(DelayerScheduler):
     """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
 
@@ -91,7 +86,6 @@ def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_ep
         super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
     """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
     applied, and then the learning rate will be a fixed value before starting decay.

diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py
@@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
     """Linearly warmup learning rate and then linearly decay.
 

diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py
@@ -2,12 +2,9 @@
 
 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class MultiStepLR(_MultiStepLR):
     """Decays the learning rate of each parameter group by gamma once the
     number of epoch reaches one of the milestones. Notice that such decay can
@@ -33,7 +30,6 @@ def __init__(self,
         super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
     """Multistep learning rate scheduler with warmup.
 

diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py
@@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class OneCycleLR(_OneCycleLR):
     r"""Sets the learning rate of each parameter group according to the
     1cycle learning rate policy. The 1cycle policy anneals the learning

diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py
@@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
     """Polynomial learning rate scheduler.
 
@@ -41,7 +38,6 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
     """Polynomial learning rate scheduler with warmup.
 

diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py
@@ -3,10 +3,7 @@
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class LambdaLR(_LambdaLR):
     """Sets the learning rate of each parameter group to the initial lr
     times a given function. When last_epoch=-1, sets initial lr as lr.
@@ -24,7 +21,6 @@ def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1)
         super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
     """Multiply the learning rate of each parameter group by the factor given
     in the specified function. When last_epoch=-1, sets initial lr as lr.
@@ -42,7 +38,6 @@ def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1)
         super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class StepLR(_StepLR):
     """Decays the learning rate of each parameter group by gamma every
     step_size epochs. Notice that such decay can happen simultaneously with
@@ -61,7 +56,6 @@ def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.
         super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class ExponentialLR(_ExponentialLR):
     """Decays the learning rate of each parameter group by gamma every epoch.
     When last_epoch=-1, sets initial lr as lr

diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
@@ -4,12 +4,10 @@
 import torch
 
 from colossalai.kernel.op_builder import CPUAdamBuilder
-from colossalai.legacy.registry import OPTIMIZERS
 
 from .nvme_optimizer import NVMeOptimizer
 
 
-@OPTIMIZERS.register_module
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 

diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
@@ -8,11 +8,9 @@
 '''
 import torch
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 

diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py
@@ -1,11 +1,9 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
 import torch
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedLAMB(torch.optim.Optimizer):
     """Implements LAMB algorithm.
 

diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py
@@ -2,11 +2,9 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
 

diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
@@ -4,13 +4,11 @@
 from torch.optim import Adam
 
 from colossalai.kernel.op_builder import FusedOptimBuilder
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 from .cpu_adam import CPUAdam
 
 
-@OPTIMIZERS.register_module
 class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 

diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py
@@ -5,10 +5,7 @@
 import torch
 from torch.optim import Optimizer
 
-from colossalai.legacy.registry import OPTIMIZERS
 
-
-@OPTIMIZERS.register_module
 class Lamb(Optimizer):
     r"""Implements Lamb algorithm.
     It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

diff --git a/colossalai/nn/optimizer/lars.py b/colossalai/nn/optimizer/lars.py
@@ -5,10 +5,7 @@
 import torch
 from torch.optim import Optimizer
 
-from colossalai.legacy.registry import OPTIMIZERS
 
-
-@OPTIMIZERS.register_module
 class Lars(Optimizer):
     r"""Implements the LARS optimizer from `"Large batch training of convolutional networks"
     <https://arxiv.org/pdf/1708.03888.pdf>`_.

diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -12,12 +12,10 @@
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.legacy.registry import DATA_SAMPLERS
 
 T_co = TypeVar('T_co', covariant=True)
 
 
-@DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
     """A data sampler for distributed data parallelism.
 

diff --git a/tests/test_engine/test_engine.py → tests/test_legacy/test_engine/test_engine.py b/tests/test_engine/test_engine.py → tests/test_legacy/test_engine/test_engine.py
diff --git a/...test_engine/test_gradient_accumluation.py → ...test_engine/test_gradient_accumluation.py b/...test_engine/test_gradient_accumluation.py → ...test_engine/test_gradient_accumluation.py