From bb0ee0d031dc9b389c3731a4136631bf63cce121 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 14:54:23 +0800 Subject: [PATCH 1/7] [amp] fit torch's new api --- colossalai/accelerator/cuda_accelerator.py | 2 +- colossalai/booster/mixed_precision/fp16_torch.py | 14 +++++++------- .../kernel/triton/llama_act_combine_kernel.py | 6 +++++- colossalai/legacy/amp/torch_amp/_grad_scaler.py | 2 +- .../legacy/nn/layer/parallel_2d/_operation.py | 6 +++++- .../legacy/nn/layer/parallel_2p5d/_operation.py | 6 +++++- .../legacy/nn/layer/parallel_3d/_operation.py | 6 +++++- .../nn/layer/parallel_sequence/_operation.py | 6 +++++- colossalai/legacy/nn/loss/loss_1d.py | 7 ++++++- colossalai/legacy/nn/loss/loss_2d.py | 7 ++++++- colossalai/legacy/nn/loss/loss_2p5d.py | 7 ++++++- colossalai/legacy/nn/loss/loss_3d.py | 7 ++++++- colossalai/moe/_operation.py | 7 ++++++- colossalai/nn/layer/layernorm.py | 7 ++++++- .../mixed_precision_training_with_booster.md | 2 +- .../mixed_precision_training_with_booster.md | 2 +- tests/test_legacy/test_moe/moe_utils.py | 2 +- tests/test_legacy/test_moe/test_moe_hybrid_zero.py | 2 +- .../test_legacy/test_moe/test_moe_load_balance.py | 2 +- 19 files changed, 75 insertions(+), 25 deletions(-) diff --git a/colossalai/accelerator/cuda_accelerator.py b/colossalai/accelerator/cuda_accelerator.py index f1ab487d4f58..a7db10e8daab 100644 --- a/colossalai/accelerator/cuda_accelerator.py +++ b/colossalai/accelerator/cuda_accelerator.py @@ -279,4 +279,4 @@ def autocast( """ Return autocast function """ - return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) + return torch.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py index c757a878d97a..371d4ede5fcf 100644 --- a/colossalai/booster/mixed_precision/fp16_torch.py +++ b/colossalai/booster/mixed_precision/fp16_torch.py @@ -21,12 +21,12 @@ class TorchAMPOptimizer(OptimizerWrapper): optim (Optimizer): Optimizer to wrap. init_scale (float): Initial scale factor. Default: 2**16. growth_factor (float): Factor by which the scale is multiplied during - :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite + :meth:`torch.amp.GradScaler.step` if gradients were found to be finite this iteration. Default: 2.0. backoff_factor (float): Factor by which the scale is multiplied during - :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite + :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite this iteration. Default: 0.5. - growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step` + growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step` calls that may cause the scale to increase. Default: 2000. """ @@ -39,7 +39,7 @@ def __init__( growth_interval: int = 2000, ) -> None: super().__init__(optim) - self.scaler = torch.cuda.amp.GradScaler( + self.scaler = torch.amp.GradScaler( init_scale=init_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, @@ -100,12 +100,12 @@ class FP16TorchMixedPrecision(MixedPrecision): Args: init_scale (float): Initial scale factor. Default: 2**16. growth_factor (float): Factor by which the scale is multiplied during - :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite + :meth:`torch.amp.GradScaler.step` if gradients were found to be finite this iteration. Default: 2.0. backoff_factor (float): Factor by which the scale is multiplied during - :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite + :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite this iteration. Default: 0.5. - growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step` + growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step` calls that may cause the scale to increase. Default: 2000. """ diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py index 7a2c7e8fbd74..092b75a675e9 100644 --- a/colossalai/kernel/triton/llama_act_combine_kernel.py +++ b/colossalai/kernel/triton/llama_act_combine_kernel.py @@ -3,7 +3,11 @@ import torch from torch import Tensor -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd try: import triton diff --git a/colossalai/legacy/amp/torch_amp/_grad_scaler.py b/colossalai/legacy/amp/torch_amp/_grad_scaler.py index fc1aeec234fd..99cb3380c261 100644 --- a/colossalai/legacy/amp/torch_amp/_grad_scaler.py +++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py @@ -120,7 +120,7 @@ class GradScaler(object): def __init__(self, init_scale=2.0**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, enabled=True): if enabled and not torch.cuda.is_available(): - warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.") + warnings.warn("torch.amp.GradScaler is enabled, but CUDA is not available. Disabling.") self._enabled = False else: self._enabled = enabled diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py index f67ee2e60be1..809c41e4341a 100644 --- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py @@ -3,7 +3,11 @@ import torch import torch.distributed as dist from torch import Tensor -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py index 43328bd033c8..0f0a5fc7ef1e 100644 --- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py @@ -3,7 +3,11 @@ import torch import torch.distributed as dist from torch import Tensor -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py index fe42d8e28111..7b4191da4b13 100755 --- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py @@ -5,7 +5,11 @@ import torch from torch import Tensor -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py index 4e9bf364d8eb..e60cb3c78dbc 100644 --- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py @@ -3,7 +3,11 @@ import torch from torch import distributed as dist -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication import ring_forward diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py index fae9c929b788..f7cbc91f2cea 100644 --- a/colossalai/legacy/nn/loss/loss_1d.py +++ b/colossalai/legacy/nn/loss/loss_1d.py @@ -1,6 +1,11 @@ import torch import torch.distributed as dist -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.nn.modules.loss import _Loss from colossalai.legacy.context import ParallelMode diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py index 474fd4a2cb9c..0d8c3f86b7ba 100644 --- a/colossalai/legacy/nn/loss/loss_2d.py +++ b/colossalai/legacy/nn/loss/loss_2d.py @@ -1,6 +1,11 @@ import torch import torch.distributed as dist -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py index b423ab3d8699..d69c999a8541 100644 --- a/colossalai/legacy/nn/loss/loss_2p5d.py +++ b/colossalai/legacy/nn/loss/loss_2p5d.py @@ -1,6 +1,11 @@ import torch import torch.distributed as dist -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py index de6a674d61db..77246b474670 100644 --- a/colossalai/legacy/nn/loss/loss_3d.py +++ b/colossalai/legacy/nn/loss/loss_3d.py @@ -1,6 +1,11 @@ import torch import torch.distributed as dist -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py index 62904d90eef8..9f923aa13b75 100644 --- a/colossalai/moe/_operation.py +++ b/colossalai/moe/_operation.py @@ -3,7 +3,12 @@ import torch import torch.distributed as dist from torch import Tensor -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.distributed import ProcessGroup from colossalai.quantization.fp8 import all_to_all_single_fp8 diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py index 1db48faee213..4f6d691e5b17 100644 --- a/colossalai/nn/layer/layernorm.py +++ b/colossalai/nn/layer/layernorm.py @@ -5,7 +5,12 @@ import numbers import torch -from torch.cuda.amp import custom_bwd, custom_fwd + +try: + from torch.amp import custom_bwd, custom_fwd +except ImportError: + from torch.cuda.amp import custom_bwd, custom_fwd + from torch.nn import init from torch.nn.parameter import Parameter diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md index 65304b1f4e65..1e17c2bb584d 100644 --- a/docs/source/en/features/mixed_precision_training_with_booster.md +++ b/docs/source/en/features/mixed_precision_training_with_booster.md @@ -16,7 +16,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan) AMP stands for automatic mixed precision training. In Colossal-AI, we have incorporated different implementations of mixed precision training: -1. torch.cuda.amp +1. torch.amp 2. apex.amp 3. naive amp diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md index da377ceb294b..93a69830cadf 100644 --- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md +++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md @@ -16,7 +16,7 @@ AMP 代表自动混合精度训练。 在 Colossal-AI 中, 我们结合了混合精度训练的不同实现: -1. torch.cuda.amp +1. torch.amp 2. apex.amp 3. naive amp diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py index 8c133849b000..96e74e2ae026 100644 --- a/tests/test_legacy/test_moe/moe_utils.py +++ b/tests/test_legacy/test_moe/moe_utils.py @@ -87,7 +87,7 @@ def assert_not_equal_in_group(tensor, process_group=None): def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.cuda.amp.autocast(enabled=enable_autocast): + with torch.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) diff --git a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py index fdd6d956ef83..6e5719c28232 100644 --- a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py +++ b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py @@ -14,7 +14,7 @@ def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.cuda.amp.autocast(enabled=enable_autocast): + with torch.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) diff --git a/tests/test_legacy/test_moe/test_moe_load_balance.py b/tests/test_legacy/test_moe/test_moe_load_balance.py index adf2dbc1ccf3..9f1bf318ad13 100644 --- a/tests/test_legacy/test_moe/test_moe_load_balance.py +++ b/tests/test_legacy/test_moe/test_moe_load_balance.py @@ -26,7 +26,7 @@ def split_ddp_grad(grad, world_size): def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.cuda.amp.autocast(enabled=enable_autocast): + with torch.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) From d28bbbee19165e110a88b982e5ae4ab59cc3678f Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 14:57:00 +0800 Subject: [PATCH 2/7] [amp] fix api call --- colossalai/accelerator/cuda_accelerator.py | 2 +- tests/test_legacy/test_moe/moe_utils.py | 2 +- tests/test_legacy/test_moe/test_moe_hybrid_zero.py | 2 +- tests/test_legacy/test_moe/test_moe_load_balance.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/colossalai/accelerator/cuda_accelerator.py b/colossalai/accelerator/cuda_accelerator.py index a7db10e8daab..32e62b33f86b 100644 --- a/colossalai/accelerator/cuda_accelerator.py +++ b/colossalai/accelerator/cuda_accelerator.py @@ -279,4 +279,4 @@ def autocast( """ Return autocast function """ - return torch.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) + return torch.amp.autocast(device_type="cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py index 96e74e2ae026..8c133849b000 100644 --- a/tests/test_legacy/test_moe/moe_utils.py +++ b/tests/test_legacy/test_moe/moe_utils.py @@ -87,7 +87,7 @@ def assert_not_equal_in_group(tensor, process_group=None): def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.amp.autocast(enabled=enable_autocast): + with torch.cuda.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) diff --git a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py index 6e5719c28232..fdd6d956ef83 100644 --- a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py +++ b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py @@ -14,7 +14,7 @@ def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.amp.autocast(enabled=enable_autocast): + with torch.cuda.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) diff --git a/tests/test_legacy/test_moe/test_moe_load_balance.py b/tests/test_legacy/test_moe/test_moe_load_balance.py index 9f1bf318ad13..adf2dbc1ccf3 100644 --- a/tests/test_legacy/test_moe/test_moe_load_balance.py +++ b/tests/test_legacy/test_moe/test_moe_load_balance.py @@ -26,7 +26,7 @@ def split_ddp_grad(grad, world_size): def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False): model.train() - with torch.amp.autocast(enabled=enable_autocast): + with torch.cuda.amp.autocast(enabled=enable_autocast): if criterion: y = model(data) loss = criterion(y, label) From df7f139569dee11035405486d9696a6abe34fab1 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 15:09:44 +0800 Subject: [PATCH 3/7] [amp] fix api call --- colossalai/legacy/amp/torch_amp/_grad_scaler.py | 2 +- colossalai/legacy/nn/layer/parallel_2d/_operation.py | 2 +- colossalai/legacy/nn/layer/parallel_2p5d/_operation.py | 2 +- colossalai/legacy/nn/layer/parallel_3d/_operation.py | 2 +- colossalai/legacy/nn/layer/parallel_sequence/_operation.py | 2 +- colossalai/legacy/nn/loss/loss_1d.py | 2 +- colossalai/legacy/nn/loss/loss_2d.py | 2 +- colossalai/legacy/nn/loss/loss_2p5d.py | 2 +- colossalai/legacy/nn/loss/loss_3d.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/colossalai/legacy/amp/torch_amp/_grad_scaler.py b/colossalai/legacy/amp/torch_amp/_grad_scaler.py index 99cb3380c261..fc1aeec234fd 100644 --- a/colossalai/legacy/amp/torch_amp/_grad_scaler.py +++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py @@ -120,7 +120,7 @@ class GradScaler(object): def __init__(self, init_scale=2.0**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, enabled=True): if enabled and not torch.cuda.is_available(): - warnings.warn("torch.amp.GradScaler is enabled, but CUDA is not available. Disabling.") + warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.") self._enabled = False else: self._enabled = enabled diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py index 809c41e4341a..3fff3fcb4093 100644 --- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py @@ -5,7 +5,7 @@ from torch import Tensor try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py index 0f0a5fc7ef1e..356b20f76fd4 100644 --- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py @@ -5,7 +5,7 @@ from torch import Tensor try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py index 7b4191da4b13..47bed0eb1a11 100755 --- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py @@ -7,7 +7,7 @@ from torch import Tensor try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py index e60cb3c78dbc..5e10e4bcd1d0 100644 --- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py @@ -5,7 +5,7 @@ from torch import distributed as dist try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py index f7cbc91f2cea..4637ffdcaa9b 100644 --- a/colossalai/legacy/nn/loss/loss_1d.py +++ b/colossalai/legacy/nn/loss/loss_1d.py @@ -2,7 +2,7 @@ import torch.distributed as dist try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py index 0d8c3f86b7ba..183e18231236 100644 --- a/colossalai/legacy/nn/loss/loss_2d.py +++ b/colossalai/legacy/nn/loss/loss_2d.py @@ -2,7 +2,7 @@ import torch.distributed as dist try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py index d69c999a8541..e263191b6f44 100644 --- a/colossalai/legacy/nn/loss/loss_2p5d.py +++ b/colossalai/legacy/nn/loss/loss_2p5d.py @@ -2,7 +2,7 @@ import torch.distributed as dist try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py index 77246b474670..43590dea2213 100644 --- a/colossalai/legacy/nn/loss/loss_3d.py +++ b/colossalai/legacy/nn/loss/loss_3d.py @@ -2,7 +2,7 @@ import torch.distributed as dist try: - from torch.amp import custom_bwd, custom_fwd + from torch.cuda.amp import custom_bwd, custom_fwd except ImportError: from torch.cuda.amp import custom_bwd, custom_fwd From c85e25209885ece9c20d466edf688bf2f7ced146 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 15:13:00 +0800 Subject: [PATCH 4/7] [misc] fit torch pytree api upgrade --- colossalai/pipeline/schedule/_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/colossalai/pipeline/schedule/_utils.py b/colossalai/pipeline/schedule/_utils.py index 271b3238f5c4..8f42a9014e85 100644 --- a/colossalai/pipeline/schedule/_utils.py +++ b/colossalai/pipeline/schedule/_utils.py @@ -3,8 +3,9 @@ import torch import torch.cuda +from packaging.version import Version from torch.nn import Module -from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, _register_pytree_node, tree_flatten, tree_map, tree_unflatten +from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, tree_flatten, tree_map, tree_unflatten # this register are for torch under version 1.13.1, maybe removed in the future @@ -16,7 +17,12 @@ def _odict_unflatten(values: List[Any], context: Any) -> "OrderedDict[Any, Any]" return OrderedDict((key, value) for key, value in zip(context, values)) -_register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) +if Version(torch.__version__) <= Version("1.13.1"): + try: + from torch.utils._pytree import register_pytree_node as _register_pytree_node + except ImportError: + from torch.utils._pytree import _register_pytree_node + _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) def tree_map_hf(fn: Any, pytree: Any): From d6402c79d5cea5c3e6bf11c7f045ef91634aeae8 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 15:13:27 +0800 Subject: [PATCH 5/7] [misc] remove legacy import --- colossalai/kernel/jit/option.py | 2 +- .../zero/gemini/memory_tracer/runtime_mem_tracer.py | 11 ++++++----- colossalai/zero/gemini/placement_policy.py | 3 ++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py index d392649a62f2..1ee93e4e0d9f 100644 --- a/colossalai/kernel/jit/option.py +++ b/colossalai/kernel/jit/option.py @@ -1,7 +1,6 @@ import torch from colossalai.accelerator import get_accelerator -from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear from .bias_dropout_add import bias_dropout_add_fused_train from .bias_gelu import bias_gelu_impl @@ -45,6 +44,7 @@ def warmup_jit_fusion( dtype: torch.dtype = torch.float32, ): """Compile JIT functions before the main training steps""" + from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear embed = Embedding(vocab_size, hidden_size).to(get_accelerator().get_current_device()) linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_accelerator().get_current_device()) diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py index b0d258824d2b..81520326f4cb 100644 --- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py +++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py @@ -1,10 +1,5 @@ import torch.nn -from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import ( - GradMemStats, - GradMemTracerHook, - ParamMemTracerHook, -) from colossalai.tensor.param_op_hook import ColoParamOpHookManager from colossalai.utils import _cast_float @@ -27,6 +22,12 @@ class RuntimeMemTracer: def __init__(self, module: torch.nn.Module, dtype: torch.dtype = torch.half): super().__init__() + from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import ( + GradMemStats, + GradMemTracerHook, + ParamMemTracerHook, + ) + self.module = module self.dtype = dtype self._gradstat = GradMemStats() diff --git a/colossalai/zero/gemini/placement_policy.py b/colossalai/zero/gemini/placement_policy.py index 178755d03107..2aa8dc3f6cdd 100644 --- a/colossalai/zero/gemini/placement_policy.py +++ b/colossalai/zero/gemini/placement_policy.py @@ -8,7 +8,6 @@ import torch.distributed as dist from colossalai.accelerator import get_accelerator -from colossalai.legacy.utils.memory import colo_device_memory_capacity from colossalai.zero.gemini.chunk import Chunk from .chunk import Chunk, ChunkManager @@ -172,6 +171,8 @@ def evict_tensors( Returns: int: the volume of memory that is evicted """ + from colossalai.legacy.utils.memory import colo_device_memory_capacity + start = time() cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device()) used_cuda_model_data = self.chunk_manager.total_mem["cuda"] From a7cca4df5ed46fcf3be622bbb7a240ba83c4b9b7 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 15:20:17 +0800 Subject: [PATCH 6/7] [misc] fit torch amp api --- colossalai/kernel/triton/llama_act_combine_kernel.py | 6 +----- colossalai/legacy/nn/layer/parallel_2d/_operation.py | 6 +----- colossalai/legacy/nn/layer/parallel_2p5d/_operation.py | 6 +----- colossalai/legacy/nn/layer/parallel_3d/_operation.py | 6 +----- colossalai/legacy/nn/layer/parallel_sequence/_operation.py | 6 +----- colossalai/legacy/nn/loss/loss_1d.py | 7 +------ colossalai/legacy/nn/loss/loss_2d.py | 7 +------ colossalai/legacy/nn/loss/loss_2p5d.py | 7 +------ colossalai/legacy/nn/loss/loss_3d.py | 7 +------ colossalai/moe/_operation.py | 7 +------ colossalai/nn/layer/layernorm.py | 7 +------ 11 files changed, 11 insertions(+), 61 deletions(-) diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py index 092b75a675e9..7a2c7e8fbd74 100644 --- a/colossalai/kernel/triton/llama_act_combine_kernel.py +++ b/colossalai/kernel/triton/llama_act_combine_kernel.py @@ -3,11 +3,7 @@ import torch from torch import Tensor - -try: - from torch.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd +from torch.cuda.amp import custom_bwd, custom_fwd try: import triton diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py index 3fff3fcb4093..f67ee2e60be1 100644 --- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py @@ -3,11 +3,7 @@ import torch import torch.distributed as dist from torch import Tensor - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd +from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py index 356b20f76fd4..43328bd033c8 100644 --- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py @@ -3,11 +3,7 @@ import torch import torch.distributed as dist from torch import Tensor - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd +from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py index 47bed0eb1a11..fe42d8e28111 100755 --- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py @@ -5,11 +5,7 @@ import torch from torch import Tensor - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd +from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py index 5e10e4bcd1d0..4e9bf364d8eb 100644 --- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py @@ -3,11 +3,7 @@ import torch from torch import distributed as dist - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd +from torch.cuda.amp import custom_bwd, custom_fwd from colossalai.accelerator import get_accelerator from colossalai.legacy.communication import ring_forward diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py index 4637ffdcaa9b..fae9c929b788 100644 --- a/colossalai/legacy/nn/loss/loss_1d.py +++ b/colossalai/legacy/nn/loss/loss_1d.py @@ -1,11 +1,6 @@ import torch import torch.distributed as dist - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.nn.modules.loss import _Loss from colossalai.legacy.context import ParallelMode diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py index 183e18231236..474fd4a2cb9c 100644 --- a/colossalai/legacy/nn/loss/loss_2d.py +++ b/colossalai/legacy/nn/loss/loss_2d.py @@ -1,11 +1,6 @@ import torch import torch.distributed as dist - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py index e263191b6f44..b423ab3d8699 100644 --- a/colossalai/legacy/nn/loss/loss_2p5d.py +++ b/colossalai/legacy/nn/loss/loss_2p5d.py @@ -1,11 +1,6 @@ import torch import torch.distributed as dist - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py index 43590dea2213..de6a674d61db 100644 --- a/colossalai/legacy/nn/loss/loss_3d.py +++ b/colossalai/legacy/nn/loss/loss_3d.py @@ -1,11 +1,6 @@ import torch import torch.distributed as dist - -try: - from torch.cuda.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.nn.functional import cross_entropy from torch.nn.modules.loss import _Loss diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py index 9f923aa13b75..62904d90eef8 100644 --- a/colossalai/moe/_operation.py +++ b/colossalai/moe/_operation.py @@ -3,12 +3,7 @@ import torch import torch.distributed as dist from torch import Tensor - -try: - from torch.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup from colossalai.quantization.fp8 import all_to_all_single_fp8 diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py index 4f6d691e5b17..1db48faee213 100644 --- a/colossalai/nn/layer/layernorm.py +++ b/colossalai/nn/layer/layernorm.py @@ -5,12 +5,7 @@ import numbers import torch - -try: - from torch.amp import custom_bwd, custom_fwd -except ImportError: - from torch.cuda.amp import custom_bwd, custom_fwd - +from torch.cuda.amp import custom_bwd, custom_fwd from torch.nn import init from torch.nn.parameter import Parameter From befe9e5c888f565b87f24e413c2f6d5cec50c2bd Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 18 Oct 2024 15:22:55 +0800 Subject: [PATCH 7/7] [misc] fit torch amp api --- colossalai/booster/mixed_precision/fp16_torch.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py index 371d4ede5fcf..c757a878d97a 100644 --- a/colossalai/booster/mixed_precision/fp16_torch.py +++ b/colossalai/booster/mixed_precision/fp16_torch.py @@ -21,12 +21,12 @@ class TorchAMPOptimizer(OptimizerWrapper): optim (Optimizer): Optimizer to wrap. init_scale (float): Initial scale factor. Default: 2**16. growth_factor (float): Factor by which the scale is multiplied during - :meth:`torch.amp.GradScaler.step` if gradients were found to be finite + :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite this iteration. Default: 2.0. backoff_factor (float): Factor by which the scale is multiplied during - :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite + :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite this iteration. Default: 0.5. - growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step` + growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step` calls that may cause the scale to increase. Default: 2000. """ @@ -39,7 +39,7 @@ def __init__( growth_interval: int = 2000, ) -> None: super().__init__(optim) - self.scaler = torch.amp.GradScaler( + self.scaler = torch.cuda.amp.GradScaler( init_scale=init_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, @@ -100,12 +100,12 @@ class FP16TorchMixedPrecision(MixedPrecision): Args: init_scale (float): Initial scale factor. Default: 2**16. growth_factor (float): Factor by which the scale is multiplied during - :meth:`torch.amp.GradScaler.step` if gradients were found to be finite + :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite this iteration. Default: 2.0. backoff_factor (float): Factor by which the scale is multiplied during - :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite + :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite this iteration. Default: 0.5. - growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step` + growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step` calls that may cause the scale to increase. Default: 2000. """