From bb0ee0d031dc9b389c3731a4136631bf63cce121 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 14:54:23 +0800
Subject: [PATCH 1/7] [amp] fit torch's new api

---
 colossalai/accelerator/cuda_accelerator.py         |  2 +-
 colossalai/booster/mixed_precision/fp16_torch.py   | 14 +++++++-------
 .../kernel/triton/llama_act_combine_kernel.py      |  6 +++++-
 colossalai/legacy/amp/torch_amp/_grad_scaler.py    |  2 +-
 .../legacy/nn/layer/parallel_2d/_operation.py      |  6 +++++-
 .../legacy/nn/layer/parallel_2p5d/_operation.py    |  6 +++++-
 .../legacy/nn/layer/parallel_3d/_operation.py      |  6 +++++-
 .../nn/layer/parallel_sequence/_operation.py       |  6 +++++-
 colossalai/legacy/nn/loss/loss_1d.py               |  7 ++++++-
 colossalai/legacy/nn/loss/loss_2d.py               |  7 ++++++-
 colossalai/legacy/nn/loss/loss_2p5d.py             |  7 ++++++-
 colossalai/legacy/nn/loss/loss_3d.py               |  7 ++++++-
 colossalai/moe/_operation.py                       |  7 ++++++-
 colossalai/nn/layer/layernorm.py                   |  7 ++++++-
 .../mixed_precision_training_with_booster.md       |  2 +-
 .../mixed_precision_training_with_booster.md       |  2 +-
 tests/test_legacy/test_moe/moe_utils.py            |  2 +-
 tests/test_legacy/test_moe/test_moe_hybrid_zero.py |  2 +-
 .../test_legacy/test_moe/test_moe_load_balance.py  |  2 +-
 19 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/colossalai/accelerator/cuda_accelerator.py b/colossalai/accelerator/cuda_accelerator.py
index f1ab487d4f58..a7db10e8daab 100644
--- a/colossalai/accelerator/cuda_accelerator.py
+++ b/colossalai/accelerator/cuda_accelerator.py
@@ -279,4 +279,4 @@ def autocast(
         """
         Return autocast function
         """
-        return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
+        return torch.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py
index c757a878d97a..371d4ede5fcf 100644
--- a/colossalai/booster/mixed_precision/fp16_torch.py
+++ b/colossalai/booster/mixed_precision/fp16_torch.py
@@ -21,12 +21,12 @@ class TorchAMPOptimizer(OptimizerWrapper):
         optim (Optimizer): Optimizer to wrap.
         init_scale (float): Initial scale factor. Default: 2**16.
         growth_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            :meth:`torch.amp.GradScaler.step` if gradients were found to be finite
             this iteration. Default: 2.0.
         backoff_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite
             this iteration. Default: 0.5.
-        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+        growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step`
             calls that may cause the scale to increase. Default: 2000.
     """
 
@@ -39,7 +39,7 @@ def __init__(
         growth_interval: int = 2000,
     ) -> None:
         super().__init__(optim)
-        self.scaler = torch.cuda.amp.GradScaler(
+        self.scaler = torch.amp.GradScaler(
             init_scale=init_scale,
             growth_factor=growth_factor,
             backoff_factor=backoff_factor,
@@ -100,12 +100,12 @@ class FP16TorchMixedPrecision(MixedPrecision):
     Args:
         init_scale (float): Initial scale factor. Default: 2**16.
         growth_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            :meth:`torch.amp.GradScaler.step` if gradients were found to be finite
             this iteration. Default: 2.0.
         backoff_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite
             this iteration. Default: 0.5.
-        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+        growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step`
             calls that may cause the scale to increase. Default: 2000.
     """
 
diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py
index 7a2c7e8fbd74..092b75a675e9 100644
--- a/colossalai/kernel/triton/llama_act_combine_kernel.py
+++ b/colossalai/kernel/triton/llama_act_combine_kernel.py
@@ -3,7 +3,11 @@
 
 import torch
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
 
 try:
     import triton
diff --git a/colossalai/legacy/amp/torch_amp/_grad_scaler.py b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
index fc1aeec234fd..99cb3380c261 100644
--- a/colossalai/legacy/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
@@ -120,7 +120,7 @@ class GradScaler(object):
 
     def __init__(self, init_scale=2.0**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, enabled=True):
         if enabled and not torch.cuda.is_available():
-            warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            warnings.warn("torch.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
             self._enabled = False
         else:
             self._enabled = enabled
diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
index f67ee2e60be1..809c41e4341a 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@@ -3,7 +3,11 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
index 43328bd033c8..0f0a5fc7ef1e 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@@ -3,7 +3,11 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
index fe42d8e28111..7b4191da4b13 100755
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@@ -5,7 +5,11 @@
 
 import torch
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index 4e9bf364d8eb..e60cb3c78dbc 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -3,7 +3,11 @@
 
 import torch
 from torch import distributed as dist
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication import ring_forward
diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
index fae9c929b788..f7cbc91f2cea 100644
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
@@ -1,6 +1,11 @@
 import torch
 import torch.distributed as dist
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.nn.modules.loss import _Loss
 
 from colossalai.legacy.context import ParallelMode
diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
index 474fd4a2cb9c..0d8c3f86b7ba 100644
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -1,6 +1,11 @@
 import torch
 import torch.distributed as dist
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
index b423ab3d8699..d69c999a8541 100644
--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -1,6 +1,11 @@
 import torch
 import torch.distributed as dist
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
index de6a674d61db..77246b474670 100644
--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -1,6 +1,11 @@
 import torch
 import torch.distributed as dist
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index 62904d90eef8..9f923aa13b75 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -3,7 +3,12 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.distributed import ProcessGroup
 
 from colossalai.quantization.fp8 import all_to_all_single_fp8
diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py
index 1db48faee213..4f6d691e5b17 100644
--- a/colossalai/nn/layer/layernorm.py
+++ b/colossalai/nn/layer/layernorm.py
@@ -5,7 +5,12 @@
 import numbers
 
 import torch
-from torch.cuda.amp import custom_bwd, custom_fwd
+
+try:
+    from torch.amp import custom_bwd, custom_fwd
+except ImportError:
+    from torch.cuda.amp import custom_bwd, custom_fwd
+
 from torch.nn import init
 from torch.nn.parameter import Parameter
 
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
index 65304b1f4e65..1e17c2bb584d 100644
--- a/docs/source/en/features/mixed_precision_training_with_booster.md
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -16,7 +16,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 AMP stands for automatic mixed precision training.
 In Colossal-AI, we have incorporated different implementations of mixed precision training:
 
-1. torch.cuda.amp
+1. torch.amp
 2. apex.amp
 3. naive amp
 
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index da377ceb294b..93a69830cadf 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -16,7 +16,7 @@
 AMP 代表自动混合精度训练。
 在 Colossal-AI 中, 我们结合了混合精度训练的不同实现:
 
-1. torch.cuda.amp
+1. torch.amp
 2. apex.amp
 3. naive amp
 
diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py
index 8c133849b000..96e74e2ae026 100644
--- a/tests/test_legacy/test_moe/moe_utils.py
+++ b/tests/test_legacy/test_moe/moe_utils.py
@@ -87,7 +87,7 @@ def assert_not_equal_in_group(tensor, process_group=None):
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
+    with torch.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)
diff --git a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
index fdd6d956ef83..6e5719c28232 100644
--- a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
+++ b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
@@ -14,7 +14,7 @@
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
+    with torch.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)
diff --git a/tests/test_legacy/test_moe/test_moe_load_balance.py b/tests/test_legacy/test_moe/test_moe_load_balance.py
index adf2dbc1ccf3..9f1bf318ad13 100644
--- a/tests/test_legacy/test_moe/test_moe_load_balance.py
+++ b/tests/test_legacy/test_moe/test_moe_load_balance.py
@@ -26,7 +26,7 @@ def split_ddp_grad(grad, world_size):
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
+    with torch.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)

From d28bbbee19165e110a88b982e5ae4ab59cc3678f Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 14:57:00 +0800
Subject: [PATCH 2/7] [amp] fix api call

---
 colossalai/accelerator/cuda_accelerator.py          | 2 +-
 tests/test_legacy/test_moe/moe_utils.py             | 2 +-
 tests/test_legacy/test_moe/test_moe_hybrid_zero.py  | 2 +-
 tests/test_legacy/test_moe/test_moe_load_balance.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/colossalai/accelerator/cuda_accelerator.py b/colossalai/accelerator/cuda_accelerator.py
index a7db10e8daab..32e62b33f86b 100644
--- a/colossalai/accelerator/cuda_accelerator.py
+++ b/colossalai/accelerator/cuda_accelerator.py
@@ -279,4 +279,4 @@ def autocast(
         """
         Return autocast function
         """
-        return torch.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
+        return torch.amp.autocast(device_type="cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py
index 96e74e2ae026..8c133849b000 100644
--- a/tests/test_legacy/test_moe/moe_utils.py
+++ b/tests/test_legacy/test_moe/moe_utils.py
@@ -87,7 +87,7 @@ def assert_not_equal_in_group(tensor, process_group=None):
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.amp.autocast(enabled=enable_autocast):
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)
diff --git a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
index 6e5719c28232..fdd6d956ef83 100644
--- a/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
+++ b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
@@ -14,7 +14,7 @@
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.amp.autocast(enabled=enable_autocast):
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)
diff --git a/tests/test_legacy/test_moe/test_moe_load_balance.py b/tests/test_legacy/test_moe/test_moe_load_balance.py
index 9f1bf318ad13..adf2dbc1ccf3 100644
--- a/tests/test_legacy/test_moe/test_moe_load_balance.py
+++ b/tests/test_legacy/test_moe/test_moe_load_balance.py
@@ -26,7 +26,7 @@ def split_ddp_grad(grad, world_size):
 
 def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
     model.train()
-    with torch.amp.autocast(enabled=enable_autocast):
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
         if criterion:
             y = model(data)
             loss = criterion(y, label)

From df7f139569dee11035405486d9696a6abe34fab1 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 15:09:44 +0800
Subject: [PATCH 3/7] [amp] fix api call

---
 colossalai/legacy/amp/torch_amp/_grad_scaler.py            | 2 +-
 colossalai/legacy/nn/layer/parallel_2d/_operation.py       | 2 +-
 colossalai/legacy/nn/layer/parallel_2p5d/_operation.py     | 2 +-
 colossalai/legacy/nn/layer/parallel_3d/_operation.py       | 2 +-
 colossalai/legacy/nn/layer/parallel_sequence/_operation.py | 2 +-
 colossalai/legacy/nn/loss/loss_1d.py                       | 2 +-
 colossalai/legacy/nn/loss/loss_2d.py                       | 2 +-
 colossalai/legacy/nn/loss/loss_2p5d.py                     | 2 +-
 colossalai/legacy/nn/loss/loss_3d.py                       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/colossalai/legacy/amp/torch_amp/_grad_scaler.py b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
index 99cb3380c261..fc1aeec234fd 100644
--- a/colossalai/legacy/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
@@ -120,7 +120,7 @@ class GradScaler(object):
 
     def __init__(self, init_scale=2.0**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, enabled=True):
         if enabled and not torch.cuda.is_available():
-            warnings.warn("torch.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
             self._enabled = False
         else:
             self._enabled = enabled
diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
index 809c41e4341a..3fff3fcb4093 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@@ -5,7 +5,7 @@
 from torch import Tensor
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
index 0f0a5fc7ef1e..356b20f76fd4 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@@ -5,7 +5,7 @@
 from torch import Tensor
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
index 7b4191da4b13..47bed0eb1a11 100755
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@@ -7,7 +7,7 @@
 from torch import Tensor
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index e60cb3c78dbc..5e10e4bcd1d0 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -5,7 +5,7 @@
 from torch import distributed as dist
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
index f7cbc91f2cea..4637ffdcaa9b 100644
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
index 0d8c3f86b7ba..183e18231236 100644
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
index d69c999a8541..e263191b6f44 100644
--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 
diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
index 77246b474670..43590dea2213 100644
--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 
 try:
-    from torch.amp import custom_bwd, custom_fwd
+    from torch.cuda.amp import custom_bwd, custom_fwd
 except ImportError:
     from torch.cuda.amp import custom_bwd, custom_fwd
 

From c85e25209885ece9c20d466edf688bf2f7ced146 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 15:13:00 +0800
Subject: [PATCH 4/7] [misc] fit torch pytree api upgrade

---
 colossalai/pipeline/schedule/_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/colossalai/pipeline/schedule/_utils.py b/colossalai/pipeline/schedule/_utils.py
index 271b3238f5c4..8f42a9014e85 100644
--- a/colossalai/pipeline/schedule/_utils.py
+++ b/colossalai/pipeline/schedule/_utils.py
@@ -3,8 +3,9 @@
 
 import torch
 import torch.cuda
+from packaging.version import Version
 from torch.nn import Module
-from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, _register_pytree_node, tree_flatten, tree_map, tree_unflatten
+from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, tree_flatten, tree_map, tree_unflatten
 
 
 # this register are for torch under version 1.13.1, maybe removed in the future
@@ -16,7 +17,12 @@ def _odict_unflatten(values: List[Any], context: Any) -> "OrderedDict[Any, Any]"
     return OrderedDict((key, value) for key, value in zip(context, values))
 
 
-_register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)
+if Version(torch.__version__) <= Version("1.13.1"):
+    try:
+        from torch.utils._pytree import register_pytree_node as _register_pytree_node
+    except ImportError:
+        from torch.utils._pytree import _register_pytree_node
+    _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)
 
 
 def tree_map_hf(fn: Any, pytree: Any):

From d6402c79d5cea5c3e6bf11c7f045ef91634aeae8 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 15:13:27 +0800
Subject: [PATCH 5/7] [misc] remove legacy import

---
 colossalai/kernel/jit/option.py                       |  2 +-
 .../zero/gemini/memory_tracer/runtime_mem_tracer.py   | 11 ++++++-----
 colossalai/zero/gemini/placement_policy.py            |  3 ++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index d392649a62f2..1ee93e4e0d9f 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -1,7 +1,6 @@
 import torch
 
 from colossalai.accelerator import get_accelerator
-from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 
 from .bias_dropout_add import bias_dropout_add_fused_train
 from .bias_gelu import bias_gelu_impl
@@ -45,6 +44,7 @@ def warmup_jit_fusion(
     dtype: torch.dtype = torch.float32,
 ):
     """Compile JIT functions before the main training steps"""
+    from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 
     embed = Embedding(vocab_size, hidden_size).to(get_accelerator().get_current_device())
     linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_accelerator().get_current_device())
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index b0d258824d2b..81520326f4cb 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,10 +1,5 @@
 import torch.nn
 
-from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
-    GradMemStats,
-    GradMemTracerHook,
-    ParamMemTracerHook,
-)
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import _cast_float
 
@@ -27,6 +22,12 @@ class RuntimeMemTracer:
 
     def __init__(self, module: torch.nn.Module, dtype: torch.dtype = torch.half):
         super().__init__()
+        from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
+            GradMemStats,
+            GradMemTracerHook,
+            ParamMemTracerHook,
+        )
+
         self.module = module
         self.dtype = dtype
         self._gradstat = GradMemStats()
diff --git a/colossalai/zero/gemini/placement_policy.py b/colossalai/zero/gemini/placement_policy.py
index 178755d03107..2aa8dc3f6cdd 100644
--- a/colossalai/zero/gemini/placement_policy.py
+++ b/colossalai/zero/gemini/placement_policy.py
@@ -8,7 +8,6 @@
 import torch.distributed as dist
 
 from colossalai.accelerator import get_accelerator
-from colossalai.legacy.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.chunk import Chunk
 
 from .chunk import Chunk, ChunkManager
@@ -172,6 +171,8 @@ def evict_tensors(
         Returns:
             int: the volume of memory that is evicted
         """
+        from colossalai.legacy.utils.memory import colo_device_memory_capacity
+
         start = time()
         cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
         used_cuda_model_data = self.chunk_manager.total_mem["cuda"]

From a7cca4df5ed46fcf3be622bbb7a240ba83c4b9b7 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 15:20:17 +0800
Subject: [PATCH 6/7] [misc] fit torch amp api

---
 colossalai/kernel/triton/llama_act_combine_kernel.py       | 6 +-----
 colossalai/legacy/nn/layer/parallel_2d/_operation.py       | 6 +-----
 colossalai/legacy/nn/layer/parallel_2p5d/_operation.py     | 6 +-----
 colossalai/legacy/nn/layer/parallel_3d/_operation.py       | 6 +-----
 colossalai/legacy/nn/layer/parallel_sequence/_operation.py | 6 +-----
 colossalai/legacy/nn/loss/loss_1d.py                       | 7 +------
 colossalai/legacy/nn/loss/loss_2d.py                       | 7 +------
 colossalai/legacy/nn/loss/loss_2p5d.py                     | 7 +------
 colossalai/legacy/nn/loss/loss_3d.py                       | 7 +------
 colossalai/moe/_operation.py                               | 7 +------
 colossalai/nn/layer/layernorm.py                           | 7 +------
 11 files changed, 11 insertions(+), 61 deletions(-)

diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py
index 092b75a675e9..7a2c7e8fbd74 100644
--- a/colossalai/kernel/triton/llama_act_combine_kernel.py
+++ b/colossalai/kernel/triton/llama_act_combine_kernel.py
@@ -3,11 +3,7 @@
 
 import torch
 from torch import Tensor
-
-try:
-    from torch.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 try:
     import triton
diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
index 3fff3fcb4093..f67ee2e60be1 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@@ -3,11 +3,7 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
index 356b20f76fd4..43328bd033c8 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@@ -3,11 +3,7 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
index 47bed0eb1a11..fe42d8e28111 100755
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@@ -5,11 +5,7 @@
 
 import torch
 from torch import Tensor
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index 5e10e4bcd1d0..4e9bf364d8eb 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -3,11 +3,7 @@
 
 import torch
 from torch import distributed as dist
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 from colossalai.accelerator import get_accelerator
 from colossalai.legacy.communication import ring_forward
diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
index 4637ffdcaa9b..fae9c929b788 100644
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
@@ -1,11 +1,6 @@
 import torch
 import torch.distributed as dist
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.modules.loss import _Loss
 
 from colossalai.legacy.context import ParallelMode
diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
index 183e18231236..474fd4a2cb9c 100644
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -1,11 +1,6 @@
 import torch
 import torch.distributed as dist
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
index e263191b6f44..b423ab3d8699 100644
--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -1,11 +1,6 @@
 import torch
 import torch.distributed as dist
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
index 43590dea2213..de6a674d61db 100644
--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -1,11 +1,6 @@
 import torch
 import torch.distributed as dist
-
-try:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index 9f923aa13b75..62904d90eef8 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -3,12 +3,7 @@
 import torch
 import torch.distributed as dist
 from torch import Tensor
-
-try:
-    from torch.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
 from colossalai.quantization.fp8 import all_to_all_single_fp8
diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py
index 4f6d691e5b17..1db48faee213 100644
--- a/colossalai/nn/layer/layernorm.py
+++ b/colossalai/nn/layer/layernorm.py
@@ -5,12 +5,7 @@
 import numbers
 
 import torch
-
-try:
-    from torch.amp import custom_bwd, custom_fwd
-except ImportError:
-    from torch.cuda.amp import custom_bwd, custom_fwd
-
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn import init
 from torch.nn.parameter import Parameter
 

From befe9e5c888f565b87f24e413c2f6d5cec50c2bd Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 15:22:55 +0800
Subject: [PATCH 7/7] [misc] fit torch amp api

---
 colossalai/booster/mixed_precision/fp16_torch.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py
index 371d4ede5fcf..c757a878d97a 100644
--- a/colossalai/booster/mixed_precision/fp16_torch.py
+++ b/colossalai/booster/mixed_precision/fp16_torch.py
@@ -21,12 +21,12 @@ class TorchAMPOptimizer(OptimizerWrapper):
         optim (Optimizer): Optimizer to wrap.
         init_scale (float): Initial scale factor. Default: 2**16.
         growth_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.amp.GradScaler.step` if gradients were found to be finite
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
             this iteration. Default: 2.0.
         backoff_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
             this iteration. Default: 0.5.
-        growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step`
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
             calls that may cause the scale to increase. Default: 2000.
     """
 
@@ -39,7 +39,7 @@ def __init__(
         growth_interval: int = 2000,
     ) -> None:
         super().__init__(optim)
-        self.scaler = torch.amp.GradScaler(
+        self.scaler = torch.cuda.amp.GradScaler(
             init_scale=init_scale,
             growth_factor=growth_factor,
             backoff_factor=backoff_factor,
@@ -100,12 +100,12 @@ class FP16TorchMixedPrecision(MixedPrecision):
     Args:
         init_scale (float): Initial scale factor. Default: 2**16.
         growth_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.amp.GradScaler.step` if gradients were found to be finite
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
             this iteration. Default: 2.0.
         backoff_factor (float): Factor by which the scale is multiplied during
-            :meth:`torch.amp.GradScaler.step` if gradients were found to be infinite
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
             this iteration. Default: 0.5.
-        growth_interval (int): Number of iterations between :meth:`torch.amp.GradScaler.step`
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
             calls that may cause the scale to increase. Default: 2000.
     """