From d1940c2b3617ebfe655c7441b4ba649c12d26a7d Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 16:20:42 +0800
Subject: [PATCH 01/11] [mixed_precison] add naive amp demo

---
 colossalai/booster/mixed_precision/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/mixed_precision/__init__.py b/colossalai/booster/mixed_precision/__init__.py
index 3cf0ad28cdbe..0df9d84159f9 100644
--- a/colossalai/booster/mixed_precision/__init__.py
+++ b/colossalai/booster/mixed_precision/__init__.py
@@ -1,17 +1,19 @@
 from .bf16 import BF16MixedPrecision
 from .fp8 import FP8MixedPrecision
 from .fp16_apex import FP16ApexMixedPrecision
+from .fp16_naive import FP16NaiveMixedPrecision
 from .fp16_torch import FP16TorchMixedPrecision
 from .mixed_precision_base import MixedPrecision
 
 __all__ = [
     'MixedPrecision', 'mixed_precision_factory', 'FP16_Apex_MixedPrecision', 'FP16_Torch_MixedPrecision',
-    'FP32_MixedPrecision', 'BF16_MixedPrecision', 'FP8_MixedPrecision'
+    'FP32_MixedPrecision', 'BF16_MixedPrecision', 'FP8_MixedPrecision', 'FP16NaiveMixedPrecision'
 ]
 
 _mixed_precision_mapping = {
     'fp16': FP16TorchMixedPrecision,
     'fp16_apex': FP16ApexMixedPrecision,
+    'fp16_naive': FP16NaiveMixedPrecision,
     'bf16': BF16MixedPrecision,
     'fp8': FP8MixedPrecision
 }

From e8b7029662a586a774ee7ee578f06c8476f9da3c Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 16:21:03 +0800
Subject: [PATCH 02/11] [mixed_precison] add naive amp demo

---
 colossalai/booster/mixed_precision/fp16_naive.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 colossalai/booster/mixed_precision/fp16_naive.py

diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
new file mode 100644
index 000000000000..ef1ec1f42d70
--- /dev/null
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -0,0 +1,5 @@
+from .mixed_precision_base import MixedPrecision
+
+
+class FP16NaiveMixedPrecision(MixedPrecision):
+    pass

From 62fbb0747805fcd3eec9afec6bfcf70792a29690 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 16:52:51 +0800
Subject: [PATCH 03/11] [api] add docstrings and initialization to apex amp,
 naive amp

---
 .../booster/mixed_precision/fp16_apex.py      | 36 ++++++++++++++++++-
 .../booster/mixed_precision/fp16_naive.py     | 23 +++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 266a750734b1..3ec9617374f3 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -1,5 +1,39 @@
+from typing import Optional
+
 from .mixed_precision_base import MixedPrecision
 
 
 class FP16ApexMixedPrecision(MixedPrecision):
-    pass
+    """
+    Precision for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level.
+Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+        num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use.
+When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per
+loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple
+losses/backward passes, but use a single global loss scale for all of them.
+        verbosity(int, default=1): Set to 0 to suppress Amp-related output.
+        min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling.
+The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
+        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss
+scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
+        **kwargs:Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
+                cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
+            cast_model_type: Casts your model’s parameters and buffers to the desired type.patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+            keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
+            master_weights: Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
+            loss_scale: If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
+    """
+
+    def __init__(
+        self,
+        opt_level: Optional[str] = "O1",
+        num_losses: Optional[int] = 1,
+        verbosity: int = 1,
+        min_loss_scale: float = None,
+        max_loss_scale: float = 2.**24,
+        **kwargs,
+    ) -> None:
+        pass
diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
index ef1ec1f42d70..b848d4f04def 100644
--- a/colossalai/booster/mixed_precision/fp16_naive.py
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -2,4 +2,25 @@
 
 
 class FP16NaiveMixedPrecision(MixedPrecision):
-    pass
+    """
+        Precision for mixed precision training in FP16 using PyTorch AMP.
+
+        Args:
+            log_num_zeros_in_grad(bool): return number of zeros in the gradients.
+            initial_scale(int): initial scale of gradient scaler
+            growth_factor(int): the growth rate of loss scale
+            backoff_factor(float): the decrease rate of loss scale
+            hysteresis(int): delay shift in dynamic loss scaling
+            max_scale(int): maximum loss scale allowed
+            verbose(bool): if set to `True`, will print debug info
+    """
+
+    def __init__(self,
+                 log_num_zeros_in_grad: bool,
+                 initial_scale: int,
+                 growth_factor: int,
+                 backoff_factor: float,
+                 hysteresis: int,
+                 max_scale: int,
+                 verbose: bool = None) -> None:
+        pass

From 08de5dad15ebb145a6528e6870834d8e1aabad61 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 18:15:12 +0800
Subject: [PATCH 04/11] [api] add docstring to apex amp/ naive amp

---
 .../booster/mixed_precision/fp16_apex.py       | 16 ++++++++--------
 .../booster/mixed_precision/fp16_naive.py      | 18 +++++++++---------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 3ec9617374f3..35dd83908270 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -9,18 +9,18 @@ class FP16ApexMixedPrecision(MixedPrecision):
 
     Args:
         opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level.
-Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+            Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
         num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use.
-When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per
-loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple
-losses/backward passes, but use a single global loss scale for all of them.
+            When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per loss/backward pass,
+            which can improve stability. If num_losses is left to 1, Amp will still support multiple losses/backward passes,
+            but use a single global loss scale for all of them.
         verbosity(int, default=1): Set to 0 to suppress Amp-related output.
         min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling.
-The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
-        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss
-scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
+            The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
+        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling.
+            If dynamic loss scaling is not used, max_loss_scale is ignored.
         **kwargs:Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
-                cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
+            cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
             cast_model_type: Casts your model’s parameters and buffers to the desired type.patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
             keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
             master_weights: Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
index b848d4f04def..0241bd095ad3 100644
--- a/colossalai/booster/mixed_precision/fp16_naive.py
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -3,16 +3,16 @@
 
 class FP16NaiveMixedPrecision(MixedPrecision):
     """
-        Precision for mixed precision training in FP16 using PyTorch AMP.
+    Precision for mixed precision training in FP16 using PyTorch AMP.
 
-        Args:
-            log_num_zeros_in_grad(bool): return number of zeros in the gradients.
-            initial_scale(int): initial scale of gradient scaler
-            growth_factor(int): the growth rate of loss scale
-            backoff_factor(float): the decrease rate of loss scale
-            hysteresis(int): delay shift in dynamic loss scaling
-            max_scale(int): maximum loss scale allowed
-            verbose(bool): if set to `True`, will print debug info
+    Args:
+        log_num_zeros_in_grad(bool): return number of zeros in the gradients.
+        initial_scale(int): initial scale of gradient scaler
+        growth_factor(int): the growth rate of loss scale
+        backoff_factor(float): the decrease rate of loss scale
+        hysteresis(int): delay shift in dynamic loss scaling
+        max_scale(int): maximum loss scale allowed
+        verbose(bool): if set to `True`, will print debug info
     """
 
     def __init__(self,

From 8094841a7f4f9a006d93766f4ed6aa8f481b6193 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 18:17:15 +0800
Subject: [PATCH 05/11] [api] add docstring to apex amp/ naive amp

---
 colossalai/booster/mixed_precision/fp16_apex.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 35dd83908270..750ccdb24d86 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -8,17 +8,11 @@ class FP16ApexMixedPrecision(MixedPrecision):
     Precision for mixed precision training in FP16 using PyTorch AMP.
 
     Args:
-        opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level.
-            Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
-        num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use.
-            When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per loss/backward pass,
-            which can improve stability. If num_losses is left to 1, Amp will still support multiple losses/backward passes,
-            but use a single global loss scale for all of them.
+        opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level. Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+        num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use. When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple losses/backward passes, but use a single global loss scale for all of them.
         verbosity(int, default=1): Set to 0 to suppress Amp-related output.
-        min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling.
-            The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
-        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling.
-            If dynamic loss scaling is not used, max_loss_scale is ignored.
+        min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
+        max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
         **kwargs:Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
             cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
             cast_model_type: Casts your model’s parameters and buffers to the desired type.patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.

From 39a69d261ee95969132986d4030f0e42da16cb5d Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 18:49:09 +0800
Subject: [PATCH 06/11] [api] add docstring to apex amp/ naive amp

---
 colossalai/booster/mixed_precision/fp16_apex.py  | 2 +-
 colossalai/booster/mixed_precision/fp16_naive.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 750ccdb24d86..62bac0e13a72 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -5,7 +5,7 @@
 
 class FP16ApexMixedPrecision(MixedPrecision):
     """
-    Precision for mixed precision training in FP16 using PyTorch AMP.
+    Precision for mixed precision training in FP16 using apex AMP.
 
     Args:
         opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level. Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
index 0241bd095ad3..b8c256b2c71b 100644
--- a/colossalai/booster/mixed_precision/fp16_naive.py
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -3,7 +3,7 @@
 
 class FP16NaiveMixedPrecision(MixedPrecision):
     """
-    Precision for mixed precision training in FP16 using PyTorch AMP.
+    Precision for mixed precision training in FP16 using naive AMP.
 
     Args:
         log_num_zeros_in_grad(bool): return number of zeros in the gradients.

From 95c541fb7134f87759892db24a27e62d36a5d4ce Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 22 May 2023 19:31:18 +0800
Subject: [PATCH 07/11] [api] add docstring to apex amp/ naive amp

---
 .../booster/mixed_precision/fp16_apex.py      | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index 62bac0e13a72..c222b3c156be 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -1,4 +1,6 @@
-from typing import Optional
+from typing import Any, Optional, Union
+
+import torch
 
 from .mixed_precision_base import MixedPrecision
 
@@ -9,13 +11,22 @@ class FP16ApexMixedPrecision(MixedPrecision):
 
     Args:
         opt_level(str, optional, default="O1" ): Pure or mixed precision optimization level. Accepted values are “O0”, “O1”, “O2”, and “O3”, explained in detail above Apex AMP Documentation.
+        cast_model_type (torch.dtype, optional, default=None): Casts your model’s parameters and buffers to the desired type.
+        patch_torch_functions (bool, optional, default=None): Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+        keep_batchnorm_fp32 (bool or str, optional, default=None): To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
+        master_weights (bool, optional, default=None): Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
+        loss_scale (float or str, optional, default=None): If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
+        cast_model_outputs (torch.dpython:type, optional, default=None): Option to ensure that the outputs of your model(s) are always cast to a particular type regardless of opt_level.
         num_losses(int, optional, default=1): Option to tell AMP in advance how many losses/backward passes you plan to use. When used in conjunction with the loss_id argument to `amp.scale_loss`, enables Amp to use a different loss scale per loss/backward pass, which can improve stability. If num_losses is left to 1, Amp will still support multiple losses/backward passes, but use a single global loss scale for all of them.
         verbosity(int, default=1): Set to 0 to suppress Amp-related output.
         min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
         max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
+
+
         **kwargs:Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
             cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
-            cast_model_type: Casts your model’s parameters and buffers to the desired type.patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+            cast_model_type: Casts your model’s parameters and buffers to the desired type.
+            patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
             keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
             master_weights: Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
             loss_scale: If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
@@ -24,6 +35,12 @@ class FP16ApexMixedPrecision(MixedPrecision):
     def __init__(
         self,
         opt_level: Optional[str] = "O1",
+        cast_model_type: torch.dtype = None,
+        patch_torch_functions: bool = None,
+        keep_batchnorm_fp32: Union[bool, str] = None,
+        master_weights: bool = None,
+        loss_scale: Union[float, str] = None,
+        cast_model_outputs: Any = None,
         num_losses: Optional[int] = 1,
         verbosity: int = 1,
         min_loss_scale: float = None,

From 33b7533a77a0903cf2cfe3aa395edd5473d6e5a8 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 22 May 2023 19:34:15 +0800
Subject: [PATCH 08/11] [api] add docstring to apex amp/ naive amp

---
 .../booster/mixed_precision/fp16_apex.py      | 36 +++++++------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_apex.py b/colossalai/booster/mixed_precision/fp16_apex.py
index c222b3c156be..e184271e932a 100644
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
@@ -21,30 +21,18 @@ class FP16ApexMixedPrecision(MixedPrecision):
         verbosity(int, default=1): Set to 0 to suppress Amp-related output.
         min_loss_scale(float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, min_loss_scale is ignored.
         max_loss_scale(float, default=2.**24 ): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, max_loss_scale is ignored.
-
-
-        **kwargs:Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
-            cast_model_type, patch_torch_functions, keep_batchnorm_fp32, master_weights, loss_scale.
-            cast_model_type: Casts your model’s parameters and buffers to the desired type.
-            patch_torch_functions: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
-            keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
-            master_weights: Maintain FP32 master weights to accompany any FP16 model weights. FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
-            loss_scale: If loss_scale is a float value, use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically.
     """
 
-    def __init__(
-        self,
-        opt_level: Optional[str] = "O1",
-        cast_model_type: torch.dtype = None,
-        patch_torch_functions: bool = None,
-        keep_batchnorm_fp32: Union[bool, str] = None,
-        master_weights: bool = None,
-        loss_scale: Union[float, str] = None,
-        cast_model_outputs: Any = None,
-        num_losses: Optional[int] = 1,
-        verbosity: int = 1,
-        min_loss_scale: float = None,
-        max_loss_scale: float = 2.**24,
-        **kwargs,
-    ) -> None:
+    def __init__(self,
+                 opt_level: Optional[str] = "O1",
+                 cast_model_type: torch.dtype = None,
+                 patch_torch_functions: bool = None,
+                 keep_batchnorm_fp32: Union[bool, str] = None,
+                 master_weights: bool = None,
+                 loss_scale: Union[float, str] = None,
+                 cast_model_outputs: Any = None,
+                 num_losses: Optional[int] = 1,
+                 verbosity: int = 1,
+                 min_loss_scale: float = None,
+                 max_loss_scale: float = 2.**24) -> None:
         pass

From 410cf73f282b3f95c4ae7653ac6e332e9f3e8211 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 10:24:32 +0800
Subject: [PATCH 09/11] [api] add docstring to apex amp/ naive amp

---
 colossalai/booster/mixed_precision/fp16_naive.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/colossalai/booster/mixed_precision/fp16_naive.py b/colossalai/booster/mixed_precision/fp16_naive.py
index b8c256b2c71b..5d0d815257f3 100644
--- a/colossalai/booster/mixed_precision/fp16_naive.py
+++ b/colossalai/booster/mixed_precision/fp16_naive.py
@@ -6,13 +6,13 @@ class FP16NaiveMixedPrecision(MixedPrecision):
     Precision for mixed precision training in FP16 using naive AMP.
 
     Args:
-        log_num_zeros_in_grad(bool): return number of zeros in the gradients.
-        initial_scale(int): initial scale of gradient scaler
-        growth_factor(int): the growth rate of loss scale
-        backoff_factor(float): the decrease rate of loss scale
-        hysteresis(int): delay shift in dynamic loss scaling
-        max_scale(int): maximum loss scale allowed
-        verbose(bool): if set to `True`, will print debug info
+    log_num_zeros_in_grad(bool): return number of zeros in the gradients.
+    initial_scale(int): initial scale of gradient scaler.
+    growth_factor(int): the growth rate of loss scale.
+    backoff_factor(float): the decrease rate of loss scale.
+    hysteresis(int): delay shift in dynamic loss scaling.
+    max_scale(int): maximum loss scale allowed.
+    verbose(bool): if set to `True`, will print debug info.
     """
 
     def __init__(self,

From 101c6215a65f285f2c8c96b4d77934e418eb5db8 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 10:41:08 +0800
Subject: [PATCH 10/11] [api] fix

---
 tests/test_utils/test_lazy_init/test_distribute.py | 4 ++--
 tests/test_utils/test_lazy_init/test_models.py     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_utils/test_lazy_init/test_distribute.py b/tests/test_utils/test_lazy_init/test_distribute.py
index c15b055e8361..63bcea9cca69 100644
--- a/tests/test_utils/test_lazy_init/test_distribute.py
+++ b/tests/test_utils/test_lazy_init/test_distribute.py
@@ -15,10 +15,10 @@
     from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 except:
     pass
-from utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
-
 from tests.kit.model_zoo import model_zoo
 
+from .utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
+
 
 def find_shard_dim(shape: torch.Size) -> Optional[int]:
     for dim, size in enumerate(shape):
diff --git a/tests/test_utils/test_lazy_init/test_models.py b/tests/test_utils/test_lazy_init/test_models.py
index 4a0217b31a97..9834f4b8646f 100644
--- a/tests/test_utils/test_lazy_init/test_models.py
+++ b/tests/test_utils/test_lazy_init/test_models.py
@@ -1,8 +1,9 @@
 import pytest
-from utils import SUPPORT_LAZY, check_lazy_init
 
 from tests.kit.model_zoo import model_zoo
 
+from .utils import SUPPORT_LAZY, check_lazy_init
+
 
 @pytest.mark.skipif(not SUPPORT_LAZY, reason='requires torch >= 1.12.0')
 @pytest.mark.parametrize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])

From 217b40069f7c1c73b43745acf08eef1dadb938a0 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 10:45:27 +0800
Subject: [PATCH 11/11] [api] fix

---
 tests/test_utils/test_lazy_init/test_distribute.py | 4 ++--
 tests/test_utils/test_lazy_init/test_models.py     | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/test_utils/test_lazy_init/test_distribute.py b/tests/test_utils/test_lazy_init/test_distribute.py
index 63bcea9cca69..c15b055e8361 100644
--- a/tests/test_utils/test_lazy_init/test_distribute.py
+++ b/tests/test_utils/test_lazy_init/test_distribute.py
@@ -15,9 +15,9 @@
     from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 except:
     pass
-from tests.kit.model_zoo import model_zoo
+from utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
 
-from .utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
+from tests.kit.model_zoo import model_zoo
 
 
 def find_shard_dim(shape: torch.Size) -> Optional[int]:
diff --git a/tests/test_utils/test_lazy_init/test_models.py b/tests/test_utils/test_lazy_init/test_models.py
index 9834f4b8646f..4a0217b31a97 100644
--- a/tests/test_utils/test_lazy_init/test_models.py
+++ b/tests/test_utils/test_lazy_init/test_models.py
@@ -1,9 +1,8 @@
 import pytest
+from utils import SUPPORT_LAZY, check_lazy_init
 
 from tests.kit.model_zoo import model_zoo
 
-from .utils import SUPPORT_LAZY, check_lazy_init
-
 
 @pytest.mark.skipif(not SUPPORT_LAZY, reason='requires torch >= 1.12.0')
 @pytest.mark.parametrize('subset', ['torchvision', 'diffusers', 'timm', 'transformers', 'torchaudio', 'deepfm', 'dlrm'])