From 31d4c6b40c6cdbfc41649d8a58f44df54b77a3ac Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 8 May 2023 17:24:35 +0800
Subject: [PATCH 01/10] [booster] update tests for booster

---
 .../test_gemini_checkpoint_io.py              | 118 ++++++++++++++++++
 .../test_low_level_zero_checkpoint_io.py      |   0
 .../test_torch_ddp_checkpoint_io.py           |   0
 3 files changed, 118 insertions(+)
 create mode 100644 tests/test_checkpoint_io/test_gemini_checkpoint_io.py
 create mode 100644 tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
 create mode 100644 tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py

diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
new file mode 100644
index 000000000000..c5ed70f9933a
--- /dev/null
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -0,0 +1,118 @@
+import tempfile
+
+import pytest
+import torch
+
+import colossalai
+from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils.cuda import get_current_device
+from colossalai.zero import ColoInitContext, ZeroDDP
+from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
+from colossalai.zero.gemini.gemini_mgr import GeminiManager
+from tests.components_to_test.registry import non_distributed_component_funcs
+
+
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['bert'])
+@parameterize('use_safetensors', [True, False])
+def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: bool):
+    from transformers import BertForSequenceClassification
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+    with ColoInitContext(device=(get_current_device())):
+        bert_model = model_builder()
+    bert_model.config.save_pretrained(save_directory=(model_ckpt_dir.name))
+
+    config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    bert_model = ZeroDDP(bert_model, gemini_manager)
+    bert_model.train()
+
+    ckpt_io = GeminiCheckpointIO()
+    if ckpt_io.coordinator.is_master():
+        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
+        ckpt_io.save_model(bert_model, (model_ckpt_dir.name),
+                           True,
+                           True,
+                           '', (model_size / 3),
+                           use_safetensors=use_safetensors)
+        new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
+        recursive_check(bert_model.state_dict(only_rank_0=True, dtype=(torch.float32)), new_bert_model.state_dict())
+    model_ckpt_dir.cleanup()
+
+
+@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('model_name', ['gpt2', 'bert'])
+@parameterize('use_safetensors', [True, False])
+def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, *_ = get_components_func()
+    with ColoInitContext(device=(get_current_device())):
+        model = model_builder()
+        new_model = model_builder()
+    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    chunk_manager = ChunkManager(config_dict)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager)
+
+    model.train()
+    #new model
+    new_config_dict, *_ = search_chunk_configuration(new_model, search_range_mb=1, search_interval_byte=100)
+    new_chunk_manager = ChunkManager(new_config_dict)
+    new_gemini_manager = GeminiManager(placement_policy, new_chunk_manager)
+    new_model = ZeroDDP(new_model, new_gemini_manager)
+
+    model_ckpt_dir = tempfile.TemporaryDirectory()
+    ckpt_io = GeminiCheckpointIO()
+    model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
+    ckpt_io.save_model(model, (model_ckpt_dir.name),
+                       True,
+                       True,
+                       'epoch', (model_size / 3),
+                       use_safetensors=use_safetensors)
+
+    if ckpt_io.coordinator.is_master():
+        ckpt_io.load_model(new_model, (model_ckpt_dir.name), strict=True)
+        model_dict = model.state_dict(only_rank_0=True)
+        new_model_dict = new_model.state_dict(only_rank_0=True)
+        recursive_check(model_dict, new_model_dict)
+    model_ckpt_dir.cleanup()
+
+
+def run_dist(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_state_dict()
+    exam_state_dict_with_origin()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [4, 4])
+@rerun_if_address_is_in_use()
+def test_gemini_ckpIO(world_size):
+    spawn(run_dist, world_size)
+
+
+def recursive_check(d1, d2):
+    for k, v in d1.items():
+        if isinstance(v, dict):
+            recursive_check(v, d2[k])
+        elif isinstance(v, list):
+            for i in range(len(v)):
+                if isinstance(v[i], torch.Tensor):
+                    v[i] = v[i].to('cpu')
+                    d2[k][i] = d2[k][i].to('cpu')
+                    if not torch.equal(v[i], d2[k][i]):
+                        raise AssertionError
+                elif not v[i] == d2[k][i]:
+                    raise AssertionError
+
+        elif isinstance(v, torch.Tensor):
+            v = v.to('cpu')
+            d2[k] = d2[k].to('cpu')
+            assert torch.equal(v, d2[k])
+        elif not v == d2[k]:
+            raise AssertionError
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
new file mode 100644
index 000000000000..e69de29bb2d1

From d8bee3ff5b16cb8695cec53e5e71554aff481fe1 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 9 May 2023 16:07:23 +0800
Subject: [PATCH 02/10] [booster] update tests for booster

---
 colossalai/booster/plugin/torch_ddp_plugin.py |   4 +
 docs/source/en/basics/colossalai-booster.md   | 124 ++++++++++++++++++
 .../zh-Hans/basics/colossalai_booster.md      |   0
 .../test_gemini_checkpoint_io.py              |   1 +
 .../test_general_checkpoint_io.py             |  98 +-------------
 .../test_low_level_zero_checkpoint_io.py      |  74 +++++++++++
 .../test_torch_ddp_checkpoint_io.py           |  86 ++++++++++++
 7 files changed, 294 insertions(+), 93 deletions(-)
 create mode 100644 docs/source/en/basics/colossalai-booster.md
 create mode 100644 docs/source/zh-Hans/basics/colossalai_booster.md

diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index 76906d844ef1..dfef384567ae 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -1,3 +1,4 @@
+from contextlib import contextmanager
 from typing import Callable, List, Tuple, Union
 
 import torch.nn as nn
@@ -115,6 +116,9 @@ def control_device(self) -> bool:
     def supported_devices(self) -> List[str]:
         return ['cuda']
 
+    def no_sync(self, model) -> contextmanager:
+        return model.no_sync()
+
     def configure(
         self,
         model: nn.Module,
diff --git a/docs/source/en/basics/colossalai-booster.md b/docs/source/en/basics/colossalai-booster.md
new file mode 100644
index 000000000000..fc33e8cbe039
--- /dev/null
+++ b/docs/source/en/basics/colossalai-booster.md
@@ -0,0 +1,124 @@
+# colossal-ai booster
+
+**Prerequisite:**
+- [Distributed Training](../concepts/distributed_training.md)
+- [Colossal-AI Overview](../concepts/colossalai_overview.md)
+
+## Introduction
+In our new design, `colossalai.booster` replaces the role of `colossalai.initialize` to inject features into your training components (e.g. model, optimizer, dataloader) seamlessly. With these new APIs, user can integrate their model with our parallelism features more friendly. Also calling `colossalai.booster` is the standard procedure before you run into your training loops. In the sections below, I will cover how `colossalai.booster` works and what we should take note of.
+
+### Plugin
+<p>Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows:</p>
+
+***GeminiPlugin:*** <p> This plugin wrapps the Gemini acceleration solution, that ZeRO with chunk-based memory management. </p>
+
+***TorchDDPPlugin:*** <p>This plugin wrapps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines. </p>
+
+***LowLevelZeroPlugin:*** <p>This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.</p>
+
+### API of booster
+Booster.__init__(...):
+* Args:
+    * device (str or torch.device): The device to run the training. Default: 'cuda'.
+    * mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.'fp16' would use PyTorch AMP while 'fp16_apex' would use Nvidia Apex.
+    * plugin (Plugin): The plugin to run the training. Default: None.
+* Return:
+    * booster (Booster)
+
+
+booster.boost(...): This function is called to boost objects. (e.g. model, optimizer, criterion).
+* Args:
+    * model (nn.Module): The model to be boosted.
+    * optimizer (Optimizer): The optimizer to be boosted.
+    * criterion (Callable): The criterion to be boosted.
+    * dataloader (DataLoader): The dataloader to be boosted.
+    * lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
+* Return:
+    * model, optimizer, criterion, dataloader, lr_scheduler
+
+booster.backward(loss, optimizer): This function run the backward operation
+* Args:
+    * loss (torch.Tensor)
+    * optimizer (Optimizer)
+
+booster.no_sync(model) :A context manager to disable gradient synchronizations across processes.
+
+booster.save_model(...): This function is called to save model checkpoints
+* Args:
+    * model: nn.Module,
+    * checkpoint: str,
+    * prefix: str = None,
+    * shard: bool = False, # if saved as shards
+    * size_per_shard: int = 1024  # the max length of shard
+
+booster.load_model(...):
+* Args:
+    * model: nn.Module,
+    * checkpoint: str,
+    * strict: bool = True
+
+booster.save_optimizer(...): This function is called to save optimizer checkpoints
+* Args:
+    * optimizer: Optimizer,
+    * checkpoint: str,
+    * shard: bool = False, # if saved as shards
+    * size_per_shard: int = 1024  # the max length of shard
+
+booster.load_optimizer(...):
+* Args:
+    * optimizer: Optimizer,
+    * checkpoint: str,
+
+booster.save_lr_scheduler(...): This function is called to save lr scheduler checkpoints
+* Args:
+    * lr_scheduler: LRScheduler,
+    * checkpoint: str,
+
+booster.load_lr_scheduler(...):
+* Args:
+    * lr_scheduler: LRScheduler,
+    * checkpoint: str,
+
+## usage
+In a typical workflow, you need to launch distributed environment at the beginning of training script and create objects needed (such as models, optimizers, loss function, data loaders etc.) firstly, then call `colossalai.booster` to inject features into these objects, After that, you can use our booster API and these returned objects to continue the rest of your training processes.
+
+<P> A pseudo-code example is like below: </p>
+
+```python
+import torch
+from torch.optim import SGD
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+
+def train():
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = SGD((model.parameters()), lr=0.001)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+    model, optimizer, criterion, _, scheduler = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
+
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to('cuda')
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
+    scheduler.step()
+
+    save_path = "./model"
+    booster.save_model(model, save_path, True, True, "", 10, use_safetensors=use_safetensors)
+
+    new_model = resnet18()
+    booster.load_model(new_model, save_path)
+```
+
+if you want to run a example, [click here](../../../../examples/tutorial/new_api/cifar_resnet/README.md)
+
+[more design detailers](https://github.com/hpcaitech/ColossalAI/discussions/3046)
diff --git a/docs/source/zh-Hans/basics/colossalai_booster.md b/docs/source/zh-Hans/basics/colossalai_booster.md
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index c5ed70f9933a..837714c232a5 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -18,6 +18,7 @@
 @parameterize('use_safetensors', [True, False])
 def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: bool):
     from transformers import BertForSequenceClassification
+
     model_ckpt_dir = tempfile.TemporaryDirectory()
     get_components_func = non_distributed_component_funcs.get_callable(model_name)
     model_builder, *_ = get_components_func()
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
index 752ca706bfd4..6fc6048995ab 100644
--- a/tests/test_checkpoint_io/test_general_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -1,21 +1,14 @@
 import tempfile
+
 import pytest
 import torch
 from torch.optim import Adam
 from torchvision.models import resnet18
 
-from colossalai.checkpoint_io import GeneralCheckpointIO
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
+from colossalai.checkpoint_io import GeneralCheckpointIO
 from colossalai.testing import clear_cache_before_run, parameterize
 
-import colossalai
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext, ZeroDDP
-from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
-from colossalai.zero.gemini.gemini_mgr import GeminiManager
-from tests.components_to_test.registry import non_distributed_component_funcs
-
 # ========
 # Note:
 # 1. due to checkpoint IO can be quite slow if tested with all models, we will only test on resnet for now
@@ -61,11 +54,11 @@ def test_unsharded_checkpoint(use_safetensors: bool):
     ckpt_io.load_model(new_model, model_ckpt_tempfile.name)
     ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
 
-
     # check for model and optimizer state dict recursively
     recursive_check(model.state_dict(), new_model.state_dict())
     recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
 
+
 @pytest.mark.parametrize('use_safetensors', [True, False])
 def test_sharded_checkpoint(use_safetensors: bool):
     # create a model and optimizer
@@ -87,7 +80,7 @@ def test_sharded_checkpoint(use_safetensors: bool):
     else:
         suffix = ".bin"
         WEIGHTS_INDEX_NAME = "model.bin.index.json"
-    
+
     model_ckpt_dir = tempfile.TemporaryDirectory()
     optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
 
@@ -96,7 +89,7 @@ def test_sharded_checkpoint(use_safetensors: bool):
 
     ckpt_io.save_model(model, model_ckpt_dir.name, True, True, "", 10, use_safetensors=use_safetensors)
     ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name, shard=False)
-    
+
     # create new model
     new_model = resnet18()
     new_optimizer = Adam(new_model.parameters(), lr=0.001)
@@ -108,87 +101,6 @@ def test_sharded_checkpoint(use_safetensors: bool):
     recursive_check(model.state_dict(), new_model.state_dict())
     recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
 
-@parameterize('placement_policy', ['cuda', 'cpu'])
-@parameterize('model_name', ['bert'])
-@parameterize('use_safetensors', [True, False])
-def hf_load_colossalai_checkpoint(placement_policy, model_name, use_safetensors: bool):
-    from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertForSequenceClassification
-
-    model_ckpt_dir = tempfile.TemporaryDirectory()
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, *_ = get_components_func()
-
-    with ColoInitContext(device=get_current_device()):
-        bert_model = model_builder()
-    bert_model.config.save_pretrained(save_directory=model_ckpt_dir.name)
-    config_dict, *_ = search_chunk_configuration(bert_model, search_range_mb=1, search_interval_byte=100)
-    chunk_manager = ChunkManager(config_dict)
-    gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    bert_model = ZeroDDP(bert_model, gemini_manager)
-    bert_model.train()
-
-    ckpt_io = GeminiCheckpointIO()
-    if ckpt_io.coordinator.is_master():
-        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
-        ckpt_io.save_model(bert_model, model_ckpt_dir.name, True, True, "", (model_size / 3), use_safetensors=use_safetensors)
-        new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
-        recursive_check(bert_model.state_dict(only_rank_0=True, dtype=torch.float32), new_bert_model.state_dict())
-    
-    model_ckpt_dir.cleanup()
-        
-
-
-@parameterize('placement_policy', ['cuda', 'cpu'])
-@parameterize('model_name', ['gpt2', 'bert'])
-@parameterize('use_safetensors', [True, False])
-def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, *_ = get_components_func()
-
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-        new_model = model_builder()
-
-    config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
-    chunk_manager = ChunkManager(config_dict)
-    gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    model = ZeroDDP(model, gemini_manager)
-    model.train()
-
-    new_config_dict, *_ = search_chunk_configuration(new_model, search_range_mb=1, search_interval_byte=100)
-    new_chunk_manager = ChunkManager(new_config_dict)
-    new_gemini_manager = GeminiManager(placement_policy, new_chunk_manager)
-    new_model = ZeroDDP(new_model, new_gemini_manager)
-
-    model_ckpt_dir = tempfile.TemporaryDirectory()
-
-    ckpt_io = GeminiCheckpointIO()
-    model_size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2
-    ckpt_io.save_model(model, model_ckpt_dir.name, True, True, "epoch", (model_size / 3), use_safetensors=use_safetensors)
-
-    # load model
-    if ckpt_io.coordinator.is_master():
-        ckpt_io.load_model(new_model, model_ckpt_dir.name, strict=True)
-        model_dict = model.state_dict(only_rank_0=True)
-        new_model_dict = new_model.state_dict(only_rank_0=True)
-        recursive_check(model_dict, new_model_dict)
-
-    model_ckpt_dir.cleanup()
-
-
-def run_dist(rank, world_size, port):
-    config = {}
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    exam_state_dict()
-    hf_load_colossalai_checkpoint()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [4, 4])
-@rerun_if_address_is_in_use()
-def test_gemini_ckpIO(world_size):
-    spawn(run_dist, world_size)
-
 
 # do recursive check for the optimizer state dict
 # if the value is a dict, compare its values
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
index e69de29bb2d1..46074507a52e 100644
--- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -0,0 +1,74 @@
+import tempfile
+
+import pytest
+import torch
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroCheckpointIO
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+
+
+@clear_cache_before_run()
+@parameterize('stage', [2])
+def check_low_level_zero_checkpointIO(stage: int):
+    plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32)
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = HybridAdam((model.parameters()), lr=0.001)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to('cuda')
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.step()
+
+    optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    ckpt_io = LowLevelZeroCheckpointIO()
+    ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
+
+    if ckpt_io.coordinator.is_master():
+        new_model = resnet18()
+        new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
+        _, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
+        ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
+        recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_low_level_zero_checkpointIO()
+
+
+@rerun_if_address_is_in_use()
+def test_low_level_zero_checkpointIO():
+    spawn(run_dist, 2)
+
+
+def recursive_check(d1, d2):
+    for k, v in d1.items():
+        if isinstance(v, dict):
+            recursive_check(v, d2[k])
+        elif isinstance(v, list):
+            for i in range(len(v)):
+                if isinstance(v[i], torch.Tensor):
+                    v[i] = v[i].to('cpu')
+                    d2[k][i] = d2[k][i].to('cpu')
+                    if not torch.equal(v[i], d2[k][i]):
+                        raise AssertionError
+                elif v[i] != d2[k][i]:
+                    assert v[i] == d2[k][i]
+
+        elif isinstance(v, torch.Tensor):
+            v = v.to('cpu')
+            d2[k] = d2[k].to('cpu')
+            if not torch.equal(v, d2[k]):
+                raise AssertionError
+        elif not v == d2[k]:
+            raise AssertionError
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
index e69de29bb2d1..38d0a381ab5c 100644
--- a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
@@ -0,0 +1,86 @@
+import tempfile
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import SGD
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPCheckpointIO
+from colossalai.interface import OptimizerWrapper
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+
+
+def check_torch_ddp_checkpointIO():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = SGD((model.parameters()), lr=0.001)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
+
+    assert isinstance(model.module, DDP)
+    assert isinstance(optimizer, OptimizerWrapper)
+
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to('cuda')
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
+    scheduler.step()
+
+    optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    lr_scheduler_ckpt_tempfile = tempfile.NamedTemporaryFile()
+    ckpt_io = TorchDDPCheckpointIO()
+    ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
+    ckpt_io.save_lr_scheduler(scheduler, lr_scheduler_ckpt_tempfile.name)
+
+    if ckpt_io.coordinator.is_master():
+        new_model = resnet18()
+        new_optimizer = SGD((new_model.parameters()), lr=0.001)
+        new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
+        _, new_optimizer, _, _, new_scheduler = booster.boost(new_model, new_optimizer, lr_scheduler=new_scheduler)
+
+        ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
+        recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+
+        ckpt_io.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_tempfile.name)
+        recursive_check(scheduler.state_dict(), new_scheduler.state_dict())
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
+    check_torch_ddp_checkpointIO()
+
+
+@rerun_if_address_is_in_use()
+def test_torch_ddp_checkpointIO():
+    spawn(run_dist, 2)
+
+
+def recursive_check(d1, d2):
+    for k, v in d1.items():
+        if isinstance(v, dict):
+            recursive_check(v, d2[k])
+        elif isinstance(v, list):
+            for i in range(len(v)):
+                if isinstance(v[i], torch.Tensor):
+                    v[i] = v[i].to('cpu')
+                    d2[k][i] = d2[k][i].to('cpu')
+                    if not torch.equal(v[i], d2[k][i]):
+                        raise AssertionError
+                elif v[i] != d2[k][i]:
+                    assert v[i] == d2[k][i]
+
+        elif isinstance(v, torch.Tensor):
+            v = v.to('cpu')
+            d2[k] = d2[k].to('cpu')
+            if not torch.equal(v, d2[k]):
+                raise AssertionError
+        elif not v == d2[k]:
+            raise AssertionError

From 852d0d6c63aa16b9fd12a0bd1fa5dd938ab22856 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 9 May 2023 17:50:09 +0800
Subject: [PATCH 03/10] [booster] update tests for booster

---
 colossalai/testing/__init__.py                |  11 +-
 colossalai/testing/comparison.py              |  20 +++
 docs/source/en/basics/colossalai-booster.md   | 124 ------------------
 .../zh-Hans/basics/colossalai_booster.md      |   0
 .../test_gemini_checkpoint_io.py              |  24 +---
 .../test_general_checkpoint_io.py             |  27 +---
 .../test_low_level_zero_checkpoint_io.py      |  25 +---
 .../test_torch_ddp_checkpoint_io.py           |  25 +---
 8 files changed, 33 insertions(+), 223 deletions(-)
 delete mode 100644 docs/source/en/basics/colossalai-booster.md
 delete mode 100644 docs/source/zh-Hans/basics/colossalai_booster.md

diff --git a/colossalai/testing/__init__.py b/colossalai/testing/__init__.py
index c53e0f44c7e0..7af7dfa56099 100644
--- a/colossalai/testing/__init__.py
+++ b/colossalai/testing/__init__.py
@@ -1,4 +1,11 @@
-from .comparison import assert_close, assert_close_loose, assert_equal, assert_equal_in_group, assert_not_equal
+from .comparison import (
+    assert_close,
+    assert_close_loose,
+    assert_equal,
+    assert_equal_in_group,
+    assert_not_equal,
+    recursive_check,
+)
 from .pytest_wrapper import run_on_environment_flag
 from .utils import (
     clear_cache_before_run,
@@ -13,5 +20,5 @@
 __all__ = [
     'assert_equal', 'assert_not_equal', 'assert_close', 'assert_close_loose', 'assert_equal_in_group', 'parameterize',
     'rerun_on_exception', 'rerun_if_address_is_in_use', 'skip_if_not_enough_gpus', 'free_port', 'spawn',
-    'clear_cache_before_run', 'run_on_environment_flag'
+    'clear_cache_before_run', 'run_on_environment_flag', 'recursive_check'
 ]
diff --git a/colossalai/testing/comparison.py b/colossalai/testing/comparison.py
index e00d0da168c7..94bb15b082a5 100644
--- a/colossalai/testing/comparison.py
+++ b/colossalai/testing/comparison.py
@@ -28,3 +28,23 @@ def assert_equal_in_group(tensor: Tensor, process_group: ProcessGroup = None):
         a = tensor_list[i]
         b = tensor_list[i + 1]
         assert torch.all(a == b), f'expected tensors on rank {i} and {i + 1} to be equal but they are not, {a} vs {b}'
+
+
+def recursive_check(d1, d2):
+    for k, v in d1.items():
+        if isinstance(v, dict):
+            recursive_check(v, d2[k])
+        elif isinstance(v, list):
+            for i in range(len(v)):
+                if isinstance(v[i], torch.Tensor):
+                    v[i] = v[i].to("cpu")
+                    d2[k][i] = d2[k][i].to("cpu")
+                    assert torch.equal(v[i], d2[k][i])
+                else:
+                    assert v[i] == d2[k][i]
+        elif isinstance(v, torch.Tensor):
+            v = v.to("cpu")
+            d2[k] = d2[k].to("cpu")
+            assert torch.equal(v, d2[k])
+        else:
+            assert v == d2[k]
diff --git a/docs/source/en/basics/colossalai-booster.md b/docs/source/en/basics/colossalai-booster.md
deleted file mode 100644
index fc33e8cbe039..000000000000
--- a/docs/source/en/basics/colossalai-booster.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# colossal-ai booster
-
-**Prerequisite:**
-- [Distributed Training](../concepts/distributed_training.md)
-- [Colossal-AI Overview](../concepts/colossalai_overview.md)
-
-## Introduction
-In our new design, `colossalai.booster` replaces the role of `colossalai.initialize` to inject features into your training components (e.g. model, optimizer, dataloader) seamlessly. With these new APIs, user can integrate their model with our parallelism features more friendly. Also calling `colossalai.booster` is the standard procedure before you run into your training loops. In the sections below, I will cover how `colossalai.booster` works and what we should take note of.
-
-### Plugin
-<p>Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows:</p>
-
-***GeminiPlugin:*** <p> This plugin wrapps the Gemini acceleration solution, that ZeRO with chunk-based memory management. </p>
-
-***TorchDDPPlugin:*** <p>This plugin wrapps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines. </p>
-
-***LowLevelZeroPlugin:*** <p>This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.</p>
-
-### API of booster
-Booster.__init__(...):
-* Args:
-    * device (str or torch.device): The device to run the training. Default: 'cuda'.
-    * mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.'fp16' would use PyTorch AMP while 'fp16_apex' would use Nvidia Apex.
-    * plugin (Plugin): The plugin to run the training. Default: None.
-* Return:
-    * booster (Booster)
-
-
-booster.boost(...): This function is called to boost objects. (e.g. model, optimizer, criterion).
-* Args:
-    * model (nn.Module): The model to be boosted.
-    * optimizer (Optimizer): The optimizer to be boosted.
-    * criterion (Callable): The criterion to be boosted.
-    * dataloader (DataLoader): The dataloader to be boosted.
-    * lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
-* Return:
-    * model, optimizer, criterion, dataloader, lr_scheduler
-
-booster.backward(loss, optimizer): This function run the backward operation
-* Args:
-    * loss (torch.Tensor)
-    * optimizer (Optimizer)
-
-booster.no_sync(model) :A context manager to disable gradient synchronizations across processes.
-
-booster.save_model(...): This function is called to save model checkpoints
-* Args:
-    * model: nn.Module,
-    * checkpoint: str,
-    * prefix: str = None,
-    * shard: bool = False, # if saved as shards
-    * size_per_shard: int = 1024  # the max length of shard
-
-booster.load_model(...):
-* Args:
-    * model: nn.Module,
-    * checkpoint: str,
-    * strict: bool = True
-
-booster.save_optimizer(...): This function is called to save optimizer checkpoints
-* Args:
-    * optimizer: Optimizer,
-    * checkpoint: str,
-    * shard: bool = False, # if saved as shards
-    * size_per_shard: int = 1024  # the max length of shard
-
-booster.load_optimizer(...):
-* Args:
-    * optimizer: Optimizer,
-    * checkpoint: str,
-
-booster.save_lr_scheduler(...): This function is called to save lr scheduler checkpoints
-* Args:
-    * lr_scheduler: LRScheduler,
-    * checkpoint: str,
-
-booster.load_lr_scheduler(...):
-* Args:
-    * lr_scheduler: LRScheduler,
-    * checkpoint: str,
-
-## usage
-In a typical workflow, you need to launch distributed environment at the beginning of training script and create objects needed (such as models, optimizers, loss function, data loaders etc.) firstly, then call `colossalai.booster` to inject features into these objects, After that, you can use our booster API and these returned objects to continue the rest of your training processes.
-
-<P> A pseudo-code example is like below: </p>
-
-```python
-import torch
-from torch.optim import SGD
-from torchvision.models import resnet18
-
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import TorchDDPPlugin
-
-def train():
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
-    plugin = TorchDDPPlugin()
-    booster = Booster(plugin=plugin)
-    model = resnet18()
-    criterion = lambda x: x.mean()
-    optimizer = SGD((model.parameters()), lr=0.001)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
-    model, optimizer, criterion, _, scheduler = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
-
-    x = torch.randn(4, 3, 224, 224)
-    x = x.to('cuda')
-    output = model(x)
-    loss = criterion(output)
-    booster.backward(loss, optimizer)
-    optimizer.clip_grad_by_norm(1.0)
-    optimizer.step()
-    scheduler.step()
-
-    save_path = "./model"
-    booster.save_model(model, save_path, True, True, "", 10, use_safetensors=use_safetensors)
-
-    new_model = resnet18()
-    booster.load_model(new_model, save_path)
-```
-
-if you want to run a example, [click here](../../../../examples/tutorial/new_api/cifar_resnet/README.md)
-
-[more design detailers](https://github.com/hpcaitech/ColossalAI/discussions/3046)
diff --git a/docs/source/zh-Hans/basics/colossalai_booster.md b/docs/source/zh-Hans/basics/colossalai_booster.md
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 837714c232a5..5dbca63f6c5e 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -5,7 +5,7 @@
 
 import colossalai
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import parameterize, recursive_check, rerun_if_address_is_in_use, spawn
 from colossalai.utils.cuda import get_current_device
 from colossalai.zero import ColoInitContext, ZeroDDP
 from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
@@ -95,25 +95,3 @@ def run_dist(rank, world_size, port):
 @rerun_if_address_is_in_use()
 def test_gemini_ckpIO(world_size):
     spawn(run_dist, world_size)
-
-
-def recursive_check(d1, d2):
-    for k, v in d1.items():
-        if isinstance(v, dict):
-            recursive_check(v, d2[k])
-        elif isinstance(v, list):
-            for i in range(len(v)):
-                if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to('cpu')
-                    d2[k][i] = d2[k][i].to('cpu')
-                    if not torch.equal(v[i], d2[k][i]):
-                        raise AssertionError
-                elif not v[i] == d2[k][i]:
-                    raise AssertionError
-
-        elif isinstance(v, torch.Tensor):
-            v = v.to('cpu')
-            d2[k] = d2[k].to('cpu')
-            assert torch.equal(v, d2[k])
-        elif not v == d2[k]:
-            raise AssertionError
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
index 6fc6048995ab..b4063a672e87 100644
--- a/tests/test_checkpoint_io/test_general_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -7,7 +7,7 @@
 
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
 from colossalai.checkpoint_io import GeneralCheckpointIO
-from colossalai.testing import clear_cache_before_run, parameterize
+from colossalai.testing import clear_cache_before_run, parameterize, recursive_check
 
 # ========
 # Note:
@@ -100,28 +100,3 @@ def test_sharded_checkpoint(use_safetensors: bool):
     # check for model and optimizer state dict recursively
     recursive_check(model.state_dict(), new_model.state_dict())
     recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
-
-
-# do recursive check for the optimizer state dict
-# if the value is a dict, compare its values
-# if the value is a list, comapre all elements one-by-one
-# if the value is a torch.Tensor, use torch.equal
-# otherwise use assertEqual
-def recursive_check(d1, d2):
-    for k, v in d1.items():
-        if isinstance(v, dict):
-            recursive_check(v, d2[k])
-        elif isinstance(v, list):
-            for i in range(len(v)):
-                if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to("cpu")
-                    d2[k][i] = d2[k][i].to("cpu")
-                    assert torch.equal(v[i], d2[k][i])
-                else:
-                    assert v[i] == d2[k][i]
-        elif isinstance(v, torch.Tensor):
-            v = v.to("cpu")
-            d2[k] = d2[k].to("cpu")
-            assert torch.equal(v, d2[k])
-        else:
-            assert v == d2[k]
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
index 46074507a52e..163cdea2e5cd 100644
--- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -9,7 +9,7 @@
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroCheckpointIO
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, parameterize, recursive_check, rerun_if_address_is_in_use, spawn
 
 
 @clear_cache_before_run()
@@ -49,26 +49,3 @@ def run_dist(rank, world_size, port):
 @rerun_if_address_is_in_use()
 def test_low_level_zero_checkpointIO():
     spawn(run_dist, 2)
-
-
-def recursive_check(d1, d2):
-    for k, v in d1.items():
-        if isinstance(v, dict):
-            recursive_check(v, d2[k])
-        elif isinstance(v, list):
-            for i in range(len(v)):
-                if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to('cpu')
-                    d2[k][i] = d2[k][i].to('cpu')
-                    if not torch.equal(v[i], d2[k][i]):
-                        raise AssertionError
-                elif v[i] != d2[k][i]:
-                    assert v[i] == d2[k][i]
-
-        elif isinstance(v, torch.Tensor):
-            v = v.to('cpu')
-            d2[k] = d2[k].to('cpu')
-            if not torch.equal(v, d2[k]):
-                raise AssertionError
-        elif not v == d2[k]:
-            raise AssertionError
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
index 38d0a381ab5c..cf890deb7400 100644
--- a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
@@ -10,7 +10,7 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPCheckpointIO
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from colossalai.testing import recursive_check, rerun_if_address_is_in_use, spawn
 
 
 def check_torch_ddp_checkpointIO():
@@ -61,26 +61,3 @@ def run_dist(rank, world_size, port):
 @rerun_if_address_is_in_use()
 def test_torch_ddp_checkpointIO():
     spawn(run_dist, 2)
-
-
-def recursive_check(d1, d2):
-    for k, v in d1.items():
-        if isinstance(v, dict):
-            recursive_check(v, d2[k])
-        elif isinstance(v, list):
-            for i in range(len(v)):
-                if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to('cpu')
-                    d2[k][i] = d2[k][i].to('cpu')
-                    if not torch.equal(v[i], d2[k][i]):
-                        raise AssertionError
-                elif v[i] != d2[k][i]:
-                    assert v[i] == d2[k][i]
-
-        elif isinstance(v, torch.Tensor):
-            v = v.to('cpu')
-            d2[k] = d2[k].to('cpu')
-            if not torch.equal(v, d2[k]):
-                raise AssertionError
-        elif not v == d2[k]:
-            raise AssertionError

From f18e667dc44447c1f207d46e4d91aed4b33b5a82 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 9 May 2023 17:52:29 +0800
Subject: [PATCH 04/10] [booster] update tests for booster

---
 colossalai/booster/plugin/torch_ddp_plugin.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index dfef384567ae..bbe8624f8ed2 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -116,9 +116,6 @@ def control_device(self) -> bool:
     def supported_devices(self) -> List[str]:
         return ['cuda']
 
-    def no_sync(self, model) -> contextmanager:
-        return model.no_sync()
-
     def configure(
         self,
         model: nn.Module,

From ac981420859b8608d84560e1437f55bc05e9f9cb Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 9 May 2023 17:53:12 +0800
Subject: [PATCH 05/10] [booster] update tests for booster

---
 colossalai/booster/plugin/torch_ddp_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index bbe8624f8ed2..76906d844ef1 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import Callable, List, Tuple, Union
 
 import torch.nn as nn

From ecfbbf2ca88607c8af46bfc0f858c6b9b41b29ea Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 9 May 2023 18:27:52 +0800
Subject: [PATCH 06/10] [booster] update booster tutorials#3717, fix recursive
 check

---
 colossalai/testing/__init__.py                   |  4 ++--
 colossalai/testing/comparison.py                 | 16 ++++++++++------
 .../test_gemini_checkpoint_io.py                 |  7 ++++---
 .../test_general_checkpoint_io.py                | 10 +++++-----
 .../test_low_level_zero_checkpoint_io.py         | 10 ++++++++--
 .../test_torch_ddp_checkpoint_io.py              |  6 +++---
 6 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/colossalai/testing/__init__.py b/colossalai/testing/__init__.py
index 7af7dfa56099..9d0475ed064c 100644
--- a/colossalai/testing/__init__.py
+++ b/colossalai/testing/__init__.py
@@ -4,7 +4,7 @@
     assert_equal,
     assert_equal_in_group,
     assert_not_equal,
-    recursive_check,
+    check_state_dict_equal,
 )
 from .pytest_wrapper import run_on_environment_flag
 from .utils import (
@@ -20,5 +20,5 @@
 __all__ = [
     'assert_equal', 'assert_not_equal', 'assert_close', 'assert_close_loose', 'assert_equal_in_group', 'parameterize',
     'rerun_on_exception', 'rerun_if_address_is_in_use', 'skip_if_not_enough_gpus', 'free_port', 'spawn',
-    'clear_cache_before_run', 'run_on_environment_flag', 'recursive_check'
+    'clear_cache_before_run', 'run_on_environment_flag', 'check_state_dict_equal'
 ]
diff --git a/colossalai/testing/comparison.py b/colossalai/testing/comparison.py
index 94bb15b082a5..faf61638d8bb 100644
--- a/colossalai/testing/comparison.py
+++ b/colossalai/testing/comparison.py
@@ -1,3 +1,5 @@
+from typing import OrderedDict
+
 import torch
 import torch.distributed as dist
 from torch import Tensor
@@ -30,21 +32,23 @@ def assert_equal_in_group(tensor: Tensor, process_group: ProcessGroup = None):
         assert torch.all(a == b), f'expected tensors on rank {i} and {i + 1} to be equal but they are not, {a} vs {b}'
 
 
-def recursive_check(d1, d2):
+def check_state_dict_equal(d1: OrderedDict, d2: OrderedDict, ignore_device: bool = True):
     for k, v in d1.items():
         if isinstance(v, dict):
-            recursive_check(v, d2[k])
+            check_state_dict_equal(v, d2[k])
         elif isinstance(v, list):
             for i in range(len(v)):
                 if isinstance(v[i], torch.Tensor):
-                    v[i] = v[i].to("cpu")
-                    d2[k][i] = d2[k][i].to("cpu")
+                    if not ignore_device:
+                        v[i] = v[i].to("cpu")
+                        d2[k][i] = d2[k][i].to("cpu")
                     assert torch.equal(v[i], d2[k][i])
                 else:
                     assert v[i] == d2[k][i]
         elif isinstance(v, torch.Tensor):
-            v = v.to("cpu")
-            d2[k] = d2[k].to("cpu")
+            if not ignore_device:
+                v = v.to("cpu")
+                d2[k] = d2[k].to("cpu")
             assert torch.equal(v, d2[k])
         else:
             assert v == d2[k]
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 5dbca63f6c5e..1e5a2e1c4b44 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -5,7 +5,7 @@
 
 import colossalai
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
-from colossalai.testing import parameterize, recursive_check, rerun_if_address_is_in_use, spawn
+from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.utils.cuda import get_current_device
 from colossalai.zero import ColoInitContext, ZeroDDP
 from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
@@ -41,7 +41,8 @@ def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: b
                            '', (model_size / 3),
                            use_safetensors=use_safetensors)
         new_bert_model = BertForSequenceClassification.from_pretrained(model_ckpt_dir.name)
-        recursive_check(bert_model.state_dict(only_rank_0=True, dtype=(torch.float32)), new_bert_model.state_dict())
+        check_state_dict_equal(bert_model.state_dict(only_rank_0=True, dtype=(torch.float32)),
+                               new_bert_model.state_dict(), False)
     model_ckpt_dir.cleanup()
 
 
@@ -79,7 +80,7 @@ def exam_state_dict(placement_policy, model_name: str, use_safetensors: bool):
         ckpt_io.load_model(new_model, (model_ckpt_dir.name), strict=True)
         model_dict = model.state_dict(only_rank_0=True)
         new_model_dict = new_model.state_dict(only_rank_0=True)
-        recursive_check(model_dict, new_model_dict)
+        check_state_dict_equal(model_dict, new_model_dict, False)
     model_ckpt_dir.cleanup()
 
 
diff --git a/tests/test_checkpoint_io/test_general_checkpoint_io.py b/tests/test_checkpoint_io/test_general_checkpoint_io.py
index b4063a672e87..9e973bb23e0b 100644
--- a/tests/test_checkpoint_io/test_general_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_general_checkpoint_io.py
@@ -7,7 +7,7 @@
 
 from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
 from colossalai.checkpoint_io import GeneralCheckpointIO
-from colossalai.testing import clear_cache_before_run, parameterize, recursive_check
+from colossalai.testing import check_state_dict_equal, clear_cache_before_run, parameterize
 
 # ========
 # Note:
@@ -55,8 +55,8 @@ def test_unsharded_checkpoint(use_safetensors: bool):
     ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
 
     # check for model and optimizer state dict recursively
-    recursive_check(model.state_dict(), new_model.state_dict())
-    recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+    check_state_dict_equal(model.state_dict(), new_model.state_dict())
+    check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
 
 
 @pytest.mark.parametrize('use_safetensors', [True, False])
@@ -98,5 +98,5 @@ def test_sharded_checkpoint(use_safetensors: bool):
     ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
 
     # check for model and optimizer state dict recursively
-    recursive_check(model.state_dict(), new_model.state_dict())
-    recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+    check_state_dict_equal(model.state_dict(), new_model.state_dict())
+    check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
index 163cdea2e5cd..217a950d8155 100644
--- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -9,7 +9,13 @@
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroCheckpointIO
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import clear_cache_before_run, parameterize, recursive_check, rerun_if_address_is_in_use, spawn
+from colossalai.testing import (
+    check_state_dict_equal,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
 
 
 @clear_cache_before_run()
@@ -38,7 +44,7 @@ def check_low_level_zero_checkpointIO(stage: int):
         new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
         _, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
         ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
-        recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
 
 
 def run_dist(rank, world_size, port):
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
index cf890deb7400..9128f8c0fe9e 100644
--- a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
@@ -10,7 +10,7 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPCheckpointIO
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import recursive_check, rerun_if_address_is_in_use, spawn
+from colossalai.testing import check_state_dict_equal, rerun_if_address_is_in_use, spawn
 
 
 def check_torch_ddp_checkpointIO():
@@ -47,10 +47,10 @@ def check_torch_ddp_checkpointIO():
         _, new_optimizer, _, _, new_scheduler = booster.boost(new_model, new_optimizer, lr_scheduler=new_scheduler)
 
         ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
-        recursive_check(optimizer.state_dict(), new_optimizer.state_dict())
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
 
         ckpt_io.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_tempfile.name)
-        recursive_check(scheduler.state_dict(), new_scheduler.state_dict())
+        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
 
 
 def run_dist(rank, world_size, port):

From 351e344da69f6a05ed9861e468c4a4b37b0d51e6 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 11 May 2023 14:40:20 +0800
Subject: [PATCH 07/10] [booster] update booster tutorials#3717, update setup
 doc

---
 colossalai/zero/gemini/chunk/chunk.py           |  4 ++++
 docs/source/en/get_started/installation.md      |  6 +++---
 docs/source/zh-Hans/basics/launch_colossalai.md | 16 ++++++++++++----
 docs/source/zh-Hans/get_started/installation.md |  6 +++---
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index a7682eaf62e9..6b960076f8e8 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -77,6 +77,7 @@ def __init__(self,
             keep_gathered (bool): optional, if True, this chunk is always gathered in CUDA memory
             pin_memory (bool): optional, if True, this chunk always has a shard copied in pinned CPU memory
         """
+        # chunk的个数
         self.count_id = Chunk._total_number
         Chunk._total_number += 1
 
@@ -214,6 +215,7 @@ def can_move(self) -> bool:
 
     @property
     def can_release(self) -> bool:
+        # 如果是gathered的状态，或者chunk所包含的tensor全部都是hold状态则，不可释放
         if self.keep_gathered:
             return False
         else:
@@ -222,10 +224,12 @@ def can_release(self) -> bool:
 
     @property
     def can_reduce(self):
+        # chunk中所有tensor都是bwd
         return self.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == self.num_tensors
 
     @property
     def has_inf_or_nan(self) -> bool:
+        # 判断是否有inf或者nan的值
         """Check if the chunk has inf or nan values on CUDA.
         """
         if self.is_gathered:
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index 290879219074..93f9d074ead4 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -39,13 +39,13 @@ cd ColossalAI
 pip install -r requirements/requirements.txt
 
 # install colossalai
-pip install .
+CUDA_EXT=1 pip install .
 ```
 
-If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
+If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer), just don't specify the `CUDA_EXT`:
 
 ```shell
-CUDA_EXT=1 pip install .
+pip install .
 ```
 
 
diff --git a/docs/source/zh-Hans/basics/launch_colossalai.md b/docs/source/zh-Hans/basics/launch_colossalai.md
index ca927de578d5..e90ec88df68e 100644
--- a/docs/source/zh-Hans/basics/launch_colossalai.md
+++ b/docs/source/zh-Hans/basics/launch_colossalai.md
@@ -74,17 +74,15 @@ import colossalai
 args = colossalai.get_default_parser().parse_args()
 
 # launch distributed environment
-colossalai.launch(config=<CONFIG>,
+colossalai.launch(config=args.config,
                   rank=args.rank,
                   world_size=args.world_size,
                   host=args.host,
                   port=args.port,
                   backend=args.backend
 )
-
 ```
 
-
 ### 用 Colossal-AI命令行工具 启动
 
 为了更好地支持单节点以及多节点的训练，我们通过封装PyTorch的启动器实现了一个更加方便的启动器。
@@ -93,12 +91,22 @@ PyTorch自带的启动器需要在每个节点上都启动命令才能启动多
 首先，我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装，那么我们自然而然应该使用`colossalai.launch_from_torch`。
 分布式环境所需的参数，如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的，可以直接从环境变量中读取。
 
+
+config.py
+```python
+BATCH_SIZE = 512
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 2
+```
+train.py
 ```python
 import colossalai
 
 colossalai.launch_from_torch(
-    config=<CONFIG>,
+    config="./config.py",
 )
+...
 ```
 
 接下来，我们可以轻松地在终端使用`colossalai run`来启动训练。下面的命令可以在当前机器上启动一个4卡的训练任务。
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index 72f85393814f..8858ae0fa262 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -38,13 +38,13 @@ cd ColossalAI
 pip install -r requirements/requirements.txt
 
 # install colossalai
-pip install .
+CUDA_EXT=1 pip install .
 ```
 
-如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）：
+如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）您可以不添加`CUDA_EXT=1`：
 
 ```shell
-NO_CUDA_EXT=1 pip install .
+pip install .
 ```
 
 <!-- doc-test-command: echo "installation.md does not need test" -->

From c6f4e3ce6cb4d7fd95c1850e0e5e854b404c160d Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 13:03:07 +0800
Subject: [PATCH 08/10] fix conflict

---
 colossalai/zero/gemini/chunk/chunk.py           | 4 ----
 docs/source/zh-Hans/basics/launch_colossalai.md | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index 6b960076f8e8..a7682eaf62e9 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -77,7 +77,6 @@ def __init__(self,
             keep_gathered (bool): optional, if True, this chunk is always gathered in CUDA memory
             pin_memory (bool): optional, if True, this chunk always has a shard copied in pinned CPU memory
         """
-        # chunk的个数
         self.count_id = Chunk._total_number
         Chunk._total_number += 1
 
@@ -215,7 +214,6 @@ def can_move(self) -> bool:
 
     @property
     def can_release(self) -> bool:
-        # 如果是gathered的状态，或者chunk所包含的tensor全部都是hold状态则，不可释放
         if self.keep_gathered:
             return False
         else:
@@ -224,12 +222,10 @@ def can_release(self) -> bool:
 
     @property
     def can_reduce(self):
-        # chunk中所有tensor都是bwd
         return self.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == self.num_tensors
 
     @property
     def has_inf_or_nan(self) -> bool:
-        # 判断是否有inf或者nan的值
         """Check if the chunk has inf or nan values on CUDA.
         """
         if self.is_gathered:
diff --git a/docs/source/zh-Hans/basics/launch_colossalai.md b/docs/source/zh-Hans/basics/launch_colossalai.md
index bc94420415a9..39b09deae085 100644
--- a/docs/source/zh-Hans/basics/launch_colossalai.md
+++ b/docs/source/zh-Hans/basics/launch_colossalai.md
@@ -81,8 +81,10 @@ colossalai.launch(config=args.config,
                   port=args.port,
                   backend=args.backend
 )
+
 ```
 
+
 ### 用 Colossal-AI命令行工具 启动
 
 为了更好地支持单节点以及多节点的训练，我们通过封装PyTorch的启动器实现了一个更加方便的启动器。

From 5d061891a884f35e2c59fee7a1ff1cd126ae9b16 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 14:52:53 +0800
Subject: [PATCH 09/10] [doc] update amp document

---
 docs/sidebars.json                            |   1 +
 docs/source/en/basics/define_your_config.md   |  20 +-
 .../en/features/mixed_precision_training.md   |   3 +-
 .../mixed_precision_training_with_booster.md  | 251 ++++++++++++++++++
 .../zh-Hans/basics/define_your_config.md      |  19 +-
 .../features/mixed_precision_training.md      |   3 +-
 .../mixed_precision_training_with_booster.md  |   0
 7 files changed, 273 insertions(+), 24 deletions(-)
 create mode 100644 docs/source/en/features/mixed_precision_training_with_booster.md
 create mode 100644 docs/source/zh-Hans/features/mixed_precision_training_with_booster.md

diff --git a/docs/sidebars.json b/docs/sidebars.json
index 1b3ddedd12b1..8be40e4512f9 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -43,6 +43,7 @@
       "label": "Features",
       "collapsed": true,
       "items": [
+        "features/mixed_precision_training_with_booster",
         "features/mixed_precision_training",
         "features/gradient_accumulation_with_booster",
         "features/gradient_accumulation",
diff --git a/docs/source/en/basics/define_your_config.md b/docs/source/en/basics/define_your_config.md
index 048ffcacbb8f..223af44f8dce 100644
--- a/docs/source/en/basics/define_your_config.md
+++ b/docs/source/en/basics/define_your_config.md
@@ -2,9 +2,6 @@
 
 Author: Guangyang Lu, Shenggui Li, Siqi Mai
 
-> ⚠️ The information on this page is outdated and will be deprecated. Please check [Booster API](../basics/booster_api.md) for more information.
-
-
 **Prerequisite:**
 - [Distributed Training](../concepts/distributed_training.md)
 - [Colossal-AI Overview](../concepts/colossalai_overview.md)
@@ -24,8 +21,7 @@ In this tutorial, we will cover how to define your configuration file.
 ## Configuration Definition
 
 In a configuration file, there are two types of variables. One serves as feature specification and the other serves
-as hyper-parameters. All feature-related variables are reserved keywords. For example, if you want to use mixed precision
-training, you need to use the variable name `fp16` in the config file and follow a pre-defined format.
+as hyper-parameters. All feature-related variables are reserved keywords. For example, if you want to use 1D tensor parallelism, you need to use the variable name `parallel` in the config file and follow a pre-defined format.
 
 ### Feature Specification
 
@@ -37,14 +33,13 @@ To illustrate the use of config file, we use mixed precision training as an exam
 follow the steps below.
 
 1. create a configuration file (e.g. `config.py`, the file name can be anything)
-2. define the mixed precision configuration in the config file. For example, in order to use mixed precision training
-natively provided by PyTorch, you can just write these lines of code below into your config file.
+2. define the hybrid parallelism configuration in the config file. For example, in order to use 1D tensor parallel, you can just write these lines of code below into your config file.
 
    ```python
-   from colossalai.amp import AMP_TYPE
-
-   fp16 = dict(
-     mode=AMP_TYPE.TORCH
+   parallel = dict(
+      data=1,
+      pipeline=1,
+      tensor=dict(size=2, mode='1d'),
    )
    ```
 
@@ -57,7 +52,7 @@ the current directory.
    colossalai.launch(config='./config.py', ...)
    ```
 
-In this way, Colossal-AI knows what features you want to use and will inject this feature during `colossalai.initialize`.
+In this way, Colossal-AI knows what features you want to use and will inject this feature.
 
 ### Global Hyper-parameters
 
@@ -83,3 +78,4 @@ colossalai.launch(config='./config.py', ...)
 print(gpc.config.BATCH_SIZE)
 
 ```
+<!-- doc-test-command: echo "define_your_config.md does not need test" -->
diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md
index 11aa5235301a..04f0bc6deb98 100644
--- a/docs/source/en/features/mixed_precision_training.md
+++ b/docs/source/en/features/mixed_precision_training.md
@@ -1,4 +1,4 @@
-# Auto Mixed Precision Training
+# Auto Mixed Precision Training (Outdated)
 
 Author: Chuanrui Wang, Shenggui Li, Yongbin Li
 
@@ -365,3 +365,4 @@ Use the following command to start the training scripts. You can change `--nproc
 ```python
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training.py  -->
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
new file mode 100644
index 000000000000..2ae88b07157f
--- /dev/null
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -0,0 +1,251 @@
+# Auto Mixed Precision Training (Latest)
+
+Author: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Training Booster](../basics/booster_api.md)
+
+**Related Paper**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## Introduction
+
+AMP stands for automatic mixed precision training.
+In Colossal-AI, we have incorporated different implementations of mixed precision training:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
+| AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 |
+
+The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
+The last method is similar to Apex O2 level.
+Among these methods, apex AMP is not compatible with tensor parallelism.
+This is because that tensors are split across devices in tensor parallelism, thus, it is required to communicate among different processes to check if inf or nan occurs in the whole model weights.
+We modified the torch amp implementation so that it is compatible with tensor parallelism now.
+
+> ❌️ fp16 and zero are not compatible
+>
+> ⚠️ Pipeline only support naive AMP currently
+
+We recommend you to use torch AMP as it generally gives better accuracy than naive AMP if no pipeline is used.
+
+## Table of Contents
+
+In this tutorial we will cover:
+
+1. [AMP introduction](#amp-introduction)
+2. [AMP in Colossal-AI](#amp-in-colossal-ai)
+3. [Hands-on Practice](#hands-on-practice)
+
+## AMP Introduction
+
+Automatic Mixed Precision training is a mixture of FP16 and FP32 training.
+
+Half-precision float point format (FP16) has lower arithmetic complexity and higher compute efficiency. Besides, fp16 requires half of the storage needed by fp32 and saves memory & network bandwidth, which makes more memory available for large batch size and model size.
+
+However, there are other operations, like reductions, which require the dynamic range of fp32 to avoid numeric overflow/underflow. That's the reason why we introduce automatic mixed precision, attempting to match each operation to its appropriate data type, which can reduce the memory footprint and augment training efficiency.
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>Illustration of an ordinary AMP (figure from <a href="https://arxiv.org/abs/2108.05818">PatrickStar paper</a>)</figcaption>
+</figure>
+
+## AMP in Colossal-AI
+
+We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Now booster support torch amp, the other two(apex amp, naive amp) are still started by `colossalai.initialize`, if needed, please refer to [this](./mixed_precision_training.md). Next we will support `bf16`, `fp8`.
+
+### Start with Booster
+instantiate `Booster` with `mixed_precision="fp16"`, then you can train with torch amp.
+<!--- doc-test-ignore-start -->
+```python
+"""
+    Mapping:
+    'fp16': torch amp
+    'fp16_apex': apex amp,
+    'bf16': bf16,
+    'fp8': fp8,
+    'fp16_naive': naive amp
+"""
+from colossalai import Booster
+booster = Booster(mixed_precision='fp16',...)
+```
+<!--- doc-test-ignore-end -->
+or you can create a `FP16TorchMixedPrecision` object, such as:
+<!--- doc-test-ignore-start -->
+```python
+from colossalai.mixed_precision import FP16TorchMixedPrecision
+mixed_precision = FP16TorchMixedPrecision(
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000)
+booster = Booster(mixed_precision=mixed_precision,...)
+```
+<!--- doc-test-ignore-end -->
+The same goes for other types of amps.
+
+
+### Torch AMP Configuration
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16TorchMixedPrecision }}
+
+### Apex AMP Configuration
+
+For this mode, we rely on the Apex implementation for mixed precision training.
+We support this plugin because it allows for finer control on the granularity of mixed precision.
+For example, O2 level (optimization level 2) will keep batch normalization in fp32.
+
+If you look for more details, please refer to [Apex Documentation](https://nvidia.github.io/apex/).
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16ApexMixedPrecision }}
+
+### Naive AMP Configuration
+
+In Naive AMP mode, we achieved mixed precision training while maintaining compatibility with complex tensor and pipeline parallelism.
+This AMP mode will cast all operations into fp16.
+The following code block shows the mixed precision api for this mode.
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16NaiveMixedPrecision }}
+
+When using `colossalai.booster`, you are required to first instantiate a model, an optimizer and a criterion.
+The output model is converted to AMP model of smaller memory consumption.
+If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`.
+Otherwise, try smaller models or checkout more parallelization training techniques!
+
+
+## Hands-on Practice
+
+Now we will introduce the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example.
+
+### Step 1. Import libraries in train.py
+
+Create a `train.py` and import the necessary dependencies. Remember to install `scipy` and `timm` by running
+`pip install timm scipy`.
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from timm.models import vit_base_patch16_224
+from titans.utils import barrier_context
+from torchvision import datasets, transforms
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+```
+
+### Step 2. Initialize Distributed Environment
+
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md)
+for other initialization methods.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### Step 3. Create training components
+
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is
+obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])`
+to a path on your machine. Data will be automatically downloaded to the root path.
+
+```python
+# define the constants
+NUM_EPOCHS = 2
+BATCH_SIZE = 128
+
+# build model
+model = vit_base_patch16_224(drop_rate=0.1)
+
+# build dataloader
+train_dataset = datasets.Caltech101(
+    root=Path(os.environ['DATA']),
+    download=True,
+    transform=transforms.Compose([
+        transforms.Resize(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        Gray2RGB(),
+        transforms.Normalize([0.5, 0.5, 0.5],
+                                [0.5, 0.5, 0.5])
+    ]))
+
+# build optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+
+# lr_scheduler
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS)
+```
+
+### Step 4. Inject AMP Feature
+
+Create a `MixedPrecision`(if needed) and `TorchDDPPlugin` object, call `colossalai.boost` convert the training components to be running with FP16.
+
+```python
+plugin = TorchDDPPlugin()
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+booster = Booster(mixed_precision='fp16', plugin=plugin)
+
+# if you need to customize the config, do like this
+# >>> from colossalai.mixed_precision import FP16TorchMixedPrecision
+# >>> mixed_precision = FP16TorchMixedPrecision(
+# >>>     init_scale=2.**16,
+# >>>     growth_factor=2.0,
+# >>>     backoff_factor=0.5,
+# >>>     growth_interval=2000)
+# >>> plugin = TorchDDPPlugin()
+# >>> booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+
+# boost model, optimizer, criterion, dataloader, lr_scheduler
+model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, dataloader, lr_scheduler)
+```
+
+### Step 5. Train with Booster
+
+Use booster in a normal training loops.
+
+```python
+model.train()
+for epoch in range(NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        optimizer.zero_grad()
+        output = model(img)
+        loss = criterion(output, label)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+    lr_scheduler.step()
+```
+
+### Step 6. Invoke Training Scripts
+
+Use the following command to start the training scripts. You can change `--nproc_per_node` to use a different number of GPUs.
+
+```shell
+colossalai run --nproc_per_node 1 train.py --config config/config.py
+```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->
diff --git a/docs/source/zh-Hans/basics/define_your_config.md b/docs/source/zh-Hans/basics/define_your_config.md
index 720e75805e8d..cf099f03815e 100644
--- a/docs/source/zh-Hans/basics/define_your_config.md
+++ b/docs/source/zh-Hans/basics/define_your_config.md
@@ -2,8 +2,6 @@
 
 作者: Guangyang Lu, Shenggui Li, Siqi Mai
 
-> ⚠️ 此页面上的信息已经过时并将被废弃。请在[Booster API](../basics/booster_api.md)页面查阅更新。
-
 **预备知识:**
 - [分布式训练](../concepts/distributed_training.md)
 - [Colossal-AI 总览](../concepts/colossalai_overview.md)
@@ -20,7 +18,7 @@
 
 ## 配置定义
 
-在一个配置文件中，有两种类型的变量。一种是作为特征说明，另一种是作为超参数。所有与特征相关的变量都是保留关键字。例如，如果您想使用混合精度训练，需要在 config 文件中使用变量名`fp16`，并遵循预先定义的格式。
+在一个配置文件中，有两种类型的变量。一种是作为特征说明，另一种是作为超参数。所有与特征相关的变量都是保留关键字。例如，如果您想使用`1D`张量并行，需要在 config 文件中使用变量名`fp16`，并遵循预先定义的格式。
 
 ### 功能配置
 
@@ -29,13 +27,13 @@ Colossal-AI 提供了一系列的功能来加快训练速度。每个功能都
 为了说明配置文件的使用，我们在这里使用混合精度训练作为例子。您需要遵循以下步骤。
 
 1. 创建一个配置文件（例如 `config.py`，您可以指定任意的文件名）。
-2. 在配置文件中定义混合精度的配置。例如，为了使用 PyTorch 提供的原始混合精度训练，您只需将下面这几行代码写入您的配置文件中。
-
-   ```python
-   from colossalai.amp import AMP_TYPE
+2. 在配置文件中定义混合并行的配置。例如，为了使用`1D`张量并行，您只需将下面这几行代码写入您的配置文件中。
 
-   fp16 = dict(
-     mode=AMP_TYPE.TORCH
+    ```python
+   parallel = dict(
+      data=1,
+      pipeline=1,
+      tensor=dict(size=2, mode='1d'),
    )
    ```
 
@@ -47,7 +45,7 @@ Colossal-AI 提供了一系列的功能来加快训练速度。每个功能都
    colossalai.launch(config='./config.py', ...)
    ```
 
-这样，Colossal-AI 便知道您想使用什么功能，并会在 `colossalai.initialize` 期间注入您所需要的功能。
+这样，Colossal-AI 便知道您想使用什么功能，并注入您所需要的功能。
 
 ### 全局超参数
 
@@ -71,3 +69,4 @@ colossalai.launch(config='./config.py', ...)
 print(gpc.config.BATCH_SIZE)
 
 ```
+<!-- doc-test-command: echo "define_your_config.md does not need test" -->
diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md
index c9db3a59c1c3..b3b9a7ebc26b 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training.md
@@ -1,4 +1,4 @@
-# 自动混合精度训练 (AMP)
+# 自动混合精度训练 (旧版本)
 
 作者: Chuanrui Wang, Shenggui Li, Yongbin Li
 
@@ -342,3 +342,4 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 ```python
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py --config config/config_AMP_torch.py
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training.py  -->
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
new file mode 100644
index 000000000000..e69de29bb2d1

From 797a89ab4412b580d5fb606ce81969b6b11104ea Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 23 May 2023 14:59:03 +0800
Subject: [PATCH 10/10] [doc] update amp document

---
 .../mixed_precision_training_with_booster.md  | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)

diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index e69de29bb2d1..689e28fcc6fd 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -0,0 +1,235 @@
+# 自动混合精度训练 (最新版本)
+
+作者: [Mingyan Jiang](https://github.com/jiangmingyan)
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [booster使用](../basics/booster_api.md)
+
+**相关论文**
+- [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+
+
+## 引言
+
+AMP 代表自动混合精度训练。
+在 Colossal-AI 中, 我们结合了混合精度训练的不同实现:
+
+1. torch.cuda.amp
+2. apex.amp
+3. naive amp
+
+
+| Colossal-AI | 支持张量并行 | 支持流水并行 | fp16范围 |
+| ----------- | ----------------------- | ------------------------- | ----------- |
+| AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间，模型参数、激活和梯度向下转换至fp16 |
+| AMP_TYPE.APEX | ❌ | ❌ | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3 |
+| AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作，全都向下转换至fp16 |
+
+前两个依赖于 PyTorch (1.6及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现inf或nan。我们修改了torch amp实现，使其现在与张量并行兼容。
+
+> ❌️ fp16与ZeRO不兼容
+>
+> ⚠️ 流水并行目前仅支持naive amp
+
+我们建议使用 torch AMP，因为在不使用流水并行时，它通常比 NVIDIA AMP 提供更好的准确性。
+
+## 目录
+
+在本教程中，我们将介绍:
+
+1. [AMP 介绍](#amp-介绍)
+2. [Colossal-AI 中的 AMP](#colossal-ai-中的-amp)
+3. [练习实例](#实例)
+
+## AMP 介绍
+
+自动混合精度训练是混合 FP16 和 FP32 训练。
+
+半精度浮点格式（FP16）具有较低的算法复杂度和较高的计算效率。此外，FP16 仅需要 FP32 所需的一半存储空间，并节省了内存和网络带宽，从而为大 batch size 和大模型提供了更多内存。
+
+然而，还有其他操作，如缩减，需要 FP32 的动态范围，以避免数值溢出/下溢。因此，我们引入自动混合精度，尝试将每个操作与其相应的数据类型相匹配，这可以减少内存占用并提高训练效率。
+
+<figure style={{textAlign: "center"}}>
+<img src="https://s2.loli.net/2022/01/28/URzLJ3MPeDQbtck.png"/>
+<figcaption>AMP 示意图 (图片来自 <a href="https://arxiv.org/abs/2108.05818">PatrickStar 论文</a>)</figcaption>
+</figure>
+
+## Colossal-AI 中的 AMP
+
+我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster支持amp特性注入，如果您要使用混合精度训练，则在创建booster实例时指定`mixed_precision`参数，我们现已支持torch amp，apex amp, naive amp（现已移植torch amp至booster，apex amp, naive amp仍由`colossalai.initialize`方式启动，如您需使用，请[参考](./mixed_precision_training.md）;后续将会拓展`bf16`,`pf8`的混合精度训练.
+
+#### booster启动方式
+您可以在创建booster实例时，指定`mixed_precision="fp16"`即使用torch amp。
+<!--- doc-test-ignore-start -->
+```python
+"""
+    初始化映射关系如下：
+    'fp16': torch amp
+    'fp16_apex': apex amp,
+    'bf16': bf16,
+    'fp8': fp8,
+    'fp16_naive': naive amp
+"""
+from colossalai import Booster
+booster = Booster(mixed_precision='fp16',...)
+```
+<!--- doc-test-ignore-end -->
+或者您可以自定义一个`FP16TorchMixedPrecision`对象，如
+<!--- doc-test-ignore-start -->
+```python
+from colossalai.mixed_precision import FP16TorchMixedPrecision
+mixed_precision = FP16TorchMixedPrecision(
+    init_scale=2.**16,
+    growth_factor=2.0,
+    backoff_factor=0.5,
+    growth_interval=2000)
+booster = Booster(mixed_precision=mixed_precision,...)
+```
+<!--- doc-test-ignore-end -->
+其他类型的amp使用方式也是一样的。
+
+### Torch AMP 配置
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16TorchMixedPrecision }}
+
+### Apex AMP 配置
+
+对于这种模式，我们依靠 Apex 实现混合精度训练。我们支持这个插件，因为它允许对混合精度的粒度进行更精细的控制。
+例如, O2 水平 (优化器水平2) 将保持 batch normalization 为 FP32。
+
+如果你想了解更多细节，请参考 [Apex Documentation](https://nvidia.github.io/apex/)。
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16ApexMixedPrecision }}
+
+### Naive AMP 配置
+
+在 Naive AMP 模式中, 我们实现了混合精度训练，同时保持了与复杂张量和流水并行的兼容性。该 AMP 模式将所有操作转为 FP16 。下列代码块展示了该模式的booster启动方式。
+
+{{ autodoc:colossalai.booster.mixed_precision.FP16NaiveMixedPrecision }}
+
+当使用`colossalai.booster`时, 首先需要实例化一个模型、一个优化器和一个标准。将输出模型转换为内存消耗较小的 AMP 模型。如果您的输入模型已经太大，无法放置在 GPU 中，请使用`dtype=torch.float16`实例化你的模型。或者请尝试更小的模型，或尝试更多的并行化训练技术！
+
+## 实例
+
+下面我们将展现如何在 Colossal-AI 使用 AMP。在该例程中，我们使用 Torch AMP.
+
+### 步骤 1. 在 train.py 导入相关库
+
+创建`train.py`并导入必要依赖. 请记得通过命令`pip install timm scipy`安装`scipy`和`timm`。
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from timm.models import vit_base_patch16_224
+from titans.utils import barrier_context
+from torchvision import datasets, transforms
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+```
+
+### 步骤 2. 初始化分布式环境
+
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
+使用其他初始化方法。
+
+```python
+# 初始化分布式设置
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### 步骤 3. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`
+在你的机器上设置路径。数据将会被自动下载到该路径。
+
+```python
+# define the constants
+NUM_EPOCHS = 2
+BATCH_SIZE = 128
+# build model
+model = vit_base_patch16_224(drop_rate=0.1)
+
+# build dataloader
+train_dataset = datasets.Caltech101(
+    root=Path(os.environ['DATA']),
+    download=True,
+    transform=transforms.Compose([
+        transforms.Resize(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        Gray2RGB(),
+        transforms.Normalize([0.5, 0.5, 0.5],
+                                [0.5, 0.5, 0.5])
+    ]))
+
+# build optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.1)
+
+# build loss
+criterion = torch.nn.CrossEntropyLoss()
+
+# lr_scheduelr
+lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=NUM_EPOCHS)
+```
+
+### 步骤 4. 插入 AMP
+创建一个MixedPrecision对象（如果需要）及torchDDPPlugin对象，调用 `colossalai.boost` 将所有训练组件转为为FP16模式.
+
+```python
+plugin = TorchDDPPlugin()
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+booster = Booster(mixed_precision='fp16', plugin=plugin)
+
+# if you need to customize the config, do like this
+# >>> from colossalai.mixed_precision import FP16TorchMixedPrecision
+# >>> mixed_precision = FP16TorchMixedPrecision(
+# >>>     init_scale=2.**16,
+# >>>     growth_factor=2.0,
+# >>>     backoff_factor=0.5,
+# >>>     growth_interval=2000)
+# >>> plugin = TorchDDPPlugin()
+# >>> booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+
+# boost model, optimizer, criterion, dataloader, lr_scheduler
+model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, dataloader, lr_scheduler)
+```
+
+### 步骤 5. 使用 booster 训练
+
+使用booster构建一个普通的训练循环。
+
+```python
+model.train()
+for epoch in range(NUM_EPOCHS):
+    for img, label in enumerate(train_dataloader):
+        img = img.cuda()
+        label = label.cuda()
+        optimizer.zero_grad()
+        output = model(img)
+        loss = criterion(output, label)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+    lr_scheduler.step()
+```
+
+### 步骤 6. 启动训练脚本
+
+使用下列命令启动训练脚本，你可以改变 `--nproc_per_node` 以使用不同数量的 GPU。
+
+```shell
+colossalai run --nproc_per_node 1 train.py --config config/config.py
+```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 mixed_precision_training_with_booster.py  -->