From 9942fd5bfa5a15f7d72a6ea00704588af8772cc6 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Mon, 15 Nov 2021 16:43:28 +0800
Subject: [PATCH 01/10] remove redundancy func in setup (#19) (#20)

---
 setup.py | 98 +++++++++++++++++---------------------------------------
 1 file changed, 29 insertions(+), 69 deletions(-)

diff --git a/setup.py b/setup.py
index e68430a210d8..d71876bb9938 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,6 @@
 import os
 import subprocess
 import sys
-import warnings
 
 import torch
 from setuptools import setup, find_packages
@@ -23,13 +22,36 @@ def get_cuda_bare_metal_version(cuda_dir):
     return raw_output, bare_metal_major, bare_metal_minor
 
 
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(
+        cuda_dir)
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
+                           "not match the version used to compile Pytorch binaries.  " +
+                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
+                           "In some cases, a minor-version mismatch will not cause later errors:  " +
+                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+                           "You can try commenting out this check (at your own risk).")
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
 if not torch.cuda.is_available():
     # https://github.com/NVIDIA/apex/issues/486
     # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
     # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
     print('\nWarning: Torch did not find available GPUs on this system.\n',
           'If your intention is to cross-compile, this is not an error.\n'
-          'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
+          'By default, Colossal-AI will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
           'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
           'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
           'If you wish to cross-compile for a single specific architecture,\n'
@@ -46,66 +68,12 @@ def get_cuda_bare_metal_version(cuda_dir):
 TORCH_MINOR = int(torch.__version__.split('.')[1])
 
 if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
-    raise RuntimeError("Apex requires Pytorch 0.4 or newer.\n" +
+    raise RuntimeError("Colossal-AI requires Pytorch 0.4 or newer.\n" +
                        "The latest stable release can be obtained from https://pytorch.org/")
 
 cmdclass = {}
 ext_modules = []
 
-extras = {}
-if "--pyprof" in sys.argv:
-    string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
-             "soon be removed from Apex.  Please visit\n" + \
-             "https://github.com/NVIDIA/PyProf\n" + \
-             "for the latest version."
-    warnings.warn(string, DeprecationWarning)
-    with open('requirements.txt') as f:
-        required_packages = f.read().splitlines()
-        extras['pyprof'] = required_packages
-    try:
-        sys.argv.remove("--pyprof")
-    except:
-        pass
-else:
-    warnings.warn(
-        "Option --pyprof not specified. Not installing PyProf dependencies!")
-
-if "--cuda_ext" in sys.argv:
-    if TORCH_MAJOR == 0:
-        raise RuntimeError("--cuda_ext requires Pytorch 1.0 or later, "
-                           "found torch.__version__ = {}".format(torch.__version__))
-
-
-def get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output(
-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-
-    return raw_output, bare_metal_major, bare_metal_minor
-
-
-def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(
-        cuda_dir)
-    torch_binary_major = torch.version.cuda.split(".")[0]
-    torch_binary_minor = torch.version.cuda.split(".")[1]
-
-    print("\nCompiling cuda extensions with")
-    print(raw_output + "from " + cuda_dir + "/bin\n")
-
-    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
-                           "not match the version used to compile Pytorch binaries.  " +
-                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
-                           "In some cases, a minor-version mismatch will not cause later errors:  " +
-                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-                           "You can try commenting out this check (at your own risk).")
-
-
 # Set up macros for forward/backward compatibility hack around
 # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
 # and
@@ -123,6 +91,10 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
 if "--cuda_ext" in sys.argv:
+    if TORCH_MAJOR == 0:
+        raise RuntimeError("--cuda_ext requires Pytorch 1.0 or later, "
+                           "found torch.__version__ = {}".format(torch.__version__))
+
     sys.argv.remove("--cuda_ext")
 
     if CUDA_HOME is None:
@@ -145,17 +117,6 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
                                                        # '--resource-usage',
                                                        '--use_fast_math'] + version_dependent_macros}))
 
-# Check, if ATen/CUDAGenerator.h is found, otherwise use the new ATen/CUDAGeneratorImpl.h, due to breaking change in https://github.com/pytorch/pytorch/pull/36026
-generator_flag = []
-torch_dir = torch.__path__[0]
-if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
-    generator_flag = ['-DOLD_GENERATOR']
-
-
-def fetch_requirements(path):
-    with open(path, 'r') as fd:
-        return [r.strip() for r in fd.readlines()]
-
 
 install_requires = fetch_requirements('requirements/requirements.txt')
 
@@ -170,6 +131,5 @@ def fetch_requirements(path):
     description='An integrated large-scale model training system with efficient parallelization techniques',
     ext_modules=ext_modules,
     cmdclass={'build_ext': BuildExtension} if ext_modules else {},
-    extras_require=extras,
     install_requires=install_requires,
 )

From 2b05de4c645ae3ae73252ac4dace184d6afc7e09 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Mon, 15 Nov 2021 16:53:56 +0800
Subject: [PATCH 02/10] use env to control the language of doc (#24) (#25)

---
 docs/conf.py                        |  2 ++
 docs/index.rst                      | 26 +++++++++++++-------------
 docs/{index_en.rst => index_zh.rst} | 26 +++++++++++++-------------
 3 files changed, 28 insertions(+), 26 deletions(-)
 rename docs/{index_en.rst => index_zh.rst} (62%)

diff --git a/docs/conf.py b/docs/conf.py
index b0a57bdbc08b..695477e35fbe 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,6 +24,8 @@
 # The full version, including alpha/beta/rc tags
 release = '0.0.1'
 
+if 'SPHINX_LANG' in os.environ:
+    root_doc = f'index_{os.environ["SPHINX_LANG"]}'
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
diff --git a/docs/index.rst b/docs/index.rst
index f9a6ce444a79..16141b5ead8e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,27 +3,27 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-夸父AI系统（Colossal-AI）开发文档
+Colossal-AI documentation
 ======================================
 .. toctree::
    :maxdepth: 1
-   :caption: 快速上手指南
+   :caption: GETTING STARTED
 
-   installation_zh.md
-   run_demo_zh.md
+   installation.md
+   run_demo.md
 
 
 .. toctree::
    :maxdepth: 1
-   :caption: 个性化您的训练
-
-   parallelization_zh.md
-   model_zh.md
-   trainer_engine_zh.md
-   amp_zh.md
-   zero_zh.md
-   add_your_parallel_zh.md
-   config_zh.md
+   :caption: CUSTOMIZE YOUR TRAINING
+
+   parallelization.md
+   model.md
+   trainer_engine.md
+   amp.md
+   zero.md
+   add_your_parallel.md
+   config.md
    
 
 
diff --git a/docs/index_en.rst b/docs/index_zh.rst
similarity index 62%
rename from docs/index_en.rst
rename to docs/index_zh.rst
index 16141b5ead8e..f9a6ce444a79 100644
--- a/docs/index_en.rst
+++ b/docs/index_zh.rst
@@ -3,27 +3,27 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Colossal-AI documentation
+夸父AI系统（Colossal-AI）开发文档
 ======================================
 .. toctree::
    :maxdepth: 1
-   :caption: GETTING STARTED
+   :caption: 快速上手指南
 
-   installation.md
-   run_demo.md
+   installation_zh.md
+   run_demo_zh.md
 
 
 .. toctree::
    :maxdepth: 1
-   :caption: CUSTOMIZE YOUR TRAINING
-
-   parallelization.md
-   model.md
-   trainer_engine.md
-   amp.md
-   zero.md
-   add_your_parallel.md
-   config.md
+   :caption: 个性化您的训练
+
+   parallelization_zh.md
+   model_zh.md
+   trainer_engine_zh.md
+   amp_zh.md
+   zero_zh.md
+   add_your_parallel_zh.md
+   config_zh.md
    
 
 

From 3defa32aee5c8cac42b0625df258254d11cfaad7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 18 Nov 2021 19:45:06 +0800
Subject: [PATCH 03/10] Support TP-compatible Torch AMP and Update trainer API
 (#27)

* Add gradient accumulation, fix lr scheduler

* fix FP16 optimizer and adapted torch amp with tensor parallel (#18)

* fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes

* fixed trainer

* Revert "fixed trainer"

This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097.

* improved consistency between trainer, engine and schedule (#23)

Co-authored-by: 1SAA <c2h214748@gmail.com>

Co-authored-by: 1SAA <c2h214748@gmail.com>
Co-authored-by: ver217 <lhx0217@gmail.com>
---
 README.md                                     |  14 +-
 colossalai/builder/__init__.py                |  10 +-
 colossalai/builder/builder.py                 |  33 +-
 colossalai/engine/__init__.py                 |   2 +-
 colossalai/engine/_base_engine.py             | 216 +++---
 colossalai/engine/amp/__init__.py             |   2 +
 colossalai/engine/{ => amp}/amp_type.py       |   0
 colossalai/engine/amp/grad_scaler.py          | 577 ++++++++++++++
 colossalai/engine/schedule/_base_schedule.py  | 146 ++--
 colossalai/engine/schedule/_no_pipeline.py    | 133 ++--
 colossalai/engine/schedule/_pipeline.py       |  91 +--
 colossalai/engine/schedule/_utils.py          |  11 +
 colossalai/initialize.py                      |  56 +-
 colossalai/nn/layer/parallel_2d/_operation.py | 156 ++--
 colossalai/nn/lr_scheduler/__init__.py        |   2 +-
 colossalai/nn/lr_scheduler/cosine.py          |   7 +-
 colossalai/nn/lr_scheduler/delayed.py         |  15 +-
 colossalai/nn/lr_scheduler/linear.py          |  15 -
 colossalai/nn/lr_scheduler/multistep.py       |  16 +-
 colossalai/nn/lr_scheduler/torch.py           |  42 +-
 colossalai/nn/optimizer/_utils.py             |   2 +-
 .../zero_redundancy_optimizer_level_2.py      |  15 +-
 .../zero_redundancy_optimizer_level_3.py      |   2 +-
 colossalai/registry/__init__.py               |   1 +
 colossalai/trainer/__init__.py                |   4 +-
 colossalai/trainer/_trainer.py                | 350 +++++----
 colossalai/trainer/hooks/__init__.py          |   2 +
 colossalai/trainer/hooks/_checkpoint_hook.py  |  70 +-
 colossalai/trainer/hooks/_log_hook.py         |  84 ++-
 .../trainer/hooks/_lr_scheduler_hook.py       |  58 ++
 colossalai/trainer/hooks/_metric_hook.py      |  64 +-
 colossalai/trainer/metric.py                  |  27 +
 colossalai/{ => utils}/checkpointing.py       |   6 +-
 colossalai/utils/common.py                    |   2 +-
 configs/resnet/resnet50.py                    |   3 +-
 configs/sample_config.py                      |   7 +-
 configs/vit/vit_2d.py                         |  23 +-
 configs/vit/vit_3d.py                         |  16 +-
 .../colossalai.engine.amp.amp_type.rst        |   5 +
 .../colossalai.engine.amp.grad_scaler.rst     |   5 +
 docs/colossalai/colossalai.engine.amp.rst     |  12 +
 .../colossalai/colossalai.engine.amp_type.rst |   5 -
 docs/colossalai/colossalai.engine.rst         |   7 +-
 docs/colossalai/colossalai.rst                |   1 -
 .../colossalai.utils.checkpointing.rst        |   5 +
 docs/colossalai/colossalai.utils.rst          |   1 +
 docs/parallelization.md                       |  55 +-
 docs/run_demo.md                              |  30 +-
 docs/run_demo_zh.md                           |  28 +-
 docs/trainer_engine.md                        |  69 +-
 docs/trainer_engine_zh.md                     |  19 +-
 examples/colossal_cifar_demo.ipynb            | 704 +++++++++---------
 examples/run_trainer.py                       |  17 +-
 requirements/requirements.txt                 |   2 +-
 setup.py                                      |   2 +-
 .../configs/vit_2d.py                         |  37 +-
 .../configs/vit_2p5d.py                       |  17 +-
 .../test_vit_2d/test_vit_2d.py                |  36 +-
 .../test_vit_2p5d/test_vit_2p5d.py            |  41 +-
 .../configs/non_pipeline_resnet.py            |   2 -
 .../configs/non_pipeline_resnet_apex_amp.py   |   3 -
 .../configs/non_pipeline_resnet_torch_amp.py  |   3 -
 .../configs/pipeline_vanilla_resnet.py        |  10 +-
 .../test_engine_apex_amp.py                   |  12 +-
 .../test_engine_no_amp.py                     |  12 +-
 .../test_engine_torch_amp.py                  |  13 +-
 .../test_pipeline/test_schedule.py            |  23 +-
 .../test_pipeline_engine/test_engine.py       |  13 +-
 tests/test_fp16_optimizer/configs/vit_2d.py   |   7 +-
 .../test_vit_2d/test_vit_2d.py                |  39 +-
 .../test_vision_transformer/configs/vit_2d.py |   4 +-
 .../configs/vit_2p5d.py                       |  11 +-
 .../test_vision_transformer/configs/vit_3d.py |  19 +-
 .../test_vit_2d/test_vit_2d.py                |  39 +-
 .../test_vit_2p5d/test_vit_2p5d.py            |  42 +-
 .../test_vit_3d/test_vit_3d.py                |  27 +-
 .../configs/test_trainer_resnet.py            |  21 +-
 .../configs/test_trainer_vit_2d.py            |  26 +-
 tests/test_trainer/test_trainer.py            |  14 +-
 .../test_vit_2d/test_vit_2d.py                |  40 +-
 80 files changed, 2184 insertions(+), 1574 deletions(-)
 create mode 100644 colossalai/engine/amp/__init__.py
 rename colossalai/engine/{ => amp}/amp_type.py (100%)
 create mode 100644 colossalai/engine/amp/grad_scaler.py
 create mode 100644 colossalai/trainer/hooks/_lr_scheduler_hook.py
 rename colossalai/{ => utils}/checkpointing.py (98%)
 create mode 100644 docs/colossalai/colossalai.engine.amp.amp_type.rst
 create mode 100644 docs/colossalai/colossalai.engine.amp.grad_scaler.rst
 create mode 100644 docs/colossalai/colossalai.engine.amp.rst
 delete mode 100644 docs/colossalai/colossalai.engine.amp_type.rst
 create mode 100644 docs/colossalai/colossalai.utils.checkpointing.rst

diff --git a/README.md b/README.md
index 6e6c8de8144a..f5f16a725181 100644
--- a/README.md
+++ b/README.md
@@ -42,26 +42,18 @@ pip install -v --no-cache-dir --global-option="--cuda_ext" .
 
 ```python
 import colossalai
-from colossalai.engine import Engine
 from colossalai.trainer import Trainer
 from colossalai.core import global_context as gpc
 
-model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
-engine = Engine(
-    model=model,
-    criterion=criterion,
-    optimizer=optimizer,
-    lr_scheduler=lr_scheduler,
-    schedule=schedule
-)
+engine, train_dataloader, test_dataloader = colossalai.initialize()
 
 trainer = Trainer(engine=engine,
-                  hooks_cfg=gpc.config.hooks,
                   verbose=True)
 trainer.fit(
     train_dataloader=train_dataloader,
     test_dataloader=test_dataloader,
-    max_epochs=gpc.config.num_epochs,
+    epochs=gpc.config.num_epochs,
+    hooks_cfg=gpc.config.hooks,
     display_progress=True,
     test_interval=5
 )
diff --git a/colossalai/builder/__init__.py b/colossalai/builder/__init__.py
index 17d643285956..2ae19413269a 100644
--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
@@ -1,2 +1,10 @@
-from .builder import *
+from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper,
+                      build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
+                      build_gradient_handler)
 from .pipeline import ModelInitializer
+
+__all__ = [
+    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper',
+    'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
+    'build_gradient_handler', 'ModelInitializer'
+]
diff --git a/colossalai/builder/builder.py b/colossalai/builder/builder.py
index f88dc1cbff74..c32ad3b39927 100644
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -181,18 +181,6 @@ def build_transform(config):
     return build_from_registry(config, TRANSFORMS)
 
 
-def build_pipe_alloc_policy(config):
-    """Returns a pipeline allocation policy object constructed from `config`.
-
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: A pipeline allocation policy object
-    :rtype: 
-    """
-    return build_from_registry(config, PIPE_ALLOC_POLICY)
-
-
 def build_data_sampler(config, dataset):
     """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
     constructed from `config`.
@@ -235,7 +223,7 @@ def build_optimizer_wrapper(config, optimizer, model=None):
         return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_)
 
 
-def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
+def build_lr_scheduler(config, optimizer):
     """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler` 
     constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
 
@@ -254,9 +242,16 @@ def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
     """
     config_ = config.copy()
     mod_type = config_.pop('type')
-    # warmup epochs will overwrite warmup steps
-    if 'warmup_epochs' in config_:
-        warmup_epochs = config_.pop('warmup_epochs')
-        config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
-    return LR_SCHEDULERS.get_module(mod_type)(optimizer, total_steps, num_steps_per_epoch=num_steps_per_epoch,
-                                              **config_)
+    return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_)
+
+
+def build_schedule(config):
+    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
+    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+    """
+    return build_from_registry(config, SCHEDULE)
diff --git a/colossalai/engine/__init__.py b/colossalai/engine/__init__.py
index c00be7df6e7b..7e55922363d8 100644
--- a/colossalai/engine/__init__.py
+++ b/colossalai/engine/__init__.py
@@ -1,7 +1,7 @@
-from .amp_type import AMP_TYPE
 from ._base_engine import Engine
 from .gradient_handler import *
 from .schedule import *
+from .amp import *
 
 
 __all__ = ['Engine']
diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 843ef1d4f046..a99aa91e73c3 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from torch.nn import Module
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
 
 from colossalai.builder import build_gradient_handler
 from colossalai.context import ParallelMode
@@ -9,162 +11,166 @@
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                            ZeroRedundancyOptimizer_Level_3)
-from torch.nn import Module
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
-
-from .schedule import BaseSchedule, NoPipelineSchedule
+from .schedule import BaseSchedule
 
 
 class Engine:
     """Basic engine class for training and evaluation. It runs a specific process method 
     :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
+    It controls a iteration in training.
 
-    :param train_dataloader: Dataloader in training
-    :param test_dataloader: Dataloader in evaluation
     :param model: The neural network model
-    :param criterion: Criterion for calculating loss
     :param optimizer: Optimizer for updating the parameters
-    :param lr_scheduler: Learning rate scheduler ajusting learning rate during the training or evaluation
-    :param schedule: Running schedule in :meth:`step`
-    :type train_dataloader: DataLoader, optional
-    :type test_dataloader: DataLoader, optional
+    :param step_schedule: Running schedule in :meth:`step`
+    :param gradient_accumulation: Steps of gradient accumulation
+    :param gradient_clipping: The norm of gradient clipping
     :type model: Module
-    :type criterion: _Loss, optional
-    :type optimizer: Optimizer, optional
-    :type lr_scheduler: _LRScheduler, optional
-    :type schedule: BaseSchedule, optional
+    :type optimizer: Optimizer
+    :type step_schedule: BaseSchedule, optional
+    :type gradient_accumulation: int, optional
+    :type gradient_clipping: float, optional
     """
+
     def __init__(self,
-                 train_dataloader: Optional[DataLoader] = None,
-                 test_dataloader: Optional[DataLoader] = None,
-                 model: Module = None,
-                 criterion: _Loss = None,
-                 optimizer: Optimizer = None,
-                 lr_scheduler: Optional[_LRScheduler] = None,
-                 schedule: BaseSchedule = None):
-        self.train_dataloader = train_dataloader
-        self.test_dataloader = test_dataloader
-        assert model is not None, "Engine requires a model"
-        self.model = model
-        self.criterion = criterion
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        self.schedule = schedule if schedule is not None \
-            else NoPipelineSchedule()
+                 model: Module,
+                 optimizer: Optimizer,
+                 criterion: _Loss,
+                 step_schedule: BaseSchedule,
+                 gradient_handlers: list = None,
+                 gradient_accumulation: int = 1,
+                 gradient_clipping: float = 0.0,
+                 ):
+        self._model = model
+        self._optimizer = optimizer
+        self._criterion = criterion
+        self._schedule = step_schedule
+
+        # schedule initialize
+        self._schedule.initialize(model, optimizer)
+
+        # state
+        self.training = True  # default
+
+        # gradient accumulation
+        assert gradient_accumulation > 0, 'gradient accumulation size must be larger than 0'
+        self._grad_accum_size = gradient_accumulation
+        self._grad_clip = gradient_clipping
         self._logger = get_global_dist_logger()
 
         # build gradient handler
         self._gradient_handlers = []
-        gradient_handler_cfg = []
 
-        if hasattr(gpc.config, 'gradient_handler'):
-            assert isinstance(gpc.config.gradient_handler, list), \
+        if gradient_handlers is not None:
+            assert isinstance(gradient_handlers, list), \
                 f'argument gradient_handler_cfg expected type list, ' \
-                f'but got type {type(gpc.config.gradient_handler)}'
-            gradient_handler_cfg = gpc.config.gradient_handler
-        elif isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                         ZeroRedundancyOptimizer_Level_3)):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
+                f'but got type {type(gradient_handlers)}'
+        elif isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
+                                    ZeroRedundancyOptimizer_Level_3)):
+            gradient_handlers = [dict(type='ZeROGradientHandler')]
             self._logger.info(
                 "Training with zero is detected, ZeROGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
         elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
                 ParallelMode.DATA) > 1:
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
+            gradient_handlers = [dict(type='DataParallelGradientHandler')]
             self._logger.info(
                 "Data parallel training is detected, DataParallelGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
-        if len(gradient_handler_cfg) == 0:
+
+        if gradient_handlers is None:
             self._logger.warning(
                 "No gradient handler is set up, please make sure you do not need "
                 "to all-reduce the gradients after a training step.",
                 ranks=[0])
-        for cfg in gradient_handler_cfg:
-            handler = build_gradient_handler(cfg, self.model, self.optimizer)
-            self._gradient_handlers.append(handler)
+        else:
+            for cfg in gradient_handlers:
+                handler = build_gradient_handler(cfg, model, optimizer)
+                self._gradient_handlers.append(handler)
 
-        self.schedule.initialize(self.train_dataloader, self.model,
-                                 self.criterion, self.optimizer,
-                                 self.lr_scheduler)
-        self.forward_only = False
+    @property
+    def model(self):
+        return self._model
 
-    def handle_gradient(self):
-        """Handles all-reduce operations of gradients across different parallel groups.
-        """
-        for handler in self._gradient_handlers:
-            handler.handle_gradient()
+    @property
+    def optimizer(self):
+        return self._optimizer
 
-    def set_dataloader(self, data: DataLoader, train: bool = True):
-        """Sets dataloader in training or evaluation.
+    @property
+    def criterion(self):
+        return self._criterion
 
-        :param data: Dataloader to be set
-        :param train: Set training dataloader if True, otherwise evaluation dataloader
-        :type data: DataLoader
-        :type train: bool
-        """
-        if train:
-            self.train_dataloader = data
-        else:
-            self.test_dataloader = data
+    @property
+    def schedule(self):
+        return self._schedule
 
-    def get_model(self):
-        """Returns the neural network model in the engine.
-        """
-        return self.model
-    def get_optimizer(self):
-        """Returns optimizier in the engine.
-        """
-        return self.optimizer
+    @property
+    def gradient_accumulation(self):
+        return self._grad_accum_size
 
-    def get_lr_scheduler(self):
-        """Returns the learning rate scheduler in the engine.
+    def handle_gradient(self):
+        """Handles all-reduce operations of gradients across different parallel groups.
         """
-        return self.lr_scheduler
+        for handler in self._gradient_handlers:
+            handler.handle_gradient()
 
     def train(self):
         """Sets the model to training mode.
         """
-        self.forward_only = False
-        self.schedule.train(dataloader=self.train_dataloader, mode=True)
+        self.training = True
+        self._model.train()
 
     def eval(self):
         """Sets the model to evaluation mode.
         """
-        self.forward_only = True
-        self.schedule.train(dataloader=self.test_dataloader, mode=False)
+        self.training = False
+        self._model.eval()
 
-    def is_train(self):
-        """Returns True if it is in training, otherwise False.
-        """
-        return not self.forward_only
-
-    def get_lr(self):
-        """Gets current learning rate.
-        """
-        return self.schedule.get_lr()
-
-    def step(self, return_loss=True):
+    def step(self,
+             data_iter,
+             is_last_iteration: bool = False,
+             return_loss=True):
         """A running step based on the schedule. Usually, it runs a training or
         evaluation over a batch of dataset.
 
+        :param data_iter: Data iterator of the dataset
+        :param is_last_iteration: If True, this iteration is the last iteration in the epoch
         :param return_loss: loss will be returned if True
-        :type return_loss: bool
+        :type data_iter: Iterator
+        :type is_last_iteration: bool, optional
+        :type return_loss: bool, optional
         :return: (output, lablel, loss)
         """
-        self.schedule.zero_grad(forward_only=self.forward_only)
-
-        output, label, loss = self.schedule.forward_backward_step(
-            forward_only=self.forward_only, return_loss=return_loss)
-
-        if not self.forward_only:
-            # all reduce gradients
-            self.handle_gradient()
-
-            self.schedule.step()
+        if self.training:
+            self._optimizer.zero_grad()
+
+        # differentiate training and eval with grad accum
+        if self.training:
+            for i in range(self._grad_accum_size):
+                output, label, loss = self._schedule.forward_backward_step(
+                    data_iter, self._model, self._criterion, self._optimizer,
+                    forward_only=False,
+                    grad_accum_size=self._grad_accum_size,
+                    return_loss=return_loss)
+
+                if i == self._grad_accum_size - 1:
+                    # all reduce gradients
+                    self.handle_gradient()
+                    self._schedule.optimizer_step(self._model, self._optimizer, self._grad_clip)
+        else:
+            output, label, loss = self._schedule.forward_backward_step(
+                data_iter, self._model, self._criterion, self._optimizer,
+                forward_only=True,
+                grad_accum_size=1,
+                return_loss=return_loss)
+
+        # consume the remaining dataset left out due to gradient accumulation
+        if is_last_iteration:
+            while True:
+                try:
+                    _ = next(data_iter)
+                except StopIteration:
+                    break
 
         return output, label, loss
diff --git a/colossalai/engine/amp/__init__.py b/colossalai/engine/amp/__init__.py
new file mode 100644
index 000000000000..927d5cf09d1a
--- /dev/null
+++ b/colossalai/engine/amp/__init__.py
@@ -0,0 +1,2 @@
+from .grad_scaler import GradScaler
+from .amp_type import AMP_TYPE
diff --git a/colossalai/engine/amp_type.py b/colossalai/engine/amp/amp_type.py
similarity index 100%
rename from colossalai/engine/amp_type.py
rename to colossalai/engine/amp/amp_type.py
diff --git a/colossalai/engine/amp/grad_scaler.py b/colossalai/engine/amp/grad_scaler.py
new file mode 100644
index 000000000000..7859d132db17
--- /dev/null
+++ b/colossalai/engine/amp/grad_scaler.py
@@ -0,0 +1,577 @@
+# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.p
+import torch
+from collections import defaultdict, abc
+import warnings
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+from colossalai.context import ParallelMode
+import torch.distributed as dist
+from colossalai.core import global_context as gpc
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda or master_tensor.device.type == 'xla'
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(
+                device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state():
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler(object):
+    _scale: Optional[torch.Tensor]
+    _grows_tracker: Optional[torch.Tensor]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
+    """
+    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+    """
+
+    def __init__(self,
+                 init_scale=2.**16,
+                 growth_factor=2.0,
+                 backoff_factor=0.5,
+                 growth_interval=2000,
+                 enabled=True):
+        if enabled and not torch.cuda.is_available():
+            warnings.warn(
+                "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            self._enabled = False
+        else:
+            self._enabled = enabled
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker = None
+            self._per_optimizer_states = defaultdict(
+                _refresh_per_optimizer_state)
+
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(
+            funcname) + fix
+        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(
+            funcname) + fix
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self, dev):
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = torch.full(
+            (1,), self._init_scale, dtype=torch.float32, device=dev)
+        self._growth_tracker = torch.full(
+            (1,), self._init_growth_tracker, dtype=torch.int32, device=dev)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda or outputs.device.type == 'xla'
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        # holds a reference that can be overwritten by apply_scale
+        stash: List[_MultiDeviceReplicator] = []
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda or val.device.type == 'xla'
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError(
+                    "outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
+        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(
+            lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError(
+                            "Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
+                        to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(grads,
+                                                                     per_device_found_inf.get(
+                                                                         device),
+                                                                     per_device_inv_scale.get(device))
+        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
+        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1:
+            for tensor in per_device_found_inf._per_device_tensors.values():
+                dist.all_reduce(tensor, op=dist.ReduceOp.MAX,
+                                group=gpc.get_group(ParallelMode.TENSOR))
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer):
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, False)
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
+        retval = None
+        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
+            retval = optimizer.step(*args, **kwargs)
+        return retval
+
+    def step(self, optimizer, *args, **kwargs):
+        """
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if (not self._enabled):
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError(
+                "Closure use is not currently supported if GradScaler is enabled.")
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError(
+                "step() has already been called since the last update().")
+
+        retval = None
+
+        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
+            optimizer_state["stage"] = OptState.STEPPED
+            return retval
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert len(optimizer_state["found_inf_per_device"]
+                   ) > 0, "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(
+            optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale=None):
+        """
+        Updates the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                # type: ignore[attr-defined]
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [found_inf.to(device=_scale.device, non_blocking=True)
+                          for state in self._per_optimizer_states.values()
+                          for found_inf in state["found_inf_per_device"].values()]
+
+            assert len(
+                found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            torch._amp_update_scale_(_scale,
+                                     _growth_tracker,
+                                     found_inf_combined,
+                                     self._growth_factor,
+                                     self._backoff_factor,
+                                     self._growth_interval)
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self):
+        return self._scale
+
+    def get_scale(self):
+        """
+        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return self._init_scale if self._scale is None else self._get_scale_async().item()
+        else:
+            return 1.0
+
+    def get_growth_factor(self):
+        r"""
+        Returns a Python float containing the scale growth factor.
+        """
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self):
+        r"""
+        Returns a Python float containing the scale backoff factor.
+        """
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self):
+        r"""
+        Returns a Python int containing the growth interval.
+        """
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval):
+        r"""
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self):
+        if self._enabled:
+            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
+        else:
+            return 0
+
+    def is_enabled(self):
+        r"""
+        Returns a bool indicating whether this instance is enabled.
+        """
+        return self._enabled
+
+    def state_dict(self):
+        r"""
+        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        return {"scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker()} if self._enabled else {}
+
+    def load_state_dict(self, state_dict):
+        r"""
+        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError("The source state dict is empty, possibly because it was saved "
+                               "from a disabled instance of GradScaler.")
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
+                                                         "of an iteration, or at the end after scaler.update()."
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state['_init_scale'] = self.get_scale()
+            state['_init_growth_tracker'] = self._get_growth_tracker()
+            state['_scale'] = None
+            state['_growth_tracker'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer):
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = torch.full(
+            (1,), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=_scale.device)
+
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer):
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py
index c64031c09409..0583ccbf3d14 100644
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -5,125 +5,85 @@
 
 import torch
 
+from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.utils import get_current_device
 
 
 class BaseSchedule(ABC):
     """A basic helper class to control the process of training or evaluation.
+    It mainly composes of forward_backward_step for gradient backward and
+    optimizer_step for parameters update.
+    For the convenience to enable FP16, we aggreate all codes that contain the
+    control of FP16 in class schedule.
     """
+
     def __init__(self):
-        self.initialized = False
         self.logger = get_global_dist_logger()
 
-    @property
-    @abstractmethod
-    def num_steps(self):
-        """The number of batches in training or evaluation.
-        """
-        pass
+    @staticmethod
+    def _move_tensor(element):
+        if torch.is_tensor(element):
+            if not element.is_cuda:
+                return element.to(get_current_device()).detach()
+        return element
 
-    def initialize(self,
-                   dataloader=None,
-                   model=None,
-                   criterion=None,
-                   optimizer=None,
-                   lr_scheduler=None):
-        """Initializes the schedule and set parameters before running.
-
-        :param dataloader: DataLoader in training or evaluation
-        :param model: The neural network model
-        :param criterion: Criterion for calculating loss
-        :param optimizer: Optimizer for updating the parameters
-        :param lr_scheduler: Learning rate scheduler in the process
-        """
-        self.dataloader = dataloader
-        assert model is not None, "Schedule requires a model"
-        self.model = model
-        assert criterion is not None, "Schedule requires a criterion"
-        self.criterion = criterion
-        assert optimizer is not None, "Schedule requires an optimizer"
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        self.initialized = True
-
-    def check_initialized(self):
-        """Checks whether the schedule is initialized.
-        """
-        assert self.initialized, \
-            'Schedule is not initialized. Call schedule.initialize(...) before using it.'
+    def _move_to_device(self, data):
+        if isinstance(data, (tuple, list)):
+            data = tuple([self._move_tensor(d) for d in data])
+        elif torch.is_tensor(data):
+            data = data.to(get_current_device()).detach()
+        return data
 
-    def load_batch(self):
-        """Loads a batch of dataset. It returns the data and labels which are
+    def load_batch(self, data_iter):
+        """Loads a batch from data iterator. It returns the data and labels which are
         already in the same GPU as where the model's.
 
         :return: (data, label)
-        :rtype: (Tensor, Tensor) 
+        :rtype: (Tensor, Tensor)
         """
-        self.check_initialized()
-        if self.data_iter is None:
+        if data_iter is None:
             raise RuntimeError('Dataloader is not defined.')
-        data, label = next(self.data_iter)
+        data, label = next(data_iter)
         return self._move_to_device(data), self._move_to_device(label)
 
-    def _move_to_device(self, data):
-        if isinstance(data, (
-                tuple,
-                list,
-        )):
-            data = tuple([
-                d.to(get_current_device()).detach() for d in data
-                if torch.is_tensor(d)
-            ])
-        elif torch.is_tensor(data):
-            data = data.to(get_current_device()).detach()
-        return data
-
-    def train(self, dataloader=None, mode=True):
-        """Sets the dataloader to be used and turn the model to 
-        training or evaluation mode.
+    def initialize(self, model, optimizer):
+        """Initializes the model and the optimizer before training.
+         This is often used in FP16 training.
 
-        :param dataloader: Dataloader to be used
-        :param mode: If True, the model will set as training mode. Otherwise, evaluation mode.
-        """
-        self.check_initialized()
-        if mode:
-            self.model.train()
-        else:
-            self.model.eval()
-        if dataloader is not None:
-            self.dataloader = dataloader
-            self.data_iter = iter(dataloader)
-
-    def zero_grad(self, forward_only=False):
-        """Cleans gradients with the optimizer.
+        :param model: The neural network model
+        :param optimizer: Optimizer for updating the parameters
         """
-        if not forward_only:
-            self.check_initialized()
-            self.optimizer.zero_grad()
+        return model, optimizer
 
-    def get_lr(self):
-        """Returns the current learning rate.
-        """
-        if self.lr_scheduler is not None:
-            return self.lr_scheduler.get_lr()[0]
-        else:
-            return self.optimizer.param_groups[0]['lr']
+    @abstractmethod
+    def forward_backward_step(self,
+                              data_iter,
+                              model,
+                              criterion,
+                              optimizer=None,
+                              forward_only=False,
+                              grad_accum_size: int = 1,
+                              return_loss=True):
+        """The process function over a batch of dataset for training or evaluation.
 
-    def step(self):
-        """Updates the parameters and learning rate with the optimizer.
+        :param data_iter: Data iterator of the dataset
+        :param model: Model used in training or evaluation
+        :param optimizer: Optimizer used in training or evaluation
+        :param criterion: Loss function
+        :param forward_only: If True, the process won't include backward
+        :param grad_accum_size: Steps of gradient accumulation
+        :param return_loss: If False, the loss won't be returned
         """
-        self.check_initialized()
-        self.optimizer.step()
-        # update lr scheduler
-        if self.lr_scheduler is not None:
-            self.lr_scheduler.step()
+        pass
 
     @abstractmethod
-    def forward_backward_step(self, forward_only=False, return_loss=True):
-        """The process function over a batch of dataset for training or evaluation.
+    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
+        """Updates the parameters with the optimizer.
 
-        :param forward_only: If True, the process won't include backward.
-        :param return_loss: If False, the loss won't be returned.
+        :param model: The neural network model
+        :param optimizer: Optimizer for updating the parameters
+        :param grad_clipping: The norm of gradient clipping
+        :type grad_clipping: float, optional
         """
         pass
diff --git a/colossalai/engine/schedule/_no_pipeline.py b/colossalai/engine/schedule/_no_pipeline.py
index 3ab1fa2d3ce4..4f38e6cda493 100644
--- a/colossalai/engine/schedule/_no_pipeline.py
+++ b/colossalai/engine/schedule/_no_pipeline.py
@@ -4,19 +4,24 @@
 try:
     import apex.amp as apex_amp
 except:
-    print('apex is required for mixed precision training')
+    pass
+
 try:
     import torch.cuda.amp as torch_amp
 except:
-    print('PyTorch amp is not supported with the current PyTorch version')
+    pass
+
+from typing import Iterable
+
+import torch.nn as nn
+from torch.optim import Optimizer
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.engine.amp_type import AMP_TYPE
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                            ZeroRedundancyOptimizer_Level_3)
-from ._utils import convert_to_fp16
+from colossalai.nn.optimizer._utils import clip_grad_norm_fp32
 from ._base_schedule import BaseSchedule
+from ._utils import convert_to_fp16, convert_to_fp32
+from ..amp import AMP_TYPE, GradScaler
 
 
 class NoPipelineSchedule(BaseSchedule):
@@ -30,6 +35,7 @@ class NoPipelineSchedule(BaseSchedule):
     :type amp_type: AMP_TYPE
     :type amp_config: dict
     """
+
     def __init__(
             self,
             amp_type: AMP_TYPE = None,
@@ -41,12 +47,6 @@ def __init__(
         assert amp_type is None or isinstance(amp_type, AMP_TYPE), \
             'unrecognised value for argument fp16, it can only be None, torch or apex'
 
-        # LSG: check compatibility
-        # LSG: torch.cuda.amp and apex.amp cannot be used for tensor parallel
-        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(
-                ParallelMode.TENSOR) > 1:
-            assert amp_type != AMP_TYPE.TORCH and amp_type != AMP_TYPE.APEX, \
-                'You can only AMP_TYPE.PARALLEL for tensor parallel training'
         self.use_zero_level_2_3 = False
 
         if amp_type is not None:
@@ -79,107 +79,110 @@ def __init__(
             self.fp16 = False
             self.amp_type = None
 
-    @property
-    def num_steps(self):
-        return len(self.dataloader)
-
-    def initialize(self,
-                   dataloader,
-                   model,
-                   criterion,
-                   optimizer,
-                   lr_scheduler=None):
-        super().initialize(dataloader,
-                           model,
-                           criterion,
-                           optimizer,
-                           lr_scheduler=lr_scheduler)
-        if isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                       ZeroRedundancyOptimizer_Level_3)):
+    def initialize(self, model: nn.Module, optimizer: Optimizer):
+        if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
+                                  ZeroRedundancyOptimizer_Level_3)):
             self.use_zero_level_2_3 = True
-            assert self.amp_type != AMP_TYPE.PARALLEL, 'ZeRO Level 2 and 3 are mutually exclusive with AMP_TYPE.PARALLEL'
+            assert self.amp_type != AMP_TYPE.PARALLEL, \
+                'ZeRO Level 2 and 3 are mutually exclusive with AMP_TYPE.PARALLEL'
 
         if self.fp16:
             if self.amp_type == AMP_TYPE.TORCH:
-                self._torch_amp_scaler = torch_amp.GradScaler(**self.amp_cfg)
+                self._torch_amp_scaler = GradScaler(**self.amp_cfg)
             elif self.amp_type == AMP_TYPE.APEX:
-                self.model, self.optimizer = apex_amp.initialize(
-                    self.model, self.optimizer, **self.amp_cfg)
-
-    def forward_backward_step(self, forward_only=False, return_loss=True):
+                model, optimizer = apex_amp.initialize(model, optimizer, **self.amp_cfg)
+
+        return model, optimizer
+
+    def forward_backward_step(self,
+                              data_iter: Iterable,
+                              model: nn.Module,
+                              criterion: nn.modules.loss._Loss,
+                              optimizer: Optimizer = None,
+                              forward_only: bool = False,
+                              grad_accum_size: int = 1,
+                              return_loss: bool = True):
         """The process function that loads loads a batch of dataset and feeds it to the model.
         The returned labels and loss will None if :attr:`return_loss` is False.
 
+        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
+        :param model: Model for training and inference
+        :param criterion: Loss function for training
+        :param optimizer: Optimizer used for training
+        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
+        :param grad_accum_size: The number of iterations for gradient accumulation
+        :param return_loss: Loss will be returned if True
+        :type data_iter: Iterator
+        :type model: torch.nn.Module
+        :type criterion: torch.nn.modules.loss._Loss
+        :type optimizer: torch.optim.Optimizer
+        :type forward_only: bool, optional
+        :type grad_accum_size: int
+        :type return_loss: bool, optional
         :return: (output, label, loss)
         """
         assert forward_only or return_loss, \
             'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
 
-        data, label = self.load_batch()
+        data, label = self.load_batch(data_iter)
         loss = None
 
-        # LSG: leave for debug, make sure dataloader is deterministic
-        # if forward_only:
-        #     img = data[0]
-        #     rank = gpc.get_local_rank(ParallelMode.DATA)
-        #     world_size = gpc.get_world_size(ParallelMode.DATA)
-        #     group = gpc.get_group(ParallelMode.DATA)
-        #     input_list = [img.clone() for _ in range(world_size)]
-        #     output_list = [torch.empty_like(img) for _ in range(world_size)]
-        #     output_list[rank] = img.clone()
-        #     dist.all_to_all(output_tensor_list=output_list, input_tensor_list=input_list, group=group)
-        #     assert torch.equal(output_list[0], output_list[1])  # and torch.equal(output_list[1], output_list[2])
-
         # forward
         if self.fp16 and self.amp_type == AMP_TYPE.TORCH:
             with torch_amp.autocast():
-                output = self.model(*data)
+                output = model(*data)
                 if not isinstance(output, (tuple, list)):
                     output = (output,)
                 if return_loss:
-                    loss = self.criterion(*output, *label)
+                    loss = criterion(*output, *label)
         else:
             if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL:
                 data = convert_to_fp16(data)
 
-            output = self.model(*data)
+            output = model(*data)
+
+            if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL:
+                output = convert_to_fp32(output)
+
             if not isinstance(output, (tuple, list)):
                 output = (output,)
             if return_loss:
-                loss = self.criterion(*output, *label)
+                loss = criterion(*output, *label)
+
+        loss /= grad_accum_size
 
         if not forward_only:
             # backward
             if self.use_zero_level_2_3:
-                self.optimizer.backward(loss)
+                optimizer.backward(loss)
             elif self.fp16:
                 if self.amp_type == AMP_TYPE.APEX:
-                    with apex_amp.scale_loss(loss,
-                                             self.optimizer) as scaled_loss:
+                    with apex_amp.scale_loss(loss, optimizer) as scaled_loss:
                         scaled_loss.backward()
                 elif self.amp_type == AMP_TYPE.TORCH:
                     self._torch_amp_scaler.scale(loss).backward()
                 elif self.amp_type == AMP_TYPE.PARALLEL:
-                    loss = self.optimizer.scale_loss(loss)
+                    loss = optimizer.scale_loss(loss)
                     loss.backward()
                     # scale back to display the original value in logs
-                    loss.div_(self.optimizer.grad_scaler.scale)
+                    loss.div_(optimizer.grad_scaler.scale)
             else:
                 loss.backward()
 
         if return_loss:
-            return output, label, loss
+            return output, label, loss * grad_accum_size
         else:
             return output, None, None
 
-    def step(self):
+    def optimizer_step(self, model: nn.Module, optimizer: Optimizer, grad_clipping: float = 0.0):
         # step optimizer
         if self.fp16 and self.amp_type == AMP_TYPE.TORCH:
-            self._torch_amp_scaler.step(self.optimizer)
+            if grad_clipping > 0.0:
+                self._torch_amp_scaler.unscale_(optimizer)
+                clip_grad_norm_fp32(model.parameters(), grad_clipping)
+            self._torch_amp_scaler.step(optimizer)
             self._torch_amp_scaler.update()
         else:
-            self.optimizer.step()
-
-        # update lr scheduler
-        if self.lr_scheduler is not None:
-            self.lr_scheduler.step()
+            if not self.fp16 and not self.use_zero_level_2_3 and grad_clipping > 0.0:
+                clip_grad_norm_fp32(model.parameters(), grad_clipping)
+            optimizer.step()
diff --git a/colossalai/engine/schedule/_pipeline.py b/colossalai/engine/schedule/_pipeline.py
index 0b477c0d5361..6defea93d57a 100644
--- a/colossalai/engine/schedule/_pipeline.py
+++ b/colossalai/engine/schedule/_pipeline.py
@@ -15,7 +15,7 @@
 from colossalai.utils import get_current_device
 from ._base_schedule import BaseSchedule
 from ._utils import convert_to_fp16
-from ..amp_type import AMP_TYPE
+from ..amp import AMP_TYPE
 
 
 def squeeze(x: Union[Tensor, tuple, list]):
@@ -93,12 +93,11 @@ def _sync_data(self):
             )
 
     # Pipeline schedule just puts data in memory
-    def load_batch(self):
-        self.check_initialized()
-        if self.data_iter is None:
+    def load_batch(self, data_iter):
+        if data_iter is None:
             raise RuntimeError('Dataloader is not defined.')
         self.batch_pos = 0
-        data, label = next(self.data_iter)
+        data, label = next(data_iter)
         self.batch_data, self.batch_label = \
             self._move_to_device(data), self._move_to_device(label)
         batch_size = self.batch_data.shape[0]
@@ -117,23 +116,8 @@ def load_micro_batch(self):
         self.batch_pos += self.microbatch_size
         return (data,), (label,)
 
-    @property
-    def num_steps(self):
-        return len(self.dataloader)
-
-    def initialize(self,
-                   dataloader,
-                   model,
-                   criterion,
-                   optimizer,
-                   lr_scheduler=None):
-        super().initialize(dataloader,
-                           model,
-                           criterion,
-                           optimizer,
-                           lr_scheduler=lr_scheduler)
-        if isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                       ZeroRedundancyOptimizer_Level_3)):
+    def initialize(self, model, optimizer):
+        if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
             raise TypeError(
                 "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3"
             )
@@ -145,7 +129,8 @@ def initialize(self,
                 'default tensor dtype is set to torch.half for fp16 training',
                 ranks=[0])
 
-    def forward_step(self, input_tensor, return_tensors, return_loss=True):
+    def forward_step(self, model, criterion, input_tensor, return_tensors,
+                     grad_accum_size, return_loss=True):
         """Forward step for passed-in model. If it is the first stage, the input tensor 
         is obtained from data_iterator, otherwise the passed-in input_tensor is used.
         Returns output tensor. This is a helper function and can be ignored by users.
@@ -156,14 +141,14 @@ def forward_step(self, input_tensor, return_tensors, return_loss=True):
             if self.amp_type == AMP_TYPE.PARALLEL:
                 input_tensor = convert_to_fp16(input_tensor)
         input_tensor = squeeze(input_tensor)
-        output_tensor = self.model(input_tensor)
+        output_tensor = model(input_tensor)
         output_tensor = squeeze(output_tensor)
 
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             if return_loss:
                 input_tensor, label = self.load_micro_batch()
-                loss_reduced = self.criterion(output_tensor, *
-                label) / self.num_microbatches
+                loss_reduced = criterion(output_tensor, *label) \
+                               / (self.num_microbatches * grad_accum_size)
                 return_tensors.append(
                     tuple((output_tensor, label[0], loss_reduced)))
                 return loss_reduced
@@ -174,7 +159,7 @@ def forward_step(self, input_tensor, return_tensors, return_loss=True):
         else:
             return output_tensor
 
-    def backward_step(self, input_tensor, output_tensor, output_tensor_grad):
+    def backward_step(self, optimizer, input_tensor, output_tensor, output_tensor_grad):
         """Backward step through the passed-in output tensor. If it is the last stage, the 
         output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
         Returns the gradients with respect to the input tensor (None if first stage).
@@ -187,7 +172,7 @@ def backward_step(self, input_tensor, output_tensor, output_tensor_grad):
 
         # Backward pass.
         if output_tensor_grad is None and self.amp_type == AMP_TYPE.PARALLEL:
-            output_tensor = self.optimizer.scale_loss(output_tensor)
+            output_tensor = optimizer.scale_loss(output_tensor)
         torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)
 
         # Collect the grad of the input_tensor.
@@ -197,17 +182,24 @@ def backward_step(self, input_tensor, output_tensor, output_tensor_grad):
 
         return input_tensor_grad
 
-    def forward_backward_step(self, forward_only=True, return_loss=True):
+    def forward_backward_step(self,
+                              data_iter,
+                              model,
+                              criterion,
+                              optimizer=None,
+                              forward_only=False,
+                              grad_accum_size: int = 1,
+                              return_loss=True):
         """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
         Returns a tuple with losses if the last stage, an empty tuple otherwise.
-        
+
         :return: (output, label, loss)
         """
 
         assert forward_only or return_loss, \
             'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
 
-        self.load_batch()
+        self.load_batch(data_iter)
         num_warmup_microbatches = \
             (gpc.get_world_size(ParallelMode.PIPELINE) -
              gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
@@ -233,9 +225,11 @@ def forward_backward_step(self, forward_only=True, return_loss=True):
             if not gpc.is_first_rank(ParallelMode.PIPELINE):
                 ft_shape = recv_tensor_meta(ft_shape)
             input_tensor = recv_forward(ft_shape)
-            output_tensor = self.forward_step(input_tensor,
-                                              return_tensors,
-                                              return_loss=return_loss)
+            output_tensor = self.forward_step(
+                model, criterion,
+                input_tensor, return_tensors,
+                grad_accum_size, return_loss=return_loss
+            )
             if not gpc.is_last_rank(ParallelMode.PIPELINE):
                 bt_shape = output_tensor.shape
                 fs_checker = send_tensor_meta(output_tensor, fs_checker)
@@ -257,9 +251,11 @@ def forward_backward_step(self, forward_only=True, return_loss=True):
         for i in range(num_microbatches_remaining):
             last_iteration = (i == (num_microbatches_remaining - 1))
 
-            output_tensor = self.forward_step(input_tensor,
-                                              return_tensors,
-                                              return_loss=return_loss)
+            output_tensor = self.forward_step(
+                model, criterion,
+                input_tensor, return_tensors,
+                grad_accum_size, return_loss=return_loss
+            )
             if forward_only:
                 send_forward(output_tensor)
 
@@ -279,9 +275,11 @@ def forward_backward_step(self, forward_only=True, return_loss=True):
                 input_tensor = input_tensors.pop(0)
                 output_tensor = output_tensors.pop(0)
 
-                input_tensor_grad = self.backward_step(input_tensor,
-                                                       output_tensor,
-                                                       output_tensor_grad)
+                input_tensor_grad = self.backward_step(
+                    optimizer,
+                    input_tensor, output_tensor,
+                    output_tensor_grad
+                )
 
                 if last_iteration:
                     input_tensor = None
@@ -298,9 +296,11 @@ def forward_backward_step(self, forward_only=True, return_loss=True):
 
                 output_tensor_grad = recv_backward(bt_shape)
 
-                input_tensor_grad = self.backward_step(input_tensor,
-                                                       output_tensor,
-                                                       output_tensor_grad)
+                input_tensor_grad = self.backward_step(
+                    optimizer,
+                    input_tensor, output_tensor,
+                    output_tensor_grad
+                )
 
                 send_backward(input_tensor_grad)
 
@@ -309,8 +309,11 @@ def forward_backward_step(self, forward_only=True, return_loss=True):
                 output, label, loss = tuple(map(list, zip(*return_tensors)))
                 return (torch.cat(output, dim=0),
                         torch.cat(label, dim=0),
-                        sum(loss))
+                        sum(loss) * grad_accum_size)
             else:
                 return tuple((torch.cat(return_tensors, dim=0), None, None))
         else:
             return tuple((None, None, None))
+
+    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
+        optimizer.step()
diff --git a/colossalai/engine/schedule/_utils.py b/colossalai/engine/schedule/_utils.py
index 9c4a2a19b912..cdfd0246c12d 100644
--- a/colossalai/engine/schedule/_utils.py
+++ b/colossalai/engine/schedule/_utils.py
@@ -14,3 +14,14 @@ def convert_to_fp16(data: Union[Tensor, List[Tensor]]):
     else:
         raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}")
     return ret
+
+
+def convert_to_fp32(data: Union[Tensor, List[Tensor]]):
+    if isinstance(data, Tensor):
+        ret = data.float()
+    elif isinstance(data, (list, tuple)):
+        ret = [val.float() for val in data]
+    else:
+        raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}")
+    return ret
+
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 35e8095b6285..6806d86eb61c 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -6,18 +6,20 @@
 import random
 from pathlib import Path
 from typing import Callable, Iterable, Optional, Union
+from typing import Tuple
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
 
 from colossalai.engine import AMP_TYPE, NoPipelineSchedule, PipelineSchedule
+from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger, init_global_dist_logger
 from colossalai.nn import DataParallelSampler
 from colossalai.nn.model.base_model import BaseModel
 from .builder import (ModelInitializer, build_dataset, build_loss,
-                      build_lr_scheduler, build_model, build_optimizer,
-                      build_optimizer_wrapper)
+                      build_model, build_optimizer,
+                      build_optimizer_wrapper, build_schedule)
 from .context import Config, ParallelMode
 from .core import global_context as gpc
 from .utils import get_current_device, sync_model_param_in_dp
@@ -182,7 +184,7 @@ def initialize(config: Union[str, dict] = None,
                backend: str = None,
                train_dataloader: Optional[Union[Iterable, Callable]] = None,
                test_dataloader: Optional[Union[Iterable, Callable]] = None,
-               ):
+               ) -> Tuple[Engine, DataLoader, DataLoader]:
     '''Core function that initializes distributed environment, logger, cudnn, data, model, loss function, optimizer, and lr_scheduler(their configs are in gpc.config).
 
     :param config: config file or config file path are both acceptable
@@ -201,7 +203,7 @@ def initialize(config: Union[str, dict] = None,
     :type train_dataloader: Optional[Union[Iterable, Callable]], optional
     :param test_dataloader: If None, the config is used to build a dataloder; Else, it should be a dataloader object or a function with no arguments which can build a dataloader, defaults to None
     :type test_dataloader: Optional[Union[Iterable, Callable]], optional
-    :return: (model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler)
+    :return: (engine, train_dataloader, test_dataloader, criterion)
     :rtype: tuple
     '''
     # initialize distributed environment
@@ -337,21 +339,7 @@ def initialize(config: Union[str, dict] = None,
         optimizer = build_optimizer_wrapper(fp16_cfg, optimizer)
     logger.info('Optimizer is created', ranks=[0])
 
-    lr_scheduler = None
-    if hasattr(gpc.config, 'lr_scheduler'):
-        if hasattr(gpc.config, 'num_steps'):
-            total_steps = gpc.config.num_steps
-        elif hasattr(gpc.config, 'num_epochs'):
-            total_steps = int(gpc.config.num_epochs * len(train_dataloader))
-        else:
-            raise Exception(
-                'Please specify training stopping criterion num_steps or num_epochs in your configuration.'
-            )
-        lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, optimizer,
-                                          total_steps, len(train_dataloader))
-        logger.info('Learning rate scheduler is created', ranks=[0])
-
-    # pipeline or no pipeline schedule
+    # build schedule and engine
     if hasattr(gpc.config, 'fp16'):
         amp_type = gpc.config.fp16.mode
         amp_cfg = gpc.config.fp16.copy()
@@ -360,12 +348,32 @@ def initialize(config: Union[str, dict] = None,
         amp_type = None
         amp_cfg = None
 
-    if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-        assert hasattr(gpc.config,
-                       'schedule'), "Config 'schedule' not found in your configuration file for pipeline parallel training"
+    engine_cfg = gpc.config.get('engine', dict())
+    schedule_cfg = engine_cfg.pop('schedule', None)
+
+    schedule_type = None
+    if schedule_cfg is not None:
+        schedule_type = schedule_cfg.get('type', None)
+
+    if schedule_type is not None:
+        # run customized schedule
+        schedule_cfg['amp_type'] = amp_type
+        schedule_cfg['amp_config'] = amp_cfg
+        schedule = build_schedule(schedule_cfg)
+    elif gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
+        assert schedule_cfg is not None, \
+            "Config 'engine.schedule' not found in your configuration file for pipeline parallel training"
         schedule = PipelineSchedule(
-            amp_type=amp_type, amp_config=amp_cfg, **gpc.config.schedule.copy())
+            amp_type=amp_type, amp_config=amp_cfg, **schedule_cfg.copy())
     else:
         schedule = NoPipelineSchedule(amp_type=amp_type, amp_config=amp_cfg)
 
-    return model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler
+    engine = Engine(
+        model=model,
+        optimizer=optimizer,
+        criterion=criterion,
+        step_schedule=schedule,
+        **gpc.config.get('engine', dict())
+    )
+
+    return engine, train_dataloader, test_dataloader
diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/nn/layer/parallel_2d/_operation.py
index 2c7eb8ac6050..d9ecf2fad140 100644
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@@ -7,6 +7,7 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 
 def matmul_2d(a,
@@ -60,6 +61,7 @@ class Matmul_AB_2D(torch.autograd.Function):
     """Matrix multiplication for :math:`C = AB`
     """
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx: Any,
                 A: Tensor,
                 B: Tensor,
@@ -120,32 +122,32 @@ def forward(ctx: Any,
         return out
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         A, B = ctx.saved_tensors
-        A_grad = Matmul_ABT_2D.forward(
-            None,
-            output_grad, B,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_ATB_2D.forward(
-            None,
-            A, output_grad,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+        with torch.no_grad():
+            A_grad = Matmul_ABT_2D.apply(
+                output_grad, B,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_ATB_2D.apply(
+                A, output_grad,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
         return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None
 
 
@@ -153,6 +155,7 @@ class Matmul_ABT_2D(torch.autograd.Function):
     """Matrix multiplication for :math:`C = AB^T`
     """
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx: Any,
                 A: Tensor,
                 B: Tensor,
@@ -214,32 +217,33 @@ def forward(ctx: Any,
         return out
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         A, B = ctx.saved_tensors
-        A_grad = Matmul_AB_2D.forward(
-            None,
-            output_grad, B,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_ATB_2D.forward(
-            None,
-            output_grad, A,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+
+        with torch.no_grad():
+            A_grad = Matmul_AB_2D.apply(
+                output_grad, B,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_ATB_2D.apply(
+                output_grad, A,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
         return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None
 
 
@@ -247,6 +251,7 @@ class Matmul_ATB_2D(torch.autograd.Function):
     """Matrix multiplication for :math:`C = A^TB`
     """
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx: Any,
                 A: Tensor,
                 B: Tensor,
@@ -308,32 +313,33 @@ def forward(ctx: Any,
         return out
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         A, B = ctx.saved_tensors
-        A_grad = Matmul_ABT_2D.forward(
-            None,
-            B, output_grad,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_AB_2D.forward(
-            None,
-            A, output_grad,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+
+        with torch.no_grad():
+            A_grad = Matmul_ABT_2D.apply(
+                B, output_grad,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_AB_2D.apply(
+                A, output_grad,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
         return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None
 
 
@@ -341,6 +347,7 @@ class Add_Bias_2D(torch.autograd.Function):
     """Matrix add bias: :math:`C = A + b`
     """
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx: Any,
                 input: Tensor,
                 bias: Tensor,
@@ -384,6 +391,7 @@ def forward(ctx: Any,
             return output
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         row_rank = ctx.row_rank
         col_rank = ctx.col_rank
@@ -423,6 +431,7 @@ def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
 class _LayerNorm_2D(torch.autograd.Function):
 
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
     def forward(ctx: Any,
                 input: Tensor,
                 E_x: Tensor,
@@ -440,6 +449,7 @@ def forward(ctx: Any,
         return output
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         row_parallel_mode = ctx.row_parallel_mode
         col_parallel_mode = ctx.col_parallel_mode
@@ -492,6 +502,7 @@ def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
 class _ViT_Split_Input_2D(torch.autograd.Function):
 
     @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx: Any,
                 inputs: Tensor,
                 batch_size: int,
@@ -509,6 +520,7 @@ def forward(ctx: Any,
         return output
 
     @staticmethod
+    @custom_bwd
     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         # output_grad: [b/q, s, h/q]
         # grads: [b, s, h/q]
diff --git a/colossalai/nn/lr_scheduler/__init__.py b/colossalai/nn/lr_scheduler/__init__.py
index 82e28ff88a62..fd44686f0e37 100644
--- a/colossalai/nn/lr_scheduler/__init__.py
+++ b/colossalai/nn/lr_scheduler/__init__.py
@@ -1,5 +1,5 @@
 from .cosine import CosineAnnealingLR, CosineAnnealingWarmupLR, FlatAnnealingLR, FlatAnnealingWarmupLR
-from .linear import LinearWarmupLR, LinearWarmupDecay
+from .linear import LinearWarmupLR
 from .multistep import MultiStepLR, MultiStepWarmupLR
 from .onecycle import OneCycleLR
 from .poly import PolynomialLR, PolynomialWarmupLR
diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py
index 067636a3df25..0df30baab916 100644
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@@ -66,11 +66,10 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: int = 0, last_epoch: int = -1,
-                 **kwargs):
+    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: int = 0, last_epoch: int = -1):
         base_scheduler = _CosineAnnealingLR(
-            optimizer, total_steps - warmup_steps, eta_min=eta_min)
-        super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch)
+            optimizer, total_steps - warmup_steps, eta_min=eta_min, last_epoch=last_epoch)
+        super().__init__(optimizer, warmup_steps, base_scheduler)
 
 
 @LR_SCHEDULERS.register_module
diff --git a/colossalai/nn/lr_scheduler/delayed.py b/colossalai/nn/lr_scheduler/delayed.py
index c8972c92294f..173d2f52c7a7 100644
--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@@ -55,7 +55,7 @@ def step(self, epoch=None):
 
 
 class WarmupScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler 
+    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
 
     :param optimizer: Wrapped optimizer.
     :type optimizer: torch.optim.Optimizer
@@ -66,11 +66,8 @@ class WarmupScheduler(_LRScheduler):
     :param last_epoch: The index of last epoch, defaults to -1
     :type last_epoch: int, optional
     """
-
     def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
-        if warmup_epochs < 0:
-            raise ValueError(f'warmup_epochs must >= 0, got {warmup_epochs}')
-        self.warmup_epochs = warmup_epochs
+        self.warmup_epochs = int(warmup_epochs)
         self.after_scheduler = after_scheduler
         self.finished = False
         super().__init__(optimizer, last_epoch)
@@ -79,14 +76,10 @@ def get_lr(self):
         if self.last_epoch >= self.warmup_epochs:
             if not self.finished:
                 self.after_scheduler.base_lrs = self.base_lrs
-                # reset lr to base_lr
-                for group, base_lr in zip(self.optimizer.param_groups, self.base_lrs):
-                    group['lr'] = base_lr
                 self.finished = True
-            with _enable_get_lr_call(self.after_scheduler):
-                return self.after_scheduler.get_lr()
+            return self.after_scheduler.get_lr()
 
-        return [(self.last_epoch + 1) / (self.warmup_epochs + 1) * lr for lr in self.base_lrs]
+        return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs]
 
     def step(self, epoch=None):
         if self.finished:
diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py
index afc68c5a719f..b9498baf0d19 100644
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@@ -28,18 +28,3 @@ def get_lr(self):
         else:
             return [(self.total_steps - self.last_epoch) / (self.total_steps - self.warmup_steps) * lr for lr in
                     self.base_lrs]
-
-
-@LR_SCHEDULERS.register_module
-class LinearWarmupDecay(_LRScheduler):
-    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
-        self.warmup_steps = int(warmup_steps)
-        self.total_steps = total_steps
-        super().__init__(optimizer, last_epoch=last_epoch)
-
-    def get_lr(self):
-        if self.last_epoch < self.warmup_steps:
-            return [(self.last_epoch + 1) / self.warmup_steps * lr for lr in self.base_lrs]
-        else:
-            return [(self.total_steps - self.last_epoch - 1) / (self.total_steps - self.warmup_steps) * lr for lr in
-                    self.base_lrs]
diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py
index 46420765c8d7..5def4a1fac92 100644
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -27,12 +27,7 @@ class MultiStepLR(_MultiStepLR):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1,
-                 num_steps_per_epoch: int = -1, last_epoch: int = -1, **kwargs):
-        if num_steps_per_epoch <= 0:
-            raise ValueError(
-                f'num_steps_per_epoch must > 0, got {num_steps_per_epoch}')
-        milestones = [v * num_steps_per_epoch for v in milestones]
+    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
         super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
 
 
@@ -57,14 +52,11 @@ class MultiStepWarmupLR(WarmupScheduler):
     """
 
     def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
-                 gamma: float = 0.1, num_steps_per_epoch: int = -1, last_epoch: int = -1, **kwargs):
+                 gamma: float = 0.1, last_epoch: int = -1, **kwargs):
         if len(milestones) == 0:
             raise ValueError('milestones cannot be empty')
-        if num_steps_per_epoch <= 0:
-            raise ValueError(
-                f'num_steps_per_epoch must > 0, got {num_steps_per_epoch}')
-        milestones = [v * num_steps_per_epoch - warmup_steps for v in milestones if v *
-                      num_steps_per_epoch >= warmup_steps]
+        milestones = [
+            v - warmup_steps for v in milestones if v >= warmup_steps]
         base_scheduler = _MultiStepLR(optimizer, milestones=milestones,
                                       gamma=gamma)
         super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch)
diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py
index 3ac0121ffabc..e739084b6fbb 100644
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@@ -1,7 +1,7 @@
 from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
 
 from colossalai.registry import LR_SCHEDULERS
 
@@ -25,11 +25,8 @@ class LambdaLR(_LambdaLR):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps, lr_lambda=None, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        def func(step): return lr_lambda(step // num_steps_per_epoch)
-
-        super().__init__(optimizer, func, last_epoch=last_epoch)
+    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
 @LR_SCHEDULERS.register_module
@@ -51,11 +48,8 @@ class MultiplicativeLR(_MultiplicativeLR):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps, lr_lambda=None, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        def func(step): return lr_lambda(step // num_steps_per_epoch)
-
-        super().__init__(optimizer, func, last_epoch=last_epoch)
+    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
 @LR_SCHEDULERS.register_module
@@ -79,14 +73,13 @@ class StepLR(_StepLR):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        super().__init__(optimizer, step_size * num_steps_per_epoch,
+    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, step_size,
                          gamma=gamma, last_epoch=last_epoch)
 
 
 @LR_SCHEDULERS.register_module
-class ExponentialLR(_LRScheduler):
+class ExponentialLR(_ExponentialLR):
     """Decays the learning rate of each parameter group by gamma every epoch.
     When last_epoch=-1, sets initial lr as lr
 
@@ -102,21 +95,6 @@ class ExponentialLR(_LRScheduler):
     :type last_epoch: int, optional
     """
 
-    def __init__(self, optimizer, total_steps, gamma: float = 1.0, num_steps_per_epoch: int = -1,
+    def __init__(self, optimizer, total_steps, gamma: float = 1.0,
                  last_epoch: int = -1) -> None:
-        self.gamma = gamma
-        self.num_steps_per_epoch = num_steps_per_epoch
-        super().__init__(optimizer, last_epoch=last_epoch)
-
-    def get_lr(self):
-        if self.last_epoch == 0:
-            return self.base_lrs
-        elif (self.last_epoch + 1) % self.num_steps_per_epoch == 0:
-            return [group['lr'] * self.gamma
-                    for group in self.optimizer.param_groups]
-        return [group['lr']
-                for group in self.optimizer.param_groups]
-
-    def _get_closed_form_lr(self):
-        return [base_lr * self.gamma ** (self.last_epoch // self.num_steps_per_epoch)
-                for base_lr in self.base_lrs]
+        super().__init__(optimizer, gamma, last_epoch=last_epoch)
diff --git a/colossalai/nn/optimizer/_utils.py b/colossalai/nn/optimizer/_utils.py
index 1be8ffc1b2c7..6cd92bb38c34 100644
--- a/colossalai/nn/optimizer/_utils.py
+++ b/colossalai/nn/optimizer/_utils.py
@@ -106,7 +106,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
             tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type)
             no_tensor_parallel_grads = _calc_lp(
                 no_tensor_parallel_grads, norm_type)
-        if gpc.is_initialized(ParallelMode.TENSOR):
+        if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0:
             # Sum across all model-parallel GPUs.
             torch.distributed.all_reduce(tensor_parallel_norm,
                                          op=torch.distributed.ReduceOp.SUM,
diff --git a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py b/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py
index 17e277843c98..1a57c5876f01 100644
--- a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py
+++ b/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.distributed as dist
+
 try:
     from deepspeed.git_version_info import version
     from deepspeed.moe.utils import is_moe_param
@@ -13,7 +14,7 @@
     from deepspeed.ops.op_builder import UtilsBuilder
     from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
 except ImportError:
-    print('DeepSpeed is required if you want to use ZeRO.')
+    pass
 from packaging import version as pkg_version
 from torch._six import inf
 from torch.distributed.distributed_c10d import _get_global_rank
@@ -251,7 +252,7 @@ def __init__(self,
         self.nccl_start_alignment_factor = 2
 
         assert (
-            allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
+                allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
 
         self.all_reduce_print = False
         self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
@@ -759,7 +760,7 @@ def increment_value(dictionary, key):
             elif start_index > current_index and start_index < (current_index +
                                                                 param_size):
                 assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                        first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
                 set_key_value_list(self.param_to_partition_ids[i],
@@ -803,7 +804,7 @@ def get_param_id(self, param):
     def report_ipg_memory_usage(self, tag, param_elems):
         elem_count = self.elements_in_ipg_bucket + param_elems
         percent_of_bucket_size = (
-            100.0 * elem_count) // self.reduce_bucket_size
+                                         100.0 * elem_count) // self.reduce_bucket_size
         if self.verbose:
             report_memory_usage(
                 f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
@@ -1491,7 +1492,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id):
                 params_in_partition.append(tensor)
 
                 assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                        first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
             else:
@@ -1799,7 +1800,7 @@ def step(self, closure=None):
             num_elements = shard_size
 
             assert shard_size * \
-                num_shards <= partitioned_params[partition_id].numel()
+                   num_shards <= partitioned_params[partition_id].numel()
 
             for shard_id in range(num_shards):
 
@@ -2248,7 +2249,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
     if cpu_offload:
         gpu_mem = 2 * total_params
         cpu_mem = total_params * \
-            max(4 * total_gpus, 16) * additional_buffer_factor
+                  max(4 * total_gpus, 16) * additional_buffer_factor
     else:
         gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
diff --git a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py b/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py
index 6f5d7969c460..4e54f3cd3e62 100644
--- a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py
+++ b/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py
@@ -21,7 +21,7 @@
     from deepspeed.runtime.zero.partition_parameters import *
     from deepspeed.runtime.zero.partition_parameters import _init_external_params
 except ImportError:
-    print('DeepSpeed is required if you want to use ZeRO.')
+    pass
 
 from torch._six import inf
 from torch.distributed.distributed_c10d import _get_global_rank
diff --git a/colossalai/registry/__init__.py b/colossalai/registry/__init__.py
index 99aedc495899..1de1c98aea1e 100644
--- a/colossalai/registry/__init__.py
+++ b/colossalai/registry/__init__.py
@@ -20,3 +20,4 @@
 PIPE_ALLOC_POLICY = Registry('pipeline_allocation_policy')
 SAMPLERS = Registry('samplers')
 LR_SCHEDULERS = Registry('lr_schedulers')
+SCHEDULE = Registry('schedules')
diff --git a/colossalai/trainer/__init__.py b/colossalai/trainer/__init__.py
index 34e38d54a1fa..57f7b7495325 100644
--- a/colossalai/trainer/__init__.py
+++ b/colossalai/trainer/__init__.py
@@ -1,5 +1,5 @@
 from ._trainer import Trainer
 from .hooks import *
-from .metric import Loss, Accuracy2D, Accuracy3D, Accuracy2p5D
+from .metric import Loss, Accuracy2D, Accuracy3D, Accuracy2p5D, LearningRate
 
-__all__ = ['Trainer', 'Loss', 'Accuracy3D', 'Accuracy2D', 'Accuracy2p5D']
+__all__ = ['Trainer', 'Loss', 'Accuracy3D', 'Accuracy2D', 'Accuracy2p5D', 'LearningRate']
diff --git a/colossalai/trainer/_trainer.py b/colossalai/trainer/_trainer.py
index 67334964040f..96a82d995817 100644
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
 from typing import Union, List
 
 import torch
@@ -10,12 +9,11 @@
 from tqdm import tqdm
 
 from colossalai.builder import build_hooks
-from colossalai.checkpointing import save_checkpoint, load_checkpoint, get_checkpoint_path
-from colossalai.context import Config
 from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
-from colossalai.utils import get_global_multitimer, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
 from colossalai.nn.data import DataParallelSampler
+from colossalai.utils import MultiTimer
+from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
 
 
 class Trainer:
@@ -30,43 +28,31 @@ class Trainer:
     :type hoooks_cfg: Config, optional
     :type verbose: bool, optional
     """
+
     def __init__(self,
                  engine: Engine,
-                 hooks_cfg: Optional[Config] = None,
-                 verbose: bool = False):
+                 verbose: bool = False,
+                 timer: MultiTimer = None):
         # training-ralated params
         self._engine = engine
-        self._max_epochs = float('inf')
-        self._max_steps = float('inf')
+        self._max_epochs = 0
         self._cur_epoch = 0
+        self._max_steps = 0
         self._cur_step = 0
-
-        # data-related params
-        self._train_dataloader = None
-        self._test_dataloader = None
+        self._steps_per_epoch = 0
 
         # misc params
-        self._display_progress = False
         self._logger = get_global_dist_logger()
         self._verbose = verbose
 
         # hooks can store states in this dict, and could be consumed by other hooks
-        self.states = {}
+        self.states = dict()
 
         # build hooks
         self.hooks = list()
-        if hooks_cfg is not None:
-            for cfg in hooks_cfg:
-                hook = build_hooks(cfg, self)
-                self.hooks.append(hook)
-        self.hooks.sort(key=lambda hook: hook.priority)
-        if self._verbose:
-            for hook in self.hooks:
-                self._logger.info(
-                    f'build {hook.__class__.__name__} for train, priority = {hook.priority}', ranks=[0])
 
-        # timer
-        self._timer = get_global_multitimer()
+        # multi-timer for time benchmarking
+        self._timer = timer
 
     @property
     def cur_epoch(self):
@@ -74,13 +60,65 @@ def cur_epoch(self):
         """
         return self._cur_epoch
 
+    @cur_epoch.setter
+    def cur_epoch(self, epoch: int):
+        """Set how many epochs have been processed.
+        """
+        # allow setter for training resumption
+        self._cur_epoch = epoch
+
     @property
     def cur_step(self):
         """Returns how many iteration steps have been processed.
         """
         return self._cur_step
 
-    def call_hooks(self, func, output=None):
+    @property
+    def max_epochs(self):
+        return self._max_epochs
+
+    @property
+    def max_steps(self):
+        return self._max_steps
+
+    @property
+    def steps_per_epoch(self):
+        return self._steps_per_epoch
+
+    @property
+    def engine(self):
+        return self._engine
+
+    @engine.setter
+    def engine(self, engine_: Engine):
+        self._engine = engine_
+
+    def _set_current_step(self, epoch: int):
+        """Sets current step number.
+
+        :param epoch: Step number to be set
+        :type epoch: int
+        """
+        self._cur_step = epoch * self._steps_per_epoch
+
+    def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
+        """Call timer funciton with a given timer name.
+
+        :param action: Function to be called on timer
+        :type action: str
+        :param item: Name of the timer
+        :type item: str
+        """
+
+        if self._timer is not None:
+            getattr(self._timer, action)(item, *args, **kwargs)
+
+    def _reset_states(self) -> None:
+        """Clear trainer states
+        """
+        self.states = dict()
+
+    def _call_hooks(self, func, output=None):
         """Calls specific hooks in the current time point.
 
         :param func: A string represents the time point
@@ -95,161 +133,186 @@ def call_hooks(self, func, output=None):
             else:
                 getattr(hook, func)(*output)
 
-    def exceed_max_step(self):
-        """Checks whether the trainer exceeds the maximum number of runnning iterations.
-        """
-        return self._cur_step >= self._max_steps
-
-    def set_epoch(self, epoch):
-        """Sets current epoch number.
-
-        :param epoch: Epoch number to be set
-        :type epoch: int
+    @staticmethod
+    def _should_display_progress(display_progress: bool):
+        """ Only display progress on DP rank 0, TP rank 0 and PP last rank
         """
-        self._cur_epoch = epoch
-
-    def _recover_steps(self):
-        step = self.cur_step * self._engine.schedule.num_steps
-        self._cur_step = step
+        return display_progress and is_dp_rank_0() and is_tp_rank_0() and is_no_pp_or_last_stage()
 
-    def _set_display_progress(self, display_progress: bool):
-        self._display_progress = display_progress and is_dp_rank_0(
-        ) and is_tp_rank_0() and is_no_pp_or_last_stage()
-
-    def _train_epoch(self, epoch: int = None):
+    def _train_epoch(self,
+                     train_dataloader: DataLoader,
+                     epoch: int = None,
+                     display_progress: bool = False):
         # set sampler epoch
         if epoch is not None and \
-                hasattr(self._engine.train_dataloader, 'sampler') and \
-                isinstance(self._engine.train_dataloader.sampler, DataParallelSampler):
-            self._engine.train_dataloader.sampler.set_epoch(epoch)
+                hasattr(train_dataloader, 'sampler') and \
+                isinstance(train_dataloader.sampler, DataParallelSampler):
+            train_dataloader.sampler.set_epoch(epoch)
 
+        # set training state
         self._engine.train()
-
-        progress = range(self._engine.schedule.num_steps)
-        if self._display_progress:
+        data_iter = iter(train_dataloader)
+        progress = range(self._steps_per_epoch)
+        if display_progress:
             if epoch is None:
                 progress = tqdm(progress, desc='[Train]')
             else:
                 progress = tqdm(progress, desc=f'[Epoch {epoch} train]')
 
         # train 1 epoch
-        self.call_hooks('before_train_epoch')
-        self._timer.start('train-epoch')
-        for _ in progress:
-            self._cur_step += 1
+        self._call_hooks('before_train_epoch')
+        self._call_timer(action='start', item='train-epoch')
+        for i in progress:
+            self._call_hooks('before_train_iter')
+            self._call_timer(action='start', item='train-step')
+
+            if i == self._steps_per_epoch - 1:
+                is_last_iteration = True
+            else:
+                is_last_iteration = False
 
-            self.call_hooks('before_train_iter')
-            self._timer.start('train-step')
-            logits, label, loss = self._engine.step()
-            self._timer.stop('train-step', keep_in_history=True)
-            self.call_hooks('after_train_iter', output=(logits, label, loss))
+            # run 1 training step
+            logits, label, loss = self._engine.step(data_iter, is_last_iteration)
+            self._call_timer(action='stop', item='train-step', keep_in_history=True)
+            self._call_hooks('after_train_iter', output=(logits, label, loss))
 
-            if self.exceed_max_step():
-                # stop when max iter is reached
+            self._cur_step += 1
+
+            # stop when max iter is reached
+            if self._exceed_max_step():
                 break
-        self._timer.stop('train-epoch', keep_in_history=True)
-        self.call_hooks('after_train_epoch')
-        self._timer.reset('train-step')
+
+        self._call_timer(action='stop', item='train-epoch', keep_in_history=True)
+        self._call_hooks('after_train_epoch')
+        self._call_timer(action='reset', item='train-step')
 
     def _eval(self,
+              test_dataloader: DataLoader,
               epoch: int = None,
-              return_loss: bool = True):
+              display_progress: bool = False):
         # switch engine status
         self._engine.eval()
 
-        self.call_hooks('before_test')
+        data_iter = iter(test_dataloader)
+        num_steps = len(test_dataloader)
+
+        self._call_hooks('before_test')
         with torch.no_grad():
             # prepare progress bar
-            progress = range(self._engine.schedule.num_steps)
-            if self._display_progress:
+            progress = range(num_steps)
+            if display_progress:
                 desc = 'Evaluation'
                 if epoch is not None:
                     desc = '[Epoch %d val]' % epoch
                 progress = tqdm(progress, desc=desc)
 
-            self.call_hooks('before_test_epoch')
-            self._timer.start('test-epoch')
+            self._call_hooks('before_test_epoch')
+            self._call_timer(action='start', item='test-epoch')
             for _ in progress:
-                self.call_hooks('before_test_iter')
-                self._timer.start('test-step')
-                logits, label, loss = self._engine.step(
-                    return_loss=return_loss)
-                self._timer.stop('test-step', keep_in_history=True)
-                self.call_hooks('after_test_iter',
-                                output=(logits, label, loss))
-            self._timer.stop('test-epoch', keep_in_history=True)
-            self.call_hooks('after_test_epoch')
-        self.call_hooks('after_test')
-        self._timer.reset('test-step')
-        self._timer.reset('test-epoch')
+                self._call_hooks('before_test_iter')
+                self._call_timer(action='start', item='test-step')
+                logits, label, loss = self._engine.step(data_iter, return_loss=True)
+                self._call_timer(action='stop', item='test-step', keep_in_history=True)
+                self._call_hooks('after_test_iter',
+                                 output=(logits, label, loss))
+            self._call_timer(action='stop', item='test-epoch', keep_in_history=True)
+            self._call_hooks('after_test_epoch')
+        self._call_hooks('after_test')
+        self._call_timer(action='reset', item='test-step')
+        self._call_timer(action='reset', item='test-epoch')
+
+    def _exceed_max_step(self):
+        return self._max_steps is not None and self._cur_step > self._max_steps
 
     def fit(self,
             train_dataloader: DataLoader,
-            test_dataloader: DataLoader = None,
-            max_epochs: int = None,
+            epochs: int,
             max_steps: int = None,
+            test_dataloader: DataLoader = None,
             test_interval: int = 1,
-            display_progress: bool = False):
+            hooks_cfg: dict = None,
+            display_progress: bool = False,
+            ):
         """Trains the model to fit training data.
 
         :param train_dataloader: DataLoader in training
-        :param test_dataloader: DataLoader in testing
-        :param max_epochs: Maximum number of epoches
+        :param epochs: Maximum number of epoches
         :param max_steps: Maximum number of running iterations
+        :param test_dataloader: DataLoader in testing
         :param test_interval: Interval of testing
+        :param hooks_cfg: A list of hook configuration
         :param display_progress: If True, the training progress will be printed
         :type train_dataloader: DataLoader
-        :type test_dataloader: DataLoader
-        :type max_epochs: int
+        :type epochs: int
         :type max_steps: int
+        :type test_dataloader: DataLoader
         :type test_interval: int
+        :type hooks_cfg: dict
         :type display_progress: bool
+        :type gradient_accumulation: int
         """
 
-        # prepare dataloaders
-        self._train_dataloader = train_dataloader
-        self._engine.set_dataloader(self._train_dataloader, train=True)
-        self._engine.train()
+        # set epochs and steps, consider gradient accumulation
+        self._steps_per_epoch = len(train_dataloader) // self._engine.gradient_accumulation
+        self._max_steps = max_steps
+        self._max_epochs = epochs
 
+        # check if testing is required
         should_test = False
         if test_dataloader is not None:
-            self._test_dataloader = test_dataloader
-            self._engine.set_dataloader(self._test_dataloader, train=False)
             should_test = True
 
-        # decide the
-        if max_epochs is not None:
-            self._max_epochs = max_epochs
-        if max_steps is not None:
-            self._max_steps = max_steps
-        self._set_display_progress(display_progress)
+        display_progress = self._should_display_progress(display_progress)
+
+        # reset hooks
+        self._reset_states()
+        self.hooks = list()
+
+        # build hooks
+        if hooks_cfg is not None:
+            for cfg in hooks_cfg:
+                hook = build_hooks(cfg, self)
+                self.hooks.append(hook)
+        self.hooks.sort(key=lambda hook: hook.priority)
+        if self._verbose:
+            for hook in self.hooks:
+                self._logger.info(
+                    f'build {hook.__class__.__name__} for training, priority = {hook.priority}', ranks=[0])
+            self._logger.info("Lower value means higher priority for calling hook function")
 
         # start train
-        self.call_hooks('before_train')
+        self._engine.train()
+        self._call_hooks('before_train')
 
         # recover step value if resuming training
-        if self.cur_epoch != 0:
-            self._recover_steps()
-
         last_epoch = self._cur_epoch
+        if self.cur_epoch != 0:
+            self._set_current_step(last_epoch)
 
-        for epoch in range(last_epoch, self._max_epochs):
-            self._cur_epoch += 1
-
+        for epoch in range(last_epoch, epochs):
             # train for one epoch
-            self._train_epoch(epoch)
+            self._train_epoch(
+                train_dataloader=train_dataloader,
+                epoch=epoch,
+                display_progress=display_progress
+            )
 
             # start eval
             if should_test and epoch % test_interval == 0:
-                self._eval(epoch, return_loss=True)
+                self._eval(test_dataloader=test_dataloader,
+                           display_progress=display_progress,
+                           epoch=epoch,
+                           )
+
+            self._cur_epoch += 1
 
             # check for termination
-            if self.exceed_max_step():
+            if self._exceed_max_step():
                 self._logger.info(
-                    f"Max number of steps {self._max_steps} has been reached, training is stopped automatically")
+                    f"Max number of steps {max_steps} has been reached, training is stopped automatically")
                 break
-        self.call_hooks('after_train')
-        self._timer.reset('train-epoch')
+        self._call_hooks('after_train')
+        self._call_timer('reset', 'train-epoch')
 
     def evaluate(self,
                  test_dataloader: DataLoader,
@@ -261,15 +324,13 @@ def evaluate(self,
         :type test_dataloader: DataLoader
         :type display_progress: bool, optional
         """
-        # set dataloader
-        self._test_dataloader = test_dataloader
-        self._engine.set_dataloader(self._test_dataloader, train=True)
-
-        # set
-        self._set_display_progress(display_progress)
+        # set display
+        display_progress = self._should_display_progress(display_progress)
 
         # eval
-        self._eval(return_loss=True)
+        self._eval(test_dataloader=test_dataloader,
+                   display_progress=display_progress,
+                   )
 
     def predict(self, data: Union[Tensor, List[Tensor]]):
         """Uses trained model to make a prediction for a tensor or a tensor list.
@@ -289,45 +350,6 @@ def predict(self, data: Union[Tensor, List[Tensor]]):
         # prepare a list of (data, label) to make it iterable
         # for compatibility with schedule
         simple_dataloader = [(data, None)]
-        self._engine.set_dataloader(simple_dataloader)
-        output, _, _ = self._engine.step(return_loss=False)
+        data_iter = iter(simple_dataloader)
+        output, _, _ = self._engine.step(data_iter, return_loss=False)
         return output
-
-    def save(self, path: str, suffix: str = ''):
-        """Saves the model to a file.
-
-        :param path: Relative path of the file
-        :param suffix: Suffix of the file
-        :type path: str
-        :type suffix: str, optional
-        """
-        save_path = get_checkpoint_path(path,
-                                        self._cur_epoch,
-                                        suffix=suffix)
-        save_checkpoint(save_path, self._cur_epoch, self._engine.get_model(),
-                        self._engine.get_optimizer(),
-                        self._engine.get_lr_scheduler())
-
-    def load(self,
-             path: str,
-             finetune: bool = False,
-             strict: bool = False):
-        """Loads parameters to the model from a file.
-
-        :param path: Relative path of the file
-        :param finetune: Whether allows to load a part of the model
-        :param strict: Whether loads a model that has the same shape of parameters 
-        :type path: str
-        :type finetune: bool, optional
-        :type strict: bool, optional
-        """
-        last_epoch, _ = load_checkpoint(path,
-                                        self._engine.get_model(),
-                                        self._engine.get_optimizer(),
-                                        self._engine.get_lr_scheduler(),
-                                        finetune=finetune,
-                                        strict=strict)
-        if finetune:
-            self.set_epoch(0)
-        else:
-            self.set_epoch(last_epoch)
diff --git a/colossalai/trainer/hooks/__init__.py b/colossalai/trainer/hooks/__init__.py
index 2cc3c78b76ef..952bef8b99fc 100644
--- a/colossalai/trainer/hooks/__init__.py
+++ b/colossalai/trainer/hooks/__init__.py
@@ -2,10 +2,12 @@
 from ._checkpoint_hook import SaveCheckpointHook, LoadCheckpointHook
 from ._metric_hook import LossHook, Accuracy2DHook, AccuracyHook, MetricHook
 from ._log_hook import LogMetricByEpochHook, TensorboardHook, LogTimingByEpochHook, LogMemoryByEpochHook
+from ._lr_scheduler_hook import LRSchedulerHook
 
 __all__ = [
     'BaseHook', 'MetricHook',
     'LoadCheckpointHook', 'SaveCheckpointHook',
     'LossHook', 'AccuracyHook', 'Accuracy2DHook',
     'LogMetricByEpochHook', 'TensorboardHook', 'LogTimingByEpochHook', 'LogMemoryByEpochHook',
+    'LRSchedulerHook'
 ]
diff --git a/colossalai/trainer/hooks/_checkpoint_hook.py b/colossalai/trainer/hooks/_checkpoint_hook.py
index 49fd289480ff..e1d9d4714277 100644
--- a/colossalai/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/trainer/hooks/_checkpoint_hook.py
@@ -3,13 +3,13 @@
 
 import os.path as osp
 
-import torch.distributed as dist
-
-from colossalai.checkpointing import get_latest_checkpoint_path, get_checkpoint_path
 from colossalai.registry import HOOKS
-from colossalai.trainer.hooks import BaseHook
 from colossalai.trainer import Trainer
+from colossalai.trainer.hooks import BaseHook
 from colossalai.utils import is_dp_rank_0
+from colossalai.utils.checkpointing import get_latest_checkpoint_path, get_checkpoint_path
+from colossalai.utils.checkpointing import save_checkpoint, load_checkpoint
+from ._lr_scheduler_hook import LRSchedulerHook
 
 
 @HOOKS.register_module
@@ -33,7 +33,7 @@ def __init__(self,
                  interval: int = 1,
                  checkpoint_dir: str = None,
                  suffix: str = '',
-                 priority: int = 0):
+                 priority: int = 10):
         super().__init__(trainer=trainer, priority=priority)
         assert isinstance(trainer, Trainer), \
             f'SaveCheckpointHook expects a Trainer, got {type(trainer)}'
@@ -41,6 +41,16 @@ def __init__(self,
         self.checkpoint_dir = checkpoint_dir
         self.suffix = suffix
 
+        # get lr scheduler from the LRSchedulerHook before train
+        self._lr_scheduler = None
+
+    def before_train(self):
+        # check if lr scheduler is present in LRSchedulerHook
+        for hook in self.trainer.hooks:
+            if isinstance(hook, LRSchedulerHook):
+                self._lr_scheduler = hook.lr_scheduler
+                break
+
     def after_train_epoch(self):
         """Saves the model after a training epoch.
         """
@@ -48,14 +58,18 @@ def after_train_epoch(self):
         if self.trainer.cur_epoch % self.interval == 0:
             # only gpus with data parallel rank equals to 0 write to the disk
             if is_dp_rank_0():
-                self.trainer.save(path=self.checkpoint_dir, suffix=self.suffix)
+                save_path = get_checkpoint_path(self.checkpoint_dir,
+                                                self.trainer.cur_epoch,
+                                                suffix=self.suffix)
+
+                save_checkpoint(save_path,
+                                self.trainer.cur_epoch,
+                                self.trainer.engine.model,
+                                self.trainer.engine.optimizer,
+                                self._lr_scheduler)
                 self.logger.info(
                     f'checkpoint for epoch {self.trainer.cur_epoch} is saved to {self.checkpoint_dir}')
 
-            # wait until everyone is done
-            if dist.is_initialized():
-                dist.barrier()
-
 
 @HOOKS.register_module
 class LoadCheckpointHook(BaseHook):
@@ -81,30 +95,46 @@ def __init__(self,
                  epoch: int = -1,
                  finetune: bool = False,
                  strict: bool = False,
-                 priority: int = 10) -> None:
+                 suffix: str = '',
+                 priority: int = 0) -> None:
+        super().__init__(trainer=trainer, priority=priority)
         assert isinstance(trainer, Trainer), \
             f'LoadLatestCheckpointHook excepts a Trainer, got {type(trainer)}'
         self.epoch = epoch
         self.checkpoint_dir = checkpoint_dir
         self.finetune = finetune
+        self.suffix = suffix
         self.strict = strict
-        super().__init__(trainer=trainer, priority=priority)
 
     def before_train(self):
         """Loads parameters to the model before training.
         """
+        # check if lr scheduler is present in LRSchedulerHook
+        lr_scheduler = None
+        for hook in self.trainer.hooks:
+            if isinstance(hook, LRSchedulerHook):
+                lr_scheduler = hook.lr_scheduler
+                break
+
+        # use latest checkpoint if epoch = -1
         if self.epoch == -1:
-            path = get_latest_checkpoint_path(self.checkpoint_dir)
+            path = get_latest_checkpoint_path(self.checkpoint_dir, suffix=self.suffix)
         else:
-            path = get_checkpoint_path(self.checkpoint_dir, epoch=self.epoch)
+            path = get_checkpoint_path(self.checkpoint_dir, epoch=self.epoch, suffix=self.suffix)
+
         if osp.exists(path):
-            self.trainer.load(
-                path, finetune=self.finetune, strict=self.strict)
+            last_epoch, _ = load_checkpoint(path,
+                                            self.trainer.engine.model,
+                                            self.trainer.engine.optimizer,
+                                            lr_scheduler,
+                                            finetune=self.finetune,
+                                            strict=self.strict)
+            if self.finetune:
+                self.trainer.cur_epoch = 0
+            else:
+                self.trainer.cur_epoch = last_epoch
+
             self.logger.info(
                 f'loaded checkpoint from {path}')
         else:
             raise FileNotFoundError(f'checkpoint is not found at {path}')
-
-        # Some utilities want to load a checkpoint without distributed being initialized
-        if dist.is_initialized():
-            dist.barrier()
diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/trainer/hooks/_log_hook.py
index d7ed4bf56b2e..3c3fdfc43ef8 100644
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@@ -5,7 +5,7 @@
 import os.path as osp
 
 import torch
-from tensorboardX import SummaryWriter
+from torch.utils.tensorboard import SummaryWriter
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
@@ -13,7 +13,7 @@
 from colossalai.trainer._trainer import Trainer
 from colossalai.utils import get_global_multitimer, set_global_multitimer_status, report_memory_usage, is_dp_rank_0, \
     is_tp_rank_0, is_no_pp_or_last_stage
-from ._metric_hook import MetricHook
+from ._base_hook import BaseHook
 
 
 def _format_number(val):
@@ -24,7 +24,7 @@ def _format_number(val):
     return val
 
 
-class EpochIntervalHook(MetricHook):
+class EpochIntervalHook(BaseHook):
     def __init__(self, trainer: Trainer, interval: int = 1, priority: int = 1):
         super().__init__(trainer, priority)
         self._interval = interval
@@ -45,7 +45,7 @@ class LogMetricByEpochHook(EpochIntervalHook):
     :type priority: int, optional
     """
 
-    def __init__(self, trainer: Trainer, interval: int = 1, priority: int = 1) -> None:
+    def __init__(self, trainer: Trainer, interval: int = 1, priority: int = 10) -> None:
         super().__init__(trainer=trainer, interval=interval, priority=priority)
         self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() and is_no_pp_or_last_stage()
 
@@ -74,7 +74,7 @@ def after_test_epoch(self):
 
 
 @HOOKS.register_module
-class TensorboardHook(MetricHook):
+class TensorboardHook(BaseHook):
     """Specialized Hook to record the metric to Tensorboard.
 
     :param trainer: Trainer attached with current hook
@@ -85,59 +85,71 @@ class TensorboardHook(MetricHook):
     :type priority: int, optional
     """
 
-    def __init__(self, trainer: Trainer, log_dir: str, priority: int = 1) -> None:
+    def __init__(self,
+                 trainer: Trainer,
+                 log_dir: str,
+                 dp_rank_0_only: bool = True,
+                 tp_rank_0_only: bool = True,
+                 priority: int = 10,
+                 ) -> None:
         super().__init__(trainer=trainer, priority=priority)
-        self._is_rank_to_log = is_no_pp_or_last_stage()
 
-        if self._is_rank_to_log:
+        # create log dir
+        if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
+            os.makedirs(log_dir, exist_ok=True)
+
+        # determine the ranks to generate tensorboard logs
+        self._is_valid_rank_to_log = is_no_pp_or_last_stage()
+
+        if dp_rank_0_only:
+            self._is_valid_rank_to_log = self._is_valid_rank_to_log and is_dp_rank_0()
+
+        if tp_rank_0_only:
+            self._is_valid_rank_to_log = self._is_valid_rank_to_log and is_tp_rank_0()
+
+        if self._is_valid_rank_to_log:
             # create workspace on only one rank
             if gpc.is_initialized(ParallelMode.GLOBAL):
                 rank = gpc.get_global_rank()
             else:
                 rank = 0
 
-            log_dir = osp.join(log_dir, f'rank_{rank}')
-
             # create workspace
-            if not osp.exists(log_dir):
-                os.makedirs(log_dir)
+            log_dir = osp.join(log_dir, f'rank_{rank}')
+            os.makedirs(log_dir, exist_ok=True)
 
             self.writer = SummaryWriter(
                 log_dir=log_dir, filename_suffix=f'_rank_{rank}')
 
-    def after_train_iter(self, *args):
-        for metric_name, metric_calculator in self.trainer.states['metrics']['train'].items():
+    def _log_by_iter(self, mode: str):
+        for metric_name, metric_calculator in self.trainer.states['metrics'][mode].items():
             if metric_calculator.epoch_only:
                 continue
             val = metric_calculator.get_last_step_value()
-            if self._is_rank_to_log:
-                self.writer.add_scalar(
-                    f'{metric_name}/train', val, self.trainer.cur_step)
 
-    def after_test_iter(self, *args):
-        for metric_name, metric_calculator in self.trainer.states['metrics']['test'].items():
-            if metric_calculator.epoch_only:
-                continue
-            val = metric_calculator.get_last_step_value()
-            if self._is_rank_to_log:
-                self.writer.add_scalar(f'{metric_name}/test', val,
+            if self._is_valid_rank_to_log:
+                self.writer.add_scalar(f'{metric_name}/{mode}', val,
                                        self.trainer.cur_step)
 
-    def after_test_epoch(self):
-        for metric_name, metric_calculator in self.trainer.states['metrics']['test'].items():
+    def _log_by_epoch(self, mode: str):
+        for metric_name, metric_calculator in self.trainer.states['metrics'][mode].items():
             if metric_calculator.epoch_only:
                 val = metric_calculator.get_accumulated_value()
-                if self._is_rank_to_log:
-                    self.writer.add_scalar(f'{metric_name}/test', val,
+                if self._is_valid_rank_to_log:
+                    self.writer.add_scalar(f'{metric_name}/{mode}', val,
                                            self.trainer.cur_step)
 
+    def after_test_iter(self, *args):
+        self._log_by_iter(mode='test')
+
+    def after_test_epoch(self):
+        self._log_by_epoch(mode='test')
+
+    def after_train_iter(self, *args):
+        self._log_by_iter(mode='train')
+
     def after_train_epoch(self):
-        for metric_name, metric_calculator in self.trainer.states['metrics']['train'].items():
-            if metric_calculator.epoch_only:
-                val = metric_calculator.get_accumulated_value()
-                if self._is_rank_to_log:
-                    self.writer.add_scalar(f'{metric_name}/train', val,
-                                           self.trainer.cur_step)
+        self._log_by_epoch(mode='train')
 
 
 @HOOKS.register_module
@@ -157,7 +169,7 @@ class LogTimingByEpochHook(EpochIntervalHook):
     def __init__(self,
                  trainer: Trainer,
                  interval: int = 1,
-                 priority: int = 1,
+                 priority: int = 10,
                  log_eval: bool = True
                  ) -> None:
         super().__init__(trainer=trainer, interval=interval, priority=priority)
@@ -217,7 +229,7 @@ class LogMemoryByEpochHook(EpochIntervalHook):
     def __init__(self,
                  trainer: Trainer,
                  interval: int = 1,
-                 priority: int = 1,
+                 priority: int = 10,
                  log_eval: bool = True
                  ) -> None:
         super().__init__(trainer=trainer, interval=interval, priority=priority)
diff --git a/colossalai/trainer/hooks/_lr_scheduler_hook.py b/colossalai/trainer/hooks/_lr_scheduler_hook.py
new file mode 100644
index 000000000000..ca483aebe14b
--- /dev/null
+++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py
@@ -0,0 +1,58 @@
+from torch import Tensor
+
+from colossalai.builder import build_lr_scheduler
+from colossalai.registry import HOOKS
+from ._metric_hook import MetricHook
+from .._trainer import Trainer
+from ..metric import LearningRate
+
+
+@HOOKS.register_module
+class LRSchedulerHook(MetricHook):
+    """Build LR scheduler
+
+    :param trainer: Trainer attached with current hook
+    :type trainer: Trainer
+    :param lr_scheduler_cfg: The config of LR scheduler
+    :type lr_scheduler_cfg: dict
+    :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch. Defaults to `True`.
+    :type by_epoch: bool
+    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :type priority: int, optional
+    """
+
+    def __init__(self,
+                 trainer: Trainer,
+                 lr_scheduler_cfg: dict,
+                 by_epoch: bool = True,
+                 store_lr_in_state: bool = True,
+                 priority: int = 1,
+                 ):
+        super().__init__(trainer=trainer, priority=priority)
+        self.by_epoch = by_epoch
+
+        if by_epoch:
+            total_steps = trainer.max_epochs
+        else:
+            total_steps = trainer.max_epochs * trainer.steps_per_epoch
+            if trainer.max_steps is not None:
+                total_steps = min(total_steps, trainer.max_steps)
+
+        lr_scheduler_cfg['total_steps'] = total_steps
+
+        self.lr_scheduler = build_lr_scheduler(
+            lr_scheduler_cfg, trainer.engine.optimizer)
+
+        if store_lr_in_state:
+            self.trainer.states['metrics']['train']['lr'] = LearningRate(epoch_only=by_epoch,
+                                                                         initial_lr=self.lr_scheduler.get_lr()[0])
+
+    def after_train_epoch(self):
+        if self.by_epoch:
+            self.lr_scheduler.step()
+            self.trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_lr()[0])
+
+    def after_train_iter(self, output: Tensor, label: Tensor, loss: Tensor):
+        if not self.by_epoch:
+            self.lr_scheduler.step()
+            self.trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_lr()[0])
diff --git a/colossalai/trainer/hooks/_metric_hook.py b/colossalai/trainer/hooks/_metric_hook.py
index 241ec63d3d73..8c3478c71336 100644
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@@ -21,9 +21,12 @@ class MetricHook(BaseHook):
     :type priority: int
     """
 
-    def __init__(self, trainer: Trainer, priority: int):
+    def __init__(self,
+                 trainer: Trainer,
+                 priority: int,
+                 ):
         super().__init__(trainer, priority)
-        self._is_stage_to_log = is_no_pp_or_last_stage()
+        self._is_stage_to_compute = is_no_pp_or_last_stage()
         self._check_metric_states_initialization()
 
     def _check_metric_states_initialization(self):
@@ -41,33 +44,34 @@ class LossHook(MetricHook):
     :type priority: int, optional
     """
 
-    def __init__(self, trainer: Trainer, priority: int = 10):
+    def __init__(self, trainer: Trainer, priority: int = 0):
         super().__init__(trainer, priority)
 
-        if self._is_stage_to_log:
-            self.metric = Loss(epoch_only=False)
+        if self._is_stage_to_compute:
+            self.train_loss = Loss(epoch_only=False)
+            self.test_loss = Loss(epoch_only=True)
 
             # register the metric calculator
             self.trainer.states['metrics']['train'][
-                self.metric.__class__.__name__] = self.metric
+                self.train_loss.__class__.__name__] = self.train_loss
             self.trainer.states['metrics']['test'][
-                self.metric.__class__.__name__] = self.metric
+                self.test_loss.__class__.__name__] = self.test_loss
 
     def before_train_epoch(self):
-        if self._is_stage_to_log:
-            self.metric.reset()
+        if self._is_stage_to_compute:
+            self.train_loss.reset()
 
     def after_train_iter(self, logits, label, loss):
-        if self._is_stage_to_log:
-            self.metric.update(loss)
+        if self._is_stage_to_compute:
+            self.train_loss.update(loss)
 
     def before_test_epoch(self):
-        if self._is_stage_to_log:
-            self.metric.reset()
+        if self._is_stage_to_compute:
+            self.test_loss.reset()
 
     def after_test_iter(self, logits, label, loss):
-        if self._is_stage_to_log:
-            self.metric.update(loss)
+        if self._is_stage_to_compute:
+            self.test_loss.update(loss)
 
 
 @HOOKS.register_module
@@ -81,10 +85,10 @@ class Accuracy2DHook(MetricHook):
     :type priority: int, optional
     """
 
-    def __init__(self, trainer: Trainer, priority: int = 10):
+    def __init__(self, trainer: Trainer, priority: int = 0):
         super().__init__(trainer, priority)
 
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric = Accuracy2D(epoch_only=True)
 
             # register the metric
@@ -92,20 +96,20 @@ def __init__(self, trainer: Trainer, priority: int = 10):
                 self.metric.__class__.__name__] = self.metric
 
     def before_test(self):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.reset()
 
     def after_test_iter(self, logits, label, *args):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.update(logits, label)
 
 
 @HOOKS.register_module
 class Accuracy2p5DHook(MetricHook):
-    def __init__(self, trainer: Trainer, priority: int = 10):
+    def __init__(self, trainer: Trainer, priority: int = 0):
         super().__init__(trainer, priority)
 
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric = Accuracy2p5D(epoch_only=True)
 
             # register the metric
@@ -113,11 +117,11 @@ def __init__(self, trainer: Trainer, priority: int = 10):
                 self.metric.__class__.__name__] = self.metric
 
     def before_test(self):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.reset()
 
     def after_test_iter(self, logits, label, *args):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.update(logits, label)
 
 
@@ -138,7 +142,7 @@ def __init__(self,
                  priority: int = 10):
         super().__init__(trainer, priority)
 
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric = Accuracy3D(epoch_only=True,
                                      input_parallel_mode=input_parallel_mode,
                                      weight_parallel_mode=weight_parallel_mode)
@@ -148,11 +152,11 @@ def __init__(self,
                 self.metric.__class__.__name__] = self.metric
 
     def before_test(self):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.reset()
 
     def after_test_iter(self, logits, label, *args):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.update(logits, label)
 
 
@@ -166,10 +170,10 @@ class AccuracyHook(MetricHook):
     :type priority: int
     """
 
-    def __init__(self, trainer: Trainer, priority: int = 10):
+    def __init__(self, trainer: Trainer, priority: int = 0):
         super().__init__(trainer, priority)
 
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric = Accuracy(epoch_only=True)
 
             # register the metric
@@ -177,9 +181,9 @@ def __init__(self, trainer: Trainer, priority: int = 10):
                 self.metric.__class__.__name__] = self.metric
 
     def before_test(self):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.reset()
 
     def after_test_iter(self, logits, label, *args):
-        if self._is_stage_to_log:
+        if self._is_stage_to_compute:
             self.metric.update(logits, label)
diff --git a/colossalai/trainer/metric.py b/colossalai/trainer/metric.py
index 744e0e03a6ac..b595d37b823c 100644
--- a/colossalai/trainer/metric.py
+++ b/colossalai/trainer/metric.py
@@ -126,6 +126,33 @@ def is_better(a, b):
         return a < b
 
 
+class LearningRate(Metric):
+    """A metric collector for learning rate.
+
+    :param epoch_only: Whether the metric only read for the full epoch
+    :type epoch_only: bool
+    """
+
+    def __init__(self, epoch_only: bool, initial_lr: float = 0.):
+        super().__init__(epoch_only=epoch_only)
+        self.lr = 0.
+
+    def reset(self) -> None:
+        pass
+
+    def update(self, lr) -> None:
+        self.lr = lr
+
+    def get_last_step_value(self):
+        return self.lr
+
+    def get_accumulated_value(self):
+        return self.lr
+
+    def is_better(a, b) -> bool:
+        pass
+
+
 class Accuracy(Metric):
     """A metric collector for accuracy. It only works for classification
     tasks.
diff --git a/colossalai/checkpointing.py b/colossalai/utils/checkpointing.py
similarity index 98%
rename from colossalai/checkpointing.py
rename to colossalai/utils/checkpointing.py
index 17db1a1a5316..d2cf050cca9c 100644
--- a/colossalai/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@@ -5,9 +5,9 @@
 
 import torch
 
-from .context import Config
-from .context.parallel_mode import ParallelMode
-from .core import global_context as gpc
+from colossalai.context import Config
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 
 __all__ = [
     'get_checkpoint_path',
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 1496e77ac78f..d8c6663ba626 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -27,7 +27,7 @@ def sync_model_param_in_dp(model):
     :param model: A pyTorch nn.model on whose parameters you check the consistency
     '''
     
-    if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 2:
+    if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
         for param in model.parameters():
             ranks = gpc.get_ranks_in_group(ParallelMode.DATA)
             dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA))
diff --git a/configs/resnet/resnet50.py b/configs/resnet/resnet50.py
index 57b8b83047ef..d5ecbdfef2ca 100644
--- a/configs/resnet/resnet50.py
+++ b/configs/resnet/resnet50.py
@@ -4,6 +4,7 @@
 
 IMG_SIZE = 224
 BATCH_SIZE = 256
+NUM_EPOCHS = 100
 
 model = dict(
     type='VanillaResNet',
@@ -67,8 +68,6 @@
     type='CrossEntropyLoss'
 )
 
-max_epochs = 100
-
 from colossalai.engine import AMP_TYPE
 
 fp16 = dict(
diff --git a/configs/sample_config.py b/configs/sample_config.py
index bfc2d68e277c..b9768d2c1258 100644
--- a/configs/sample_config.py
+++ b/configs/sample_config.py
@@ -1,21 +1,20 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+NUM_EPOCH = int
+
 model = dict()
 train_data = dict()
 test_data = dict()
 optimizer = dict()
 loss = dict()
-lr_scheduler = dict()
 
 fp16 = dict()
 zero = dict()
 
 gradient_handler = []
 parallel = dict()
-
-num_epochs = int
-num_steps = int
+hooks = []
 
 cudnn_benchmark = True
 cudnn_deterministic = False
diff --git a/configs/vit/vit_2d.py b/configs/vit/vit_2d.py
index 9d09eda2c016..f36a03accb80 100644
--- a/configs/vit/vit_2d.py
+++ b/configs/vit/vit_2d.py
@@ -8,10 +8,11 @@
 IMG_SIZE = 32
 PATCH_SIZE = 4
 DIM = 512
-NUM_ATTENTION_HEADS = 8
+NUM_ATTENTION_HEADS = 2
 SUMMA_DIM = 2
 NUM_CLASSES = 10
-DEPTH = 6
+DEPTH = 1
+NUM_EPOCHS = 60
 
 train_data = dict(
     dataset=dict(
@@ -127,14 +128,22 @@
     dict(type='LogMetricByEpochHook'),
     dict(type='Accuracy2DHook'),
     dict(type='LossHook'),
-    dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
+    dict(type='TensorboardHook', log_dir='./tb_logs'),
     # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
     # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
 ]
 
 parallel = dict(
     pipeline=dict(size=1),
-    tensor=dict(size=4, mode='2d'),
+    tensor=dict(size=1, mode='2d'),
 )
 
 # for fp16 training
@@ -144,17 +153,11 @@
 #     initial_scale=2 ** 8
 # )
 
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
-)
-
 # only needed when pipeline parallel is used
 # schedule = dict(
 #     num_microbatches=8
 # )
 
-num_epochs = 60
 
 logging = dict(
     root_path='./logs'
diff --git a/configs/vit/vit_3d.py b/configs/vit/vit_3d.py
index 037e2c15ebb4..ea605dac8b84 100644
--- a/configs/vit/vit_3d.py
+++ b/configs/vit/vit_3d.py
@@ -14,6 +14,7 @@
 
 BATCH_SIZE = 512
 IMG_SIZE = 32
+NUM_EPOCHS = 60
 
 train_data = dict(
     dataset=dict(
@@ -83,6 +84,14 @@
     ),
     dict(type='LossHook'),
     dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
     # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
     # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
 ]
@@ -97,13 +106,6 @@
     initial_scale=2 ** 8
 )
 
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
-)
-
-num_epochs = 60
-
 logging = dict(
     root_path='./logs'
 )
diff --git a/docs/colossalai/colossalai.engine.amp.amp_type.rst b/docs/colossalai/colossalai.engine.amp.amp_type.rst
new file mode 100644
index 000000000000..ec1afdfa69e2
--- /dev/null
+++ b/docs/colossalai/colossalai.engine.amp.amp_type.rst
@@ -0,0 +1,5 @@
+colossalai.engine.amp.amp\_type
+===============================
+
+.. automodule:: colossalai.engine.amp.amp_type
+   :members:
diff --git a/docs/colossalai/colossalai.engine.amp.grad_scaler.rst b/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
new file mode 100644
index 000000000000..752079eabc97
--- /dev/null
+++ b/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
@@ -0,0 +1,5 @@
+colossalai.engine.amp.grad\_scaler
+==================================
+
+.. automodule:: colossalai.engine.amp.grad_scaler
+   :members:
diff --git a/docs/colossalai/colossalai.engine.amp.rst b/docs/colossalai/colossalai.engine.amp.rst
new file mode 100644
index 000000000000..987f27f6a76f
--- /dev/null
+++ b/docs/colossalai/colossalai.engine.amp.rst
@@ -0,0 +1,12 @@
+colossalai.engine.amp
+=====================
+
+.. automodule:: colossalai.engine.amp
+   :members:
+
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.engine.amp.amp_type
+   colossalai.engine.amp.grad_scaler
diff --git a/docs/colossalai/colossalai.engine.amp_type.rst b/docs/colossalai/colossalai.engine.amp_type.rst
deleted file mode 100644
index 8121b993363f..000000000000
--- a/docs/colossalai/colossalai.engine.amp_type.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.amp\_type
-===========================
-
-.. automodule:: colossalai.engine.amp_type
-   :members:
diff --git a/docs/colossalai/colossalai.engine.rst b/docs/colossalai/colossalai.engine.rst
index 1cd4733b840c..915be4c98b7e 100644
--- a/docs/colossalai/colossalai.engine.rst
+++ b/docs/colossalai/colossalai.engine.rst
@@ -7,11 +7,6 @@ colossalai.engine
 .. toctree::
    :maxdepth: 2
 
+   colossalai.engine.amp
    colossalai.engine.gradient_handler
    colossalai.engine.schedule
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.engine.amp_type
diff --git a/docs/colossalai/colossalai.rst b/docs/colossalai/colossalai.rst
index 414ee8120849..a4d4656fdf70 100644
--- a/docs/colossalai/colossalai.rst
+++ b/docs/colossalai/colossalai.rst
@@ -21,7 +21,6 @@ colossalai
 .. toctree::
    :maxdepth: 2
 
-   colossalai.checkpointing
    colossalai.constants
    colossalai.core
    colossalai.initialize
diff --git a/docs/colossalai/colossalai.utils.checkpointing.rst b/docs/colossalai/colossalai.utils.checkpointing.rst
new file mode 100644
index 000000000000..534a581d5364
--- /dev/null
+++ b/docs/colossalai/colossalai.utils.checkpointing.rst
@@ -0,0 +1,5 @@
+colossalai.utils.checkpointing
+==============================
+
+.. automodule:: colossalai.utils.checkpointing
+   :members:
diff --git a/docs/colossalai/colossalai.utils.rst b/docs/colossalai/colossalai.utils.rst
index bfe62172f233..7f712e31379f 100644
--- a/docs/colossalai/colossalai.utils.rst
+++ b/docs/colossalai/colossalai.utils.rst
@@ -9,6 +9,7 @@ colossalai.utils
    :maxdepth: 2
 
    colossalai.utils.activation_checkpoint
+   colossalai.utils.checkpointing
    colossalai.utils.common
    colossalai.utils.cuda
    colossalai.utils.memory
diff --git a/docs/parallelization.md b/docs/parallelization.md
index ca98d542b708..0c1e70bfee13 100644
--- a/docs/parallelization.md
+++ b/docs/parallelization.md
@@ -17,38 +17,40 @@ parallel = dict(
 )
 ```
 
-The name of the dictionary variable should be **parallel**. All the arguments even **parallel** itself are optional and data,
-pipeline, tensor parallel size will be set to defaulted value 1. The value of data, pipeline and tensor can be a int
-representing the size of specific parallel dimension or a dictionary with a key called "size". The key "mode"
+The name of the dictionary variable should be **parallel**. All the arguments even **parallel** itself are optional and
+data, pipeline, tensor parallel size will be set to defaulted value 1. The value of data, pipeline and tensor can be a
+int representing the size of specific parallel dimension or a dictionary with a key called "size". The key "mode"
 represents the way of tensor parallelism.
 
 ## Data Parallel
 
-Data parallel is the most common way to distribute your training task by splitting data into several shards and train 
-on a single shard on each device. The configuration for data parallel is detected automatically and set for you. You do 
-not have to explicitly set them in your configurations. When data parallel size is larger than 1, Colossal-AI automatically 
+Data parallel is the most common way to distribute your training task by splitting data into several shards and train on
+a single shard on each device. The configuration for data parallel is detected automatically and set for you. You do not
+have to explicitly set them in your configurations. When data parallel size is larger than 1, Colossal-AI automatically
 adds the distributed data sampler to the dataloader to shard the dataset.
 
 ## 1D, 2D, 2.5D and 3D Parallel
 
-To enable hybrid parallelism, we provide an array of tensor parallelism. We provide the list of papers which match each 
+To enable hybrid parallelism, we provide an array of tensor parallelism. We provide the list of papers which match each
 tensor parallel method. These parallel modes need to work with the distributed layers provided by Colossal-AI.
-- 1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+
+-
+1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 
 - 2D: [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)  
-2D parallel relies on the SUMMA matrix multiplication algorithm and splits the input data, 
-model weights and layer outputs along two different dimensions. The tensor chunks are distributed over a 2D mesh of $P = N^2$ 
-devices where $N$ is the number of tensor chunks in a single dimension.
+  2D parallel relies on the SUMMA matrix multiplication algorithm and splits the input data, model weights and layer
+  outputs along two different dimensions. The tensor chunks are distributed over a 2D mesh of $P = N^2$ devices where
+  $N$ is the number of tensor chunks in a single dimension.
 
 - 2.5D: [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)  
-Inspired by the 2.5D matrix multiplication algorithm, 2.5D parallel introduces a novel tensor parallelism which further 
-parallelizes 2D tensor parallelism. An amount of $P = N^2 ∗ d$ processors are arranged into $d$ layers, 
-where each layer performs matrix multiplication operations independently with a dimension $N$.
+  Inspired by the 2.5D matrix multiplication algorithm, 2.5D parallel introduces a novel tensor parallelism which
+  further parallelizes 2D tensor parallelism. An amount of $P = N^2 ∗ d$ processors are arranged into $d$ layers, where
+  each layer performs matrix multiplication operations independently with a dimension $N$.
 
 - 3D: [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)  
-We also introduce a 3D tensor parallelism that parallelizes neural networks on a 3D processor cube. This method achieves 
-the optimal, $O(P^{1/3})$ communication overhead on $P$ processors, while both computation and memory usage are evenly distributed 
-through optimized load balancing of parameters as well as activations.
+  We also introduce a 3D tensor parallelism that parallelizes neural networks on a 3D processor cube. This method
+  achieves the optimal, $O(P^{1/3})$ communication overhead on $P$ processors, while both computation and memory usage
+  are evenly distributed through optimized load balancing of parameters as well as activations.
 
 ```python
 # 1D parallel
@@ -78,12 +80,12 @@ parallel = dict(
 
 ## Pipeline Parallel (experimental)
 
-Pipeline parallelism is to split the model into several partitions by layer. For example, let's assume we have a simple 
-model which consists of two linear layer. We have two GPUs, and we can allocate the first linear layer to the first GPU 
+Pipeline parallelism is to split the model into several partitions by layer. For example, let's assume we have a simple
+model which consists of two linear layer. We have two GPUs, and we can allocate the first linear layer to the first GPU
 and the second layer to the second GPU. This example of course wastes the computing resources and is only to demonstrate
-the idea of pipeline parallelism. 
+the idea of pipeline parallelism.
 
-As PyTorch is based on dynamic computation graph, the computation flow is not known until execution. To support pipeline 
+As PyTorch is based on dynamic computation graph, the computation flow is not known until execution. To support pipeline
 parallelism in PyTorch, you may need to add one more attribute, `layers_cfg` in your model class which tells Colossal-AI
 the sequence of execution. One example you can refer is `colossalai.nn.model.VanillaResNet`.
 
@@ -192,9 +194,9 @@ class VanillaResNet(BaseModel):
         ]
 ```
 
-You can set the number of pipeline stages in your configuration file. When pipeline size is larger than 1, Colossal-AI 
-will automatically creates the pipeline schedule which defines the forward and backward step. You can specify how many microbatches
-to run in each step in the `schedule` configuration.
+You can set the number of pipeline stages in your configuration file. When pipeline size is larger than 1, Colossal-AI
+will automatically creates the pipeline schedule which defines the forward and backward step. You can specify how many
+microbatches to run in each step in the `schedule` configuration.
 
 ```python
 parallel = dict(
@@ -206,10 +208,11 @@ schedule = dict(
     num_microbatches = 4 # set the number of microbatches per step
 )
 ```
+
 This feature is still in development and is only experimental for now.
 
 ## Sequence Parallel (experimental)
 
-Sequence parallel is to support long-sequence modelling such as document-level text understanding and medical imaging. 
-This method is proposed in [Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120). 
+Sequence parallel is to support long-sequence modelling such as document-level text understanding and medical imaging.
+This method is proposed in [Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120).
 This feature is still in development and is only experimental for now.
diff --git a/docs/run_demo.md b/docs/run_demo.md
index 48f0590d33ac..6d8c5b49a192 100644
--- a/docs/run_demo.md
+++ b/docs/run_demo.md
@@ -1,8 +1,8 @@
 # Quick demo
 
-Colossal-AI is an integrated large-scale deep learning system with efficient parallelization techniques. The system
-can accelerate model training on distributed systems with multiple GPUs by applying parallelization techniques. The
-system can also run on systems with only one GPU. Quick demos showing how to use Colossal-AI are given below.
+Colossal-AI is an integrated large-scale deep learning system with efficient parallelization techniques. The system can
+accelerate model training on distributed systems with multiple GPUs by applying parallelization techniques. The system
+can also run on systems with only one GPU. Quick demos showing how to use Colossal-AI are given below.
 
 ## Single GPU
 
@@ -32,25 +32,17 @@ realizes the training process.
 ```python
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.trainer import Trainer
 
+
 def run_trainer():
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+    engine, train_dataloader, test_dataloader = colossalai.initialize()
     logger = get_global_dist_logger()
-    schedule.data_sync = False
-    engine = Engine(
-        model=model,
-        criterion=criterion,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        schedule=schedule
-    )
+
     logger.info("engine is built", ranks=[0])
 
     trainer = Trainer(engine=engine,
-                      hooks_cfg=gpc.config.hooks,
                       verbose=True)
     logger.info("trainer is built", ranks=[0])
 
@@ -58,11 +50,13 @@ def run_trainer():
     trainer.fit(
         train_dataloader=train_dataloader,
         test_dataloader=test_dataloader,
-        max_epochs=gpc.config.num_epochs,
+        epochs=gpc.config.num_epochs,
+        hooks_cfg=gpc.config.hooks,
         display_progress=True,
         test_interval=2
     )
 
+
 if __name__ == '__main__':
     run_trainer()
 ```
@@ -72,9 +66,9 @@ Zoo. The detailed substitution process is elaborated [here](model.md).
 
 ## Features
 
-Colossal-AI provides a collection of parallel training components for you. We aim to support you with your development of
-distributed deep learning models just like how you write single-GPU deep learning models. We provide friendly tools to
-kickstart distributed training in a few lines.
+Colossal-AI provides a collection of parallel training components for you. We aim to support you with your development
+of distributed deep learning models just like how you write single-GPU deep learning models. We provide friendly tools
+to kickstart distributed training in a few lines.
 
 - [Data Parallelism](parallelization.md)
 - [Pipeline Parallelism](parallelization.md)
diff --git a/docs/run_demo_zh.md b/docs/run_demo_zh.md
index a52fcfd794d5..54839760d430 100644
--- a/docs/run_demo_zh.md
+++ b/docs/run_demo_zh.md
@@ -4,40 +4,36 @@ Colossal-AI是一个大规模深度学习系统，其中包含高效的并行技
 
 ## 单GPU系统
 
-在带有GPU的非分布式系统上进行模型训练时，Colossal-AI可以达到当前的基线效率。[这里](https://colab.research.google.com/drive/1fJnqqFzPuzZ_kn1lwCpG2nh3l2ths0KE?usp=sharing#scrollTo=cQ_y7lBG09LS)我们给出一个Google Colab示例展现如何使用Colossal-AI与CIFAR10数据集在非分布式系统上训练一个LeNet模型。
+在带有GPU的非分布式系统上进行模型训练时，Colossal-AI可以达到当前的基线效率。[这里](https://colab.research.google.com/drive/1fJnqqFzPuzZ_kn1lwCpG2nh3l2ths0KE?usp=sharing#scrollTo=cQ_y7lBG09LS)我们给出一个Google
+Colab示例展现如何使用Colossal-AI与CIFAR10数据集在非分布式系统上训练一个LeNet模型。
 
 ## 多GPU系统
 
-在多GPU的分布式系统上训练深度学习模型时，Colossal-AI可以使用高效的并行技术来显著地加速训练过程，这些技术将在下面的[并行技术](parallelization.md)章节中被详述。下面的代码将在拥有四个GPU的分布式系统上训练一个ViT模型，其中`HOST`变量为您分布式系统的IP地址。请注意下面的代码使用了[Slurm](https://slurm.schedmd.com/documentation.html)作业调度系统。
+在多GPU的分布式系统上训练深度学习模型时，Colossal-AI可以使用高效的并行技术来显著地加速训练过程，这些技术将在下面的[并行技术](parallelization.md)
+章节中被详述。下面的代码将在拥有四个GPU的分布式系统上训练一个ViT模型，其中`HOST`
+变量为您分布式系统的IP地址。请注意下面的代码使用了[Slurm](https://slurm.schedmd.com/documentation.html)作业调度系统。
 
 ```bash
 HOST=xxx.xxx.xxx.xxx srun ./scripts/slurm_dist_train.sh ./examples/run_trainer.py ./configs/vit/vit_2d.py
 ```
 
-`./configs/vit/vit_2d.py`是一个[配置文件](config.md)，Colossal-AI使用配置文件来定义训练过程中需要用到的参数，比如模型类型、数据集、以及优化器、学习率调度器等。您可以通过编写配置文件的方式来训练不同的模型。`./examples/run_trainer.py`是一个标准的训练脚本，具体代码已经附在下面。该脚本可以读入配置文件中的训练参数并训练模型。
+`./configs/vit/vit_2d.py`是一个[配置文件](config.md)
+，Colossal-AI使用配置文件来定义训练过程中需要用到的参数，比如模型类型、数据集、以及优化器、学习率调度器等。您可以通过编写配置文件的方式来训练不同的模型。`./examples/run_trainer.py`
+是一个标准的训练脚本，具体代码已经附在下面。该脚本可以读入配置文件中的训练参数并训练模型。
 
 ```python
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.trainer import Trainer
 
+
 def run_trainer():
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+    engine, train_dataloader, test_dataloader = colossalai.initialize()
     logger = get_global_dist_logger()
-    schedule.data_sync = False
-    engine = Engine(
-        model=model,
-        criterion=criterion,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        schedule=schedule
-    )
     logger.info("engine is built", ranks=[0])
 
     trainer = Trainer(engine=engine,
-                      hooks_cfg=gpc.config.hooks,
                       verbose=True)
     logger.info("trainer is built", ranks=[0])
 
@@ -45,11 +41,13 @@ def run_trainer():
     trainer.fit(
         train_dataloader=train_dataloader,
         test_dataloader=test_dataloader,
-        max_epochs=gpc.config.num_epochs,
+        epochs=gpc.config.num_epochs,
+        hooks_cfg=gpc.config.hooks,
         display_progress=True,
         test_interval=2
     )
 
+
 if __name__ == '__main__':
     run_trainer()
 ```
diff --git a/docs/trainer_engine.md b/docs/trainer_engine.md
index 276134021180..88b872826aca 100644
--- a/docs/trainer_engine.md
+++ b/docs/trainer_engine.md
@@ -2,9 +2,9 @@
 
 ## Build your engine
 
-To better understand how `Engine` class works, let's start from the conception of the process function in common engines. The process function 
-usually controls the behavior over a batch of a dataset, `Engine` class just controls the process function. Here we give a standard process 
-function in the following code block.
+To better understand how `Engine` class works, let's start from the conception of the process function in common
+engines. The process function usually controls the behavior over a batch of a dataset, `Engine` class just controls the
+process function. Here we give a standard process function in the following code block.
 
 ```python
 def process_function(dataloader, model, criterion, optim):
@@ -16,32 +16,33 @@ def process_function(dataloader, model, criterion, optim):
     optim.setp()
 ```
 
-In `ignite.engine` or `keras.engine`, the process function is always provided by users. However, it is tricky for users to write their own process 
-functions for pipeline parallelism. Aiming at offering accessible hybrid parallelism for users, we provide the powerful `Engine` class. This class 
-enables pipeline parallelism and offers one-forward-one-backward non-interleaving strategy. Also, you can use pre-defined learning rate scheduler 
-in the `Engine` class to adjust learning rate during training.
+In `ignite.engine` or `keras.engine`, the process function is always provided by users. However, it is tricky for users
+to write their own process functions for pipeline parallelism. Aiming at offering accessible hybrid parallelism for
+users, we provide the powerful `Engine` class. This class enables pipeline parallelism and offers
+one-forward-one-backward non-interleaving strategy. Also, you can use pre-defined learning rate scheduler in
+the `Engine` class to adjust learning rate during training.
 
-In order to build your engine, just set variables `model`, `criterion`, `optimizer`, `lr_scheduler` and `schedule`. The following code block provides
-an example.
+In order to build your engine, just set variables `model`, `criterion`, `optimizer`, `lr_scheduler` and `schedule`. The
+following code block provides an example. **The engine is automatically created from the config file for you if you
+start with `colossalai.initialize`.**
 
 ```python
 import torch
 import torch.nn as nn
 import torchvision.models as models
 import colossalai
+from colossalai.engine import Engine
 
 model = models.resnet18()
 criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model)
-lr_scheduler = colossalai.nn.lr_scheduler.CosineAnnealingLR(optimizer, 1000)
-schedule = colossalai.engine.schedule.NoPipelineSchedule()
+optimizer = torch.optim.Adam(model.parameters())
+schedule = colossalai.engine.NoPipelineSchedule()
 
 MyEngine = Engine(
     model=model,
     criterion=criterion,
     optimizer=optimizer,
-    lr_scheduler=lr_scheduler,
-    schedule=schedule
+    step_schedule=schedule
 )
 ```
 
@@ -51,21 +52,24 @@ More information regarding the class can be found in the API references.
 
 ### Overview
 
-To learn how to customize a trainer which meets your needs, let's first give a look at the `Trainer` class. We highly recommend that you read *Get Started* 
+To learn how to customize a trainer which meets your needs, let's first give a look at the `Trainer` class. We highly
+recommend that you read *Get Started*
 section and *Build your engine* first.
 
-The `Trainer` class enables researchers and engineers to use our system more conveniently. Instead of having to write your own scripts, you can simply 
-construct your own trainer by calling the `Trainer` class, just like what we did in the following code block.
+The `Trainer` class enables researchers and engineers to use our system more conveniently. Instead of having to write
+your own scripts, you can simply construct your own trainer by calling the `Trainer` class, just like what we did in the
+following code block.
 
 ```python
-MyTrainer = Trainer(MyEngine)
+MyTrainer = Trainer(my_engine)
 ```
 
-After that, you can use the `fit` method to train or evaluate your model. In order to make our `Trainer` class even more powerful, we incorporate a set of 
-handy tools to the class. For example, you can monitor or record the running states and metrics which indicate the current performance of the model. These
-functions are realized by hooks. The `BasicHook` class allows you to execute your hook functions at specified time. We have already created some practical
-hooks for you, as listed below. What you need to do is just picking the right ones which suit your needs. Detailed descriptions of the class can be found 
-in the API references.
+After that, you can use the `fit` method to train or evaluate your model. In order to make our `Trainer` class even more
+powerful, we incorporate a set of handy tools to the class. For example, you can monitor or record the running states
+and metrics which indicate the current performance of the model. These functions are realized by hooks. The `BasicHook`
+class allows you to execute your hook functions at specified time. We have already created some practical hooks for you,
+as listed below. What you need to do is just picking the right ones which suit your needs. Detailed descriptions of the
+class can be found in the API references.
 
 ```python
 hooks = [
@@ -80,18 +84,21 @@ hooks = [
 ]
 ```
 
-These hook functions will record metrics, elapsed time and memory usage and write them to log after each epoch. Besides, they print the current loss and 
-accuracy to let users monitor the performance of the model.
+These hook functions will record metrics, elapsed time and memory usage and write them to log after each epoch. Besides,
+they print the current loss and accuracy to let users monitor the performance of the model.
 
 ### Hook
 
-If you have your specific needs, feel free to extend our `BaseHook` class to add your own functions, or our `MetricHook` class to write a metric collector. 
-These hook functions can be called at twelve timing in the trainer's life cycle. Besides, you can define the priorities of all hooks to arrange the execution order of them.
-More information can be found in the API references. 
+If you have your specific needs, feel free to extend our `BaseHook` class to add your own functions, or our `MetricHook`
+class to write a metric collector. These hook functions can be called at twelve timing in the trainer's life cycle.
+Besides, you can define the priorities of all hooks to arrange the execution order of them. More information can be
+found in the API references.
 
 ### Metric
 
-You can write your own metrics by extending our `Metric` class. It should be used with the `MetricHook` class. When your write your own metric hooks, please set 
-the priority carefully and make sure the hook is called before other hooks which might require the results of the metric hook.
+You can write your own metrics by extending our `Metric` class. It should be used with the `MetricHook` class. When your
+write your own metric hooks, please set the priority carefully and make sure the hook is called before other hooks which
+might require the results of the metric hook.
 
-We've already provided some metric hooks and we store metric objects in `runner.states['metrics']`. It is a dictionary and metrics can be accessed by their names.
+We've already provided some metric hooks and we store metric objects in `runner.states['metrics']`. It is a dictionary
+and metrics can be accessed by their names.
diff --git a/docs/trainer_engine_zh.md b/docs/trainer_engine_zh.md
index 0e2df3fdd1a5..737d6745bb58 100644
--- a/docs/trainer_engine_zh.md
+++ b/docs/trainer_engine_zh.md
@@ -14,28 +14,30 @@ def process_function(dataloader, model, criterion, optim):
     optim.setp()
 ```
 
-在`ignite.engine`与`keras.engine`中，进程函数需要由用户提供，然而，用户很难为流水线并行编写进程函数。为了向用户提供方便的混合并行，我们提供了具备强大功能的`Engine`类，该类支持流水线并行，并提供前向传播后向传播不交织的策略。同时，您可以在`Engine`类中使用您事先定义好的学习率调度器来在训练过程中调整学习率。
+在`ignite.engine`与`keras.engine`中，进程函数需要由用户提供，然而，用户很难为流水线并行编写进程函数。为了向用户提供方便的混合并行，我们提供了具备强大功能的`Engine`
+类，该类支持流水线并行，并提供前向传播后向传播不交织的策略。同时，您可以在`Engine`类中使用您事先定义好的学习率调度器来在训练过程中调整学习率。
 
 您在构造引擎时只需要定义`model`、`criterion`、`optimizer`、`lr_scheduler`与`schedule`等变量即可，下面的代码块给出了一个这样的例子。
+**如果你使用`colossalai.initialize`的话，engine会从config文件里自动构建。**
 
 ```python
 import torch
 import torch.nn as nn
 import torchvision.models as models
 import colossalai
+from colossalai.engine import Engine
 
 model = models.resnet18()
 criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(model)
 lr_scheduler = colossalai.nn.lr_scheduler.CosineAnnealingLR(optimizer, 1000)
-schedule = colossalai.engine.schedule.NoPipelineSchedule()
+schedule = colossalai.engine.NoPipelineSchedule()
 
 MyEngine = Engine(
     model=model,
     criterion=criterion,
     optimizer=optimizer,
-    lr_scheduler=lr_scheduler,
-    schedule=schedule
+    step_schedule=schedule
 )
 ```
 
@@ -48,10 +50,12 @@ MyEngine = Engine(
 `Trainer`类旨在让科研工作者和工程师更加方便地使用我们的系统，您不需要自己写脚本，只需要调用`Trainer`类来构造您的训练器即可，就像下面的代码块中所做的。
 
 ```python
-MyTrainer = Trainer(MyEngine)
+MyTrainer = Trainer(my_trainer)
 ```
 
-在此之后，您可以使用`fit`方法来训练或调用您的模型。除此之外，为了让我们的`Trainer`类拥有更强大的功能，我们加入了一系列方便您使用的工具。例如，您可以在训练过程中持续监测并记录模型目前的运行状态和表现，这些功能都是通过钩子函数来实现的。我们提供的`BasicHook`类让您可以在指定时间执行您的钩子函数。如下方的代码块所示，我们事先为您定义好了一些实用的钩子函数，您需要做的就是找到符合您需求的钩子函数。更多该类的相关信息可以在API信息中找到。
+在此之后，您可以使用`fit`方法来训练或调用您的模型。除此之外，为了让我们的`Trainer`
+类拥有更强大的功能，我们加入了一系列方便您使用的工具。例如，您可以在训练过程中持续监测并记录模型目前的运行状态和表现，这些功能都是通过钩子函数来实现的。我们提供的`BasicHook`
+类让您可以在指定时间执行您的钩子函数。如下方的代码块所示，我们事先为您定义好了一些实用的钩子函数，您需要做的就是找到符合您需求的钩子函数。更多该类的相关信息可以在API信息中找到。
 
 ```python
 hooks = [
@@ -70,7 +74,8 @@ hooks = [
 
 ### 钩子函数
 
-如果您有个性化需求，您可以继承我们的`BaseHook`类并添加您的钩子函数，或者继承我们的`MetricHook`来编写您需要的度量标准。这些钩子函数可以在`Trainer`生命周期的12个时间点被执行。更多该类的相关信息可以在API信息中找到。
+如果您有个性化需求，您可以继承我们的`BaseHook`类并添加您的钩子函数，或者继承我们的`MetricHook`来编写您需要的度量标准。这些钩子函数可以在`Trainer`
+生命周期的12个时间点被执行。更多该类的相关信息可以在API信息中找到。
 
 ### 度量标准
 
diff --git a/examples/colossal_cifar_demo.ipynb b/examples/colossal_cifar_demo.ipynb
index 2ad9022c9151..221707bbbdc8 100644
--- a/examples/colossal_cifar_demo.ipynb
+++ b/examples/colossal_cifar_demo.ipynb
@@ -1,370 +1,370 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "colossal_cifar_demo.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "name": "colossal_cifar_demo.ipynb",
+   "provenance": []
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uhrbvVEh2iJd"
-      },
-      "source": [
-        "# Train an image classifier\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "vP7LvCpG23a2",
-        "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
-      },
-      "source": [
-        "!pip install ColossalAI deepspeed"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
-            "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
-            "Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
-            "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
-            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
-            "Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
-            "Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
-            "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
-            "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
-            "Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
-            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
-            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
-            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "UVKEurtS4SFS",
-        "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
-      },
-      "source": [
-        "import colossalai\n",
-        "from colossalai.engine import Engine, NoPipelineSchedule\n",
-        "from colossalai.trainer import Trainer\n",
-        "from colossalai.context import Config\n",
-        "import torch"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Please install apex to use FP16 Optimizer\n",
-            "Apex should be installed to use the FP16 optimizer\n",
-            "apex is required for mixed precision training\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PpFfhNBD7NSn"
-      },
-      "source": [
-        "First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "8yF7Lc-K7NAS",
-        "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
-      },
-      "source": [
-        "parallel_cfg = Config(dict(parallel=dict(\n",
-        "    data=dict(size=1),\n",
-        "    pipeline=dict(size=1),\n",
-        "    tensor=dict(size=1, mode=None),\n",
-        ")))\n",
-        "colossalai.init_dist(config=parallel_cfg,\n",
-        "          local_rank=0,\n",
-        "          world_size=1,\n",
-        "          host='127.0.0.1',\n",
-        "          port=8888,\n",
-        "          backend='nccl')"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
-            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "process rank 0 is bound to device 0\n",
-            "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ppjmMxc_81TK"
-      },
-      "source": [
-        "Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
-      ]
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uhrbvVEh2iJd"
+   },
+   "source": [
+    "# Train an image classifier\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "vP7LvCpG23a2",
+    "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
+   },
+   "source": [
+    "!pip install ColossalAI deepspeed"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ZyGhyD47-dUY",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
-      },
-      "source": [
-        "transform_cfg = [\n",
-        "    dict(type='ToTensor'),\n",
-        "    dict(type='Normalize',\n",
-        "        mean=[0.4914, 0.4822, 0.4465],\n",
-        "        std=[0.2023, 0.1994, 0.2010]),\n",
-        "]\n",
-        "\n",
-        "batch_size = 128\n",
-        "\n",
-        "trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
-        "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
-        "\n",
-        "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
-        "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Files already downloaded and verified\n",
-            "Files already downloaded and verified\n"
-          ]
-        }
-      ]
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
+      "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
+      "Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
+      "Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
+      "Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
+      "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
+      "Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
+      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
+      "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "UVKEurtS4SFS",
+    "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
+   },
+   "source": [
+    "import colossalai\n",
+    "from colossalai.engine import Engine, NoPipelineSchedule\n",
+    "from colossalai.trainer import Trainer\n",
+    "from colossalai.context import Config\n",
+    "import torch"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NvPbfLLR9NzC"
-      },
-      "source": [
-        "We just define a simple Convolutional Neural Network here."
-      ]
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Please install apex to use FP16 Optimizer\n",
+      "Apex should be installed to use the FP16 optimizer\n",
+      "apex is required for mixed precision training\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PpFfhNBD7NSn"
+   },
+   "source": [
+    "First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "8yF7Lc-K7NAS",
+    "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
+   },
+   "source": [
+    "parallel_cfg = Config(dict(parallel=dict(\n",
+    "    data=dict(size=1),\n",
+    "    pipeline=dict(size=1),\n",
+    "    tensor=dict(size=1, mode=None),\n",
+    ")))\n",
+    "colossalai.init_dist(config=parallel_cfg,\n",
+    "          local_rank=0,\n",
+    "          world_size=1,\n",
+    "          host='127.0.0.1',\n",
+    "          port=8888,\n",
+    "          backend='nccl')"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "code",
-      "metadata": {
-        "id": "cQ_y7lBG09LS"
-      },
-      "source": [
-        "import torch.nn as nn\n",
-        "import torch.nn.functional as F\n",
-        "\n",
-        "\n",
-        "class Net(nn.Module):\n",
-        "    def __init__(self):\n",
-        "        super().__init__()\n",
-        "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
-        "        self.pool = nn.MaxPool2d(2, 2)\n",
-        "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
-        "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
-        "        self.fc2 = nn.Linear(120, 84)\n",
-        "        self.fc3 = nn.Linear(84, 10)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = self.pool(F.relu(self.conv1(x)))\n",
-        "        x = self.pool(F.relu(self.conv2(x)))\n",
-        "        x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
-        "        x = F.relu(self.fc1(x))\n",
-        "        x = F.relu(self.fc2(x))\n",
-        "        x = self.fc3(x)\n",
-        "        return x\n",
-        "\n",
-        "\n",
-        "model = Net().cuda()"
-      ],
-      "execution_count": null,
-      "outputs": []
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
+      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tgsszAmM9dYZ"
-      },
-      "source": [
-        "Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
-      ]
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "process rank 0 is bound to device 0\n",
+      "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ppjmMxc_81TK"
+   },
+   "source": [
+    "Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "ZyGhyD47-dUY",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
+   },
+   "source": [
+    "transform_cfg = [\n",
+    "    dict(type='ToTensor'),\n",
+    "    dict(type='Normalize',\n",
+    "        mean=[0.4914, 0.4822, 0.4465],\n",
+    "        std=[0.2023, 0.1994, 0.2010]),\n",
+    "]\n",
+    "\n",
+    "batch_size = 128\n",
+    "\n",
+    "trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
+    "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
+    "\n",
+    "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
+    "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "YtaDoCax1BCf",
-        "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
-      },
-      "source": [
-        "import torch.optim as optim\n",
-        "\n",
-        "criterion = nn.CrossEntropyLoss()\n",
-        "optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
-        "schedule = NoPipelineSchedule()\n",
-        "engine = Engine(\n",
-        "        model=model,\n",
-        "        criterion=criterion,\n",
-        "        optimizer=optimizer,\n",
-        "        lr_scheduler=None,\n",
-        "        schedule=schedule\n",
-        "    )\n",
-        "trainer = Trainer(engine=engine,\n",
-        "          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
-        "          verbose=True)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
-            "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
-            "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
-            "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
-          ]
-        }
-      ]
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Files already downloaded and verified\n",
+      "Files already downloaded and verified\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NvPbfLLR9NzC"
+   },
+   "source": [
+    "We just define a simple Convolutional Neural Network here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "cQ_y7lBG09LS"
+   },
+   "source": [
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "\n",
+    "class Net(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
+    "        self.pool = nn.MaxPool2d(2, 2)\n",
+    "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
+    "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
+    "        self.fc2 = nn.Linear(120, 84)\n",
+    "        self.fc3 = nn.Linear(84, 10)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.pool(F.relu(self.conv1(x)))\n",
+    "        x = self.pool(F.relu(self.conv2(x)))\n",
+    "        x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        x = self.fc3(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "model = Net().cuda()"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tgsszAmM9dYZ"
+   },
+   "source": [
+    "Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "YtaDoCax1BCf",
+    "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
+   },
+   "source": [
+    "import torch.optim as optim\n",
+    "\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
+    "schedule = NoPipelineSchedule()\n",
+    "engine = Engine(\n",
+    "        model=model,\n",
+    "        criterion=criterion,\n",
+    "        optimizer=optimizer,\n",
+    "        lr_scheduler=None,\n",
+    "        schedule=schedule\n",
+    "    )\n",
+    "trainer = Trainer(engine=engine,\n",
+    "          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
+    "          verbose=True)"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_JR2TuvH99Ik"
-      },
-      "source": [
-        "Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
-      ]
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
+      "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
+      "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
+      "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_JR2TuvH99Ik"
+   },
+   "source": [
+    "Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "w-J3IP-J1sfx",
+    "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
+   },
+   "source": [
+    "num_epochs = 10\n",
+    "test_interval = 1\n",
+    "trainer.fit(\n",
+    "        train_dataloader=trainloader,\n",
+    "        test_dataloader=testloader,\n",
+    "        max_epochs=num_epochs,\n",
+    "        display_progress=True,\n",
+    "        test_interval=test_interval\n",
+    "    )"
+   ],
+   "execution_count": null,
+   "outputs": [
     {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "w-J3IP-J1sfx",
-        "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
-      },
-      "source": [
-        "num_epochs = 10\n",
-        "test_interval = 1\n",
-        "trainer.fit(\n",
-        "        train_dataloader=trainloader,\n",
-        "        test_dataloader=testloader,\n",
-        "        max_epochs=num_epochs,\n",
-        "        display_progress=True,\n",
-        "        test_interval=test_interval\n",
-        "    )"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[Epoch 0 train]:   0%|          | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
-            "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
-            "[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
-            "[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
-            "[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
-            "[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
-            "[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
-            "[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
-            "[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
-            "[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
-            "[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
-            "[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
-            "[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
-            "[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
-            "[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
-            "[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
-            "[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
-            "[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
-            "[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
-            "[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
-            "[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
-            "[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
-            "colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
-          ]
-        }
-      ]
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "[Epoch 0 train]:   0%|          | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
+      "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
+      "[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
+      "[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
+      "[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
+      "[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
+      "[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
+      "[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
+      "[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
+      "[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
+      "[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
+      "[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
+      "[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
+      "[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
+      "[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
+      "[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
+      "[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
+      "[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
+      "[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
+      "[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
+      "[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
+      "[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
+      "colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
+     ]
     }
-  ]
+   ]
+  }
+ ]
 }
\ No newline at end of file
diff --git a/examples/run_trainer.py b/examples/run_trainer.py
index 34e45823b2c5..080713d07be2 100644
--- a/examples/run_trainer.py
+++ b/examples/run_trainer.py
@@ -3,26 +3,18 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.trainer import Trainer
 
 
 def run_trainer():
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+    engine, train_dataloader, test_dataloader = colossalai.initialize()
     logger = get_global_dist_logger()
-    schedule.data_sync = False
-    engine = Engine(
-        model=model,
-        criterion=criterion,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        schedule=schedule
-    )
+    engine.schedule.data_sync = False
+
     logger.info("engine is built", ranks=[0])
 
     trainer = Trainer(engine=engine,
-                      hooks_cfg=gpc.config.hooks,
                       verbose=True)
     logger.info("trainer is built", ranks=[0])
 
@@ -30,7 +22,8 @@ def run_trainer():
     trainer.fit(
         train_dataloader=train_dataloader,
         test_dataloader=test_dataloader,
-        max_epochs=gpc.config.num_epochs,
+        epochs=gpc.config.num_epochs,
+        hooks_cfg=gpc.config.hooks,
         display_progress=True,
         test_interval=2
     )
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3d8f766989c3..34bfe867693f 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -3,5 +3,5 @@ torchvision>=0.9
 numpy
 tqdm
 psutil
-tensorboardX
+tensorboard
 packaging
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d71876bb9938..8541b0a6ce3a 100644
--- a/setup.py
+++ b/setup.py
@@ -121,7 +121,7 @@ def fetch_requirements(path):
 install_requires = fetch_requirements('requirements/requirements.txt')
 
 setup(
-    name='colossal-ai',
+    name='colossalai',
     version='0.0.1-beta',
     packages=find_packages(exclude=('csrc',
                                     'tests',
diff --git a/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py b/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py
index 907605317834..c97ed18044e2 100644
--- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py
+++ b/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py
@@ -27,8 +27,6 @@
     dataloader=dict(
         batch_size=BATCH_SIZE,
         pin_memory=True,
-        # num_workers=1,
-        # shuffle=True,
     )
 )
 
@@ -63,14 +61,6 @@
     type='CrossEntropyLoss2D',
 )
 
-# model = dict(
-#     type='VanillaResNet',
-#     block_type='ResNetBasicBlock',
-#     layers=[2, 2, 2, 2],
-#     num_cls=10
-# )
-
-
 model = dict(
     type='VisionTransformerFromConfig',
     tensor_splitting_cfg=dict(
@@ -135,25 +125,26 @@
 
 fp16 = dict(
     mode=AMP_TYPE.PARALLEL,
-    initial_scale=2 ** 8
 )
 
-# fp16 = dict(
-#     mode=None,
-# )
-
-schedule = dict(
-    num_microbatches=2
-)
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
+engine = dict(
+    schedule=dict(
+        num_microbatches=2
+    )
 )
 
+hooks = [
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
+]
 num_epochs = 60
 
 logging = dict(
     root_path='test_vit_2d_log'
 )
-
-seed = 100
diff --git a/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py b/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py
index d41ecea89440..fd9c89eb434f 100644
--- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py
+++ b/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py
@@ -124,14 +124,21 @@
     tensor=dict(size=4, depth=1, mode='2.5d'),
 )
 
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
-)
+hooks = [
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
+]
 
+engine = dict(
 schedule = dict(
     num_microbatches=2
 )
+)
 
 num_epochs = 60
-num_microbatches = 1
diff --git a/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py b/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py
index 9ffd0a1ec12d..b68a58cea3e3 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py
@@ -9,21 +9,22 @@
 import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather
 
 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
 
 
-def eval(engine):
+def eval(engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
 
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             # loss = sum(loss)
@@ -43,20 +44,22 @@ def eval(engine):
             correct = torch.sum(label == output)
             correct_sum += correct
             total_sum += label.size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train(engine, train_dataloader):
     engine.train()
     accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
 
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             accumulated_loss += loss.detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return avg_loss
 
 
@@ -64,25 +67,16 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader)
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
             if gpc.is_last_rank(ParallelMode.PIPELINE):
                 logger.info(
                     f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
diff --git a/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py b/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py
index 33d56360a7b4..70857f1e8d9a 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py
@@ -6,20 +6,22 @@
 import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather
 
 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')
 
-def eval(engine):
+
+def eval(engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
 
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             accumulated_loss += loss.detach().cpu().numpy()
@@ -43,21 +45,23 @@ def eval(engine):
             correct = torch.sum(label == output)
             correct_sum += correct
             total_sum += label.size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train(engine, train_dataloader):
     engine.train()
     accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
+
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
-        
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             accumulated_loss += loss.detach().cpu().numpy()
 
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return avg_loss
 
 
@@ -65,25 +69,16 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2p5d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader)
         if gpc.is_last_rank(ParallelMode.PIPELINE):
             logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
             if gpc.is_last_rank(ParallelMode.PIPELINE):
                 logger.info(
                     f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
@@ -91,4 +86,4 @@ def test_2p5d_parallel_vision_transformer():
 
 
 if __name__ == '__main__':
-    test_2p5d_parallel_vision_transformer()
\ No newline at end of file
+    test_2p5d_parallel_vision_transformer()
diff --git a/tests/test_engine/configs/non_pipeline_resnet.py b/tests/test_engine/configs/non_pipeline_resnet.py
index de78154ecb54..19f2d61d2795 100644
--- a/tests/test_engine/configs/non_pipeline_resnet.py
+++ b/tests/test_engine/configs/non_pipeline_resnet.py
@@ -38,5 +38,3 @@
 
 loss = dict(type='CrossEntropyLoss')
 
-# set_device_func = lambda global_rank, world_size: global_rank % 4
-seed = 1024
diff --git a/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py b/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py
index b6300b8c4ed7..f845d98420fb 100644
--- a/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py
+++ b/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py
@@ -40,6 +40,3 @@
 
 loss = dict(type='CrossEntropyLoss')
 fp16 = dict(mode=AMP_TYPE.APEX)
-
-# set_device_func = lambda global_rank, world_size: global_rank % 4
-seed = 1024
diff --git a/tests/test_engine/configs/non_pipeline_resnet_torch_amp.py b/tests/test_engine/configs/non_pipeline_resnet_torch_amp.py
index 87fd68554166..ab4517e92ae7 100644
--- a/tests/test_engine/configs/non_pipeline_resnet_torch_amp.py
+++ b/tests/test_engine/configs/non_pipeline_resnet_torch_amp.py
@@ -40,6 +40,3 @@
 
 loss = dict(type='CrossEntropyLoss')
 fp16 = dict(mode=AMP_TYPE.TORCH)
-
-# set_device_func = lambda global_rank, world_size: global_rank % 4
-seed = 1024
diff --git a/tests/test_engine/configs/pipeline_vanilla_resnet.py b/tests/test_engine/configs/pipeline_vanilla_resnet.py
index 9820d3b82a34..a47f40613129 100644
--- a/tests/test_engine/configs/pipeline_vanilla_resnet.py
+++ b/tests/test_engine/configs/pipeline_vanilla_resnet.py
@@ -38,11 +38,9 @@
     tensor=dict(size=1, mode=None)
 )
 
-schedule = dict(
-    num_microbatches=4
+engine = dict(
+    schedule=dict(
+        num_microbatches=4
+    )
 )
-num_pipeling_batches = 2
-seed = 1024
-lr_scheduler = dict(type='LinearWarmupLR', warmup_steps=5)
-
 num_epochs = 10
diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py
index fe6b4010b24b..98c2b807256d 100644
--- a/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py
+++ b/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py
@@ -8,7 +8,6 @@
 
 from colossalai import initialize
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.utils import report_memory_usage
 
@@ -24,20 +23,13 @@
 
 
 def run_no_pipeline(config):
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = initialize(config)
+    engine, train_dataloader, test_dataloader = initialize(config)
     logger = get_global_dist_logger()
     rank = torch.distributed.get_rank()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    schedule=schedule)
     engine.train()
-    logger.info('lr = %g' % engine.get_lr())
-    output, label, loss = engine.step()
+    output, label, loss = engine.step(iter(train_dataloader))
     logger.info('Rank {} returns: {}'.format(rank, loss.item()))
-    logger.info('lr = %g' % engine.get_lr())
 
     gpc.destroy()
     logger.info('Test engine finished')
diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py
index 865f2b04e674..effb65e02441 100644
--- a/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py
+++ b/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py
@@ -8,7 +8,6 @@
 
 from colossalai import initialize
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.utils import report_memory_usage
 
@@ -26,21 +25,14 @@
 def test_no_pipeline(config):
     print('Test no pipeline engine start')
 
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = initialize(config)
+    engine, train_dataloader, test_dataloader = initialize(config)
     logger = get_global_dist_logger()
 
     rank = torch.distributed.get_rank()
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    schedule=schedule)
 
     engine.train()
-    logger.info('lr = %g' % engine.get_lr())
-    output, label, loss = engine.step()
+    output, label, loss = engine.step(iter(train_dataloader))
     logger.info('Rank {} returns: {}'.format(rank, loss.item()))
-    logger.info('lr = %g' % engine.get_lr())
 
     gpc.destroy()
     logger.info('Test engine finished')
diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py
index 83c6927f38b2..a4c496a7db8b 100644
--- a/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py
+++ b/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py
@@ -8,7 +8,6 @@
 
 from colossalai import initialize
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.utils import report_memory_usage
 
@@ -26,21 +25,13 @@
 def test_no_pipeline(config):
     print('Test no pipeline engine start')
 
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = initialize(config)
+    engine, train_dataloader, test_dataloader = initialize(config)
     logger = get_global_dist_logger()
-
     rank = torch.distributed.get_rank()
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    schedule=schedule)
 
     engine.train()
-    logger.info('lr = %g' % engine.get_lr())
-    output, label, loss = engine.step()
+    output, label, loss = engine.step(iter(train_dataloader))
     logger.info('Rank {} returns: {}'.format(rank, loss.item()))
-    logger.info('lr = %g' % engine.get_lr())
 
     gpc.destroy()
     logger.info('Test engine finished')
diff --git a/tests/test_engine/test_pipeline/test_schedule.py b/tests/test_engine/test_pipeline/test_schedule.py
index 32fcaafc1f22..9125fb3eed84 100644
--- a/tests/test_engine/test_pipeline/test_schedule.py
+++ b/tests/test_engine/test_pipeline/test_schedule.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import initialize
 from colossalai.logging import get_global_dist_logger
@@ -22,13 +23,25 @@
 @pytest.mark.skip("This test should be invoked using the test.sh provided")
 @pytest.mark.dist
 def test_schedule():
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = initialize(CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = initialize(CONFIG_PATH)
     logger = get_global_dist_logger()
 
-    schedule.zero_grad()
-    output, label, losses = schedule.forward_backward_step(forward_only=False)
-    schedule.step()
-    logger.info('losses: {}'.format([loss.item() for loss in losses]))
+    model = engine.model
+    optimizer = engine.optimizer
+    criterion = engine.criterion
+    schedule = engine._schedule
+
+    output, label, loss = schedule.forward_backward_step(
+        data_iter=iter(train_dataloader),
+        model=model,
+        optimizer=optimizer,
+        criterion=criterion,
+        forward_only=False
+    )
+    schedule.optimizer_step(model, optimizer)
+
+    if gpc.is_last_rank(ParallelMode.PIPELINE):
+        logger.info('losses: {}'.format(loss))
 
     gpc.destroy()
     logger.info('training finished')
diff --git a/tests/test_engine/test_pipeline_engine/test_engine.py b/tests/test_engine/test_pipeline_engine/test_engine.py
index 7ed0b0a3d11f..9d6c9f59f206 100644
--- a/tests/test_engine/test_pipeline_engine/test_engine.py
+++ b/tests/test_engine/test_pipeline_engine/test_engine.py
@@ -9,7 +9,6 @@
 from colossalai import initialize
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 
 NUM_BATCH = 128
@@ -23,22 +22,14 @@
 
 
 def run_pipeline(config):
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = initialize(config)
+    engine, train_dataloader, test_dataloader = initialize(config)
     logger = get_global_dist_logger()
     rank = torch.distributed.get_rank()
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
 
     engine.train()
-    logger.info('lr = %g' % engine.get_lr())
-    outputs, labels, loss = engine.step()
+    outputs, labels, loss = engine.step(iter(train_dataloader))
     if gpc.is_last_rank(ParallelMode.PIPELINE):
         logger.info('losses: {}'.format(rank, loss.item()))
-    logger.info('lr = %g' % engine.get_lr())
 
     gpc.destroy()
     logger.info('Test engine pipeline finished')
diff --git a/tests/test_fp16_optimizer/configs/vit_2d.py b/tests/test_fp16_optimizer/configs/vit_2d.py
index bcef5e2d4be5..6283dea9b2d3 100644
--- a/tests/test_fp16_optimizer/configs/vit_2d.py
+++ b/tests/test_fp16_optimizer/configs/vit_2d.py
@@ -132,9 +132,12 @@
     initial_scale=2 ** 4
 )
 
+num_epochs = 60
+
+
 lr_scheduler = dict(
     type='LinearWarmupLR',
-    warmup_epochs=5
+    warmup_steps=5,
+    total_steps=num_epochs
 )
 
-num_epochs = 60
diff --git a/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py b/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
index a02ede90c2a3..45c36f3843d2 100644
--- a/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
+++ b/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
@@ -7,23 +7,25 @@
 import torch.autograd
 
 import colossalai
+from colossalai.builder import build_lr_scheduler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather
 
 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
 
 
-def eval(engine):
+def eval(engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
 
         output = _gather(
@@ -40,18 +42,21 @@ def eval(engine):
         correct = torch.sum(label[0] == output)
         correct_sum += correct
         total_sum += label[0].size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train(engine, train_dataloader, lr_scheduler):
     engine.train()
     accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.squeeze(0).detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
+    lr_scheduler.step()
     return avg_loss
 
 
@@ -59,26 +64,18 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
+    lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
     logger.info('start training')
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader, lr_scheduler)
 
         logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
             logger.info(
                 f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
                 f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
diff --git a/tests/test_models/test_vision_transformer/configs/vit_2d.py b/tests/test_models/test_vision_transformer/configs/vit_2d.py
index 92706e8cd78d..1fd1102fba88 100644
--- a/tests/test_models/test_vision_transformer/configs/vit_2d.py
+++ b/tests/test_models/test_vision_transformer/configs/vit_2d.py
@@ -102,6 +102,6 @@
     tensor=dict(size=4, mode='2d'),
 )
 
-lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5)
-
 num_epochs = 60
+
+lr_scheduler = dict(type='LinearWarmupLR', warmup_steps=5, total_steps=num_epochs)
diff --git a/tests/test_models/test_vision_transformer/configs/vit_2p5d.py b/tests/test_models/test_vision_transformer/configs/vit_2p5d.py
index f788cb704d9f..3c16d684a8b1 100644
--- a/tests/test_models/test_vision_transformer/configs/vit_2p5d.py
+++ b/tests/test_models/test_vision_transformer/configs/vit_2p5d.py
@@ -125,13 +125,6 @@
     tensor=dict(size=4, depth=1, mode='2.5d'),
 )
 
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
-)
-
-schedule = dict(
-    num_microbatches=8
-)
-
 num_epochs = 60
+
+lr_scheduler = dict(type='LinearWarmupLR', warmup_steps=5, total_steps=num_epochs)
diff --git a/tests/test_models/test_vision_transformer/configs/vit_3d.py b/tests/test_models/test_vision_transformer/configs/vit_3d.py
index c66212f0469d..ad041efd0a22 100644
--- a/tests/test_models/test_vision_transformer/configs/vit_3d.py
+++ b/tests/test_models/test_vision_transformer/configs/vit_3d.py
@@ -116,9 +116,14 @@
         weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
     ),
     dict(type='LossHook'),
-    # dict(type='TensorboardHook', log_dir='./tfb_logs'),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
 ]
 
 parallel = dict(
@@ -127,12 +132,4 @@
     tensor=dict(mode='3d', size=8),
 )
 
-# fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2 ** 6)
-
-lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5)
-
-# schedule = dict(num_microbatches=4)
-
 num_epochs = 60
-
-seed = 42
diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py b/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py
index fb32bea491fc..487ba335bafc 100644
--- a/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py
+++ b/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py
@@ -7,23 +7,25 @@
 import torch.autograd
 
 import colossalai
+from colossalai.builder import build_lr_scheduler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather
 
 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
 
 
-def eval(engine):
+def eval(engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
 
         output = _gather(
@@ -40,18 +42,21 @@ def eval(engine):
         correct = torch.sum(label[0] == output)
         correct_sum += correct
         total_sum += label[0].size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train(engine, train_dataloader, lr_scheduler):
     engine.train()
     accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
+    lr_scheduler.step()
     return avg_loss
 
 
@@ -59,25 +64,17 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
+    lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
     logger.info('start training')
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader, lr_scheduler)
         logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
             logger.info(
                 f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
                 f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py b/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py
index 1a576d039065..a8361d2e6ec8 100644
--- a/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py
+++ b/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py
@@ -4,22 +4,25 @@
 import torch.autograd
 
 import colossalai
+from colossalai.builder import build_lr_scheduler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather
 
 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')
 
-def eval(engine):
+
+def eval(engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
 
         output = _gather(
@@ -41,18 +44,21 @@ def eval(engine):
         correct = torch.sum(label[0] == output)
         correct_sum += correct
         total_sum += label[0].size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train(engine, train_dataloader, lr_scheduler):
     engine.train()
     accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
+    lr_scheduler.step()
     return avg_loss
 
 
@@ -60,29 +66,21 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2p5d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
+    lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
     logger.info('start training')
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader, lr_scheduler)
         logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
             logger.info(
                 f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
                 f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
 
 
 if __name__ == '__main__':
-    test_2p5d_parallel_vision_transformer()
\ No newline at end of file
+    test_2p5d_parallel_vision_transformer()
diff --git a/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py b/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py
index db78e9967943..7bee2c78b4b2 100644
--- a/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py
+++ b/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py
@@ -1,16 +1,14 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-
 import time
 from pathlib import Path
 
 import torch
 from tqdm import tqdm
 
-from colossalai import initialize
+import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.trainer import Trainer
 from colossalai.trainer.metric import Accuracy3D
@@ -29,7 +27,7 @@ def _train_epoch(epoch, engine):
     num_samples = 0
     now = time.time()
     epoch_start = now
-    progress = range(engine.schedule.num_steps)
+    progress = range(engine._schedule.num_steps)
     if gpc.get_global_rank() == 0:
         progress = tqdm(progress, desc='[Epoch %d]' % epoch, miniters=1)
     for step in progress:
@@ -68,7 +66,7 @@ def _eval(epoch, engine):
                      ParallelMode.PARALLEL_3D_WEIGHT)
     total = 0
     with torch.no_grad():
-        for _ in range(engine.schedule.num_steps):
+        for _ in range(engine._schedule.num_steps):
             outputs, targets, loss = engine.step()
             if isinstance(outputs, (list, tuple)):
                 outputs = outputs[0]
@@ -80,32 +78,25 @@ def _eval(epoch, engine):
 
         print_rank_0(
             '[Epoch %d] Evaluation loss: %.3f | Acc: %.3f%%' %
-            (epoch, eval_loss / engine.schedule.num_steps,
+            (epoch, eval_loss / engine._schedule.num_steps,
              acc.get_accumulated_value() * 100), logger)
 
 
 def train():
-    model, train_dataloader, test_dataloader, criterion, \
-    optimizer, schedule, lr_scheduler = initialize(CONFIG_PATH)
-
+    # init dist
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
     logger.info("Engine is built", ranks=[0])
 
-    trainer = Trainer(engine=engine, hooks_cfg=gpc.config.hooks, verbose=True)
+    trainer = Trainer(engine=engine, verbose=True)
     logger.info("Trainer is built", ranks=[0])
 
     logger.info("Train start", ranks=[0])
     trainer.fit(train_dataloader=train_dataloader,
                 test_dataloader=test_dataloader,
-                max_epochs=gpc.config.num_epochs,
+                epochs=gpc.config.num_epochs,
+                hooks_cfg=gpc.config.hooks,
                 display_progress=True,
                 test_interval=1)
 
diff --git a/tests/test_trainer/configs/test_trainer_resnet.py b/tests/test_trainer/configs/test_trainer_resnet.py
index 8979f4b09ae3..ff48d4e6c2cc 100644
--- a/tests/test_trainer/configs/test_trainer_resnet.py
+++ b/tests/test_trainer/configs/test_trainer_resnet.py
@@ -3,6 +3,7 @@
 
 BATCH_SIZE = 128
 IMG_SIZE = 32
+num_epochs = 200
 
 # resnet 50
 model = dict(
@@ -77,18 +78,14 @@
     dict(type='AccuracyHook'),
     dict(type='LossHook'),
     dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='CosineAnnealingLR',
+            warmup_steps=5
+        )
+    ),
     dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
 ]
 
-# fp16 = dict(
-#     mode=AMP_TYPE.PARALLEL,
-#     initial_scale=1
-# )
-
-lr_scheduler = dict(
-    type='CosineAnnealingLR',
-    T_max=200
-)
-
-num_epochs = 200
diff --git a/tests/test_trainer/configs/test_trainer_vit_2d.py b/tests/test_trainer/configs/test_trainer_vit_2d.py
index 15c799039ab2..1769f4afe26a 100644
--- a/tests/test_trainer/configs/test_trainer_vit_2d.py
+++ b/tests/test_trainer/configs/test_trainer_vit_2d.py
@@ -11,6 +11,7 @@
 SUMMA_DIM = 2
 NUM_CLASSES = 10
 DEPTH = 6
+num_epochs = 60
 
 train_data = dict(
     dataset=dict(type='CIFAR10Dataset',
@@ -52,13 +53,6 @@
 
 loss = dict(type='CrossEntropyLoss2D', )
 
-# model = dict(
-#     type='VanillaResNet',
-#     block_type='ResNetBasicBlock',
-#     layers=[2, 2, 2, 2],
-#     num_cls=10
-# )
-
 model = dict(
     type='VisionTransformerFromConfig',
     tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ),
@@ -114,8 +108,15 @@
     dict(type='Accuracy2DHook'),
     dict(type='LossHook'),
     dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
     dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
 ]
 
 parallel = dict(
@@ -125,11 +126,8 @@
 
 fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2 ** 8)
 
-lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5)
-
-schedule = dict(num_microbatches=1)
-
-num_epochs = 60
-num_microbatches = 1
+engine = dict(
+    schedule=dict(num_microbatches=1)
+)
 
 logging = dict(root_path='./logs')
diff --git a/tests/test_trainer/test_trainer.py b/tests/test_trainer/test_trainer.py
index 0c0a458b3406..6a7681d00adb 100644
--- a/tests/test_trainer/test_trainer.py
+++ b/tests/test_trainer/test_trainer.py
@@ -1,25 +1,16 @@
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.trainer import Trainer
 
 
 def test_trainer():
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+    engine, train_dataloader, test_dataloader = colossalai.initialize()
     logger = get_global_dist_logger()
 
-    engine = Engine(
-        model=model,
-        criterion=criterion,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        schedule=schedule
-    )
     logger.info("engine is built", ranks=[0])
 
     trainer = Trainer(engine=engine,
-                      hooks_cfg=gpc.config.hooks,
                       verbose=True)
     logger.info("trainer is built", ranks=[0])
 
@@ -27,7 +18,8 @@ def test_trainer():
     trainer.fit(
         train_dataloader=train_dataloader,
         test_dataloader=test_dataloader,
-        max_epochs=gpc.config.num_epochs,
+        hooks_cfg=gpc.config.hooks,
+        epochs=gpc.config.num_epochs,
         display_progress=False,
         test_interval=5
     )
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py b/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
index 6533b3a6d440..5c78dfcc22bc 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
@@ -18,14 +18,16 @@
 CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py')
 
 
-def eval(engine):
+def eval_epoch(engine: Engine, test_dataloader):
     engine.eval()
     accumulated_loss = 0
     correct_sum = 0
     total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)
 
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
 
         output = _gather(
@@ -42,18 +44,19 @@ def eval(engine):
         correct = torch.sum(label[0] == output)
         correct_sum += correct
         total_sum += label[0].size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return correct_sum, total_sum, avg_loss
 
 
-def train(engine):
+def train_epoch(engine, train_dataloader):
     engine.train()
     accumulated_loss = 0
-
-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)
         accumulated_loss += loss.detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
     return avg_loss
 
 
@@ -61,30 +64,17 @@ def train(engine):
 @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2d_parallel_vision_transformer():
     # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
     logger = get_global_dist_logger()
 
-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
-    # for param in model.parameters():
-    #     if isinstance(param, torch.HalfTensor):
-    #         print(param.shape)
-
     logger.info('start training')
     for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train_epoch(engine, train_dataloader)
 
         logger.info(f'epoch {epoch} - train loss: {train_loss}')
 
         if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval_epoch(engine, test_dataloader)
             logger.info(
                 f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
                 f'correct: {correct_sum}, acc: {correct_sum / total_sum}')

From dbe62c67b8ee5a29039972900c289550b76f8e8e Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 18 Nov 2021 23:45:09 +0800
Subject: [PATCH 04/10] add an example of ViT-B/16 and remove w_norm clipping
 in LAMB (#29)

---
 colossalai/nn/optimizer/lamb.py               |   2 +-
 examples/vit-b16/README.md                    |  14 +++
 examples/vit-b16/acc.jpeg                     | Bin 0 -> 19776 bytes
 examples/vit-b16/dataloader/__init__.py       |   0
 .../dataloader/imagenet_dali_dataloader.py    | 112 ++++++++++++++++++
 examples/vit-b16/hooks.py                     |  15 +++
 examples/vit-b16/loss.jpeg                    | Bin 0 -> 22964 bytes
 examples/vit-b16/mixup.py                     |  12 ++
 examples/vit-b16/train_dali.py                |  70 +++++++++++
 examples/vit-b16/vit-b16.py                   |  78 ++++++++++++
 10 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 examples/vit-b16/README.md
 create mode 100755 examples/vit-b16/acc.jpeg
 create mode 100755 examples/vit-b16/dataloader/__init__.py
 create mode 100755 examples/vit-b16/dataloader/imagenet_dali_dataloader.py
 create mode 100644 examples/vit-b16/hooks.py
 create mode 100755 examples/vit-b16/loss.jpeg
 create mode 100644 examples/vit-b16/mixup.py
 create mode 100644 examples/vit-b16/train_dali.py
 create mode 100755 examples/vit-b16/vit-b16.py

diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py
index 68531e92a249..f7248bd68fe7 100644
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@@ -94,7 +94,7 @@ def step(self, closure=None):
                 # * math.sqrt(bias_correction2) / bias_correction1
                 step_size = group['lr']
 
-                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
+                weight_norm = p.data.pow(2).sum().sqrt()
 
                 adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
                 if group['weight_decay'] != 0:
diff --git a/examples/vit-b16/README.md b/examples/vit-b16/README.md
new file mode 100644
index 000000000000..83b924c2e655
--- /dev/null
+++ b/examples/vit-b16/README.md
@@ -0,0 +1,14 @@
+# Overview
+
+Here is an example of training ViT-B/16 on Imagenet-1K. We use 8x A100 in this example. For simplicity and speed, we didn't apply `RandAug` and we just used `Mixup`. With `LAMB` optimizer, we can scale the batch size to 32K with a little accuracy loss.
+
+# How to run
+Using slurm:
+```shell
+srun python train_dali.py --local_rank=$SLURM_PROCID --world_size=$SLURM_NPROCS --host=$HOST --port=29500 --config=vit-b16.py
+```
+
+# Results
+
+![Loss Curve](./loss.jpeg)
+![Accuracy](./acc.jpeg)
diff --git a/examples/vit-b16/acc.jpeg b/examples/vit-b16/acc.jpeg
new file mode 100755
index 0000000000000000000000000000000000000000..43f67fd39167963235a372abb26f11c41b8dade6
GIT binary patch
literal 19776
zcmeHu1zcQ7w)Z7CL4vyk3GVKa;K5y!;1V1f0zm=+0zm=<NO0Ex!QCA~kOrFI?(W)r
zJ(;)r&1T=cot>H8_de<V)%|tfTU}k})Ttx?bLw{Db{@E=AR{jWz`?-*X0SJKI|V!h
zP>_(2kq}Xkk&#hRQP9xwFwpPbMJL9=#lj;ep`;)uAtR%rWuvE}W}zV?W8i0G;o#)q
z;i04#6cykSVdLiE`e_h2R8&;-yXZt17(`t6$?kLg)34hW02>8v{*E6!95rwU8x9^D
z?zRJ<1OPZh*ld5A@PEAE?!Y4;A|a!oqTPjcsJsW<frE#?g8+|+h=2g=?G1YlAYdcn
z+~<@;!c}{QOznin^(rbIg+{8h6<>XLpO)Li*&h{+fRKoogzf=710xd;FCV{vpwPod
z(lWAg@(LQ7TG~3gdithj<`$M#);2D#ZXkD$7oGvH1A~I!goH-NypN5G|L`#(BQq;I
zCpRy@psc*2vZ}hKwyv$cqqD2Kr?+neJUTWGnfNxju(-6mvbwguv3YQKbbNApc7AdB
z6J2ls{Lf_lm9ih`!iLdx2LS;d0r@Ao;O@A?8ay@v;(bme97#3gXHK}(T(3~@q@vPG
zTTyAa)%WpDoQKf}Xn7Xs4t^r-J7vF(F#q30*<T6!-*im^=<sl`#e>HN#DTL*h73Qn
z|KI-YJRoHWL04f3!{Jf?(lHMRC9J;0U%J0|Wl&40;)VLu%sg;;)|mL7i25<nH0n|*
zazfz&*Y2{1&K0E!TqTB$Ie6aa%H!onQ~?1v_@f9eW`YrK2k8T7<cos!Q+@M@bcdTL
z=vxm9^-c75YYa8(n?g5xHo)EkNQIZTwChHtEzw?*U)7+qzxLo`M!_3~W{9gdlhwkg
zn*phy`7NLwPl!q?KEz2}jJwc%O*6OuR`#awCi$lRv{$@B3{w8}N+-eRbk`-h-WOe?
zLGUKv7C^-c^If36LO#tsjoH2O8JIJhXwv5QMdAiGS-}rQAC4>LRz|E6)I8FopmQ|R
zDzALA3D?@X7oeJK`WPEkIn#=U`{sjqN)5NALwwTBb(q~&!g5;E=MgBAjm75X>?CP8
z!PvYt88uf{bWp2^Q`9-Op`L+~LrWZ5i=w&LlCdOIV0NSV0}tF5>!KapbEPd?Ppb+{
zQ|`nH>^*WFkW;EnqumQzFROU%YL3C@OU?thK{6C+5g1M>EVaQ(V--rcD&S-07i@7a
zOlwEekcdDC2-IQbJv227UJzumk!rtdcLYKLuwHkels)O=kj%HeVmc|NhNWnle2#MB
zoa9#U6VDF#UOC6JBh}{g#T6JYMDOxpGi(|luTZQO4|-eM-cfOKL*VG?3!#lh)V2^%
zMctIA!qV?5tJsgdDJ=(U$hoErWK3DF?An4_p7I@4$DVE&vIni!jGq)Cx48!<Uo+E|
z&6Q))T1Cd&_~JWoL*XSTs--nXSDJ&`<azpa?Tfqk9`bb2*~>V$8jzVCXDTMDrnJ(I
zu~Jpk)9!|%e(6K<Xva4^*pVQPGiIz#RX@OJob0r8$W$bAHtgp>Y0cE*C-cL@nAr5x
zIzO4Au->J5t5q6pV9qXL>lm1!Hu`z6Pjui#<Qv~*>;s1_LTHSvKFM5Ah1E;xedxHT
z*vx%vx(I1a_yH}y8myazTD+qa*vfC87zwME*-kNE86--7-B|c&ge0``svDmowb+`Z
zz9+#x-`w-S_mhF`>vy>jYk?Bcl+&qpq3VwOOt4AH*Sm^}L8WL2&fFcF7erksYV{F*
zV|V3q-Y!N4w9lS9%iH6u=!+6e=TILK`+;WN*;Wvpk)>okf#a*ldsbEM&iaO%r}qrI
ziDMVyR`ADwUX!e40Lf)Teb(mEBH1E7zAikwvT*77(bv526d9r<7h`_@pTs?1MCYdT
zRD9<a5WCtDv!4^+5i`Beg|e7haCUJ|@uue$EuDkg`yigu74+=~HzAc?raUr6a}B6R
zc{p~5H_GB9z@CWC;>5N1{KZSjXZ8tYNFmQ6h+A+S+$4kHf)6}3xjkm5pxDW{hUL}#
z_iW;|$NPema%h-e3(ha+_t1>G;dj^w>744c4;T?@fgLgvjVj|Uk`nJ%x$p?TV_Tj2
zR+_qamiJP-Xdx9>Zyiw%TxkUAR|kI;kQX_@?YxL}6rY6c1fihnY-=RWd>fFqjK-VJ
zfwu1Ejhe7=?dByTn)Za07|T4PsSaL3N4mHLY!fZ1sVVy1Xug0=X~%NO7cK{coods@
z=OEOO*N+kgK7LG@>L)rJ{E%s}a{+j7uO`)*=R^{=Cweu?^s80+U#?PC<x$9Mkk#MA
zQGpL6ORblnC~<H?oT<>^1IzN@<aw-w2QcR^-;+h4)L%K?BiPgt;}xT9-aoQlKP{VX
zbv(1aAEh=Xwip>NT;6XvS@+>(37trCiuQ211>aH?=VXn+jEag%n7$Y9Nm+pvgtL|_
z4{g@<3FBHo>!Ke&+qIr;?s>|QLZj}^PLXTps98g_A%h0ZgOQq@YQe#mCb|3L)ozP-
z2}fzcw58=yx4_Vs!K-hXJ2VfFTD;I>B0Nfj54wZ*+~z`k%>5#~!o_8)>J3N(t`Y~T
zaiB0c_(1O`G2qD`bS-PX=*?g}i5xy0rb93iChx#-HC(eTdkYx*@iAnC0)M<ET%dH2
zk*UJx6;rm8*jIl#*N_`y6EXjk1&LFgmMd+ytl&;FL3&D0ELK@~3!sd{E)R=qZvmg@
zx4?NP;OeR)`1iLd_rq<PkA=2U&sGO-fiP8MS!tKlzrCmS_rLmoWiWe1b6e4Y#xLVT
zy`r_I2qi?U*v|qgQe<S2$@6N<D;sskHz$e583}YdN`odgoj-981q_|w2md|8eq=LJ
zQXOWbQ$AZJU(7SQ!+Y8*JMa29!RRu+`s~N7lv)XbmE=rahq}#Ujh%IJy~iw=%-m4y
zA{}V3kqh+t+bwVqD!xE6=DY3&5#Jj~y#<KUFl={76~{zUcQE+A?)3=1-MPn-qIY5#
z5ZM-gMe`V&<)L!L;ka)l(~G;W_}pLkneMG6&$tYj`L^*Ez=VFQ*mo{b9LgzsO`~6O
zgITwBb5VW^EaAbG=2?6C6afM1pDP<*^DU5nVWtk)3PMhBt4cOCU%iR@I*Y|mVHt~4
ze9fSigz1=um3F7p_iv|``NPbCp{k+d8CpZ7%ncH^?9n$qR%yD`DQVmcU%4aozwQ^m
z^2(X9xvVN4FDkSt{}kgL|3F~)9<9|~KpCK?#nc_S1^&UDhldT<exTzAG9^e3t%aWV
zf}%H0i>qs{)GL@CofDCsU(^6DDTs=6v03E?19YF*SxX-vTeWiH@I~jIJEM|;K8c^p
zR6e;RcN@Gqvb+Vl7;b^rwpzD9I>hi6Xy)_1Hu2-@3``hw5~%P~axN(Ep&`viLxGDt
z08qQUm9)zDcSCX;9LzH}P6u&6(pcF=p>ZNAqGO^W_<Lhm{xHSATZ!OB2b3)zs)nXO
zI}v^d#KjM!n)gbZbcvq5e_1oFLnB-|sIYC%W5;)<647?pHn}^(ai`r0rzaC0J_tcu
z=O6JSKf{`HW3iQ|pRNp@%%G_5P5T`bVW%*ExWPPpy4A#Ic{~_i=625Ghp5lE9~GN8
z4ad{?i^}qb{{~Ud$;_}8>W;t3&HiS5@iKSPZbYeBsbw+QDRaX;!E@tG$S&zrF}!M|
zrHH8eF(c8Z)PKli{+*rqimK4bNumXRI#e>XJS}N50GNl{&^_{CZ>uK6F;rSeCXruu
z-lLg#r1v2yTHY{PMKJG&E%SGBIYR(yJDz>(;WKWEO?-;60!PcXThmKV&Sm{><5c@B
zp75RCRHvI$q?lQV)=D?L3tU|H^Ktfnz*%x~{ICCLX@7&j{=YJe+$`yse+(>aD6Va}
z1ya86DIo=B!q&5Dp?Lpe;j7-q)jE9llvrNF)qaDchkw<(W1tDBN#p%$(|B@Y;Ef@a
za0_fsty{sT?qglYG-LQquua_p(yXppU(b=D@^eAW+k|$vKt^00)1P3ff1|n2*^|(<
zH_Yo@OZ!9`NqTPo&A+J-7iY>BM0{`o->LbG@g>u8pHpPVwkw{_1E_%F+xSS?5eGN9
z_`7Q@Sr3k;XdO9cgGU?8y+}!w2u6D4+7k26_#yB)Li6J(ZMNQ`da2a`=k_<aLVe89
zM$DaOpFYpof=-8I1mgO{XBE8c2jTaN0~9Oj%F6D|K50Bo4>?NV2qq6ge4LiiMBDdz
zI<;*W^fTYAM<KB2Z|<R90@*F#=ewMIZVPP%&0R9uW)!=(h#x9n^WFka>m6?JDu-3!
zr1AdKhD==O5o2@(s=o<G4A5V<%5kg3G+y~)qW(tH?}JpsyXlkw{NbX}%bY;`)mR>m
zUKcbagm`S?dq}`_AF=%WE#RReSm@#K^9Z8;lUr{x+-un=zIB*TuNR=aOfyC`meojD
z=P49!6yhtTp1Km7WJoWaH2A`l2Y1B9IqDQK3KzLBqhnluAXb{Q$-Pm{m*Dv*$V*Z7
z^~Y&@#v_IKK&7nXmF{fY%~|SU9`#b2y3W9`Xht)G#Mfb;PvG7R=<Mgbb;_{n65gPW
zSF24?EbOJGEmWkzae9Xdv_&AJF<VF@l2TD%sYwt~z}*7T?h7SXh*Q4n*~Ou?bHFXo
zE4&Z6@!x4aR)<JSAmXY03tF#v1WJNlmfVuDqF@VU$S3cIos$%aXo_r^2(PB8?g_jL
zJENiq7%$*>--PdMF3_zfOzNt`{9<W_?s43usaU#iy_u?1c(B7+X&9FNqj8aEI3HH`
zJ8*k%0nz)7nh(#(QpZaslqWZ~a#t;P7Q9%!d9Ndi3#XsxO>z|tjNU81hbK>^?gAz*
zR@L<YYu3NWZ4Op=RZ1MhA_qEQ$%Tl3a&^xM1e<2q-&l~ujxN)>kOj%Lw9pJU4~mR=
zF~}~)&y-x>@&2^3-@ONhq7Kd3CKxB;_PTo`Ji{DEY!y82HasxAsgBg%q&o2M)r$#B
zWt#H<4s!)}1&i*ABb<BoE=2LHFGtf+iDWfxpRiydK=GRju%+_gBw7GRMCOr*-sbj0
z@<RE|k!xS>D8&+zP09^|(@F2xhEH1e>(I@^eH$iry&1O_l(cukCgVf)I?5<#Wi4ja
z;U?H~A!HCeX^L+pfpmwrfN<ml{Q)1;cf~Y1a$)z~ml5LuFE8%uL0Sk#JQ;w=nPgH&
zYQ(Hk_FCG`96^rIf`GH<n7(;Qhao>!uYb{8=KcNs^n8*>uV|#D$?;`?Ts3J6D?Bx6
z0Pp9mrkvA!S#?PQtX;$30<?SWPfqVv-2%gkM))pGKgia94N}?O0)-Cgxj*Yczy<ol
zkN=f|sjXeRNQ#9eNme~Tzo+|J7wjZJOux?9#ircvq@cib=t)u$wplaXVm5DfQX)L^
z(y7?SU59U6-(;rekY{{lFs={~SQ}k770U_dmwiGeUuaVo<}#_Y%0dwEVxXLg^bMJ1
zl20RZ`a+V!4tf0Cjc<!!9HVkyLcK40_vQ4uTO^U5UMDSyYt$1PADdS4GCt=u$b?I3
ze`uebO9zyyCKMuH8l!k9M;J3rog6;c2k%C)lp?Eqm7wz4HJSF?0DfKLOocgO>wKxG
zZltlw5SjK4o2TPB{`K4i*0-_LZsr&4M<Rz<NLz=--tlTpL2F7iH3s!%L@P;_49MwA
z;I!l7LMXN<yzQ$jeYO(@g)X>OA_1E0=ffI8NAU)pR)!qMOC!*_*zIjQ*DKwkA}YD`
z5Ku<mI#VZ}wo-K=uRQBLm}v|8E)_pX#xL9d6;kq0bfz-z7Lcn6Wq;rNb-_n%^k_nd
zz3BWBEZ3aU$ISe;_9@L5t8>XNgH%;m^orIK(2nD=h_y3yRr~bC>P?4@fTE+^)QPAI
zly{hfz;KL5R-}~hWyOibTBw_1;_J#`SNX`P6M#&K^r<wK`?RPhy4RJKD7O-%Qq)1E
z?f~(<+C$Ul2a&VP7sV}%v{-5hsclUJ4a2;)wR>y`2i<5>dQCph2o#@Qf{N>s3S<(N
z15;D;IdYjIQ?eoPO5*eHrt}TIwY)Qsih2}aQ-BUI*)SAa?7Gp}`G+#3-T6oDZ*8l@
zA2F>doBmfi%?I9Xn`fs=83o-Yao`Wbh7#q3Ze;S&=j~?CY!y4i=o?DUb)nSNO4ZSN
z52*u<rYMX8s3>aYk|-5w3(wiQbsFnxY-gm#987gf(kkCG8BvD;7wWRe_L})LVPoH}
z1<l=U^Yvy#jwa?hN_h*LcNesbYQg}t%ag4^Jh*@bdHT<gtdU9vULuE_iYrZvH7&1>
zD186YH8?T?!$6`ne<vfPl9F%hE2g!<s-P%TB#27NJXU@<5ms0FhzK`ke4bc3Te@%F
zERl5Gpci$yEP+cUs31ZPAsSVHSVX)3yCO@&^!>r%&=H=eyrB0XxdlEORYODd*Ykjk
zP=-I!;EK1%>}Wn~sw#2{0GKE3y$1w$r@Z>T`9|DFiN9!D71s1db$r-HHuM%pqu@Sx
z9gj2X;W;u>)hAs&$vv*rUn^UMi8Wrg_2jPgxaYKf7L&GaQD7Za*AOykQ_W&srZUgc
zfR@cEwBf6~X0o31h$rhUP;!3~DUcv0<E^}3f_1A`wQu48h?37`6jWrD5rGoT5M>$;
zXh{>HAxj{Ng|59%N_@T%rl~Fm!<C|cld%0HjDMcMefRN{d{69-c{YDGFn{j4q+b~|
zPx*AO(NDf?=I@(1dJY2z9uGUXJAljyj=I@n?nfwvCw~$O*i-(Z^Jr8}`hMUqy(#C(
zvrxK-kwGZ-5#q|&z_)5Ghx)^S8aMKT!A9Hj099z;{>{z0fK|Fl@J(hLG)pJJ17~~<
zic@GOw&J@?zGfJ;{`F1sf{`X8J)ydhc*&t4C%%4kE=4osA~`5|zNWeS_$aSW|MG6R
zH}eCI5gb&Foi?b}_^|Xdq3Fbhh&LKx_!+Q8$z%CvP(Q{JXi-y8^3P0#8~iaWS$p}T
zAMm&K;Fq@HU*JVBBY1SnK*2Siz5Jskxg=}C3hOOjMM|LvcA9x?X^h%A9D8}QT12}=
z6&o|S#A|jVV5Y6q6q(AXN536e<t{<#?6oy57_CI%=||mNdrH9Y?KH-J0%6|kz;FPD
zJK9sWC4h2s4494Pls{+Jlh5%fFHBw43^r=0V8}0NrfQBSY)%O8lvQ8l5VdYf?oQkS
zA|}wpd0}s1t)OiiLM};!Jr=&)8+}Uwz%p;p_*#27oQVC!OiBGvmwFNZF0=FvG=uU@
z-D9UNHmdrdwJ(ROua+*-mhTMg3_>Q$&Bhy~AVS5Z=Fars=OqmR7S!lQgm9rLjYme|
zUqf<cZaxiIV%v6Lf1Q7pgqZ|=@-AmxpROeMc&xO-nuKHfDvPR=_N;}XeLr7|QQDXj
zGN_&`Gy-~GaR?qdnrhUBLONL&d!HBYRIDO>LpZ>HD$@KuYTY1VZ8vw6N9lpLm?_tw
z&(lW3{q{Wq`UCjG66a`v)v&C*;^2bMJ-p!s9`<vZ5S|%j9pKw82Jzz%Niz72fWajL
zjZSG{1tH(KE?$lp>ze*2N7I>3OEU!E$CY(n?-S%-em;<DO%QFTjY^pwcYh<Ildo7f
zwb4X+&!}jiYEOtgwV$qUw`U$M0$Mo)1J&r$Zi<CtM~Y&H^&~Xm<emk@l2M>;8Fu?u
zr01SPI^^=J1W@E!$e8L003Sf};l#II*7{q(=x+YbZrW`4@$rp3Y>MK@HkT2JpEhF@
z?WsdO=#t--<F@+=ihF(DW)gk4_~bogXksIhXS_UiuAcM5t5#=OWshrF(ZcKlnQ-fZ
z_3CieEG{Fu`U@_uJAICFty|-zBBYtwbL}xU=h{v3lFnQ*@&MEite(wqC0|&dbqfqi
zD)45qn<JZBKrU?fr4NMXJg&5>`B|P5ptogR7iicoiu3ViE<C|!E6I2}Bc=P)y}{P1
z3lv0;sIwJtsdIl(3GbX=W@NR}fy=0x;HU?Qzarny7oE%-(IlB$-2l9hh;QoRz*)Z)
z`QcHPPxBY;*|u1@S+@S`>Kk;?Mt$ds*xBZYd>_jiE%GNi6~inTD@z<<91?l87N$dR
z)8knFd%Q$lnOU~DAMLk^iL)%HK|S@K<+mbU6#L4JIW#<3BGs%@Igi)xW!;9i;PTs1
zDJF8egs?b5kDB@~hqmKrX2Rpjz~E7cBDq5sr_5!(A_jTl7lTI{t9|h14`aYN9@mAS
zZi$Hg4$rvHbkmT~P@1XmL>Vm6BR^Y{(63rq$NFqtQJMIKV`s2zei`pSoznl)q6Z4=
z!i_5Zv0EU*{zO8lE0!cXEW5{+m!O{^n){lzp0j7Gn!GTx5xjeCus@{R&(!XoZ#^7B
z4!Lji4o8mSVzGEL1!w4_aISK0;-Va1^QFRiS)Sls>ijzrs3IIR38mt}75$ObAE{o>
zUHDota_8ImA+#dcbs^7m0xnIs)sd11bWp!Z<#21^$!-reo<xWH=1N}@jZkUtxkSiO
zr6fTl3kQqaG_8d|scys%S9!;_S~RZ^Lyj&&jp*AaE#4gUn%ejCoF7@lStjUoOBnT<
zASoj`a_g)7q_zEE&gl>Fz2V;9;v~P~4w^-G6U=c*D}BLtCo@a&^MWLvgFE8gFF!RA
zS4gm~dhAR-C%edg{|+mMgY%@wTHVdu%v`_1T4sB%(oTI!2K=D8?z!x}ugKb>erLQL
z{dqRVA`JwSS{;{e@3ohW_B8rSTUDI#$b1zb;A>QCI~LqI8^Y?NM^J0MteYu;^{E%j
z-nIpDBcfB-I!-L0Fte0fK)+JR6-xfZ_Z-O{pXu6@ZN}%Qx4~fk8!K<UZUw2}BdO8e
z!#m$xX=73~VvE5rhdEL)Xil}>E2L630EcIU;PA{@6LT7(B8}Ur?5OIyzktpvpfupy
z4g)+4*W~_!)u|(@$^&;iliadlg(lLf1^5zxE8q(I!nmogl~Q8=&bV_`CYq&yOZJP9
zX6ZNmPIuO6&_Pj03o0ENKL<INjVI%gu>RtTPGbae2Y-O{0beOIx&N8$11{fCt-e<$
z-MpjcHed8SY?XqoFspj&BbLpeS>Vb@tc|*jD(Fzaz?JZxWRkb!sQ_iHHGl`JPjUo`
za<vrDn0=aAvBQOwdC5{{#5_x_GD2IX{~0^}m%jf4?RidpmW|#7Q^fM3pUBywSfOJz
zC{BAn$bdB=tPH+y;go%IuCG#rGP~6`i>=W6Ifd8TZ|s@uW2geihs1fVr0nMk*<-xP
zQZw59sv&msVS+O&`Bc$f&Tz$jvnWM&8P$Pzq;U|W$^D4#)NPKc=H#=E-vW^QZiU2T
zS*`8U8}T~|3>meUEq~BE|5eZaXT-z}21Sll4-FxU{}2`wCjWr_{ylK4siQ$^E1#=t
zp43Sy5K48o2?>-(i=Gk?r9Q@LvbRk_ce9ofcMDinML!Xe>;L4XncOLf=-1_lG61NK
zTpML<Lc~o&_j;NsJ^OahaU;*+{ZvNHyvA1?Ax#7$^!|OfKxcJ)0=S>YNs3LT$0S-E
zAJp;TZZAB&6Y-Xjc6d%QrDACN4!7Z_$5#{<EUkB=3*>2&3#670rG^kMnQ=7_K*1Ta
zH__0mz7I_Gi}Y?G@-5cHlGy%|D{siOC#b?A{t6%IUD`KiZ=S`U#u3_zjN_mbOBDiP
zE?si1>#_**{>+lUpGP{W=ZUHn2Z1N?!LC~&DEurCTv|(;ZH=K6Eb)=LNPHV)#r>vq
zP*Iiin*xqa;wDvJZuN67WtbK{<x_a2FA(orpP~yj8$X`RY0NcF1+C(1<lUqyH~Vyk
zv$csbD^`Af6DLQ5LKJ4-t#N)*=&Gh$HC4|iPiF>$KmOLu|E0_SM?e28G~oR2PWbOq
z=LMTi`YPGR5A&X1;k^@@b`W^Nl8+HHVajV$)}Tbjb*`Zsu|gk>T>8;W(G5vPJ@`Hx
z6+yR(Cveortus=>8_um#dw})EO;L_*(LjK1M4!i60t-7@jfgdn>3S)Qv!_I_n8QQe
z=%H$WTGcq@ifh%@U}N6Eae(J@!E^I>{C!!}W>1gjkBpLm7da*QdREm?!HO|VHeTJa
zZ4(AXW5z1o*G`d(_s4>emA>s6_*mq!2N*~V?^26Vl~z>wuYHNiJbX*Nd;j$FOtllN
znw0psxGfYo8oVQ&r+41p!Kc1K0OA{#v`VCEux@COk$Uw30+NxZN5GyFEUwoHF#3=`
z;zlku&Jp~@ppZVP^pu&&^pLmkcSkc?yyOaC`7*Emjn^Cx;7m}LlQ5AVBq-J2=Zmfv
zER0pbJMXk3ew<(&efKfeboBhO##Z&Utmig!+&&7Iho`~P_~ou`z<HJUss5&|{-k0|
z2Kf3Gz-2-^So3Q1?cay_L8W~Ev@`V2)I<Jp$4P5+fnP{z`Oc(pCM{;FWz;E4tvX*^
z>m=6W`6XVP@|Th>`0+uN2Kg|;nC=Q@`Igtp^A+HLIF|X45nOrT)*RQ>tb}r6MQ#Bs
zp0^D8mqeYTB=zF&w&$7c4uh1p0pV!wh|0~ec9paZeIaZ8?CHs-H2uSG)S-J?ajp^a
zWK1>McAqpqi;8wD>{@+Sx_?Z;>fFzhXrejz3}$n@1SO&k($YH77M(rxvJ<?%u`Nt7
zulAQuTA)$i_Vyca@j><1+qQR`$`At=^sbh@YNKAJhCcaPmAl(HxZL>U8lI%-I1?-J
z_{sirS3VHv(mM+j7sa9~!||pOQAHa6t@~Iao!w91>96Jm|BPpUJh#X!Zqyj5w11lI
z&D%U_QdJpe836S}lu|g`3iIJj*KJ}rKx_`?iE?+bX8q#abAPg4icL~D%i_M8z)ED^
zRsyi59GLC_BB#`hm|R^JooV2!8rf9u$0&{UM+zZ|y>K^k6hI~_K5U3@HzrdoA(c7Z
z%C$Zn*WQ29N|7o&dtZ-2dZOh1L;&`^$5y%M61-b5lV~_wvZP2(eY!dGsq$`iS-5#q
z!Z_82O&HhRl~*Dd6RI7UP6qCTZwXQ$?>bp0w=qNP;d%Uyr=pR!BnYg(tIyKoRP4?I
z8{CbN3FD2kp^kD=uTF^pQXF_j8FLz{h+6#5zb_fva*(|@(Ln!F=Yc%9CtOy2@bKUq
z_d+`-*77Db>)W@3t2gT0k*Waipbg_q`BFctkoE(xil1>AW;ok6n@_T@&=#}x)#orD
z-(^!Q&#dezm7{;X8u&?N{BczPvu;Ef^8?ttW@9Wxi7Jga%)NABL2v@d!e&|*h7?z?
zwnnvY@KljgHiNF@U@&mq2H{yz5mT?p=@2n?Qp@M}ATnA1kNr=d6{dq7FyE$!i`lYD
zX^+?yExe$vw!lsH!zB=W>E2CUN5TLfxF4P<y*MDX+cl+4VptVbI9T|p$>lN8g6hE4
z%q#gGaW)llCD*#9SC*gfqgU-DNqDI48cBIM`eg8{`jU+OuC!GidvRP}+u{y%8=EJ1
z9mzKv^@$|Exl7Xi;H5)|h;06ehGSJ!sucv8@<VY!?Hs&*W%KJ#v&!bRGmdMcwX~G|
z_`}+jsZAprxijg>h*uSJMm{%Ep$wnGVV7SCy#A#7|6?H4VuNljV%IBjHhu?g)%u7B
z;bKR!N!1Hr-cbmny?CV<^~~cJ!SYqQ8qhlVUhz0-TUmJ*5ahZUm*lcLrYUvpocxiR
zECM6C0*=5aQ{Fl-kLHc1Ez|ldx%^+t@&B&-e<F@tGhon>us2mXw{68j0(mISUDV@z
z#%7B1UMdTr4SzXk`IxBq?M}Q<L;(J7<njv$N7iT3cOYOw@M*)bD6-EnT;})5@P1wQ
z?8>Kq<3^69#iub~jlh3rwU-zdhme$5??WNR$4zGx5uJxLKzKAVa()^*)^CmdQ5@JZ
zO3PAqUEZXJXSr-6*}}JKgE1c1DX)MA^S?sOed~(OWLE~4ln$mNF7bq3K7Y7LsHPw0
z)+#98e^MOHFhUf~Fv>(7s45IMQh9Fe+1%>e%6VRJ$=-Pjv`|X_AGcueVPg&a#W(dv
z(5{qLJGTxs;pup+=Zl=!u389Us_Y0QH2DM8^5!KIR_+NtVXk8w2AxRnA?Vl7q)Ov~
z<1QIw(XI3hKfzQKYyF%^JJzOjb5mlM)qXdOqiIDK#;d^x>oLdc8~}heGS(oRVJmGu
z=IV8KgGN0M!nbi>ZT&8M=;-gzSAt59s!Em`g*uUUT|4+)N_Pj^u(phhoSX2?8zBv{
z1&o-{hDCJ(%h)7Kk@nJVT9tMrk+9Pfh)@e2H)Zd+Ys3o}6r!8>uPFfep{V4K$2Gsy
zpubbw8J2{zE4EsvPZ{DhjIz;$$C6@}TCO_e5?yZ<6l=)c^<LTm?fbQj2o#w*NXZyb
zhTF>Y);7WZEGpc#BCT!%YAKz&aav~-rVVmJNeg?X=_~G{ZIw}OR}Hd>R_K8tE)V~u
zzWiVO{y)W#e_wX|w#|gNW^eCBJ=aFH=jlVS+6*GE-D=}|arr;!nz0e<tWcc@@{Sr9
zDHeG+&+d+xTC=VcD3T?@e|VcmHNWqa0F3-BxN9P~UU|mW?pyiBmD{Ivu>*O@nU7ea
zdbFJH!3}w82mMOE)sNBRhqAX{$uC*_lrHOsrOS35how5Pu4Dsm<O-T_MlwfU3R)9|
zu^e{MCD)gK?(IF@IM>1=^6h9lk{!~Cdf~GARea;UFDLTW4Rc{pYP~$$CG_nHr}2sQ
zmG#-?Eg+E*%8dK;UxftyIPM<{Tz(NM!1dWcvFaO}#%RGOna>;wyGrLePn~G^2rbs9
z949(CYVX&<hmALo*zF|oJL*5_diO#e|Fw!;tsN+5*0H4rwY5t&$5uTQSZ!cl*fEgs
z8fzS9bx5mAG02G0{XlFUb;j8nRo3Xpc@B~J?j!%D@b0%22jFAOd;w`|%qTdKaaM#M
z!#{XklE!bR?4WWBghY0Ts&9Sr^sWHqdD%OBY{u?^c~Z^rQ}UF-+NVy?m^_6u=TxKa
zNHArxYiyC_-Zxk9F7#t8;;xHW-OVkq&jN#A%xqwn|2pmXXFdOKFK=<75R8)@o(JEz
zE8EoJob(rF6g0gwJt$78HCVdIKEP70GByR<&915_4IbuQ9`hDaMkZD@L%w$LLNSdm
zy<r?<bKP%?+m}U9tP&Wdj+8)x<?Wei{%s=v(-!#C#2!#H{w?Vnoet0Nt!svegB1_9
z;-Vdb#Ye(=2qh$alH?qprccu_Cx5mdwqYZ#Q{q@JrR~_|UfDgj=ZSIRtuUmDGDj<k
z_|p0Tcg{hSIc@AjaX_TNz)C+Tqxf}vr3W5;=@G5Q1-t}a!;gvZzb}X**)5eB5k{0o
z{1_>L?74DHROIO<>M2dN$xqu^<FcIFmkK#7Dza4|8UJv~?!toIBXdN+)t{oX?mL)m
zP+c85GRf?5PZnJP8xNL~z|V+QwYI+nX!r>kb55UpWW`g{OZc}K@4tYWf0_|TSJG(H
zr>OKgt-qq=GbxA+F(S}HzHg>QB?m_zgy5o`Dg~ysd(>+cjUkQCHr8t%s|9*hkpveZ
zaHqm54ZVo6Jwe+aES75DOWXP=T9Nz9HleVNOR2ET_e_|>_xc67cIS7PT$~bQXMdaj
zDi)U;5;4fMsuD74e#2wo|8<6SiOU*?fGHHu?Cq<s$Z3VmCI_iE$Q9z(xc#tFSspl8
zS-f$hc-XNx=B9r2X;(1+M3Gs|VRM<e(G^}5_h~@bQZcjS^J@!D%AFl7*wHPC7rqw{
z{9x7MNmqnnx4^n+`YkY@@&qcMxoymS3#6UG4(9m(+(+o&+9xz<smsM9;!F-YG1UF7
z=xObdZ<+Z;Lsu+lX@|_TZ!5*!%O(4iofviy$LrxO&??e=9V78m#lQF4F5hDcSaN<}
zg&kG|I{a_!GY<L(tY?6)$f&I>KlrHXTU{Y0djq#}3oIu*DfJkH224^yQNMo$*4K+&
ze&BxOzpG2RMa@By7){xTcoh%#X`<fl%en23K2KOPjB2iA1&=(}!kexEP=$`*4{Vj4
zXn>vWk=(($1rT6$zK2TRkGuVQJMoSCaHFZ5o(8wT1otO-(7Qy3azjLfb2jIuNXW$P
zDkGNqpuMqG4yqG_lSob}HEEb5p=)BDOX8PUR=(%-@v!o9s~`438fL`5ughX3b0qNw
z<Wjl>wsq-G!@&mj(BAhiJc8uiO?P0Wj)h$UTPb7z)5QKi*<wE{UkGuAp~s-!=eVsa
z(}%(4W1UqnVKNZY93<GjNpt&_1o|pigloFQg#w)&y0``MzVBho9}<pQi3>L@>po;)
zo*0e~u#~pLYT#6}N#Iv5rzFqu*x%$>S=j1##b@c~o@RZBSZK(4BL(Kk@>h7Rs#k+7
zfE)@t)}Za#l!|}ydP}sTc48}PXx<--${&U55T)3*lTg)~f@^=bG_U9LM$|)h`!gKJ
zcYRPjsqTzrny^$u_xzVg=yg7YFW>ardKXL^PVTDLXV3!k6ki^4oM5CJ!;KC90#DZ#
zD_jg`mT3;{+SyIm)D7#$uY3=|_)JD1rq72YM>gK+<iiF#AE(c1#K!9lT2dSx8OmaH
z@Yt^nD-~yWauV!(Gffc?3P46*YNc&pphIFR)TdkM0R5hkH^IWmT8h-+-EqJjl}5XV
zJUSe*A~-DXdeVo7mbs$dN$#SQa18OT4ZCp{oDV?z#WFMWQI=Cg>u34mzeJKJ?a53C
zUeu83Jey}0P8M(Rq$`?M!aJBOUQgYV!Md-K4!nxA0oL<=aVWPRBJwc%+<tJ!v#7lt
zS)8lw@F`rhqhAu-SL?c}`Yh{lj@~?_4-R5-8M?+giOkUmHh1~SQiR+-m4uKaM<nmV
z%AgFUdh8~*RBHU;m;;Y!Ir;EiMpG3!grM*0Ac5BslYtEhJlgzxDjvlo4D=3keimem
z_laDN_aw+&Qy7L#h8D&#E_q*n!C}PK`x2(i#{}qBNs>woB1bUI#uiR&fqZSjYq(Nm
zUEkEh!UUN5TJULx{A>qU$@G$$JKPlxtb#lEtBp<5?~<u!g^+#0no;i-6cuT)E!;N?
z@y<U_)UH@lYHF6Le230bPZdrfeS-Kt>})Cl*s}Ts7ka@WN?BVZ;p3^M&WA?&dDd!a
zQ5;#Z2n4D`=DKFMBZV72OvAkW6XPvL1NSlWg{{F^9>}YAv?Xf#E)S($s!|w+%+Bk*
zacl{cXf{Bnc^T$(IeKd|M3l3{#N;mu^p(1zEKAlwDd^+HB^hxq5KlhTS}%m&iRhdl
zd-Fl;3tYAEo7ObU9*=q{p}|v~sqV?^J(?Sh3m<##m5UGGns1WUCabo35x<D<hvvR6
z^If1NS@$j95qo+IC>=H20%s!4iN4BXTdsdi`Doi0l~`Zi4|Fj5Aad1E5i}|Pv1*C_
zxlK1bO{(Wev1OMrXZe0|QJFk%+1wmw#ISufwschZ*IiHQW!&jyxZbq8haQ>Ikq_VM
z@gG#a&Z2duE+;f`#MivjSsqn^zDyyIW8e`l1#OAZ3_4_}m{K==S7ApPErZnd8cBN@
zqh^pMLfR?Yb%tGWKPMkEA2T<5(x!achH(Gg@I_fS2Ul_}T817cw1)=Ft9;OTFfJl%
zRl@g%_v&sYt(8JQ9y3WV)2p-h9LO{!aVj>zu{VtoxN>6ETMXoKcRNmy+9NA0>K!o|
z5-~CjO~P=oaC>~*<zR$cqp4!71(>97d};^#hz!iEFi>%9SFnjnQL|$d{#O<A(;;Ql
zpden(+{&e_Ro`+XuNT@hY)+r$be=sAGZh@L*Gaa4G7nwLF1qbsCkzB;M<ja@*BupO
z)7935x3h5O3r><@3`~b-)&k_BbD<S-yun~|LKox7GJ<;>;IGt{Pu|BrV#>-tSyxj0
z<WVR+WKV!0v){=qW6t!hA&>@{bj~FCZR+B4Of256^R6Z7&-LT%Jju?xSOgpo@nb97
zV|e=fWO6Vga{c6yffQ+j+Oh2h!Nnb)r4IDZZ0XOQ+g0CrbcOJ0QDu+A;;K>kK!W_(
znRb{qs;<Evo|&=dspmcGv4HtR<|5&iN&Mu)vKdKJG`a9GImyG+T$Q%!zgCFfNy*Ni
z+~7;KuIGAlhIog8LYwv!%`fIc+v6a}-!-3?y?Sw$F|fW+YEkR`6843(`>r67QLSL1
z&~C1I_C_S#X<nU)!pp3Blgxf+RAd&kwScoA=Kk$9MLP>td#~J4S1%(+MX|Dj`NtH)
znL5vbdQ+49S_xzCkTfj$>VpS?H-TbB_{(b6`@*QnnO0vMQncRrwf2gl^wK&F=#B8s
z2pdIMg4ZlLntDy6=|kmuiQ%GJp1pJC5-bsDaC+$$i9N|Ykb&XTw%$vEQ`U*rRi=>U
zy@C$cv$Ya4S?7ysY_KIJIDQh&rj_{Emf66eLiUTx<0580@NiXO9<Aq6B>>|2vm*BY
zY#beorP^HR?EcIr>%u{VJ(txfYxkg8c~BS1#p{q3k;5^ZP5rLay!C_!;pV*Qqw2eQ
z2FA&|*};8>pCxhmH1}xi7bXtAN(gT<2fpnh=^O<m81kK}ZAEW#Pbks^+qxcV*304D
z>%gB1>{+Q5QLhej^H}s|-Fcp9Oj5A%tZTtu@!?vCQ*_IV7Z)x`VU~@6JC6?hVrhv?
z6M;bp&&CPm&I-1OnuyxdDV2Uk3lV2W(j-h7Ti<ri#+#F`CAl_lpF6nr#|CCsYU{3O
zhtq1Kb;Jt1#Et<M0S>>kg}(rZ{;GWb8}7lqE%tEZEv&y=Ia>0G?fK&%VmpzB>#fRH
zI5hCxl}~NNutx106w@EOg{xa_FEp^`#lLzG2&*k(KzpToFLI1@uF{=H6&dF5pvWZE
z8C7QK@DCdrOgBz@`YVVHuZg+Wnq-l;zM%>;4Rd7lT|fmZN1rM=d4d}!n)p}MP9rNY
z5fc4csDPR-&Mr<PIy|`7BHUylxi{q+qwMCTjaSc|zYgzO9!`qp+Q3y~Dg$I^g;n~}
ujX}Bja~+2t-H6ro0<bF;WJ<#u962(JUlgJJp)(xc3l{zZ2ZX=<_P+pRWdKP4

literal 0
HcmV?d00001

diff --git a/examples/vit-b16/dataloader/__init__.py b/examples/vit-b16/dataloader/__init__.py
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/examples/vit-b16/dataloader/imagenet_dali_dataloader.py b/examples/vit-b16/dataloader/imagenet_dali_dataloader.py
new file mode 100755
index 000000000000..a39d73e26c36
--- /dev/null
+++ b/examples/vit-b16/dataloader/imagenet_dali_dataloader.py
@@ -0,0 +1,112 @@
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+import nvidia.dali.tfrecord as tfrec
+import torch
+import numpy as np
+
+
+class DaliDataloader(DALIClassificationIterator):
+    def __init__(self,
+                 tfrec_filenames,
+                 tfrec_idx_filenames,
+                 shard_id=0,
+                 num_shards=1,
+                 batch_size=128,
+                 num_threads=4,
+                 resize=256,
+                 crop=224,
+                 prefetch=2,
+                 training=True,
+                 gpu_aug=False,
+                 cuda=True,
+                 mixup_alpha=0.0):
+        self.mixup_alpha = mixup_alpha
+        self.training = training
+        pipe = Pipeline(batch_size=batch_size,
+                        num_threads=num_threads,
+                        device_id=torch.cuda.current_device() if cuda else None,
+                        seed=1024)
+        with pipe:
+            inputs = fn.readers.tfrecord(
+                path=tfrec_filenames,
+                index_path=tfrec_idx_filenames,
+                random_shuffle=training,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                initial_fill=10000,
+                read_ahead=True,
+                prefetch_queue_depth=prefetch,
+                name='Reader',
+                features={
+                    'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
+                    'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
+                })
+            images = inputs["image/encoded"]
+
+            if training:
+                images = fn.decoders.image(images,
+                                           device='mixed' if gpu_aug else 'cpu',
+                                           output_type=types.RGB)
+                images = fn.random_resized_crop(images,
+                                                size=crop,
+                                                device='gpu' if gpu_aug else 'cpu')
+                flip_lr = fn.random.coin_flip(probability=0.5)
+            else:
+                # decode jpeg and resize
+                images = fn.decoders.image(images,
+                                           device='mixed' if gpu_aug else 'cpu',
+                                           output_type=types.RGB)
+                images = fn.resize(images,
+                                   device='gpu' if gpu_aug else 'cpu',
+                                   resize_x=resize,
+                                   resize_y=resize,
+                                   dtype=types.FLOAT,
+                                   interp_type=types.INTERP_TRIANGULAR)
+                flip_lr = False
+
+            # center crop and normalise
+            images = fn.crop_mirror_normalize(images,
+                                              dtype=types.FLOAT,
+                                              crop=(crop, crop),
+                                              mean=[127.5],
+                                              std=[127.5],
+                                              mirror=flip_lr)
+            label = inputs["image/class/label"] - 1  # 0-999
+            # LSG: element_extract will raise exception, let's flatten outside
+            # label = fn.element_extract(label, element_map=0)  # Flatten
+            if cuda:  # transfer data to gpu
+                pipe.set_outputs(images.gpu(), label.gpu())
+            else:
+                pipe.set_outputs(images, label)
+
+        pipe.build()
+        last_batch_policy = 'DROP' if training else 'PARTIAL'
+        super().__init__(pipe, reader_name="Reader",
+                         auto_reset=True,
+                         last_batch_policy=last_batch_policy)
+
+    def __iter__(self):
+        # if not reset (after an epoch), reset; if just initialize, ignore
+        if self._counter >= self._size or self._size < 0:
+            self.reset()
+        return self
+
+    def __next__(self):
+        data = super().__next__()
+        img, label = data[0]['data'], data[0]['label']
+        label = label.squeeze()
+        if self.mixup_alpha > 0.0:
+            if self.training:
+                lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+                idx = torch.randperm(img.size(0)).to(img.device)
+                img = lam * img + (1 - lam) * img[idx, :]
+                label_a, label_b = label, label[idx]
+                lam = torch.tensor([lam], device=img.device, dtype=img.dtype)
+                label = (label_a, label_b, lam)
+            else:
+                label = (label, label, torch.ones(
+                    1, device=img.device, dtype=img.dtype))
+            return (img,), label
+        return (img,), (label,)
diff --git a/examples/vit-b16/hooks.py b/examples/vit-b16/hooks.py
new file mode 100644
index 000000000000..b6c306ed7184
--- /dev/null
+++ b/examples/vit-b16/hooks.py
@@ -0,0 +1,15 @@
+from colossalai.registry import HOOKS
+from colossalai.trainer import BaseHook
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+
+
+@HOOKS.register_module
+class TotalBatchsizeHook(BaseHook):
+    def __init__(self, trainer, priority: int = 2) -> None:
+        super().__init__(trainer, priority)
+
+    def before_train(self):
+        total_batch_size = gpc.config.BATCH_SIZE * \
+            gpc.config.engine.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
+        self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
diff --git a/examples/vit-b16/loss.jpeg b/examples/vit-b16/loss.jpeg
new file mode 100755
index 0000000000000000000000000000000000000000..a16c333cc8e945ec9e3713017160d57d273ae3d8
GIT binary patch
literal 22964
zcmeIa1z225(my;n1PdA*5-hj}hd_b{cTa*taMvLu!Ce#FU4jk{3GTs$Fu3dBF2jHB
z-o5+o=I-vhckkW({@?R`@HEYw>2pq3byxMTs;m0$``seozPz-YGynks0bmOM1>DU5
zo&nHNQPEIQ(9zJ)Ffh>XVG&|uVPayD;uGK!Qj$?qQIb(m(9p3n(mZ0NrJ#7s&&0~f
z#mmb}%_t};z%9bg!^{2CLl7`9Ft9MONU*U<xF1qH<o?Y+ckKW?bc97je<XxQ07N_l
zBs_$>P5?CkfPez;?N1&4^A7?d5;6)Z8al>3On8IZ`v61)BqT&+Boq{6WO!>|`1b&0
zJQVzgToR}Rs;|%<ITCWejmbi%m8|?iq&9j$$Mf1L0OKC<0}@g)dIrYFOw7D|Pxu7{
zg`Pc^l9rK`lULW!)Y8_`)iW_QGq<p`vUYZHb#wRd^m_OHLts#FNN8+ad_rQ<$4|-G
zIk|cH1%*Y$Rn;}Mb)fo&#;+ZnUEMvsef?wO6O-Vn@6(W_<(1X7^^MJ~?Zczvlhd>F
zi_5E@=t2M>{Y=(hDEomfJUCs5$jC^@Xg|?~fane{NO;I754lkBB~;N~ITAeLev3{h
z8Ix7{1%sAH?SSaD)95{7I^HGv!=Fg|6J`GzVFCXZWq%>;f9Qe$u#gbogNK9%5C@!J
zJ<j&O_y3pwE)Tj7*<0=a=+F<fQ`4=Z8%7c~puuU-phHUayK%uDp!IY)$FfbSALaPm
zcV_m=*Sa4U5jH9q{DrwCi_B>N+;UKjnp+$B$z;uOS78=#@Un!hP_Oz<O{zBIA){Jd
z^M!YbjEOhlh0f5|8F^CJ$a+DtLxI3_lh~y6$~7j8nEGAY66rN=%G#`N&MvvzZCw9~
z-@?*DwVn6L{!_}vi+57KFSbfbzVlrOD;739dn}g1lQ9<3ERj94%~GIT@?FZO6Q#~O
zURO?(Gk3jtMmBcgtF{+kDz^vFaBSZdkEm>r+;4_lw(DITKgN@dL~<p;>>~vr`n<9A
z{l2AJcg3d3=IK|qZFZ5m^nhaOkAl)7hzI}^5f!u&Wf3j#k+Mh_1s9XMo4~zH)3^(5
zN~;w%Z_OG*W!Z!GAR&!+F5Ck?(L+LRF2RvOg|qE3p)-ep8IHvl9Qxx>Zacd-=AjQW
zn!C`E4!17A#vhvXGsOc=u0!}(6g2B<Wy>EWYetmK5u!?~Iib^p0=8@`pyj4VTAKP<
z`0{6i9YYcN0<#Uhn=*^mJf&6v51A%xk<EeCXvb5iZ=grV8Er8UEzSlJbw#hN4i(hu
z8=C{s?RoWeGh;2#%ZrcpXi{Bfc}~kY%kn{s3Men>t6B~#Xzl<M_AU#9WPsBhN_Uxw
z^`kd@`s76pLsgjX-M&U+w2LCpP`?PR0)e9(9TDfAj_adXt>7b$q9O-$6bB%5`?5qF
zMzmA}?z!ON&pgAftxI!XHN3RMld8Fh2<b7iXKTB+TFcwie4$%>5&lXX<SM(*uJ_y=
z<2Aj;)AsxF=#z4QNYX|Xv{I`pa6-u^)RdxWqmPkv*StFn9s4z@n%l>=xUz1CiBUyq
z-W|wY$dzTSF}ABMgD+Ze_sN}#?sY_R?HhJiI+o}|axBk?r>c`WWE`Dds?>`F;8i(C
za?Shm+GLE1XxbE8oogq(CPxlZO;J{7UBT&lDD{QLsVDn-C3(8q`x9%Hcdtp{S~ThF
zNX6}lS=9F4^Y#jgCQ#`9(&<{wKqFJL>1h3kCjUoXgzdwNsAq|g^>tATC2w!_un?`S
zd!4={8<-c9A0~Ov#-(#XXZfAwOHK^NER;M=0zE4Rd?P_8Q5ObYE;a@Fth7>z#!Rmf
zFmnqFyJV34cM+h;y>%0>zWH!12jhS3wgFzq>y4q?qq5na*F5Km>@Dr8-;az(*t&8n
z%ilhzftY7mdE=**o{h8mQ%h<jzipPuqC0NC0;g8%M>B-a(OCAq%;X%F)_FVPgyNYW
z#kOlqz^7niGeqV+b+B759cA}AYDkx005|DP{j4tziMN*M`OTw+gZvxeRo|#<?UCx?
zxmF-VWa1cf2Y6t3IRdH$cuVJ$Zs|P@-}XvKi;iBx9^9c?f!2qEaU*@~TCv97@F9=S
zsD^H@K7XzTWlkX-h{V}tCWz$OF^D+PuD(${q;NLcB;I=kA}%R~kcQ1%0{P>8y2#!4
zXHh+OAn{gK`J3s;<9C1>TRvgesW~<MSJ!197%v|rE_oW5o^%1+-CG$07c>r_Wz7oC
zICXop^}3vRTC_ATxsFY-Tq&Kk_iBe8b)T`Yv}kqD?-_VHC^&@BZsZfDCLg7U^E|(Z
zS&BkauFp^Nbrws94qpqhtugb==AbHP5@`)6HcY-g)XdK+x_Axobj6HsHdGp{?Pp>%
zvFENBsJ4<sMHR@zm7BP6&Dxx23X`0$hq4($_15y!*Jw350QdK3V=-vIlWnd#eXa@4
zmn2Nk6!-qFo$}1y!eV6~Grz#q#sfEqE({v`Va93u+f93VZg_gZJENp_%F3q{ZGk@8
zg*JNYMS0d<7-M#<uLBLs+^Qew@~;R>Ban+K>oFV-eTdiyUYUs2*WavqMxD~;Y``1C
z;$IZLH((1O$gm#xb}Ec^QMRm>1RkWXu9WRfP&&KaM@m<I)L{5bzKxo&Sz+0m+#X7s
zGY4r-HgR5O(aSdtcRAK@;+cx(v_zV$5#G5~kPqEg9mh%GZ7{2|^6AllQQiQ^DYa9*
zz!9pWwPBP%Incz(G}v2r%1HpajlL_f7};H+f*?isd&^X>G|l3&Kc$odjAY!k#o}wS
zbdlo<n;AoMRp7EzB)Mo-8q$3p1cAQOGUE&BTH`B9*OBXEi#tH~W2BtyhLInx-2vX)
zXx;&`rVQ=?ZBKwVul<FDSW<s=HR^}k{@?wmGGPAq?Yc6X!fE|^eJ<n<NBrLRkwIAK
zr9X1)SA6j2u!4~sz~Y3J^EbWBQU?Q{#6rddz87uh7E(9}oE4qi63CC~X^p!KurS20
zKeM%~OuB+1@g=~q0l4H2@T|Py4&e9Z4sg*$eGp;Yb|SQsalSrs2MAX|ji?B!QNRSZ
zRF9rqd)EzH1Ww}(d)>5~e>gY`PEI-twY>6Lu$w!Jc@to8wb$)@0d~Kk)vLJ0Y23KI
ztiA)R5CXsQ72g5A?UVx-Qp90gPbn$Qm6wKawr`AQEgM~?{8tU+tRonJ^R#>!|3Y_a
zVNv<k`Lu;wVD~_RMp{S2zRy?Xiw=JAZPR2gLtc})79>{R7OD3dU{p5^B2fcH_IA>3
zXkmC|+OQx)XV}g39pErje2Hulxam42zCQ$iCrKvuOp}7mwJw`I+j2c9-ZqvgT1eo8
z=D=!n<_mU}z7&?@Og1{0UWG^l3k4~YWQFMMztGWypr8{&cJ~aWF;>=fU<-XqmGsal
z4Hh`UIpc3xtorMR0bE=au4!K!VNp+m)8#Ut4B`*nKDpZ+b49f@rOi1Hnce~ZmZ>s7
z@EoD>4K!kVKE*?EtoyZYKlTy^a-m&49a5Ba@Rpuqz&6q=>sL+A^qqe4&`*&F79<@6
zZ~IP&qJG_zUrmR+#E{&r@~BsN(|T18BgimaBnxHfrO^P<7*v_s58M~aOmrr-p<qAq
zs%o2>`liOuA6h5fD(hXP`@<3Y8-$~jSTcV#dw(Ux+Nz^9Ew5BvMV|~`ng~ibBlZ(s
z+o2I2=4~yrSbq3W7&&=9QxRML;j^@g$(jIT%(e~n|2D#(YGyFQuDvfy%W|7R$Px~R
zOtCc5IAB^@+Hz*heca8)w0?taGN})<o1C$;`gGxKk`j-qfqy$sa(;LTU~+<2kx?r^
zfRBy4|BR$<vd7sjmf05xDYU894ca~vZbI?8Vc`cA^(*TAS@kb?zJimu3}!rRWCA@t
zia7O{;8$_3-j`g(+tI7kosn=9?R;|3Bl<<1H`t#y7Ynh|$pL^QT7vmQe||NsOBBWW
zJBF_*(^^0o4-<_U=#{xfwBF#zor+8*Z~V~uSHx{@mOKA1FTjZm{r|*ca_zCVzsSKy
zZpvU0I70p>Ek;iiws<|0rtrm!@^&>}za#~nAyE=AyXznKwte-itQhb(JM`}sX2H%O
z`^O*`xqduy*}MdO3Va0Q@Uz~l(AZGGs>_^uC-5NU4)EG&&Pv<u+q7@-x`GYUL(73D
z->2AE!czseMtyf|v<`_wvJ=8I!K|#NF$#}nXX(s>3bCZE+k$FrB7`=y+uA&GTPN(v
zw7BpkpJ&p^#iBh2qzhGR4Tj!(@>GN>+n-Ye=?*}*-(h@)S$78*RWKxSZbi5Q#JVq4
zT%*hYH}lFv{qM149F`5uMFp~Ru&#Toy<KrqSxlHkpngF%9|4vHO9^TL-tAha+7dm|
zs9?mKP?eIC`29HI@8S?-Z7em2mU`elS>CQ1#T+&!ft_{g!ve!QKu_wz9$Eh+aEbm}
z9%yn0F!N0rAa9*ox1WnkwOt|U+0i697B=eaHz+=9C_0n91GI45-q;`J?8e*yyo|3e
z7FG<-Z?u7@9Kcm;@#UsFK#}(yfL8OGW1pi9hLBMQ+qrIAOv9a~g`KpWAGUQzRmt3L
z0ryvc5zY&T@b^!^&Fh$woZ4%d>4(?GNoQ&AhK{PM-2`XS9g{yjODgvJ@YQFI?3BjQ
zf~ZsL$Y3}9W%px<6Y1G0geNXMd?ioefBIC;-vu$O%UW5u7nTy%tGUwjVrG5-p;w9)
zh2-Vw)&!21mQj+!8xg$Xi#YX(+OPusk^vW~R#)^4Ys*P(MoXHv$EgXF=tx8;=^sQ?
z^k)V{0-a*A+|UhY0H#aYbQv!(csHTTv3#ypb)S*#%+U_fVvG!l%Zj(4%eQRhroj@K
zqPy~ZRvbq21_Hea3A%FRO0%gi@xsB)Pe6L-VTt`KI+HZD#<+UQYmm8S-Gtnao}biH
z7yU#m1PBr4ePhM<7485)@*67}a~rMMu3**%Q4fYH5!}MrZ5ZmP{l4ijdqItK;tLaR
z6J(gEjot!GH1`hRc?bBEiDeJC1M~?WOx*@R+fLM`qzv}`fvag3Hn1;l3s+1w*_nu;
zkN<H=z5{H}Y+6lB-2t=~o;{_w1N`mX<?ij`M@lz*cYv2o_P2z!qbjJ>VM{z7=1>WN
zOLv4>QzUhd*Bem32jgapY-g_0;?Tzspo;Z;2`+>;rQR|2+h=M6M}zZ@ohTlO^x1UV
z@>~^Bq*pz#_co6~P&2A6@rIf^0IwBYOT2N4v6rIXiygS=PYnJe`TxDw1NUJf>vsT$
ziIQ`fv26I1jS_5|<ztSiafbL4hrWnXm6j>YM9Ge$olyIRA}S+4UhC~>U$k`w`W!e*
z+wj3u#n+)de0*G~GO}+sc6iQ~+U<gv9(^W+EukAnj_zs|%-*tsX^guAF;8){aev$*
zXE1<LQ%!P)5YB2jQ=L91lwK?3(I)Bp!fMCM=a@g5q=P1S;0_>iGt|Vt5z#R0(QS!1
z30Fi4vPR^#)XITOk2o7;-h`HsZofz9PfZtc)>^5m4Frm@SgP4Gy*ei5Bp*<d&-a2N
zr{cV0s9HWWj}XhE59b+S9c}6!qRbtlIzg@qLEL5K^gt6N*-`O$)&-tt8w2l=>!(-=
zE>L`yFBUUbt(_(^-dkM}*tA4)gMjDzA;LB~j{?xAtflZ#G^qS>0EJLE7KJFEXIv&%
zYnJ`xe?a#E?oC`9Ht;+9%pE|A%|&zg0u3g&5ZJc+!1fN1o!H3yca``rgUD0?D{Bo*
zQv{ycp70`!{CMw%&yQ*rfNc753Ul$@nezP|Jx|ftr=m|1pWU)Fmg7tjixaL!6+p6M
z{F+%azJZnHzxhriEoN+5wUBxWv~^WX?EhEh)uiwouF^!FTsicn4-o~;8%O9&Y`uK8
z$VOLFT2B3~qJEQ8*!3!fJ0C&8y|6^<oxysI1pXm3i&b;rSNd$=$S1hPRsrMv>koWI
zXHCG%ed^m&83=qFHkyIwlP0Gk;y2gEPJCDMw3iF>gKZbjYm_gK#m`USHeu~jps)vR
zApH!5HIcNlzX0J#Eqr)JJnjJd#bQ(TcL33qJ3woRybF`M`+K#M=Ne^d*)^*RC4&3-
zde$t<*qAz%<7*T6Jx~nZ`+a0`A@>0tgJ$Xh%|}90Y`GUg+r2Ye$$?QN^F7#`pX|o<
z%xYQIFGJS}7_pl1v(Q<Ej*Sw2-C=7}M=c?vYOCds`U>L}Cj+QWiTabtXH2&A(z9}r
zqFtlYy>^Ii<5Ix|Z=tS~jM8LcSp^CRHJ(olF5$8n81s<`(iMbk;S|}jlXY#_c5@yD
zyUKyO-2sm0#jm&h8H&Mh(*(+RJ#sT7{+G+2>~N<^qNSGyc{b&Km!4z}tSy<b)Ds;p
zaBuEFTrI|`j(w_0wHl4qkcoNnv*y$>B0EIsZf5kP8CMhN1x#BjxIJ^%hGa5Pb>7-I
z!MzxCIJoLhNvM9n2@MNBXz!;e4t@;cuMg)V+)$AQ$P!Pm8*q<_=C#odnT&+Jg+B%j
zf3nX9Yuy3J`txJoLFq5Hu9BbW3;|tXWUwGlDQ+fW%h>0F+A1BJR;NWDRiy|4|B0ez
zOBIk93Q#QB6lZ=?k$F>OnTZT(eRq^&eTCcqPru0drY3$>cSYt3Ur{DO-@c)VFH&DJ
z`jOoMJ{y8yp?aGoX+J0xjekgiw)jc}qB6mSu<F>uqnqgjGOmYTt7%I<Fn*?gI@01o
zsDkz|%L+7hQKx8aOK1t+DF_p<{zPm33ABghQ>AU~|Fmyp(1Qq6;^F=r96C_EdG=sT
zI#0FSc1$wo8!;MD(Qi;pMExFDvm@Evgl6G^Tz3FOHI|Z6Wul53S<`r|k7;z&=ijKo
z4Tnk-X7uC64Fy-^O9|WE9OW5aHm7t4Wh0Oqmb{_VIJ;2g#)yJbq6&iwe`<@M^~zzz
zh}pV3K%CtjV7jEZyuWyoGE$!XWO?Cbb|?$a)9ld4fd59xZ~LQ0?5DY?ZhWini!Jms
z@UTS6_+hyJ&KklT2*S<Cn(vwTs1sF43QO=y-lU6&Y2>7t(1;SD@Xj|BbW&8P7HC6P
zJOkd}CVqbdO8hq8bJ`7;%$~Lcx5<rY!OzVHlb0)IosDlcrW}Q37Hz(57Y#sE3)Gvn
z=WBp51X0yxY8h7I4cl#;*`ZAS{|TA-d(Zssy!d79{Co9pUrz~Dr2r0){4~Dxgk?;Y
zSFku+H&ApGu%%}ay}Vj>A{)#bW-E#JJ`Ks_x*&`CEwUy#aQ346P}o2(x6QD?*5X3o
z4q!2B4MDmovv3-dZ2jEE5_LRZ{beU8UOZNLukD7=h~vd`i<?7Z-?l}b68Nl@{HJL2
zpCQ%X`rU6ww*SzRt1>s6mll7nhRTFxON+AolvLoR*t){cgv_-s8G|wS`SntSJMTL*
zW;7?S*Zw@bxOj3EU4%PX+iqIbPpi8`N8h~({Z0dj<w2Afq0xfFy%PJZa<q;!;>wF|
zFc5na)Tou17@KzoNV7hx9PF%!N47&SCvmRF2_h9Nuh9O+dKyq=FjPn@{T{)lP}vT5
zLV6o_<tbtFkMQX4SfTJeW6o3K6Z`vC{JuofWwUpHW-()Mb+aznmR@4gvOr)5)(*-z
z4!`7$^Uk}Q9qrCTo=vCzj2*UV^Jkk9cL1!(lpR^uAsl@x@kzsH;a8@n)a{EfqDV5M
z5dwjhw2b*uxW#AiLNuQW+pfITYR=CLR--MtICE*wO9X0!!|Q<uHgaEJM-I4aQQ$_h
z%~S)^<-v#a0!8duQiK}uKh3}Yc=>sOR{B|s0XA;_HMaf3?RJ|#jbFjfq6=__{qHL}
zq3YB7bM5S<>mX73h`3{t(gmDZd9bpX(1+x{=0KWNPNaAGL~pTjln>2j)k>I+nRyjK
zYpj{8<_$L939r;wVcve?pC)A%@`QkVM+*lxp{xrBQ5T!lH+4c|Z=0$NH~_%*pmpV1
z*V>T_aJT+GmI1jdo3owPGGz{<Q|jS<I5<C{NTOsb+E`z}a?RlI`#H6WM}?%{2_a1R
z<VKu!q+oBTW4+C|=a^kpiYVms-_ulo^tJyRG#^pSGPzY=Rp|O0|Bw~;aIsLdP*i{I
zX6g30Ce7kivRdKRkfIAsRdXP0Sh$d!tCw3;rn21T*&QH+F^rEAuT|f>?PNw-BArrZ
zW#*e!P*8ioie@zuB1`tRC{duyQL)$)<I1l5okDb9p)K9-z!3&v9X}DRkIa#!_=)?<
z$!*(h1R}P4@;;-}Hb-u^)Qhefnug*kwUciiwN%Fqg5vQb0(joN`*Mz2rfyYWPB*R%
zcM++P{Lv%v<MaNQkzv9_>6RrGAk8cH?n(sl^yDd;vBl2oEtGzH=FJo-I5QCQ#14?U
zG=PQF{|$B3ol}vuQi|7}?l}gL4*vTy+FAu8-^Do|<(k9C`l8R28d|F>6Df6O#P=B*
zV>O)yxnc=R{qddn@r>R*Mfn*?f_+=t^5$$ndaMs@7_w|gVm%~5y#@LDBqOxp8_D1P
zbsO^+ee}uzslX$D-bK7|g0SbY_=4(D#+E+M7wkbctV6m=fezY2Gum0Q{zG2d+eDCK
zc%K!sQrp{&OISh~VHmF*Geu6JnONo&AGK5I(&p3%u*;88(cUkfaXVwc{tFC^ZPCgE
zR6F7FdYitoC4pGvcr7ma`$-=kSetgazF@Rs@lZ3>627>GcX;9naCgggi++KnCxB^7
zV@iXaN)gZ|$o9+j{SIAkuHa6fFEVp2{NqPkFyn_4s#34t{6fNitp0;k>bj5JqagX3
z32KcXrR1+r3s9mZnXA>k8gG%>hp92NFF6FiKn?j!JfdV8Cd+nLD;q*FcgXV^K~^~&
z<eXZP<p^jfhJg~j0-}XTsRyQdFOt)uKTpub+VUuRduBD%#Ljz?=@LO+mm_>J6-6L5
z8FC}5fLp><a5!DnoB>55zyyj!2fXlUs+{2)*eWTm=wvaUG8!shkgvZM(kVe7<6;#)
zCl)lPPoe-IrZ<$2`HKvEJ!@?JbV$X*lEywpL?*t!W+;*GJ~-U1_Cldy8a~x51fhoA
z6{t;I>bcuOzNN2lKZ3NB1x<FfG<sHTw0;BV{RQBeR?;0bt6}97X;Ec@Wdd^@3;N8M
z1{0lm8AaPgsE@bwHQzN`r8CD3_J1jS1bySUUO{prb6l<mTa(qe6i;6WHTE2rSt$5<
z=>WdXE76HimTj)2!xbYAC-@%o<p$gALR`hWg;00)VdSxreU(z~Mgj@irp#%1Km)cJ
zb%zm6m=P$Zwa1%T0{DQ0;*}}Y$Nf9NRE1<l%LC>07T{t|KtTh&`s%ItB2NVz;Ss=%
zlmDz;^2gXCTI>}|qa|~BJ!jn&7PW|;ZrVm_RC0YJJ0r%g^fgq_#B(p?XXL@Cg|2LB
zXKS|ebBfTBWIWP@6eJ4MY?$V^pP<X{kb}GutDU0nqOH)V=+U-w9Tz^g&`)1^!ous!
zB<=9kA~%OeITlpMveRE8x5r20?jwa-1fS%TUHBa<t7Xfnud;pPqzQ-7X!=s$66j67
z!M$htfa{jLVkdOA^dNzLcX{SKF@ty4Dy)FYldi_=Cb=PK$xv@aW86%C;(2p*T3Nw8
z57zpP487TMF}QE&pdxs<h(wpSMagodLDJz>^Z|vp*B#)h<a2prrS^K{$dv^>ld#i(
z>n#IZr<hv*#}v!&Wa{4wk^Hyt8VVe<{ZvT!kLZ&^OW94$WcMC=kbYcra1vM7pOn$A
zRFqLjpNl7cERQe5Z*3RiVPB)KA1LVb(PU3RyuZO}KFvx3%HipU0gGC9nW)+9w(+>G
zIJ7x6e%J_$ZCnp>EvSCOUD107n0x7jyN;rH&|1k}RM5;HvbO!ie%0_H>;U#~*9+#_
zU?3B+GI=9fG<y*w9mw=L9RLbplwuk^j7b-$kdECj8rg&U`~o~rjjR1eec&=k*+N8A
zv*l`o-3fHk&^FE@C1QJNU;pDG|Etd$uD*Ix&&SQ$n$7R`_)CVz%C!Tu1+zW6PAf<Z
zbR|n}Q&uZMaaQn%JZ|T$zMBtLWa{w-v7@B@rL^?Krxqqsgj(ir|D7GIpPek*KaWx-
z4)BNV25sRV{6mHR(B+ZeRBAcAOGJo2*aN^-SIU1e%8JjshWt>@=>qbAVnej@Dd7p!
zS#}a4HO{BN0*=}bvw0s~F=ZuC@+UuZy+ERl*yFAIJzKGrp{fQ2u7aL5Uamb!pG93z
zI10jB6&yb!k8M}a)9>;Tj7sBsIgfq18(Tc?WxT8u19JV#=Gb5CiuQ4SE4@L8`sXLd
zNRD>0-Q1pVbTntLK$aGR_d&l9W=SVP^wN21foLT7eX{1mLzzQE?UCck(V$ejt&ydq
z#VhfoHD~$EmfFax?&UFqOXu#<I#Kj4vkPnQd{AR;OMIWRt)SLx=V(uAaNUkPk#8=M
zRKeD*&TuV<1;64B95-kSEZ`U@E<yE_v90+D<Rfafj(ZDmvOJ%9n^1~hO-^g`t)y50
zac^4@U<n2CNvhQ6%Hn}Mms&FKZk$nuit45~v{*Jih5NmRqM)OAcB83>Uu(M*HGV#;
zG-_LS=+3oLo|d};#>~TwihkE$B<gQ!qEE||+SddUO%swVHOx;+C=lfUe)%Qv2n|Kk
zbmB0h6o<DfTHjOaR=P*GoxAi37|N6M?{7;zN!bF%`0Z#^SGU$wryIu2CEhQ`9jY-s
zh7|TKLs5m5Yo8RL2Mt=j-Jnk@L?RWH5fvob&^w(wkKkxs$eEdaYjRK81oOD0>Ash@
z`&oHX$2^iDOXmX~gC4>&7Q|dk1`?iV{#ab4pyTo+$<&9QE#5$5+Jx84!*2OC`?o%Q
znYAs4&EToV_m5^IBbvXn$qmd>4Z_~RvBi|?PBQj-s7p!(*0g8fka=)V6XW^2K*60r
zx;wyW2}n~zr8r9i3xEnBtW5%0ZPq})O>J+L8MRz}G{Of*R%d>p?<|}e4M;-gB;lkp
zeLF5NWH*Jd?2Dv2#*sTEBA_*mEUnNxTG?D;H!Qr-Ha_3vjoDjEDK>vZ@Npwtwk>!|
zzLfT`!{WjdCw%QjaIcM;yNgg&C*{MR_1WKW|DRp^8B~uK{Yf<Z1@GYb4gcmYV+yP4
z5J49+YfNK;%d6=2Q)G9-?Z=3LX<1uBg3kF?Zd%gHZD%$vQSMWgq(4jIZ`l$0aV2`W
zeRgC8GSRDkkwDTIOyL=_b4molDsn`2mUfDLY$n!e9`af@%Oll{QvUN9-$3nf$x@TI
z#~v}0%ckjfGb4knVF75=K{-?3tNiuxRj)S2g+9KvWZ{PoLIOhlu4abZPL=H$TwfG#
zYx*-wqtgbkJUZ_nQ(SeQDb2>8m&TWSvKo%XMD=l{AkR|AsH;#{${ok20rja}P91ZX
z+jvoSJXQEgk+L>vStN_}q2y*6Zcy#b7heonlX$aO4cfIgXS;A-Q@GV;lv;w5!5NM=
zj%4OK{TOZm1GD#iBe9+IfMN6pbCckxWx@4^L{w#VxDQ993XN^t-*JSIp^<{fYzU=7
z^D5_);;2uxmkTiG4#3lr?eE0>xZFASXH1zB>0xd~%%lo=0$I|*FZgyv28a#KBQCLc
zQs$U1!)^y1HI3o=MwJSqJtm4;5yzXoa94i2n;B{IDfNRc_ft)^K?n3<<`pHi*wDEc
z=9qAUlu@$A^o9b<BYpbY{;zfhxn@D6R0W?`o%aoJs#n;gdpf;(rXkw3k8Rl;xJv{u
zd!ea`SBG1ALQeV@`+DhT+ca0ME!i{Kz;SJa)F??IKTWU|8D(TheqKbbXNbP16{l{>
zb?4Q|O-YuwkdU0=sSM9q!f9BImw5XU8^a-g1aE!WdH~#Pu8$NB@+mD>M;XcInW41F
zG)9u3Dj@IiW!|z9Xe9IF4p5z?8h{$=6{>y!U~RlrP-T7I`n8HTl1zALE`FD{gpI3Q
zZwvviA%2J*L-^wzfFsW!JM>H4+;pQH3k}>aj3uq|^SFz4Ip{?+!w?ha&$_*5(&#Qj
z*m7D=gJa@nHu>G*do%lmWlR&RUfYd1Lh+a2WWH*_1#}w_890sE8vW6PZHDO~bmq&N
zlCUX|d9x(?&?xe3D%vAXF>q-=Gg(~hv6Ihc53((#5{_7cyTqYwU8N&iUMIuXOEc!g
zYW{svw>>lx5&<YwTgYbsB4Kq2Z09LZPq-F}GKy-uA;^YfaHPy%%!hxa{`dUGKT6(T
zKK~9-{0n}I^mx<|+$CZ|XHopQmyez>Sy4CQN!sWjWKcw~gfC*KKMW%w&%bJsmiW4%
zM2vDOMO&_Ov$BtLmaF}-7xh@hcB83KS$wo?QF9SquQoky;mwDa)?(x>I&3TBb2zqO
zh)-GrV8eaG$Y3R;W8~cWz))yse68`wPo@Cl)gj*PO3i{%vg_sU%_w-1Qe7T2>2h5n
zr>PCa^R%T=t@rYiN$wuEzmZJjvLDmZkh;gin7E1~h}P!i;>UV>MY?MWs>NMXU;)kg
z5QLkgCyY~kjg?nxwx8p>q8M-2u<g>*+8yUS)!!$&aDH)(pwWt--C^Zv_g?79(~b&}
zm8{b}>-`|HhgQk&IrWYE!BgI<{PLVt4J34U?W<JvWunHN2YR~tvpRi5J)a(9A*`|s
zu2gP;E1~SXw#F}_<R@KLL)N01gI|5mah%P?8yCC*l5N2zzV!Qbd+)o}wFULA!v?#h
z#USZ3x|e?9+?K8*gf<GI>mfD;?CF%T#0?y+Pun`Vx^d-u!<Nm~7Srdk1=uEnLo721
z8cqD$ud!Br1$T_XY-~}|YTUr1r4F50st+=Wi0V?z%;oB8LmwID(rZ{a`>tFpYQHW~
z(C-+BVRUa!j*Y{C>f2j4fi&?hJ=n)1ql+ZvG*(7r#-s<FjfVk3{_}T$AR^*VOKBck
zJ-VNppXilc%bwpKO3O1yD=5I?i8-y1%8n}4UkcT}Ze1$ngE8^jx9yAU$RD8Q(a(Y!
z%PU-o`eJh21C-M}{8b9vJhm1@QN)$Lxq4^d-A1V$=a1Kn3Vm@Fr0cILCp5iXMq3+e
zxfe>sY<GSK2t;|GUuJ&+RLA@(gn8jCsla4NWkM@QE0dO+{&D0pQIizW`_&Cb#wPyv
zgpac8;C{G2`>r*dXQgbA6ydorHs_zB5K|+o9yM~B!I!yJJo)XjZDC>K%<ae79|P(~
zNGwNn<<ps}-$tC~jt)E_xjE#(;yURO3uqFOR_XuKY9fklC|EDQIh@C*_QZm1F$!<a
zrVgaKeGmWW0S>^hA*6d%e4~z~xLmnYPpQbl&-E4|4$1@w)K}(sWQv_G+72_&Qi>>?
zy--$b3hLC36>O^2@AAX}dLYdEpNzcm#<iZ;NKhEgn7jNaU0MVo5`(47WU1;JQBOgA
zr1Eder*-eu^*sVMM{McU4n3}K5*;q=lXGSZ2z{Ij-xT}bP5M8^EdRwa`D5IWKdC=v
z%qAA^O`nvHiL&RY5ML=}DJCg7-IA<sCHAdtQ@N!U(No^$``P~CW%pGpkGNvR3Dj(a
zOsvUN>p(1o!TqCDZ)L^i1ct0eb(;Cub)}^H^}o;T@OkDivK;L??U)pMcn{i&ad~59
zxziG~n090OV(h>axp9eSTPo78tT;P0*%|O(vJkxcuw!VbnWS8sIA)`MuT<T%ULrhh
z;?^czM?QM&Jw$J-u}wMQ{08SkXo2|3*L_7dSk27gW4Yxt2H^^tSyTi}(gz91ZX;#N
zxjS6eU!9`+FoMHN(`0FBV0^rKN*UOVPwO1wU)(1}SalaGn6IvA7CBFjpTh8EYgX^Z
z*9Z%^Sjl*!4%81*b4!XHKlIHwtz_AbRboCo_JUzHRI&};Z$Iv14vul>fybnEx7I39
z*VXj1)de04zntQ6U&>(7stf4nR}^y)8{?L}RPtn;3axO@p95NQ1dXQbvUe}j-cEkG
z^h+sWT4DKmUa;EMd3NcS|M{y*LYex|mFp!eNZqky>#Uw&Uh;k6OH3z;O-B_Y8w{J$
zX{+mJzM1$#8BJT4F}h?jLC9Bh3y}L!buaFf$TwKFlWT8{izi6K2<z*r1D5BN`8DJ;
zr>9{?7+dLc;>>Cl;qiny*pwM*6~!R8ATko!p{@~4*EMg;{b`<x(pzz@D3zAhy5|sE
zQ*<TgrV8<od|^*Y3XN*WglQqxI2Xy!Dc&H_$6rslHL_P`FOagn(3X@v=8^wCKNE$N
zx<RtBqseNzCeWyA$8-(xw6#6ie(vBHPFF0)#J9Jz(Q-DGHD+YoVP)bBx745FtZBg>
z-iY<&X;)9NvA?e3po!G4VU0=mq`=E4b5h#^olUuc`LSdtW<|el&~W;s`jO;EkZmhm
zUt7NAc-d6uf9~6HGUPl!9lS9{rFDGKc9PM#6^Vq~S9-I*27)DAP%<=UAKd{I#`Qn0
z$M-$DD#lv8*6^cj4g}BDG^}f;lLrOznhXG(aCg);uUNrjQr^C<yyr$n-^!h6N9epK
zy!OK38+H(|V`b}ySjEiSgBFj|o};o<8^+T-N^jS-D(IBE2@$iRukL`-tzh}}$+W@d
z<I)skye$x#Nx$vd%DFqztBoCPG8poN$!AudOIq&7JtWktFwF(BliUhY+CU9vDTu0B
z@#E}T?t6K~R>VR>1=FsWuF0%{fLL_edfFIm%ls6jO+|*%VIEbB17I-SWDY1+k>OI4
zU5%(OREk6k>D$R2;8AV<p<5hrcw-U-3C({vdKH5NDcnS?*$)+DGw5TysIQ>;*_RpP
zZThYh4VJ!j+Y6j_=gTQY4hzjZb(w{R0d0t?lE$fF6IGO%JJD_U1BbXZWwPFeod6#7
zDWoN+zQ#8`a&esPx&UE%4rKmt^kaSDB(!|ar&ln(1uZ_SBZYyiTITdI{++1_xC$Bk
z>CYb=%Jvty^urb?5z!#SGT0KiO43wXnVUA~MYx&rYhjIA?6robs`T&5d=1G@!WA(P
zR(iYF=<cI$3y6F}qs^8+gYhSi$WOSE4{<{#<5c+Fl^w#Ibmf_OVxN7O*Fo~XcYtlb
zeU-PKuE|!z_Doe<QUTI@kKp@dEYr=~Vk3%!j2l|6E_?g3ZFgcvCe6Bfecw-v^dYhd
zVHS+^4RjlP-~JS}^ZR#Ce?fRZ)m+1tKc_VlSk96oIV@5*E5PGJ#)1b+HHeKHWmkQf
zzLO3M{+aV7-UWKwc-#gO<gt0rh)|Nmf@&SWPIEIC0T6|_7?Vf3P2QC4)B}8*uFurQ
zGACHeX%Jp}hQ={h2vTC4mSdUYO<L!P($${Ul^hBmc+|=XC~KUYv_>6pj2fg^Ya5aB
z=)Q<EV#Iv|odX6FK!;pD8_mpiA;<K;S8B>34^*z8nyjVJVCJXH1sS%Dz(~H<k-3Z|
z9(u#Qr&E`4qf$g;&Oh1MgtdkYBf6E|!kf=y_-QNEw{cAjM3)X_0#`pbw6@@HJ>f^>
zcX2Y(`MAO^y!g`B*J~<!jJu>rOQj!fOwxdBw(Bh?r%o#)<e9w+EN<-_AiyYko07g$
zVf*Uu<m#^GfXg`~=$=>HOemCfFQ>ICV2|Qby}OFRYcV`(Lx+m_-~%Wa`V#S(JzuLN
zHwOx^A*$VEtU>(VMeC)pY@KEBm<oM(xdyQO4V2~<K%G%*w<TFBSX9E-2y6PtT%9K-
zU%s__2Pp3;9MwtI4t2RT(P|mDz4;<q!YLetg8<Q0&bP5^FnjR8ing35JYjhcK8ycx
z;QAlu+WQH9{IT%j7nt$S>i4L~_t8ZooX)9;>cdp|lWhkbb{!5n+o5WH=Szl)L?3+k
zgtnZ^>NbZqmzh=VWA>b5E{e%zJIS&|+(Q_Xn6p=bZ6{@h99p%L(80PvCmd38grsfe
zuCBb6F>d91vZYN*OEWvVNig!7<`}w=^wF=o=I_>KTi#y1)Do#LwwKOrdIP&{8=OhO
zRJBeu_MMt*yCjR5(;wjJpg3%<ygkuEu9$EuvPl-BY8Zd6t!J=3<(PxvGYV^C6gmEK
z2iQ@oD=FRD$L=n&N7Utc{p>#L=tjO@JgY(0jR)U)t=2Ix;}T=ZM0~MN+>r8Coa0zX
zo`rj%lT35CJt?NhK^Dg?9Y=WLEIBV>dl-CN!ipV2o=J5FQ1;r4@KOV@#<eALCfuuR
z<JG%-z=ElFF#7;Touv3#r*vVE;CX*NCe53Ts7^mV5Jd>94V_(iICp6QO{l9qMt#ll
zAR`Lh3Z3WVURa@rvC6kQKqc)1el;n!n%MpVD|6eK6iPI>zpUMRLd~iXo;kAD+`<oi
zaJ;RxW}2W;mDhcfB<c_Gp2}0zFV$!)IX{L2c)@vMakJW)0T{)qkKl?d$6C#p{j>6t
zwfYPi6rA?~xD9}0c!=oxK->#?c&MRI^}27y@G-w*>g}wr;E<V3D`1>2C9xr59=xpk
zK2G+Xa5p*))}lqZ4Ym(;`4)a(d072EAIlSU{Nw9Q%_j+B5m6QLt$GBL!u;bSH^J+j
z&!o$fj5&@)8A4Z27O1xxBeN5~k}gGzvxtPfpMT)|@EqypWVj+oL;@{@cd7*xC>vvD
zS=T_if+pgPXK<7wz|u!~zf8$))VzgIRX6!p-nu`|8~0NbRq}6+`A7J7Jo|s|9%*h)
zxX$M`Iap5x@vF-r4%$&W>Jr}1P*zfua9{dp9-pEM8D_dK!>jtZGgQbIx5aMMU<5;=
z+(`4*bAPJgTczuG8xO3xPO$5s=Cf>-o^^HSE5_Z$!dE18t0p2Fe2>TY;@KYA44c~N
z8UZviUZtarfeMRsm|2z-y`HqGr9Y{@D7{sK$1Ej#0|{mgJe6IMnL`cD-usl!#cyPc
zl~T$Wf)vNMFqjK3!+8*u>#4^;mRYY1>{isYPutepda0Nc<RqV*fK~?q^RzAD%A|Ew
z#?8@HiEd|aPHvb`t4q2brKI5-vqlv33C{p4KHmWZ%L78i%J@pH;q#91g0G-gtnxu_
zQHjsg3UF}INA~q0WikY4ZajmWGKk-Us#C|g7%5EE05#|w5KHkE+M0Sm>@Z`18DAY8
zk0tj46`r2qJnZE?RW)IctY$G7LQV6oL1OWgsMLs^iC)_R#0(l~OZpVP@*%}`Z1Qy7
z{nD>hhCbh8oui6*`|#4;yWu`o5n8mT;8xUzUQSoC@^q~Er+Cm!^>>AeiF*^xRiL)&
zMDQ}bhpA+Et{W|dmo_DTN4)OC&=Jih1%7h&G40v0$29)+030bk`I?MF{L`pQ&ICPG
zJ#~8++<fPCIk<QymoB;m1dyO@*ajU56wPRiPZABCe%9Lbnf!8VGmto<dvH7V0bEu9
zPbw57`(9k!J08gX%%qi@E7gT)%rbfE17FEbJv@JX20#Mw;YcnhQt`U)%e)cPEoCH9
zWjMu^CZdF+G`1eLyX)Gc{#hH{o@Q33km}{Oux7k#3{noN8aKKiY6Jhfuf#_>-Q5qa
zfqiygrOkY(8qgvgcrG;84~DXkzZ)6oMRan8s%cYp3u}uWI%=mr=xdwQ*;2sZw^Y0t
zyCrRtdYN%gx5N(A&<wK6&27=IrZ_L4MBGFyDmVVnExq11(r|K?1DVnvzs_ybcsH%$
zt#Q2!N9HN<TKC6KFH*d$59M`Uz5fhA%jeRolXFXKHyWUbGhNFuz|xA}?zbDp_L?~{
z)yN|$DJs|kP7?pM500-g<X=gK_(uqmP*pLSN4h_%viG4r>6g<kz@KAnZ9X2PfeRs^
z;1j5_y6>~y2aYm?75AhiYw&ud?;}i(7$i?jo?eSml)7_n<BUC|fovVmBAjyNAY6Py
z;rO@R|GzbC^Pjb^a;3($gp@uKZV@BC$5~oZ8*d?k7a-VoW1|=97#G%P+RrRMrY%s$
zzcd%mGj|8D{M=M4oKxc;i_DccH&woSP6ewazu?gxhpe-q>0A?qIErvODSrfzu-%SF
z0rT8>wH9=Xyg|;?#bdRxppEl%FPPAz%N)dYE!<tGhM+Aq#!NE{v&{)comn<y(w(Sh
z9%5A}L2k$g*$X5Rpnp0*Fhwq7>;>^<?p9QEeNgwzt@7jhg7WbhN*1AkFINm1g?-+(
z=6I{p;oWkNkzBu~f1>bZZrE(%r@jzhRi<}!$pem$46@(6%F(IyPv5}w3Xq+}DlXd8
z|K=Pu43a=Lr3l?9YbMBq^tp}d-Xy7dDwA_DKM)jl?%hB;ASq)be*x<qSL-!Ybcl(o
z%%@2(IB0t+K4Lv=oz^mNGi22WIv=}yu=Q72H+(TgziXz2lN8;1`o=+a7@Lg{VZf4#
zVv4uSTV=xDbwIfO6oXhl`1N=)L4%3Tlntpq5;`}h^N9S`!k@ywV*=M`C`@1b3-13o
zv;1N*JwVc<Z(N#y_brD|(i-)<O~>e1H3=X0JwQsumC*L<S`oj(wFF);Cc%>~E#b!&
z)VFv6NjLo{Cu;4X5jgghdVP*-K-2!FGp08dP&CpW1FeARi@`$i83;8k%pYz*A1MM{
z;<W|OI7s2i_1^G^zAqweH*qrGo8f0FNI-G#0LXuC<nrHcM3g`4F{0Ww2?f^i!h@@;
z%M)kZ&t4P^4ss1NQCqIunsbf3I4|R<^caD?gHXdT*6skS$$+2IjdzWB?f{u*K|e(U
z052J${##9|U0I??x2CS7D$crt7|$Zc_3dDN@tz)ma_%Njcs^lCx4=&N1OjB#VB<&r
zQolaTxJ9cGzabca?=JEpjQ&5|#(Xo0iU<6Lz7h8Q+tReFqb=x~L2D#|r>jZalG4qB
zvO3ZdsTO|T#^175>{s*gztf_=s9eMXH+Z)u3L1fRn-BKR;J;=tP8zUGvJ)E}_L*;3
zv2P<Y{=baHw7M6AOg{-C(q9SUgUpc?QRj|EyRzHLtd6r8ojs9?-B0@Fr`X}QnHHow
zp`+Ii!tVf^qFHx<#dKqsT;@;7^FQ$YztzO%33(Lh(6aCx?X}4>YV+EFjUiZH()ux(
z=1f$LV-!od=Nz;b&E`+WFW-o7SU2#53U1*XD09G*#>dF1Nuy=X&Voj6C4LrN_rJ}z
zEnUFfvUql|{wep<`|f@)3y_z6?wXL@F)n-sT=S1(U;fHJkHn(~=52X2DPBw0>5jcd
zCrSJ%j}BksQNU2UV7rKB1ixDK<jF-vDWfhlZ@9-GG6?y;C-Qv>v}h@3qsNW!WDk>q
zTqC5Vf?kA@H@Vlxy${qr>L(U`@Q}jvYNbVCk<)DaRDV@kUmvQqygucWFo#Ej1~_|7
z8dUh^JQJt<ub1!7`#y%bI@YmURkgcGj?BCvwgt0F_n)|}<wzq5h+b>0dG>^_e|uxk
z7uF=|wm05oB#B;TYiH~;#=K5{gocl(gCnu+Nr*6Md{NSI^(6Pq-8}@mgU~bd>uyog
znmS8y!iz<KL1X~Prk|tsaSfEC`^u>1t!tVADVuFT?39sNsbjIsEwsF}*=5HlCBK8M
zHOUGY+Gx4-JbFX$WhOP)pY5-#e}p*KWm|>;{kqXTc0-PG9agzjS<R0D?e>^86f6`3
zkjzOR&81rw6$M3UDE=~A6~Ss#DS1^^+Si0Ak?=~&os7ft;2K+D{e#+;y&@}nUb5m@
z`PhEKT%+eZ&F@B?o{kiiwkT+gpPU{J<?RI3gH;p?Xw<r|_-kcHU&HPIgQc887NXby
zccGq87h6=*Ak`+W{=-jhGMN2ZoDPNoOprDx1+Ld*sLsX*h7VxbT$SE!ClV?NwE7C+
znzmZFBb9@^vl{)waFiJZw=6$K@<W<fdeIqHoYB5HL;kmK%4_x!HnrRfM`ydt$eL{@
zaYm0a9LEnbaRPrCv7&9Sy!<FryM0ra_xBrQA(C>Nh=p`8#BxnV8THE1-Sl&{=D@K@
zbZp*XJPAf5y6bPKKCNt{ufDxG&2gpGF^bo>2!-g9N!|fmkTu!jqU2-Y2PQ>5&(LI}
zPkYbWEz?jN`~6*NT;-(BYKD^Bo{j&(aLy8&@{)4U%SqS5SCM@$aSjbXCS!c9R2j}C
z6hN!rQeYdJoC@4$R0tbVkQ}d_k%R7;W7EY#a-JVWJANV}cf=l+OjmjgcNj-_D^97I
zk8fC5lCW0x5p>a+hDY=+a`vv8I_qh6U-+HP=pXKtm0GElURjPq%M=HFdRY69D(`tc
znRG=yn+x6jAIT~e74IgaD)1#rD_i-rj`a$EW&nTaZQNRQp?3Sm8e`~eh0u7jcGksI
z7g(*Q^gDfens5rHaSdaJr&#whhQT!z7EaahI`|nQy=W@)%m)`!+DbN^*G_$BFXt6@
z@hPZThGsr<wO_hhyzLUY?t&1f6v3ms4Hts<5Tt-}!gNxwO?N;sK{r2MYUuGLdluaF
zbP&S`!<U&N1us|~lk>MaWZaH$U^f+z?5NrBCxZ3FBb&wDFu;5YpOU+0tT<fgLx>ra
z_fcrksNG);KSr9|EL@a{oTrNc31wo<7f>X}{^b-!u2XwLD<S&!fGf{LWsadoEV^dj
zKhZi9su4PP3T!D(h-l_)Reo=t!tV!d8%^ydj?I430LUTIn}l-;o|gj2JjTYQ8s14p
zsZ~`@?3-|$h4pFD)`iiL;w<w&k+_#rM6rmZv2W``V3y<ICY<I0nbC#Rc-c<yLvcT*
zXv*O~BoAp{5!uKCxE$?VNvCA%FAVP6T0GRrj3_ge!w_L+)py@`cLW_qldsL&1`AC_
zH;-=+Na|z>%U69CVD5j)EU}G_+I+O2(iA%IRcNn)wfpPX6_o5~U9|L(Sq$5H{7c(@
zC`vFjd=q|3X0eF)=(sbvtPWURQOm(SS@|KJm5WxkHM5al^e`E)WBF${GgEATwB%8*
zLvY^56#=BUu?$rM11T#Tw!!@>5|v(P_|nrp@dSYB_Y08J1Up>-OF^j4GEbGEexKb7
zlaeSdZDww6>8C?v{2?`LZhKPWcF&W<)Oy|94|!uumy*09wqiX=DXyndF5*GK`+S7o
zYJ(2RhjNXjv-u=h-BBJA<siuUyy-Bjr#Ky6>&~)T-oL-q)8P~%Hjq0{xA)G{zrV87
zk!yD*KCML>0@&eQcwngqKSbkuseNa4^bpl+S6#_eu88j_(Ky#~WsOI-)GF1JLH;!@
z{Nnaw!QPva_raElBLuncF67)a0jJvn6)OMuWQbq%!tbh=n%#Nt05kLCwGn)ti)nj4
zGq&spGkxsezfg0=2%x3wWg9B!jGs~xJRdF1uMrA09AL2-Z<+XfX{3ez$h<#o)pk~x
zO0XBrSIVK%qeG`La$F&BzF*YIP}&>A)B$RjE}vwD7(f!x<j_C?Vhq(fjI5wg&TA=)
zv+2+@WEGfY>$CJZWarkw<q&6gjtPgaQ0t+KK3aU6Q(p%h8PddZZkw|iLm5L^$*nHU
zSbHLskeuK44%uJWYqzyFOrAhhZ6o$R!H=}7D;}UI9?cE?q7z$y8^G=5nu+~?<C&~n
zt|K`4weGLn8WT=G4248@uoWa_q`7^5^!%mASt+*6m#c`W5ErJ%%3?*zRQ?aO_y=^~
zn$u;a5L8$Y@vd^b&&t<B0qbo`=;KKWbv2>uS1e)PNlj5IaxxH+jp1`vjd%52$avw7
zktiHD4~4A8jQYws^xT^sbl>!gjh?-2k8>bF20S3?7U*Wlg=ZcH1)9XbhriC+>%(px
zf17QjE<|(?rY}<iefngnuN^7<)NMc(^m*b`_#$1MJ%?F=^YxP=SuhtU5|z+(gfHul
KOAZ9yP5&Qd8v;fE

literal 0
HcmV?d00001

diff --git a/examples/vit-b16/mixup.py b/examples/vit-b16/mixup.py
new file mode 100644
index 000000000000..822bc8659df0
--- /dev/null
+++ b/examples/vit-b16/mixup.py
@@ -0,0 +1,12 @@
+import torch.nn as nn
+from colossalai.registry import LOSSES
+
+@LOSSES.register_module
+class MixupLoss(nn.Module):
+    def __init__(self, loss_fn_cls):
+        super().__init__()
+        self.loss_fn = loss_fn_cls()
+
+    def forward(self, inputs, *args):
+        targets_a, targets_b, lam = args
+        return lam * self.loss_fn(inputs, targets_a) + (1 - lam) * self.loss_fn(inputs, targets_b)
diff --git a/examples/vit-b16/train_dali.py b/examples/vit-b16/train_dali.py
new file mode 100644
index 000000000000..fed39c3cc3f8
--- /dev/null
+++ b/examples/vit-b16/train_dali.py
@@ -0,0 +1,70 @@
+import glob
+import os
+import colossalai
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_global_dist_logger
+from colossalai.trainer import Trainer
+from colossalai.utils import set_global_multitimer_status
+from dataloader.imagenet_dali_dataloader import DaliDataloader
+
+
+def build_dali_train():
+    root = gpc.config.dali.root
+    train_pat = os.path.join(root, 'train/*')
+    train_idx_pat = os.path.join(root, 'idx_files/train/*')
+    return DaliDataloader(
+        sorted(glob.glob(train_pat)),
+        sorted(glob.glob(train_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=True,
+        gpu_aug=gpc.config.dali.gpu_aug,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def build_dali_test():
+    root = gpc.config.dali.root
+    val_pat = os.path.join(root, 'validation/*')
+    val_idx_pat = os.path.join(root, 'idx_files/validation/*')
+    return DaliDataloader(
+        sorted(glob.glob(val_pat)),
+        sorted(glob.glob(val_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=False,
+        # gpu_aug=gpc.config.dali.gpu_aug,
+        gpu_aug=False,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def main():
+    engine, train_dataloader, test_dataloader = colossalai.initialize(
+        train_dataloader=build_dali_train,
+        test_dataloader=build_dali_test
+    )
+    logger = get_global_dist_logger()
+    set_global_multitimer_status(True)
+    timer = colossalai.utils.get_global_multitimer()
+    trainer = Trainer(engine=engine,
+                      verbose=True,
+                      timer=timer)
+
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        test_dataloader=test_dataloader,
+        epochs=gpc.config.NUM_EPOCHS,
+        hooks_cfg=gpc.config.hooks,
+        display_progress=True,
+        test_interval=1
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/vit-b16/vit-b16.py b/examples/vit-b16/vit-b16.py
new file mode 100755
index 000000000000..ac51e226ef81
--- /dev/null
+++ b/examples/vit-b16/vit-b16.py
@@ -0,0 +1,78 @@
+from colossalai.engine import AMP_TYPE
+from torch.nn import CrossEntropyLoss
+from mixup import MixupLoss
+from hooks import TotalBatchsizeHook
+from colossalai.registry import MODELS
+from timm.models import vit_base_patch16_224
+
+MODELS.register_module(vit_base_patch16_224)
+
+LOG_NAME = 'vit-b16-1k-32k-mixup-light2'
+# ViT Base
+BATCH_SIZE = 256
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+parallel = dict(
+    pipeline=dict(size=1),
+    tensor=dict(size=1, mode=None),
+)
+
+optimizer = dict(
+    type='Lamb',
+    lr=1.8e-2,
+    weight_decay=0.1,
+)
+
+
+loss = dict(
+    type='MixupLoss',
+    loss_fn_cls=CrossEntropyLoss
+)
+
+model = dict(
+    type='vit_base_patch16_224',
+    drop_rate=DROP_RATE,
+)
+
+hooks = [
+    dict(type='LogMetricByEpochHook'),
+    dict(type='AccuracyHook'),
+    dict(type='LossHook'),
+    dict(type='TotalBatchsizeHook'),
+    dict(type='TensorboardHook', log_dir=f'./tb_logs/{LOG_NAME}'),
+    dict(type='SaveCheckpointHook', interval=1,
+         checkpoint_dir=f'./ckpt/{LOG_NAME}'),
+    # dict(type='LoadCheckpointHook', epoch=10,
+    #      checkpoint_dir=f'./ckpt/{LOG_NAME}'),
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=150
+        )
+    ),
+]
+
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+
+logging = dict(
+    root_path=f"./logs/{LOG_NAME}"
+)
+
+dali = dict(
+    root='./dataset/ILSVRC2012_1k',
+    gpu_aug=True,
+    mixup_alpha=0.2
+)
+
+engine = dict(
+    schedule=None,
+    gradient_handlers=None,
+    gradient_accumulation=16,
+    gradient_clipping=1.0,
+)

From 2528adc62fe41f4df3490b3c95071b85805f8d4e Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 29 Nov 2021 10:25:38 +0800
Subject: [PATCH 05/10] add explanation for ViT example (#35) (#36)

---
 examples/vit-b16/README.md | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/examples/vit-b16/README.md b/examples/vit-b16/README.md
index 83b924c2e655..c28c7ed4477b 100644
--- a/examples/vit-b16/README.md
+++ b/examples/vit-b16/README.md
@@ -1,9 +1,10 @@
 # Overview
 
-Here is an example of training ViT-B/16 on Imagenet-1K. We use 8x A100 in this example. For simplicity and speed, we didn't apply `RandAug` and we just used `Mixup`. With `LAMB` optimizer, we can scale the batch size to 32K with a little accuracy loss.
+Here is an example of training ViT-B/16 on Imagenet-1K with batch size 32K.
+We use 8x NVIDIA A100 GPU in this example. 
 
 # How to run
-Using slurm:
+Using [Slurm](https://slurm.schedmd.com/documentation.html):
 ```shell
 srun python train_dali.py --local_rank=$SLURM_PROCID --world_size=$SLURM_NPROCS --host=$HOST --port=29500 --config=vit-b16.py
 ```
@@ -12,3 +13,28 @@ srun python train_dali.py --local_rank=$SLURM_PROCID --world_size=$SLURM_NPROCS
 
 ![Loss Curve](./loss.jpeg)
 ![Accuracy](./acc.jpeg)
+
+# Details
+`vit-b16.py`
+
+It is a [config file](https://colossalai.org/config.html), which is used by ColossalAI to define all kinds of training arguments, such as the model, dataset, and training method (optimizer, lr_scheduler, epoch, etc.). You can access config content by `gpc.config`.
+
+In this example, we train the ViT-Base patch 16 model 300 epochs on ImageNet-1K. The batch size is set to 32K through data parallel (4K on each GPU from 16x gradient accumulation with batch size 256). Since the batch size is very large than common usage, leading to convergence difficulties, we use a 
+large batch optimizer [LAMB](https://arxiv.org/abs/1904.00962), and we can scale the batch size to 32K with a little accuracy loss. The learning rate and weight decay of the optimizer are set to 1.8e-2 and 0.1, respectively. We use a linear warmup learning rate scheduler and warmup 150 epochs.
+We introduce FP16 mixed precision to accelerate training and use gradient clipping to help convergence.
+For simplicity and speed, we didn't apply `RandAug` and just used [Mixup](https://arxiv.org/abs/1710.09412) in data augmentation.
+
+If you have enough computing resources, you can expand this example conveniently with data parallel on a very large scale without gradient accumulation, and finish the training process even within one hour.
+
+
+`imagenet_dali_dataloader.py`
+To accelerate the training process, we use [DALI](https://github.com/NVIDIA/DALI) as data loader. Note that it requires the dataset in TFRecord format, avoiding read raw images which reduces efficiency of the file system.
+
+`train_dali.py`
+We build the DALI data loader and train process using Colossal-AI here.
+
+`mixup.py`
+Since we used Mixup, we define mixup loss in this file.
+
+`hooks.py`
+We also define useful hooks to log information help debugging.
\ No newline at end of file

From 4981f8f686b5a0224aa4567a6a4d8e9e9502ebd3 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Tue, 30 Nov 2021 23:48:54 +0800
Subject: [PATCH 06/10] support torch ddp

---
 colossalai/engine/_base_engine.py | 46 ++++++++++++++++++-------------
 colossalai/initialize.py          |  6 ++--
 colossalai/utils/__init__.py      |  5 ++--
 colossalai/utils/common.py        | 25 ++++++++++++++---
 4 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index a99aa91e73c3..2dd5325459b6 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -10,7 +10,8 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
-                           ZeroRedundancyOptimizer_Level_3)
+                           ZeroRedundancyOptimizer_Level_3, model)
+from colossalai.utils import is_using_ddp, ConditionalContext
 from .schedule import BaseSchedule
 
 
@@ -71,13 +72,14 @@ def __init__(self,
                 "Training with zero is detected, ZeROGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
-        elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
-                ParallelMode.DATA) > 1:
-            gradient_handlers = [dict(type='DataParallelGradientHandler')]
-            self._logger.info(
-                "Data parallel training is detected, DataParallelGradientHandler is automatically "
-                "added even though not specified in the configuration",
-                ranks=[0])
+        # elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
+        #         ParallelMode.DATA) > 1:
+        #     gradient_handlers = [dict(type='DataParallelGradientHandler')]
+        #     self._logger.info(
+        #         "Data parallel training is detected, DataParallelGradientHandler is automatically "
+        #         "added even though not specified in the configuration",
+        #         ranks=[0])
+        # FIXME: check compatibility with pipeline
 
         if gradient_handlers is None:
             self._logger.warning(
@@ -147,17 +149,23 @@ def step(self,
 
         # differentiate training and eval with grad accum
         if self.training:
-            for i in range(self._grad_accum_size):
-                output, label, loss = self._schedule.forward_backward_step(
-                    data_iter, self._model, self._criterion, self._optimizer,
-                    forward_only=False,
-                    grad_accum_size=self._grad_accum_size,
-                    return_loss=return_loss)
-
-                if i == self._grad_accum_size - 1:
-                    # all reduce gradients
-                    self.handle_gradient()
-                    self._schedule.optimizer_step(self._model, self._optimizer, self._grad_clip)
+            with ConditionalContext(self._model.no_sync(), enable=is_using_ddp()):
+                for i in range(self._grad_accum_size - 1):
+                    # FIXME: accum output tensors
+                    output, label, loss = self._schedule.forward_backward_step(
+                        data_iter, self._model, self._criterion, self._optimizer,
+                        forward_only=False,
+                        grad_accum_size=self._grad_accum_size,
+                        return_loss=return_loss)
+            output, label, loss = self._schedule.forward_backward_step(
+                data_iter, self._model, self._criterion, self._optimizer,
+                forward_only=False,
+                grad_accum_size=self._grad_accum_size,
+                return_loss=return_loss)
+            # all reduce gradients
+            self.handle_gradient()
+            self._schedule.optimizer_step(
+                self._model, self._optimizer, self._grad_clip)
         else:
             output, label, loss = self._schedule.forward_backward_step(
                 data_iter, self._model, self._criterion, self._optimizer,
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 6806d86eb61c..c1c4d7c17873 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -11,7 +11,7 @@
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
-
+from torch.nn.parallel import DistributedDataParallel as DDP
 from colossalai.engine import AMP_TYPE, NoPipelineSchedule, PipelineSchedule
 from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger, init_global_dist_logger
@@ -22,7 +22,7 @@
                       build_optimizer_wrapper, build_schedule)
 from .context import Config, ParallelMode
 from .core import global_context as gpc
-from .utils import get_current_device, sync_model_param_in_dp
+from .utils import get_current_device, sync_model_param_in_dp, is_using_ddp
 
 
 def parse_args():
@@ -276,6 +276,8 @@ def initialize(config: Union[str, dict] = None,
         model = model.half()
         logger.info("Model is cast to fp16", ranks=[0])
 
+    if is_using_ddp():
+        model = DDP(model, process_group=gpc.get_group(ParallelMode.DATA))
     # training data
     if callable(train_dataloader):
         logger.info(
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index f7ef2259bed0..66300d844b53 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,5 +1,5 @@
 from .activation_checkpoint import checkpoint
-from .common import print_rank_0, sync_model_param_in_dp, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
+from .common import print_rank_0, sync_model_param_in_dp, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp, ConditionalContext
 from .cuda import get_current_device, synchronize, empty_cache, set_to_cuda
 from .memory import report_memory_usage
 from .timer import MultiTimer, Timer
@@ -18,5 +18,6 @@ def set_global_multitimer_status(mode: bool):
 __all__ = ['checkpoint', 'print_rank_0', 'sync_model_param_in_dp', 'get_current_device',
            'synchronize', 'empty_cache', 'set_to_cuda', 'report_memory_usage', 'Timer', 'MultiTimer',
            'get_global_multitimer', 'set_global_multitimer_status',
-           'is_dp_rank_0', 'is_tp_rank_0', 'is_no_pp_or_last_stage'
+           'is_dp_rank_0', 'is_tp_rank_0', 'is_no_pp_or_last_stage',
+           'is_using_ddp', 'ConditionalContext'
            ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index d8c6663ba626..f05756d27cd2 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -2,7 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 import torch.distributed as dist
-
+from contextlib import contextmanager
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 
@@ -26,17 +26,34 @@ def sync_model_param_in_dp(model):
 
     :param model: A pyTorch nn.model on whose parameters you check the consistency
     '''
-    
+
     if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
         for param in model.parameters():
             ranks = gpc.get_ranks_in_group(ParallelMode.DATA)
-            dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA))
+            dist.broadcast(
+                param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA))
+
 
 def is_dp_rank_0():
     return not gpc.is_initialized(ParallelMode.DATA) or gpc.is_first_rank(ParallelMode.DATA)
 
+
 def is_tp_rank_0():
     return not gpc.is_initialized(ParallelMode.TENSOR) or gpc.is_first_rank(ParallelMode.TENSOR)
 
+
 def is_no_pp_or_last_stage():
-    return not gpc.is_initialized(ParallelMode.PIPELINE) or gpc.is_last_rank(ParallelMode.PIPELINE)
\ No newline at end of file
+    return not gpc.is_initialized(ParallelMode.PIPELINE) or gpc.is_last_rank(ParallelMode.PIPELINE)
+
+
+def is_using_ddp():
+    return gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1
+
+
+@contextmanager
+def ConditionalContext(context_manager, enable=True):
+    if enable:
+        with context_manager:
+            yield
+    else:
+        yield

From a97564d167695724cc1207a0cb2417dcd9d44bd8 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 1 Dec 2021 14:27:23 +0800
Subject: [PATCH 07/10] fix loss accumulation

---
 colossalai/engine/_base_engine.py | 43 ++++++++++++++++++++-----------
 colossalai/initialize.py          |  4 +--
 colossalai/utils/__init__.py      |  4 +--
 colossalai/utils/common.py        |  4 +++
 4 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 2dd5325459b6..71589f495636 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -1,17 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+import torch
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer
 
 from colossalai.builder import build_gradient_handler
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
-                           ZeroRedundancyOptimizer_Level_3, model)
-from colossalai.utils import is_using_ddp, ConditionalContext
+                           ZeroRedundancyOptimizer_Level_3)
+from colossalai.utils import is_using_ddp, ConditionalContext, is_using_pp
+from colossalai.utils.cuda import get_current_device
 from .schedule import BaseSchedule
 
 
@@ -72,14 +72,12 @@ def __init__(self,
                 "Training with zero is detected, ZeROGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
-        # elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
-        #         ParallelMode.DATA) > 1:
-        #     gradient_handlers = [dict(type='DataParallelGradientHandler')]
-        #     self._logger.info(
-        #         "Data parallel training is detected, DataParallelGradientHandler is automatically "
-        #         "added even though not specified in the configuration",
-        #         ranks=[0])
-        # FIXME: check compatibility with pipeline
+        elif is_using_ddp() and is_using_pp():
+            gradient_handlers = [dict(type='DataParallelGradientHandler')]
+            self._logger.info(
+                "Data parallel training is detected when using pipeline parallel, DataParallelGradientHandler is automatically "
+                "added even though not specified in the configuration",
+                ranks=[0])
 
         if gradient_handlers is None:
             self._logger.warning(
@@ -149,19 +147,30 @@ def step(self,
 
         # differentiate training and eval with grad accum
         if self.training:
-            with ConditionalContext(self._model.no_sync(), enable=is_using_ddp()):
+            outputs = []
+            labels = []
+            loss = torch.zeros(1, device=get_current_device())
+            with ConditionalContext(self._model.no_sync(), enable=is_using_ddp() and not is_using_pp()):
                 for i in range(self._grad_accum_size - 1):
                     # FIXME: accum output tensors
-                    output, label, loss = self._schedule.forward_backward_step(
+                    output, label, loss_ = self._schedule.forward_backward_step(
                         data_iter, self._model, self._criterion, self._optimizer,
                         forward_only=False,
                         grad_accum_size=self._grad_accum_size,
                         return_loss=return_loss)
-            output, label, loss = self._schedule.forward_backward_step(
+                    outputs.append(output)
+                    labels.append(label)
+                    loss.add_(loss_)
+            output, label, loss_ = self._schedule.forward_backward_step(
                 data_iter, self._model, self._criterion, self._optimizer,
                 forward_only=False,
                 grad_accum_size=self._grad_accum_size,
                 return_loss=return_loss)
+            outputs.append(output)
+            labels.append(label)
+            loss.add_(loss_)
+            output = self._accum_outputs(outputs)
+            label = self._accum_outputs(labels)
             # all reduce gradients
             self.handle_gradient()
             self._schedule.optimizer_step(
@@ -182,3 +191,7 @@ def step(self,
                     break
 
         return output, label, loss
+
+    @staticmethod
+    def _accum_outputs(tensor_tuples):
+        return tuple([torch.cat(x) for x in zip(*tensor_tuples)])
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index c1c4d7c17873..ebc3992d3ad1 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -22,7 +22,7 @@
                       build_optimizer_wrapper, build_schedule)
 from .context import Config, ParallelMode
 from .core import global_context as gpc
-from .utils import get_current_device, sync_model_param_in_dp, is_using_ddp
+from .utils import get_current_device, sync_model_param_in_dp, is_using_ddp, is_using_pp
 
 
 def parse_args():
@@ -276,7 +276,7 @@ def initialize(config: Union[str, dict] = None,
         model = model.half()
         logger.info("Model is cast to fp16", ranks=[0])
 
-    if is_using_ddp():
+    if is_using_ddp() and not is_using_pp():
         model = DDP(model, process_group=gpc.get_group(ParallelMode.DATA))
     # training data
     if callable(train_dataloader):
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 66300d844b53..64aafab740e9 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,5 +1,5 @@
 from .activation_checkpoint import checkpoint
-from .common import print_rank_0, sync_model_param_in_dp, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp, ConditionalContext
+from .common import print_rank_0, sync_model_param_in_dp, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp, is_using_pp, ConditionalContext
 from .cuda import get_current_device, synchronize, empty_cache, set_to_cuda
 from .memory import report_memory_usage
 from .timer import MultiTimer, Timer
@@ -19,5 +19,5 @@ def set_global_multitimer_status(mode: bool):
            'synchronize', 'empty_cache', 'set_to_cuda', 'report_memory_usage', 'Timer', 'MultiTimer',
            'get_global_multitimer', 'set_global_multitimer_status',
            'is_dp_rank_0', 'is_tp_rank_0', 'is_no_pp_or_last_stage',
-           'is_using_ddp', 'ConditionalContext'
+           'is_using_ddp', 'ConditionalContext', 'is_using_pp'
            ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index f05756d27cd2..29becadf5a3d 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -50,6 +50,10 @@ def is_using_ddp():
     return gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1
 
 
+def is_using_pp():
+    return gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1
+
+
 @contextmanager
 def ConditionalContext(context_manager, enable=True):
     if enable:

From e907f1bf05dd27da7cb622453db38607089af8a4 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 1 Dec 2021 18:27:05 +0800
Subject: [PATCH 08/10] add log for ddp

---
 colossalai/engine/_base_engine.py | 1 -
 colossalai/initialize.py          | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 71589f495636..642f0295d5f4 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -152,7 +152,6 @@ def step(self,
             loss = torch.zeros(1, device=get_current_device())
             with ConditionalContext(self._model.no_sync(), enable=is_using_ddp() and not is_using_pp()):
                 for i in range(self._grad_accum_size - 1):
-                    # FIXME: accum output tensors
                     output, label, loss_ = self._schedule.forward_backward_step(
                         data_iter, self._model, self._criterion, self._optimizer,
                         forward_only=False,
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index ebc3992d3ad1..46047afac012 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -278,6 +278,8 @@ def initialize(config: Union[str, dict] = None,
 
     if is_using_ddp() and not is_using_pp():
         model = DDP(model, process_group=gpc.get_group(ParallelMode.DATA))
+        logger.info(
+            'Model is using torch.nn.parallel.DistributedDataParallel', ranks=[0])
     # training data
     if callable(train_dataloader):
         logger.info(

From 610e51d8eaf6c93b9825abdf142b9e15b0524800 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Wed, 1 Dec 2021 18:53:28 +0800
Subject: [PATCH 09/10] change seed

---
 colossalai/initialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 46047afac012..3c94c1cbfed3 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -292,7 +292,7 @@ def initialize(config: Union[str, dict] = None,
         logger.info('Train dataset is ready.', ranks=[0])
 
         train_dataloader = get_dataloader(train_dataset,
-                                          gpc.config.get('seed', 1024),
+                                          gpc.config.get('seed', 42),
                                           True,
                                           **gpc.config.train_data.dataloader,
                                           )

From e556694609d8e03a7e90b42e8b387209e59f9a35 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 3 Dec 2021 18:56:23 +0800
Subject: [PATCH 10/10] modify timing hook

---
 colossalai/trainer/hooks/_log_hook.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/trainer/hooks/_log_hook.py
index 3c3fdfc43ef8..d4d84dff76f2 100644
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@@ -170,19 +170,23 @@ def __init__(self,
                  trainer: Trainer,
                  interval: int = 1,
                  priority: int = 10,
-                 log_eval: bool = True
+                 log_eval: bool = True,
+                 ignore_num_train_steps: int = 0
                  ) -> None:
         super().__init__(trainer=trainer, interval=interval, priority=priority)
         set_global_multitimer_status(True)
         self._global_timer = get_global_multitimer()
         self._log_eval = log_eval
         self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()
+        self.ignore_num_train_steps = ignore_num_train_steps
 
     def _get_message(self):
         msg = []
         for timer_name, timer in self._global_timer:
             last_elapsed_time = timer.get_elapsed_time()
             if timer.has_history:
+                if timer_name == 'train-step':
+                    timer._history = timer._history[self.ignore_num_train_steps:]
                 history_mean = timer.get_history_mean()
                 history_sum = timer.get_history_sum()
                 msg.append(
@@ -201,7 +205,7 @@ def after_train_epoch(self):
         if self._is_epoch_to_log() and self._is_rank_to_log:
             msg = self._get_message()
             self.logger.info(
-                f'Training - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}')
+                f'Training - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}, num steps per epoch={self.trainer.steps_per_epoch}')
 
     def after_test_epoch(self):
         """Writes log after finishing a testing epoch.