diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/amp/apex_amp/apex_amp.py
index e6bdbe4520f9..ba603ca0975c 100644
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@@ -10,11 +10,11 @@
 
 from torch import Tensor
 
-from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.interface import OptimizerWrapper
 from colossalai.utils import clip_grad_norm_fp32
 
 
-class ApexAMPOptimizer(ColossalaiOptimizer):
+class ApexAMPOptimizer(OptimizerWrapper):
     """ A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
     methods
     """
diff --git a/colossalai/amp/naive_amp/naive_amp.py b/colossalai/amp/naive_amp/naive_amp.py
index 6a39d518d3f4..c09f09f8118b 100644
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@@ -13,12 +13,12 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.interface import OptimizerWrapper
 
 from ._fp16_optimizer import FP16Optimizer
 
 
-class NaiveAMPOptimizer(ColossalaiOptimizer):
+class NaiveAMPOptimizer(OptimizerWrapper):
     """A wrapper class for optimizer to cast all parameters to fp16
 
     Args:
diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/amp/torch_amp/torch_amp.py
index 65718d77c2e0..452b3d8a00fc 100644
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@@ -7,13 +7,13 @@
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer
 
-from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.interface import OptimizerWrapper
 from colossalai.utils import clip_grad_norm_fp32
 
 from ._grad_scaler import GradScaler
 
 
-class TorchAMPOptimizer(ColossalaiOptimizer):
+class TorchAMPOptimizer(OptimizerWrapper):
     """A wrapper class which integrate Pytorch AMP with an optimizer
 
     Args:
diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py
index 19d85b80dd3d..353133bd6f2d 100644
--- a/colossalai/auto_parallel/offload/amp_optimizer.py
+++ b/colossalai/auto_parallel/offload/amp_optimizer.py
@@ -5,8 +5,8 @@
 from torch.optim import Optimizer
 
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import get_current_device
 
 from .base_offload_module import BaseOffloadModule
@@ -19,7 +19,7 @@ class OptimState(Enum):
     UNSCALED = 1
 
 
-class AMPOptimizer(ColossalaiOptimizer):
+class AMPOptimizer(OptimizerWrapper):
     """
     A wrapper for Optimizer.
     Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 3441eca38ce7..664ac63e45ac 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -13,7 +13,6 @@
 from torch.optim import Optimizer
 
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.tensor.d_tensor import (
     is_customized_distributed_tensor,
     is_distributed_tensor,
@@ -130,10 +129,7 @@ def unwrap_optimizer(optimizer: OptimizerWrapper):
     This method should be used before saving/loading it to/from sharded checkpoints.
     '''
 
-    # TODO(Baizhou): ColossalaiOptimizer will be replaced with OptimizerWrapper in the future
     unwrapped_optim = optimizer.optim
-    if isinstance(unwrapped_optim, ColossalaiOptimizer):
-        unwrapped_optim = unwrapped_optim.optim
     return unwrapped_optim
 
 
diff --git a/colossalai/cli/benchmark/__init__.py b/colossalai/cli/benchmark/__init__.py
deleted file mode 100644
index 618ff8c61dd4..000000000000
--- a/colossalai/cli/benchmark/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import click
-
-from colossalai.context import Config
-
-from .benchmark import run_benchmark
-from .utils import *
-
-__all__ = ['benchmark']
-
-
-@click.command()
-@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
-@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
-@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
-@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
-@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
-@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
-@click.option("-l", "--layers", type=int, default=2)
-@click.option("-m",
-              "--model",
-              type=click.Choice(['mlp'], case_sensitive=False),
-              default='mlp',
-              help="Select the model to benchmark, currently only supports MLP")
-def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
-              layers: int, model: str):
-    args_dict = locals()
-    args = Config(args_dict)
-    run_benchmark(args)
diff --git a/colossalai/cli/benchmark/benchmark.py b/colossalai/cli/benchmark/benchmark.py
deleted file mode 100644
index 97a9f45722dd..000000000000
--- a/colossalai/cli/benchmark/benchmark.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from functools import partial
-from typing import Dict, List
-
-import click
-import torch.multiprocessing as mp
-
-import colossalai
-from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
-from colossalai.context import Config
-from colossalai.context.random import reset_seeds
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.testing import free_port
-from colossalai.utils import MultiTimer
-
-from .models import MLP
-
-
-def run_benchmark(args: Config) -> None:
-    """
-    Run benchmarking with torch.multiprocessing.
-    """
-
-    # sanity checks
-    if args.gpus is None:
-        click.echo("Error: --num_gpus is not given")
-        exit()
-    if args.gpus <= 1:
-        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
-
-    click.echo("=== Benchmarking Parameters ===")
-    for k, v in args.items():
-        click.echo(f'{k}: {v}')
-    click.echo('')
-
-    config_list = find_all_configs(args.gpus)
-
-    avail_ports = [free_port() for _ in range(len(config_list))]
-    run_func = partial(run_dist_profiling,
-                       world_size=args.gpus,
-                       port_list=avail_ports,
-                       config_list=config_list,
-                       hyperparams=args)
-    mp.spawn(run_func, nprocs=args.gpus)
-
-
-def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
-                       hyperparams: Config) -> None:
-    """
-    A function executed for profiling, this function should be spawn by torch.multiprocessing.
-
-    Args:
-        rank (int): rank of the process
-        world_size (int): the number of processes
-        port_list (List[int]): a list of free ports for initializing distributed networks
-        config_list (List[Dict]): a list of configuration
-        hyperparams (Config): the hyperparameters given by the user
-
-    """
-
-    # disable logging for clean output
-    disable_existing_loggers()
-    logger = get_dist_logger()
-    logger.set_level('WARNING')
-
-    for config, port in zip(config_list, port_list):
-        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-        timer = MultiTimer()
-
-        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
-        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
-            click.echo(
-                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
-            )
-            continue
-
-        if hyperparams.model == 'mlp':
-            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
-        else:
-            if gpc.get_global_rank() == 0:
-                click.echo("Error: Invalid argument for --model")
-                exit()
-
-        data_func = partial(get_batch_data,
-                            dim=hyperparams.dimension,
-                            batch_size=hyperparams.batch_size,
-                            seq_length=hyperparams.seq_len,
-                            mode=config.parallel.tensor.mode)
-
-        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
-                                                                      warmup_steps=hyperparams.warmup_steps,
-                                                                      profile_steps=hyperparams.profile_steps,
-                                                                      data_func=data_func,
-                                                                      timer=timer)
-
-        gpc.destroy()
-        reset_seeds()
-
-        if gpc.get_global_rank() == 0:
-            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
-            click.echo(f"=== {config_str} ===")
-            click.echo(f"Average forward time: {fwd_time}")
-            click.echo(f"Average backward time: {bwd_time}")
-            click.echo(f"Max allocated GPU memory: {max_allocated}")
-            click.echo(f"Max cached GPU memory: {max_cached}\n")
diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py
deleted file mode 100644
index 385b485b6016..000000000000
--- a/colossalai/cli/benchmark/models.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-
-import colossalai.legacy.nn as col_nn
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-
-        for _ in range(layers):
-            self.layers.append(col_nn.Linear(dim, dim))
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
diff --git a/colossalai/cli/benchmark/utils.py b/colossalai/cli/benchmark/utils.py
deleted file mode 100644
index ee7d92d6ea6a..000000000000
--- a/colossalai/cli/benchmark/utils.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import math
-import time
-from typing import Callable, Dict, List, Tuple
-
-import torch
-
-from colossalai.context import Config, ParallelMode
-from colossalai.utils import MultiTimer
-
-
-def get_time_stamp() -> int:
-    """
-    Return the time stamp for profiling.
-
-    Returns:
-        time_stamp (int): the time given by time.time()
-    """
-
-    torch.cuda.synchronize()
-    time_stamp = time.time()
-    return time_stamp
-
-
-def get_memory_states() -> Tuple[float]:
-    """
-    Return the memory statistics.
-
-    Returns:
-        max_allocated (float): the allocated CUDA memory
-        max_cached (float):  the cached CUDA memory
-    """
-
-    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
-    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    return max_allocated, max_cached
-
-
-def find_all_configs(device_cnt: int) -> List[Dict]:
-    """
-    Find all possible configurations for tensor parallelism
-
-    Args:
-        device_cnt (int): the number of devices
-
-    Returns:
-        config_list (List[Dict]): a list of configurations
-    """
-
-    def _is_square(num):
-        # 2D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(math.sqrt(num))**2 == num
-
-    def _is_cube(num):
-        # 3D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(num**(1. / 3.))**3 == num
-
-    config_list = []
-
-    # add non-parallel config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
-    config_list.append(config)
-
-    # add 1D config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
-    config_list.append(config)
-
-    # add 2D config only if device_cnt is a square
-    if _is_square(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
-        config_list.append(config)
-
-    # check for 2.5D
-    # iterate over depth
-    for depth in range(1, device_cnt):
-        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
-            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
-            config_list.append(config)
-
-    # check for 3D if device_cnt is a cube
-    if _is_cube(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
-        config_list.append(config)
-
-    config_list = [Config(cfg) for cfg in config_list]
-    return config_list
-
-
-def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
-                  timer: MultiTimer) -> Tuple[float]:
-    """
-    Profile the forward and backward of a model
-
-    Args:
-        model (torch.nn.Module): a PyTorch model
-        warmup_steps (int): the number of steps for warmup
-        profile_steps (int): the number of steps for profiling
-        data_func (Callable): a function to generate random data
-        timer (colossalai.utils.Multitimer): a timer instance for time recording
-
-    Returns:
-        fwd_time (float): the average forward time taken by forward pass in second
-        bwd_time (float): the average backward time taken by forward pass in second
-        max_allocated (float): the maximum GPU memory allocated in GB
-        max_cached (float): the maximum GPU memory cached in GB
-    """
-
-    def _run_step(data):
-        timer.start('forward')
-        out = model(data)
-        timer.stop('forward', keep_in_history=True)
-        timer.start('backward')
-        out.mean().backward()
-        timer.stop('backward', keep_in_history=True)
-
-    data_list = [data_func() for _ in range(warmup_steps)]
-    for data in data_list:
-        _run_step(data)
-    timer.reset('forward')
-    timer.reset('backward')
-
-    for _ in range(profile_steps):
-        data = data_func()
-        _run_step(data)
-
-    max_allocated, max_cached = get_memory_states()
-    fwd_time = timer.get_timer('forward').get_history_mean()
-    bwd_time = timer.get_timer('backward').get_history_mean()
-    return fwd_time, bwd_time, max_allocated, max_cached
-
-
-def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
-    """
-    Return a random data of shape (batch_size, seq_length, dim) for profiling.
-
-    Args:
-        dim (int): hidden size
-        batch_size (int): the number of data samples
-        seq_length (int): the number of tokens
-        mode (ParallelMode): Colossal-AI ParallelMode enum
-
-    Returns:
-        data (torch.Tensor): random data
-    """
-
-    if mode in ['2d', '2.5d']:
-        batch_size = batch_size // 2
-        dim = dim // 2
-    elif mode == '3d':
-        batch_size = batch_size // 4
-        dim = dim // 2
-
-    data = torch.rand(batch_size, seq_length, dim).cuda()
-    return data
diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py
index a94e1150e49f..0dea7c504957 100644
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@@ -1,6 +1,5 @@
 import click
 
-from .benchmark import benchmark
 from .check import check
 from .launcher import run
 
@@ -19,7 +18,6 @@ def cli():
 
 cli.add_command(run)
 cli.add_command(check)
-cli.add_command(benchmark)
 
 if __name__ == '__main__':
     cli()
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index a1694e059fb4..0de04e24091e 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -20,6 +20,7 @@
 from colossalai.context import Config, ConfigException, ParallelMode
 from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.core import global_context as gpc
+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.builder.builder import build_gradient_handler
 from colossalai.legacy.engine import Engine
 from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
@@ -30,7 +31,6 @@
     get_tensor_shape,
 )
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
 from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
 from colossalai.utils.moe import sync_moe_model_param
 from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
@@ -445,9 +445,9 @@ def initialize(model: nn.Module,
     else:
         gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
 
-    # check if optimizer is ColossalaiOptimizer
-    if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizerV2)):
-        optimizer = ColossalaiOptimizer(optim=optimizer)
+    # check if optimizer is OptimizerWrapper
+    if not isinstance(optimizer, (OptimizerWrapper, ShardedOptimizerV2)):
+        optimizer = OptimizerWrapper(optim=optimizer)
 
     # gradient accumulation
     grad_accum_size = gpc.config.get('gradient_accumulation', None)
diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py
index 9af4469f403f..9a1a2dc325a3 100644
--- a/colossalai/legacy/engine/_base_engine.py
+++ b/colossalai/legacy/engine/_base_engine.py
@@ -8,6 +8,7 @@
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 
+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 from colossalai.legacy.engine.schedule import (
     BaseSchedule,
@@ -16,7 +17,6 @@
     PipelineSchedule,
 )
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 
 
@@ -27,7 +27,7 @@ class Engine:
 
     Args:
         model (``torch.nn.Module``): The neural network model.
-        optimizer (``colossalai.nn.optimizer.ColossalaiOptimizer``): Optimizer for updating the parameters.
+        optimizer (``colossalai.interface.OptimizerWrapper``): Optimizer for updating the parameters.
         criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
         gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
         clip_grad_norm (float, optional): The norm of gradient clipping.
@@ -61,7 +61,7 @@ class Engine:
 
     def __init__(self,
                  model: Module,
-                 optimizer: "ColossalaiOptimizer",
+                 optimizer: "OptimizerWrapper",
                  criterion: Optional[_Loss] = None,
                  gradient_handlers: Optional[List[BaseGradientHandler]] = None,
                  clip_grad_norm: float = 0.0,
@@ -157,7 +157,7 @@ def step(self):
         """Execute parameter update
         """
         self._all_reduce_gradients()
-        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
+        self.optimizer.clip_grad_by_norm(self._clip_grad_norm)
         return self.optimizer.step()
 
     def backward(self, loss: Tensor):
diff --git a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
index c466f7e2d03b..c2270dc53a50 100644
--- a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
@@ -10,12 +10,12 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
 
+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine import BaseGradientHandler
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context
 
 
-class GradAccumOptimizer(ColossalaiOptimizer):
+class GradAccumOptimizer(OptimizerWrapper):
     """A wrapper for the optimizer to enable gradient accumulation by skipping the steps
     before accumulation size is reached.
 
@@ -74,7 +74,7 @@ def clip_grad_norm(self, model: nn.Module, max_norm: float) -> None:
         if self.accumulate_step < self.accumulate_size:
             pass
         else:
-            self.optim.clip_grad_norm(model, max_norm)
+            self.optim.clip_grad_by_norm(max_norm)
 
     def backward(self, loss: Tensor) -> None:
         """Execute backward pass.
diff --git a/colossalai/nn/optimizer/__init__.py b/colossalai/nn/optimizer/__init__.py
index 06072648beba..7e310793f515 100644
--- a/colossalai/nn/optimizer/__init__.py
+++ b/colossalai/nn/optimizer/__init__.py
@@ -1,10 +1,9 @@
-from .colossalai_optimizer import ColossalaiOptimizer
+from .cpu_adam import CPUAdam
 from .fused_adam import FusedAdam
 from .fused_lamb import FusedLAMB
 from .fused_sgd import FusedSGD
+from .hybrid_adam import HybridAdam
 from .lamb import Lamb
 from .lars import Lars
-from .cpu_adam import CPUAdam
-from .hybrid_adam import HybridAdam
 
-__all__ = ['ColossalaiOptimizer', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
+__all__ = ['FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
diff --git a/colossalai/nn/optimizer/colossalai_optimizer.py b/colossalai/nn/optimizer/colossalai_optimizer.py
deleted file mode 100644
index 34f5a9541975..000000000000
--- a/colossalai/nn/optimizer/colossalai_optimizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.optim import Optimizer
-from colossalai.utils import clip_grad_norm_fp32
-
-
-class ColossalaiOptimizer(Optimizer):
-
-    def __init__(self, optim: Optimizer):
-        self.optim = optim
-
-    @property
-    def param_groups(self):
-        return self.optim.param_groups
-
-    @property
-    def defaults(self):
-        return self.optim.defaults
-
-    def add_param_group(self, *args, **kwargs):
-        return self.optim.add_param_group(*args, **kwargs)
-
-    def step(self, *args, **kwargs):
-        return self.optim.step(*args, **kwargs)
-
-    def zero_grad(self, *args, **kwargs):
-        self.optim.zero_grad(*args, **kwargs)
-
-    def load_state_dict(self, *args, **kwargs):
-        self.optim.load_state_dict(*args, **kwargs)
-
-    def state_dict(self):
-        return self.optim.state_dict()
-
-    def backward(self, loss: Tensor):
-        loss.backward()
-
-    def backward_by_grad(self, tensor: Tensor, grad: Tensor):
-        torch.autograd.backward(tensors=tensor, grad_tensors=grad)
-
-    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        if max_norm > 0.0:
-            clip_grad_norm_fp32(model.parameters(), max_norm)
diff --git a/colossalai/utils/checkpoint/module_checkpoint.py b/colossalai/utils/checkpoint/module_checkpoint.py
index d390da864cd3..ee8773e5059c 100644
--- a/colossalai/utils/checkpoint/module_checkpoint.py
+++ b/colossalai/utils/checkpoint/module_checkpoint.py
@@ -1,25 +1,27 @@
+from typing import Dict, Optional
+
 import torch
 import torch.distributed as dist
+
+from colossalai.interface import OptimizerWrapper
 from colossalai.tensor import ColoTensor
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils.checkpoint.utils import gather_tensor, scatter_tensor
-from typing import Optional, Dict
 
 
 def save_checkpoint(path: str,
                     epoch: int,
                     model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                     lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                     *args,
                     **kwargs):
-    """save_checkpoint 
+    """save_checkpoint
     save a model, whose parameters are `ColoTensor`s.
     Args:
         path (str): directory to save the checkpoint files.
         epoch (int): the number of epoch
         model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
         lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
     """
     rank = dist.get_rank()
@@ -74,17 +76,17 @@ def save_checkpoint(path: str,
 def load_checkpoint(path: str,
                     epoch: int,
                     model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                     lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                     torch_load_kwargs: Optional[Dict] = None,
                     load_state_dict_kwargs: Optional[Dict] = None):
-    """load_checkpoint 
+    """load_checkpoint
     load a model, whose parameters are `ColoTensor`s.
     Args:
         path (str): directory to save the checkpoint files.
         epoch (int): the number of epoch
         model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
         lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
         torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function
         load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function
diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
index 41dd174cb65a..7efe25142a27 100644
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
@@ -14,8 +14,8 @@
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
 from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
 from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
@@ -28,7 +28,7 @@ class OptimState(Enum):
     UNSCALED = 2
 
 
-class ShardedOptimizerV2(ColossalaiOptimizer):
+class ShardedOptimizerV2(OptimizerWrapper):
     """A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
 
     By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
diff --git a/docs/source/en/basics/command_line_tool.md b/docs/source/en/basics/command_line_tool.md
index 48b199cf78e9..4c278aaa0c6a 100644
--- a/docs/source/en/basics/command_line_tool.md
+++ b/docs/source/en/basics/command_line_tool.md
@@ -30,24 +30,4 @@ This command will inform you information regarding the version compatibility and
 To launch distributed jobs on single or multiple nodes, the command `colossalai run` can be used for process launching.
 You may refer to [Launch Colossal-AI](./launch_colossalai.md) for more details.
 
-## Tensor Parallel Micro-Benchmarking
-
-As Colossal-AI provides an array of tensor parallelism methods, it is not intuitive to choose one for your hardware and
-model. Therefore, we provide a simple benchmarking to evaluate the performance of various tensor parallelisms on your system.
-This benchmarking is run on a simple MLP model where the input data is of the shape `(batch_size, seq_length, hidden_size)`.
-Based on the number of GPUs, the CLI will look for all possible tensor parallel configurations and display the benchmarking results.
-You can customize the benchmarking configurations by checking out `colossalai benchmark --help`.
-
-```shell
-# run on 4 GPUs
-colossalai benchmark --gpus 4
-
-# run on 8 GPUs
-colossalai benchmark --gpus 8
-```
-
-:::caution
-
-Only single-node benchmarking is supported currently.
-
-:::
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/basics/command_line_tool.md b/docs/source/zh-Hans/basics/command_line_tool.md
index 9b0275a6cedd..5c4c18989c17 100644
--- a/docs/source/zh-Hans/basics/command_line_tool.md
+++ b/docs/source/zh-Hans/basics/command_line_tool.md
@@ -26,22 +26,4 @@ Colossal-AI给用户提供了命令行工具，目前命令行工具可以用来
 
 在分布式训练时，我们可以使用`colossalai run`来启动单节点或者多节点的多进程，详细的内容可以参考[启动 Colossal-AI](./launch_colossalai.md)。
 
-## 张量并行基准测试
-
-Colossal-AI提供了多种张量并行，想要充分理解这些方法需要一定的学习成本，对于新手来说很难靠经验选择一个并行方式。
-所以我们提供了一个简单的基准测试，能够让用户在自己的机器上测试不同张量并行的性能。这个基准测试跑一个并行的MLP模型，
-输入数据的维度为`（批大小，序列长度，隐藏层维度）`。通过指定GPU的数量，Colossal-AI会搜索所有可行的并行配置。用户可以通过查看`colossalai benchmark --help`来自定义相关的测试参数。
-
-```shell
-# 使用4个GPU
-colossalai benchmark --gpus 4
-
-# 使用8个GPU
-colossalai benchmark --gpus 8
-```
-
-:::caution
-
-目前仅支持单节点的基准测试。
-
-:::
+<!-- doc-test-command: echo  -->