diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 0de04e24091e..510c4555cc9b 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -30,11 +30,11 @@
     PipelineSchedule,
     get_tensor_shape,
 )
+from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
 from colossalai.utils.moe import sync_moe_model_param
-from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
 
 
 def get_default_parser():
diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py
index 9a1a2dc325a3..930caf20c1dd 100644
--- a/colossalai/legacy/engine/_base_engine.py
+++ b/colossalai/legacy/engine/_base_engine.py
@@ -16,8 +16,8 @@
     NonPipelineSchedule,
     PipelineSchedule,
 )
+from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 
 
 class Engine:
diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
index 4571fd679e8c..227729501be2 100644
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@@ -157,7 +157,7 @@ def load_micro_batch(self):
         return self._move_to_device(micro_batch_data)
 
     def pre_processing(self, engine):
-        from colossalai.zero.legacy import ShardedModelV2
+        from colossalai.legacy.zero import ShardedModelV2
 
         # TODO: remove this after testing new zero with pipeline parallelism
         model = engine.model
diff --git a/colossalai/zero/legacy/__init__.py b/colossalai/legacy/zero/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/__init__.py
rename to colossalai/legacy/zero/__init__.py
diff --git a/colossalai/zero/legacy/gemini/__init__.py b/colossalai/legacy/zero/gemini/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/__init__.py
rename to colossalai/legacy/zero/gemini/__init__.py
diff --git a/colossalai/zero/legacy/gemini/gemini_context.py b/colossalai/legacy/zero/gemini/gemini_context.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/gemini_context.py
rename to colossalai/legacy/zero/gemini/gemini_context.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/__init__.py b/colossalai/legacy/zero/gemini/ophooks/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/__init__.py
rename to colossalai/legacy/zero/gemini/ophooks/__init__.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
rename to colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
rename to colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
similarity index 98%
rename from colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py
rename to colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
index f40d6ced1ee0..eebcf86e0e58 100644
--- a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py
+++ b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
@@ -5,9 +5,9 @@
 
 import torch
 
+from colossalai.legacy.zero.gemini.tensor_utils import alloc_storage, free_storage
 from colossalai.tensor.param_op_hook import ColoParamOpHook
 from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
-from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage
 
 
 class TrainingPhase(Enum):
diff --git a/colossalai/zero/legacy/gemini/ophooks/utils.py b/colossalai/legacy/zero/gemini/ophooks/utils.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/utils.py
rename to colossalai/legacy/zero/gemini/ophooks/utils.py
diff --git a/colossalai/zero/legacy/gemini/paramhooks/__init__.py b/colossalai/legacy/zero/gemini/paramhooks/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/paramhooks/__init__.py
rename to colossalai/legacy/zero/gemini/paramhooks/__init__.py
diff --git a/colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py b/colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py
rename to colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py
diff --git a/colossalai/zero/legacy/gemini/stateful_tensor.py b/colossalai/legacy/zero/gemini/stateful_tensor.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/stateful_tensor.py
rename to colossalai/legacy/zero/gemini/stateful_tensor.py
diff --git a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py b/colossalai/legacy/zero/gemini/stateful_tensor_mgr.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
rename to colossalai/legacy/zero/gemini/stateful_tensor_mgr.py
diff --git a/colossalai/zero/legacy/gemini/tensor_placement_policy.py b/colossalai/legacy/zero/gemini/tensor_placement_policy.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/tensor_placement_policy.py
rename to colossalai/legacy/zero/gemini/tensor_placement_policy.py
diff --git a/colossalai/zero/legacy/gemini/tensor_utils.py b/colossalai/legacy/zero/gemini/tensor_utils.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/tensor_utils.py
rename to colossalai/legacy/zero/gemini/tensor_utils.py
diff --git a/colossalai/zero/legacy/init_ctx/__init__.py b/colossalai/legacy/zero/init_ctx/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/init_ctx/__init__.py
rename to colossalai/legacy/zero/init_ctx/__init__.py
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/legacy/zero/init_ctx/init_context.py
similarity index 97%
rename from colossalai/zero/legacy/init_ctx/init_context.py
rename to colossalai/legacy/zero/init_ctx/init_context.py
index 84e2d2f4f8e1..85a1f893d632 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/legacy/zero/init_ctx/init_context.py
@@ -11,12 +11,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.core import global_context as gpc
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
+from colossalai.legacy.zero.sharded_model.sharded_model_v2 import ShardedModelV2
+from colossalai.legacy.zero.sharded_param import ShardedParamV2
 from colossalai.logging import get_dist_logger
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
-from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-from colossalai.zero.legacy.sharded_param import ShardedParamV2
 
 
 @dataclass
diff --git a/colossalai/zero/legacy/shard_utils/__init__.py b/colossalai/legacy/zero/shard_utils/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/shard_utils/__init__.py
rename to colossalai/legacy/zero/shard_utils/__init__.py
diff --git a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py
similarity index 90%
rename from colossalai/zero/legacy/shard_utils/base_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/base_shard_strategy.py
index 7ca951091640..9fb80f57ae77 100644
--- a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py
@@ -3,7 +3,7 @@
 
 import torch.distributed as dist
 
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 
 
 class BaseShardStrategy(ABC):
diff --git a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
similarity index 97%
rename from colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
index d663104831ce..1f7baad57816 100644
--- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
@@ -4,8 +4,8 @@
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors as flatten
 
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 from .tensor_shard_strategy import TensorShardStrategy
 
diff --git a/colossalai/zero/legacy/shard_utils/commons.py b/colossalai/legacy/zero/shard_utils/commons.py
similarity index 100%
rename from colossalai/zero/legacy/shard_utils/commons.py
rename to colossalai/legacy/zero/shard_utils/commons.py
diff --git a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
similarity index 90%
rename from colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
index d1df4803b820..cc43907f6655 100644
--- a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
@@ -3,11 +3,11 @@
 import torch
 import torch.distributed as dist
 
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.shard_utils.commons import get_shard
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.shard_utils.commons import get_shard
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 
 class TensorShardStrategy(BaseShardStrategy):
diff --git a/colossalai/zero/legacy/sharded_model/__init__.py b/colossalai/legacy/zero/sharded_model/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_model/__init__.py
rename to colossalai/legacy/zero/sharded_model/__init__.py
diff --git a/colossalai/zero/legacy/sharded_model/_utils.py b/colossalai/legacy/zero/sharded_model/_utils.py
similarity index 97%
rename from colossalai/zero/legacy/sharded_model/_utils.py
rename to colossalai/legacy/zero/sharded_model/_utils.py
index f1d642cf3f13..b8a618ef5a0d 100644
--- a/colossalai/zero/legacy/sharded_model/_utils.py
+++ b/colossalai/legacy/zero/sharded_model/_utils.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor
 
 
 def get_gradient_predivide_factor(world_size: int) -> float:
diff --git a/colossalai/zero/legacy/sharded_model/reduce_scatter.py b/colossalai/legacy/zero/sharded_model/reduce_scatter.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_model/reduce_scatter.py
rename to colossalai/legacy/zero/sharded_model/reduce_scatter.py
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py
similarity index 98%
rename from colossalai/zero/legacy/sharded_model/sharded_model_v2.py
rename to colossalai/legacy/zero/sharded_model/sharded_model_v2.py
index e7064277fb3c..353f09fbaaaf 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py
@@ -13,18 +13,18 @@
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.zero.gemini.ophooks import register_ophooks_recursively
+from colossalai.legacy.zero.gemini.paramhooks import BaseParamHookMgr
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_move_to_cpu
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
 from colossalai.logging import get_dist_logger
 from colossalai.utils import disposable, get_current_device
 from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import register_ophooks_recursively
-from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_move_to_cpu
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBucketer
 
 from ._utils import (
     cast_float_arguments,
diff --git a/colossalai/zero/legacy/sharded_model/utils.py b/colossalai/legacy/zero/sharded_model/utils.py
similarity index 92%
rename from colossalai/zero/legacy/sharded_model/utils.py
rename to colossalai/legacy/zero/sharded_model/utils.py
index 08806e78ea3b..7a411669900b 100644
--- a/colossalai/zero/legacy/sharded_model/utils.py
+++ b/colossalai/legacy/zero/sharded_model/utils.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
 
 
 def col_model_deepcopy(sharded_model: ShardedModelV2, other_model: torch.nn.Module):
diff --git a/colossalai/zero/legacy/sharded_model/zero_hook.py b/colossalai/legacy/zero/sharded_model/zero_hook.py
similarity index 94%
rename from colossalai/zero/legacy/sharded_model/zero_hook.py
rename to colossalai/legacy/zero/sharded_model/zero_hook.py
index 1815bee3a9e0..3fc373e5ca44 100644
--- a/colossalai/zero/legacy/sharded_model/zero_hook.py
+++ b/colossalai/legacy/zero/sharded_model/zero_hook.py
@@ -4,13 +4,13 @@
 import torch.distributed as dist
 
 from colossalai.legacy.registry import OPHOOKS
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
 
 
 @OPHOOKS.register_module
diff --git a/colossalai/zero/legacy/sharded_optim/__init__.py b/colossalai/legacy/zero/sharded_optim/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_optim/__init__.py
rename to colossalai/legacy/zero/sharded_optim/__init__.py
diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
similarity index 98%
rename from colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
rename to colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
index 7efe25142a27..936fd538bcf2 100644
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
@@ -15,12 +15,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32
 
 
 class OptimState(Enum):
diff --git a/colossalai/zero/legacy/sharded_param/__init__.py b/colossalai/legacy/zero/sharded_param/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_param/__init__.py
rename to colossalai/legacy/zero/sharded_param/__init__.py
diff --git a/colossalai/zero/legacy/sharded_param/sharded_param.py b/colossalai/legacy/zero/sharded_param/sharded_param.py
similarity index 96%
rename from colossalai/zero/legacy/sharded_param/sharded_param.py
rename to colossalai/legacy/zero/sharded_param/sharded_param.py
index 4bcc4b62104a..454a722cf7e7 100644
--- a/colossalai/zero/legacy/sharded_param/sharded_param.py
+++ b/colossalai/legacy/zero/sharded_param/sharded_param.py
@@ -2,8 +2,8 @@
 
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage
 
 from .sharded_tensor import ShardedTensor
 
diff --git a/colossalai/zero/legacy/sharded_param/sharded_tensor.py b/colossalai/legacy/zero/sharded_param/sharded_tensor.py
similarity index 94%
rename from colossalai/zero/legacy/sharded_param/sharded_tensor.py
rename to colossalai/legacy/zero/sharded_param/sharded_tensor.py
index af60312600f2..43c7576b93b5 100644
--- a/colossalai/zero/legacy/sharded_param/sharded_tensor.py
+++ b/colossalai/legacy/zero/sharded_param/sharded_tensor.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
 
 
 class ShardedTensor(StatefulTensor):
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index edd986ef5e82..9aeab9f44a6d 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -1,2 +1,2 @@
-from .moe import *
+# from .moe import *
 from .utils import *
diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py
index 56b11f4d9e08..55604a65e055 100644
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -8,8 +8,8 @@
 
 from colossalai.context import ParallelMode, seed
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
 
 
 class MoeExperts(nn.Module):
diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py
index 03f55d91f3a8..9293d3208f11 100644
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 from colossalai.nn.layer.moe._operation import (
     COL_MOE_KERNEL_FLAG,
     AllGather,
@@ -18,7 +19,6 @@
 from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router
 from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 
 
 @no_shard_zero_decrator(is_replicated=True)
diff --git a/colossalai/zero/gemini/memory_tracer/memstats_collector.py b/colossalai/zero/gemini/memory_tracer/memstats_collector.py
index 0694be48550a..abb3dcc74b27 100644
--- a/colossalai/zero/gemini/memory_tracer/memstats_collector.py
+++ b/colossalai/zero/gemini/memory_tracer/memstats_collector.py
@@ -70,7 +70,7 @@ def record_model_data_volume(self) -> None:
         Sampling model data statistics.
         """
         if self._start_flag and not self.use_outside_memstats:
-            from colossalai.zero.legacy.gemini import StatefulTensor
+            from colossalai.legacy.zero.gemini import StatefulTensor
 
             # The following code work for ZeroInitContext, which is deprecated in v0.1.12
             cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index e5466965cc48..6656821fef74 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,12 +1,12 @@
 import torch.nn
 
-from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import _cast_float
-from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
+from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
     GradMemStats,
     GradMemTracerHook,
     ParamMemTracerHook,
 )
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import _cast_float
 
 from .memory_stats import MemStats
 
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index b239b626c07f..d813e41af5a8 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -11,11 +11,11 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.trainer import Trainer, hooks
+from colossalai.legacy.zero.init_ctx import ZeroInitContext
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn import LinearWarmupLR
 from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
 from colossalai.utils.timer import MultiTimer
-from colossalai.zero.legacy.init_ctx import ZeroInitContext
 
 
 def calc_local_model_size(model: torch.nn.Module):
diff --git a/examples/tutorial/opt/opt/colossalai_zero.py b/examples/tutorial/opt/opt/colossalai_zero.py
index 7c2c152450c5..8fbed6e83d52 100644
--- a/examples/tutorial/opt/opt/colossalai_zero.py
+++ b/examples/tutorial/opt/opt/colossalai_zero.py
@@ -2,7 +2,7 @@
     from colossalai.zero.shard_utils import TensorShardStrategy
 except ImportError:
     # colossalai > 0.2.8
-    from colossalai.zero.legacy import TensorShardStrategy
+    from colossalai.legacy.zero import TensorShardStrategy
 
 zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
                               tensor_placement_policy="auto",
diff --git a/tests/test_utils/test_commons.py b/tests/test_legacy/test_zero/test_commons.py
similarity index 90%
rename from tests/test_utils/test_commons.py
rename to tests/test_legacy/test_zero/test_commons.py
index 2633d7da21aa..377549ed996e 100644
--- a/tests/test_utils/test_commons.py
+++ b/tests/test_legacy/test_zero/test_commons.py
@@ -1,9 +1,9 @@
 import torch
 
 import colossalai
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.sharded_param import ShardedTensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.sharded_param import ShardedTensor
 
 
 def run_tensor_move(rank, world_size, port):
diff --git a/tests/test_utils/test_zero_gradient_clippling.py b/tests/test_utils/test_zero_gradient_clippling.py
deleted file mode 100644
index e99cf388e929..000000000000
--- a/tests/test_utils/test_zero_gradient_clippling.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.utils import clip_grad_norm_
-
-import colossalai
-from colossalai.logging import disable_existing_loggers
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import checkpoint, clip_grad_norm_fp32
-from colossalai.zero.legacy.shard_utils.tensor_shard_strategy import TensorShardStrategy
-from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-
-
-def checkpoint_wrapper(module, enable=True):
-    if enable:
-        module.forward = partial(checkpoint, module.forward, False)
-    return module
-
-
-class Net(nn.Module):
-
-    def __init__(self, checkpoint=False) -> None:
-        super().__init__()
-        self.fc1 = nn.Linear(5, 5)
-        self.fc2 = nn.Linear(5, 5)
-        self.fc3 = nn.Linear(5, 1)
-        if checkpoint:
-            self.fc1 = checkpoint_wrapper(self.fc1)
-        self.layers = [self.fc1, self.fc2, self.fc1, self.fc2, self.fc3]
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-def run_step(model, optimizer, x, enable_autocast=False, norm_type=2.0):
-    model.train()
-    optimizer.zero_grad()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
-        y = model(x)
-        loss = y.sum()
-    loss = loss.float()
-    loss.backward()
-    clip_grad(model, norm_type)
-    optimizer.step()
-
-
-def clip_grad(model, norm_type):
-    if isinstance(model, DDP):
-        clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=norm_type)
-    else:
-        clip_grad_norm_fp32(model.parameters(), max_norm=1.0, norm_type=norm_type)
-
-
-def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
-    if loose:
-        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
-    return torch.allclose(tensor_a, tensor_b)
-
-
-def check_grads(model, zero_model, loose=False):
-    rank = dist.get_rank()
-    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
-        zero_grad = zero_p.grad.clone().to(p.device)
-        chunks = torch.flatten(p.grad).chunk(4)
-        if rank >= len(chunks):
-            continue
-        grad = chunks[rank]
-        if zero_p.zero_shard_padding > 0:
-            zero_grad = zero_grad[:-zero_p.zero_shard_padding]
-        assert grad.dtype == zero_grad.dtype
-        assert allclose(grad, zero_grad, loose=loose)
-
-
-def check_params(model, zero_model, loose=False):
-    rank = dist.get_rank()
-    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
-        zero_shard_padding = zero_p.zero_shard_padding
-        zero_p = zero_p.clone().to(p.device)
-        chunks = torch.flatten(p).chunk(4)
-        if rank >= len(chunks):
-            continue
-        p = chunks[rank]
-        if zero_shard_padding > 0:
-            zero_p = zero_p[:-zero_shard_padding]
-        assert p.dtype == zero_p.dtype
-        assert allclose(p, zero_p, loose=loose)
-
-
-def run_dist(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_zero_clip_grad():
-    world_size = 4
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_zero_clip_grad()