hpcaitech · ver217 · Sep 12, 2023 · Sep 12, 2023
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
@@ -30,11 +30,11 @@
     PipelineSchedule,
     get_tensor_shape,
 )
+from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
 from colossalai.utils.moe import sync_moe_model_param
-from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
 
 
 def get_default_parser():

diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py
@@ -16,8 +16,8 @@
     NonPipelineSchedule,
     PipelineSchedule,
 )
+from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 
 
 class Engine:

diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@@ -157,7 +157,7 @@ def load_micro_batch(self):
         return self._move_to_device(micro_batch_data)
 
     def pre_processing(self, engine):
-        from colossalai.zero.legacy import ShardedModelV2
+        from colossalai.legacy.zero import ShardedModelV2
 
         # TODO: remove this after testing new zero with pipeline parallelism
         model = engine.model

diff --git a/colossalai/zero/legacy/__init__.py → colossalai/legacy/zero/__init__.py b/colossalai/zero/legacy/__init__.py → colossalai/legacy/zero/__init__.py
diff --git a/colossalai/zero/legacy/gemini/__init__.py → colossalai/legacy/zero/gemini/__init__.py b/colossalai/zero/legacy/gemini/__init__.py → colossalai/legacy/zero/gemini/__init__.py
diff --git a/...alai/zero/legacy/gemini/gemini_context.py → ...alai/legacy/zero/gemini/gemini_context.py b/...alai/zero/legacy/gemini/gemini_context.py → ...alai/legacy/zero/gemini/gemini_context.py
diff --git a/...ai/zero/legacy/gemini/ophooks/__init__.py → ...ai/legacy/zero/gemini/ophooks/__init__.py b/...ai/zero/legacy/gemini/ophooks/__init__.py → ...ai/legacy/zero/gemini/ophooks/__init__.py
diff --git a/...gacy/gemini/ophooks/_shard_grad_ophook.py → ...zero/gemini/ophooks/_shard_grad_ophook.py b/...gacy/gemini/ophooks/_shard_grad_ophook.py → ...zero/gemini/ophooks/_shard_grad_ophook.py
diff --git a/...acy/gemini/ophooks/_shard_param_ophook.py → ...ero/gemini/ophooks/_shard_param_ophook.py b/...acy/gemini/ophooks/_shard_param_ophook.py → ...ero/gemini/ophooks/_shard_param_ophook.py
diff --git a/...gemini/ophooks/runtime_mem_tracer_hook.py → ...gemini/ophooks/runtime_mem_tracer_hook.py b/...gemini/ophooks/runtime_mem_tracer_hook.py → ...gemini/ophooks/runtime_mem_tracer_hook.py
@@ -5,9 +5,9 @@
 
 import torch
 
+from colossalai.legacy.zero.gemini.tensor_utils import alloc_storage, free_storage
 from colossalai.tensor.param_op_hook import ColoParamOpHook
 from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
-from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage
 
 
 class TrainingPhase(Enum):

diff --git a/...salai/zero/legacy/gemini/ophooks/utils.py → ...salai/legacy/zero/gemini/ophooks/utils.py b/...salai/zero/legacy/gemini/ophooks/utils.py → ...salai/legacy/zero/gemini/ophooks/utils.py
diff --git a/...zero/legacy/gemini/paramhooks/__init__.py → ...legacy/zero/gemini/paramhooks/__init__.py b/...zero/legacy/gemini/paramhooks/__init__.py → ...legacy/zero/gemini/paramhooks/__init__.py
diff --git a/...egacy/gemini/paramhooks/_param_hookmgr.py → .../zero/gemini/paramhooks/_param_hookmgr.py b/...egacy/gemini/paramhooks/_param_hookmgr.py → .../zero/gemini/paramhooks/_param_hookmgr.py
diff --git a/...lai/zero/legacy/gemini/stateful_tensor.py → ...lai/legacy/zero/gemini/stateful_tensor.py b/...lai/zero/legacy/gemini/stateful_tensor.py → ...lai/legacy/zero/gemini/stateful_tensor.py
diff --git a/...zero/legacy/gemini/stateful_tensor_mgr.py → ...legacy/zero/gemini/stateful_tensor_mgr.py b/...zero/legacy/gemini/stateful_tensor_mgr.py → ...legacy/zero/gemini/stateful_tensor_mgr.py
diff --git a/.../legacy/gemini/tensor_placement_policy.py → ...cy/zero/gemini/tensor_placement_policy.py b/.../legacy/gemini/tensor_placement_policy.py → ...cy/zero/gemini/tensor_placement_policy.py
diff --git a/...ssalai/zero/legacy/gemini/tensor_utils.py → ...ssalai/legacy/zero/gemini/tensor_utils.py b/...ssalai/zero/legacy/gemini/tensor_utils.py → ...ssalai/legacy/zero/gemini/tensor_utils.py
diff --git a/colossalai/zero/legacy/init_ctx/__init__.py → colossalai/legacy/zero/init_ctx/__init__.py b/colossalai/zero/legacy/init_ctx/__init__.py → colossalai/legacy/zero/init_ctx/__init__.py
diff --git a/...alai/zero/legacy/init_ctx/init_context.py → ...alai/legacy/zero/init_ctx/init_context.py b/...alai/zero/legacy/init_ctx/init_context.py → ...alai/legacy/zero/init_ctx/init_context.py
@@ -11,12 +11,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.core import global_context as gpc
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
+from colossalai.legacy.zero.sharded_model.sharded_model_v2 import ShardedModelV2
+from colossalai.legacy.zero.sharded_param import ShardedParamV2
 from colossalai.logging import get_dist_logger
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
-from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-from colossalai.zero.legacy.sharded_param import ShardedParamV2
 
 
 @dataclass

diff --git a/...salai/zero/legacy/shard_utils/__init__.py → ...salai/legacy/zero/shard_utils/__init__.py b/...salai/zero/legacy/shard_utils/__init__.py → ...salai/legacy/zero/shard_utils/__init__.py
diff --git a/...legacy/shard_utils/base_shard_strategy.py → ...y/zero/shard_utils/base_shard_strategy.py b/...legacy/shard_utils/base_shard_strategy.py → ...y/zero/shard_utils/base_shard_strategy.py
@@ -3,7 +3,7 @@
 
 import torch.distributed as dist
 
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 
 
 class BaseShardStrategy(ABC):

diff --git a/...ard_utils/bucket_tensor_shard_strategy.py → ...ard_utils/bucket_tensor_shard_strategy.py b/...ard_utils/bucket_tensor_shard_strategy.py → ...ard_utils/bucket_tensor_shard_strategy.py
@@ -4,8 +4,8 @@
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors as flatten
 
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 from .tensor_shard_strategy import TensorShardStrategy
 

diff --git a/...ssalai/zero/legacy/shard_utils/commons.py → ...ssalai/legacy/zero/shard_utils/commons.py b/...ssalai/zero/legacy/shard_utils/commons.py → ...ssalai/legacy/zero/shard_utils/commons.py
diff --git a/...gacy/shard_utils/tensor_shard_strategy.py → ...zero/shard_utils/tensor_shard_strategy.py b/...gacy/shard_utils/tensor_shard_strategy.py → ...zero/shard_utils/tensor_shard_strategy.py
@@ -3,11 +3,11 @@
 import torch
 import torch.distributed as dist
 
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.shard_utils.commons import get_shard
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.shard_utils.commons import get_shard
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 
 class TensorShardStrategy(BaseShardStrategy):

diff --git a/...lai/zero/legacy/sharded_model/__init__.py → ...lai/legacy/zero/sharded_model/__init__.py b/...lai/zero/legacy/sharded_model/__init__.py → ...lai/legacy/zero/sharded_model/__init__.py
diff --git a/...salai/zero/legacy/sharded_model/_utils.py → ...salai/legacy/zero/sharded_model/_utils.py b/...salai/zero/legacy/sharded_model/_utils.py → ...salai/legacy/zero/sharded_model/_utils.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor
 
 
 def get_gradient_predivide_factor(world_size: int) -> float:

diff --git a/...ro/legacy/sharded_model/reduce_scatter.py → ...gacy/zero/sharded_model/reduce_scatter.py b/...ro/legacy/sharded_model/reduce_scatter.py → ...gacy/zero/sharded_model/reduce_scatter.py
diff --git a/.../legacy/sharded_model/sharded_model_v2.py → ...cy/zero/sharded_model/sharded_model_v2.py b/.../legacy/sharded_model/sharded_model_v2.py → ...cy/zero/sharded_model/sharded_model_v2.py
@@ -13,18 +13,18 @@
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.zero.gemini.ophooks import register_ophooks_recursively
+from colossalai.legacy.zero.gemini.paramhooks import BaseParamHookMgr
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_move_to_cpu
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
 from colossalai.logging import get_dist_logger
 from colossalai.utils import disposable, get_current_device
 from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import register_ophooks_recursively
-from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_move_to_cpu
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBucketer
 
 from ._utils import (
     cast_float_arguments,

diff --git a/...ssalai/zero/legacy/sharded_model/utils.py → ...ssalai/legacy/zero/sharded_model/utils.py b/...ssalai/zero/legacy/sharded_model/utils.py → ...ssalai/legacy/zero/sharded_model/utils.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
 
 
 def col_model_deepcopy(sharded_model: ShardedModelV2, other_model: torch.nn.Module):

diff --git a/...ai/zero/legacy/sharded_model/zero_hook.py → ...ai/legacy/zero/sharded_model/zero_hook.py b/...ai/zero/legacy/sharded_model/zero_hook.py → ...ai/legacy/zero/sharded_model/zero_hook.py
@@ -4,13 +4,13 @@
 import torch.distributed as dist
 
 from colossalai.legacy.registry import OPHOOKS
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
 
 
 @OPHOOKS.register_module

diff --git a/...lai/zero/legacy/sharded_optim/__init__.py → ...lai/legacy/zero/sharded_optim/__init__.py b/...lai/zero/legacy/sharded_optim/__init__.py → ...lai/legacy/zero/sharded_optim/__init__.py
diff --git a/.../legacy/sharded_optim/sharded_optim_v2.py → ...cy/zero/sharded_optim/sharded_optim_v2.py b/.../legacy/sharded_optim/sharded_optim_v2.py → ...cy/zero/sharded_optim/sharded_optim_v2.py
@@ -15,12 +15,12 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32
 
 
 class OptimState(Enum):

diff --git a/...lai/zero/legacy/sharded_param/__init__.py → ...lai/legacy/zero/sharded_param/__init__.py b/...lai/zero/legacy/sharded_param/__init__.py → ...lai/legacy/zero/sharded_param/__init__.py
diff --git a/...ero/legacy/sharded_param/sharded_param.py → ...egacy/zero/sharded_param/sharded_param.py b/...ero/legacy/sharded_param/sharded_param.py → ...egacy/zero/sharded_param/sharded_param.py
@@ -2,8 +2,8 @@
 
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage
 
 from .sharded_tensor import ShardedTensor
 

diff --git a/...ro/legacy/sharded_param/sharded_tensor.py → ...gacy/zero/sharded_param/sharded_tensor.py b/...ro/legacy/sharded_param/sharded_tensor.py → ...gacy/zero/sharded_param/sharded_tensor.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
 
 
 class ShardedTensor(StatefulTensor):

diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
@@ -1,2 +1,2 @@
-from .moe import *
+# from .moe import *
 from .utils import *
diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py
@@ -8,8 +8,8 @@
 
 from colossalai.context import ParallelMode, seed
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
 
 
 class MoeExperts(nn.Module):

diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 from colossalai.nn.layer.moe._operation import (
     COL_MOE_KERNEL_FLAG,
     AllGather,
@@ -18,7 +19,6 @@
 from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router
 from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 
 
 @no_shard_zero_decrator(is_replicated=True)

diff --git a/colossalai/zero/gemini/memory_tracer/memstats_collector.py b/colossalai/zero/gemini/memory_tracer/memstats_collector.py
@@ -70,7 +70,7 @@ def record_model_data_volume(self) -> None:
         Sampling model data statistics.
         """
         if self._start_flag and not self.use_outside_memstats:
-            from colossalai.zero.legacy.gemini import StatefulTensor
+            from colossalai.legacy.zero.gemini import StatefulTensor
 
             # The following code work for ZeroInitContext, which is deprecated in v0.1.12
             cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']

diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,12 +1,12 @@
 import torch.nn
 
-from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import _cast_float
-from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
+from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
     GradMemStats,
     GradMemTracerHook,
     ParamMemTracerHook,
 )
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import _cast_float
 
 from .memory_stats import MemStats
 

diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
@@ -11,11 +11,11 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.trainer import Trainer, hooks
+from colossalai.legacy.zero.init_ctx import ZeroInitContext
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn import LinearWarmupLR
 from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
 from colossalai.utils.timer import MultiTimer
-from colossalai.zero.legacy.init_ctx import ZeroInitContext
 
 
 def calc_local_model_size(model: torch.nn.Module):

diff --git a/examples/tutorial/opt/opt/colossalai_zero.py b/examples/tutorial/opt/opt/colossalai_zero.py
@@ -2,7 +2,7 @@
     from colossalai.zero.shard_utils import TensorShardStrategy
 except ImportError:
     # colossalai > 0.2.8
-    from colossalai.zero.legacy import TensorShardStrategy
+    from colossalai.legacy.zero import TensorShardStrategy
 
 zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
                               tensor_placement_policy="auto",

diff --git a/tests/test_utils/test_commons.py → tests/test_legacy/test_zero/test_commons.py b/tests/test_utils/test_commons.py → tests/test_legacy/test_zero/test_commons.py
@@ -1,9 +1,9 @@
 import torch
 
 import colossalai
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.sharded_param import ShardedTensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.sharded_param import ShardedTensor
 
 
 def run_tensor_move(rank, world_size, port):

diff --git a/tests/test_utils/test_zero_gradient_clippling.py b/tests/test_utils/test_zero_gradient_clippling.py