diff --git a/colossalai/initialize.py b/colossalai/initialize.py index 0de04e24091e..510c4555cc9b 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -30,11 +30,11 @@ PipelineSchedule, get_tensor_shape, ) +from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2 +from colossalai.legacy.zero.gemini.ophooks import BaseOpHook from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param from colossalai.utils.moe import sync_moe_model_param -from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2 -from colossalai.zero.legacy.gemini.ophooks import BaseOpHook def get_default_parser(): diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py index 9a1a2dc325a3..930caf20c1dd 100644 --- a/colossalai/legacy/engine/_base_engine.py +++ b/colossalai/legacy/engine/_base_engine.py @@ -16,8 +16,8 @@ NonPipelineSchedule, PipelineSchedule, ) +from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively from colossalai.logging import get_dist_logger -from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively class Engine: diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py index 4571fd679e8c..227729501be2 100644 --- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py +++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py @@ -157,7 +157,7 @@ def load_micro_batch(self): return self._move_to_device(micro_batch_data) def pre_processing(self, engine): - from colossalai.zero.legacy import ShardedModelV2 + from colossalai.legacy.zero import ShardedModelV2 # TODO: remove this after testing new zero with pipeline parallelism model = engine.model diff --git a/colossalai/zero/legacy/__init__.py b/colossalai/legacy/zero/__init__.py similarity index 100% rename from colossalai/zero/legacy/__init__.py rename to colossalai/legacy/zero/__init__.py diff --git a/colossalai/zero/legacy/gemini/__init__.py b/colossalai/legacy/zero/gemini/__init__.py similarity index 100% rename from colossalai/zero/legacy/gemini/__init__.py rename to colossalai/legacy/zero/gemini/__init__.py diff --git a/colossalai/zero/legacy/gemini/gemini_context.py b/colossalai/legacy/zero/gemini/gemini_context.py similarity index 100% rename from colossalai/zero/legacy/gemini/gemini_context.py rename to colossalai/legacy/zero/gemini/gemini_context.py diff --git a/colossalai/zero/legacy/gemini/ophooks/__init__.py b/colossalai/legacy/zero/gemini/ophooks/__init__.py similarity index 100% rename from colossalai/zero/legacy/gemini/ophooks/__init__.py rename to colossalai/legacy/zero/gemini/ophooks/__init__.py diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py similarity index 100% rename from colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py rename to colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py similarity index 100% rename from colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py rename to colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py diff --git a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py similarity index 98% rename from colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py rename to colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py index f40d6ced1ee0..eebcf86e0e58 100644 --- a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py +++ b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py @@ -5,9 +5,9 @@ import torch +from colossalai.legacy.zero.gemini.tensor_utils import alloc_storage, free_storage from colossalai.tensor.param_op_hook import ColoParamOpHook from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor -from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage class TrainingPhase(Enum): diff --git a/colossalai/zero/legacy/gemini/ophooks/utils.py b/colossalai/legacy/zero/gemini/ophooks/utils.py similarity index 100% rename from colossalai/zero/legacy/gemini/ophooks/utils.py rename to colossalai/legacy/zero/gemini/ophooks/utils.py diff --git a/colossalai/zero/legacy/gemini/paramhooks/__init__.py b/colossalai/legacy/zero/gemini/paramhooks/__init__.py similarity index 100% rename from colossalai/zero/legacy/gemini/paramhooks/__init__.py rename to colossalai/legacy/zero/gemini/paramhooks/__init__.py diff --git a/colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py b/colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py similarity index 100% rename from colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py rename to colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py diff --git a/colossalai/zero/legacy/gemini/stateful_tensor.py b/colossalai/legacy/zero/gemini/stateful_tensor.py similarity index 100% rename from colossalai/zero/legacy/gemini/stateful_tensor.py rename to colossalai/legacy/zero/gemini/stateful_tensor.py diff --git a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py b/colossalai/legacy/zero/gemini/stateful_tensor_mgr.py similarity index 100% rename from colossalai/zero/legacy/gemini/stateful_tensor_mgr.py rename to colossalai/legacy/zero/gemini/stateful_tensor_mgr.py diff --git a/colossalai/zero/legacy/gemini/tensor_placement_policy.py b/colossalai/legacy/zero/gemini/tensor_placement_policy.py similarity index 100% rename from colossalai/zero/legacy/gemini/tensor_placement_policy.py rename to colossalai/legacy/zero/gemini/tensor_placement_policy.py diff --git a/colossalai/zero/legacy/gemini/tensor_utils.py b/colossalai/legacy/zero/gemini/tensor_utils.py similarity index 100% rename from colossalai/zero/legacy/gemini/tensor_utils.py rename to colossalai/legacy/zero/gemini/tensor_utils.py diff --git a/colossalai/zero/legacy/init_ctx/__init__.py b/colossalai/legacy/zero/init_ctx/__init__.py similarity index 100% rename from colossalai/zero/legacy/init_ctx/__init__.py rename to colossalai/legacy/zero/init_ctx/__init__.py diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/legacy/zero/init_ctx/init_context.py similarity index 97% rename from colossalai/zero/legacy/init_ctx/init_context.py rename to colossalai/legacy/zero/init_ctx/init_context.py index 84e2d2f4f8e1..85a1f893d632 100644 --- a/colossalai/zero/legacy/init_ctx/init_context.py +++ b/colossalai/legacy/zero/init_ctx/init_context.py @@ -11,12 +11,12 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.context.singleton_meta import SingletonMeta from colossalai.core import global_context as gpc +from colossalai.legacy.zero.shard_utils import BaseShardStrategy +from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16 +from colossalai.legacy.zero.sharded_model.sharded_model_v2 import ShardedModelV2 +from colossalai.legacy.zero.sharded_param import ShardedParamV2 from colossalai.logging import get_dist_logger from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses -from colossalai.zero.legacy.shard_utils import BaseShardStrategy -from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16 -from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2 -from colossalai.zero.legacy.sharded_param import ShardedParamV2 @dataclass diff --git a/colossalai/zero/legacy/shard_utils/__init__.py b/colossalai/legacy/zero/shard_utils/__init__.py similarity index 100% rename from colossalai/zero/legacy/shard_utils/__init__.py rename to colossalai/legacy/zero/shard_utils/__init__.py diff --git a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py similarity index 90% rename from colossalai/zero/legacy/shard_utils/base_shard_strategy.py rename to colossalai/legacy/zero/shard_utils/base_shard_strategy.py index 7ca951091640..9fb80f57ae77 100644 --- a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py +++ b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py @@ -3,7 +3,7 @@ import torch.distributed as dist -from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor +from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor class BaseShardStrategy(ABC): diff --git a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py similarity index 97% rename from colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py rename to colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py index d663104831ce..1f7baad57816 100644 --- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py +++ b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py @@ -4,8 +4,8 @@ import torch.distributed as dist from torch._utils import _flatten_dense_tensors as flatten +from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor from colossalai.utils import get_current_device -from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor from .tensor_shard_strategy import TensorShardStrategy diff --git a/colossalai/zero/legacy/shard_utils/commons.py b/colossalai/legacy/zero/shard_utils/commons.py similarity index 100% rename from colossalai/zero/legacy/shard_utils/commons.py rename to colossalai/legacy/zero/shard_utils/commons.py diff --git a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py similarity index 90% rename from colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py rename to colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py index d1df4803b820..cc43907f6655 100644 --- a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py +++ b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py @@ -3,11 +3,11 @@ import torch import torch.distributed as dist +from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline +from colossalai.legacy.zero.shard_utils import BaseShardStrategy +from colossalai.legacy.zero.shard_utils.commons import get_shard +from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor from colossalai.utils import get_current_device -from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline -from colossalai.zero.legacy.shard_utils import BaseShardStrategy -from colossalai.zero.legacy.shard_utils.commons import get_shard -from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor class TensorShardStrategy(BaseShardStrategy): diff --git a/colossalai/zero/legacy/sharded_model/__init__.py b/colossalai/legacy/zero/sharded_model/__init__.py similarity index 100% rename from colossalai/zero/legacy/sharded_model/__init__.py rename to colossalai/legacy/zero/sharded_model/__init__.py diff --git a/colossalai/zero/legacy/sharded_model/_utils.py b/colossalai/legacy/zero/sharded_model/_utils.py similarity index 97% rename from colossalai/zero/legacy/sharded_model/_utils.py rename to colossalai/legacy/zero/sharded_model/_utils.py index f1d642cf3f13..b8a618ef5a0d 100644 --- a/colossalai/zero/legacy/sharded_model/_utils.py +++ b/colossalai/legacy/zero/sharded_model/_utils.py @@ -3,7 +3,7 @@ import torch import torch.nn.functional as F -from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor +from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor def get_gradient_predivide_factor(world_size: int) -> float: diff --git a/colossalai/zero/legacy/sharded_model/reduce_scatter.py b/colossalai/legacy/zero/sharded_model/reduce_scatter.py similarity index 100% rename from colossalai/zero/legacy/sharded_model/reduce_scatter.py rename to colossalai/legacy/zero/sharded_model/reduce_scatter.py diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py similarity index 98% rename from colossalai/zero/legacy/sharded_model/sharded_model_v2.py rename to colossalai/legacy/zero/sharded_model/sharded_model_v2.py index e7064277fb3c..353f09fbaaaf 100644 --- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py +++ b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py @@ -13,18 +13,18 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.zero.gemini.ophooks import register_ophooks_recursively +from colossalai.legacy.zero.gemini.paramhooks import BaseParamHookMgr +from colossalai.legacy.zero.gemini.stateful_tensor import TensorState +from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr +from colossalai.legacy.zero.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory +from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_move_to_cpu +from colossalai.legacy.zero.shard_utils import BaseShardStrategy +from colossalai.legacy.zero.sharded_model.reduce_scatter import ReduceScatterBucketer from colossalai.logging import get_dist_logger from colossalai.utils import disposable, get_current_device from colossalai.utils.memory import colo_device_memory_capacity from colossalai.zero.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector -from colossalai.zero.legacy.gemini.ophooks import register_ophooks_recursively -from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr -from colossalai.zero.legacy.gemini.stateful_tensor import TensorState -from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr -from colossalai.zero.legacy.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory -from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_move_to_cpu -from colossalai.zero.legacy.shard_utils import BaseShardStrategy -from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBucketer from ._utils import ( cast_float_arguments, diff --git a/colossalai/zero/legacy/sharded_model/utils.py b/colossalai/legacy/zero/sharded_model/utils.py similarity index 92% rename from colossalai/zero/legacy/sharded_model/utils.py rename to colossalai/legacy/zero/sharded_model/utils.py index 08806e78ea3b..7a411669900b 100644 --- a/colossalai/zero/legacy/sharded_model/utils.py +++ b/colossalai/legacy/zero/sharded_model/utils.py @@ -2,7 +2,7 @@ import torch -from colossalai.zero.legacy.sharded_model import ShardedModelV2 +from colossalai.legacy.zero.sharded_model import ShardedModelV2 def col_model_deepcopy(sharded_model: ShardedModelV2, other_model: torch.nn.Module): diff --git a/colossalai/zero/legacy/sharded_model/zero_hook.py b/colossalai/legacy/zero/sharded_model/zero_hook.py similarity index 94% rename from colossalai/zero/legacy/sharded_model/zero_hook.py rename to colossalai/legacy/zero/sharded_model/zero_hook.py index 1815bee3a9e0..3fc373e5ca44 100644 --- a/colossalai/zero/legacy/sharded_model/zero_hook.py +++ b/colossalai/legacy/zero/sharded_model/zero_hook.py @@ -4,13 +4,13 @@ import torch.distributed as dist from colossalai.legacy.registry import OPHOOKS +from colossalai.legacy.zero.gemini.ophooks import BaseOpHook +from colossalai.legacy.zero.gemini.stateful_tensor import TensorState +from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr +from colossalai.legacy.zero.shard_utils import BaseShardStrategy from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device from colossalai.zero.gemini.memory_tracer import MemStatsCollector -from colossalai.zero.legacy.gemini.ophooks import BaseOpHook -from colossalai.zero.legacy.gemini.stateful_tensor import TensorState -from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr -from colossalai.zero.legacy.shard_utils import BaseShardStrategy @OPHOOKS.register_module diff --git a/colossalai/zero/legacy/sharded_optim/__init__.py b/colossalai/legacy/zero/sharded_optim/__init__.py similarity index 100% rename from colossalai/zero/legacy/sharded_optim/__init__.py rename to colossalai/legacy/zero/sharded_optim/__init__.py diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py similarity index 98% rename from colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py rename to colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py index 7efe25142a27..936fd538bcf2 100644 --- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py +++ b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py @@ -15,12 +15,12 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.interface import OptimizerWrapper +from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState +from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy +from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage +from colossalai.legacy.zero.sharded_model import ShardedModelV2 +from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32 from colossalai.logging import get_dist_logger -from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState -from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy -from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage -from colossalai.zero.legacy.sharded_model import ShardedModelV2 -from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32 class OptimState(Enum): diff --git a/colossalai/zero/legacy/sharded_param/__init__.py b/colossalai/legacy/zero/sharded_param/__init__.py similarity index 100% rename from colossalai/zero/legacy/sharded_param/__init__.py rename to colossalai/legacy/zero/sharded_param/__init__.py diff --git a/colossalai/zero/legacy/sharded_param/sharded_param.py b/colossalai/legacy/zero/sharded_param/sharded_param.py similarity index 96% rename from colossalai/zero/legacy/sharded_param/sharded_param.py rename to colossalai/legacy/zero/sharded_param/sharded_param.py index 4bcc4b62104a..454a722cf7e7 100644 --- a/colossalai/zero/legacy/sharded_param/sharded_param.py +++ b/colossalai/legacy/zero/sharded_param/sharded_param.py @@ -2,8 +2,8 @@ import torch -from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState -from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage +from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState +from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage from .sharded_tensor import ShardedTensor diff --git a/colossalai/zero/legacy/sharded_param/sharded_tensor.py b/colossalai/legacy/zero/sharded_param/sharded_tensor.py similarity index 94% rename from colossalai/zero/legacy/sharded_param/sharded_tensor.py rename to colossalai/legacy/zero/sharded_param/sharded_tensor.py index af60312600f2..43c7576b93b5 100644 --- a/colossalai/zero/legacy/sharded_param/sharded_tensor.py +++ b/colossalai/legacy/zero/sharded_param/sharded_tensor.py @@ -1,6 +1,6 @@ import torch -from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState +from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState class ShardedTensor(StatefulTensor): diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py index edd986ef5e82..9aeab9f44a6d 100644 --- a/colossalai/nn/layer/__init__.py +++ b/colossalai/nn/layer/__init__.py @@ -1,2 +1,2 @@ -from .moe import * +# from .moe import * from .utils import * diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py index 56b11f4d9e08..55604a65e055 100644 --- a/colossalai/nn/layer/moe/experts.py +++ b/colossalai/nn/layer/moe/experts.py @@ -8,8 +8,8 @@ from colossalai.context import ParallelMode, seed from colossalai.context.moe_context import MOE_CONTEXT +from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator from colossalai.utils import get_current_device -from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator class MoeExperts(nn.Module): diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py index 03f55d91f3a8..9293d3208f11 100644 --- a/colossalai/nn/layer/moe/layers.py +++ b/colossalai/nn/layer/moe/layers.py @@ -6,6 +6,7 @@ import torch.nn.functional as F from colossalai.context.moe_context import MOE_CONTEXT +from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator from colossalai.nn.layer.moe._operation import ( COL_MOE_KERNEL_FLAG, AllGather, @@ -18,7 +19,6 @@ from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator from colossalai.utils import get_current_device -from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator @no_shard_zero_decrator(is_replicated=True) diff --git a/colossalai/zero/gemini/memory_tracer/memstats_collector.py b/colossalai/zero/gemini/memory_tracer/memstats_collector.py index 0694be48550a..abb3dcc74b27 100644 --- a/colossalai/zero/gemini/memory_tracer/memstats_collector.py +++ b/colossalai/zero/gemini/memory_tracer/memstats_collector.py @@ -70,7 +70,7 @@ def record_model_data_volume(self) -> None: Sampling model data statistics. """ if self._start_flag and not self.use_outside_memstats: - from colossalai.zero.legacy.gemini import StatefulTensor + from colossalai.legacy.zero.gemini import StatefulTensor # The following code work for ZeroInitContext, which is deprecated in v0.1.12 cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda'] diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py index e5466965cc48..6656821fef74 100644 --- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py +++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py @@ -1,12 +1,12 @@ import torch.nn -from colossalai.tensor.param_op_hook import ColoParamOpHookManager -from colossalai.utils import _cast_float -from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import ( +from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import ( GradMemStats, GradMemTracerHook, ParamMemTracerHook, ) +from colossalai.tensor.param_op_hook import ColoParamOpHookManager +from colossalai.utils import _cast_float from .memory_stats import MemStats diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py index b239b626c07f..d813e41af5a8 100644 --- a/examples/language/gpt/titans/train_gpt.py +++ b/examples/language/gpt/titans/train_gpt.py @@ -11,11 +11,11 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.legacy.trainer import Trainer, hooks +from colossalai.legacy.zero.init_ctx import ZeroInitContext from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn import LinearWarmupLR from colossalai.utils import colo_set_process_memory_fraction, is_using_pp from colossalai.utils.timer import MultiTimer -from colossalai.zero.legacy.init_ctx import ZeroInitContext def calc_local_model_size(model: torch.nn.Module): diff --git a/examples/tutorial/opt/opt/colossalai_zero.py b/examples/tutorial/opt/opt/colossalai_zero.py index 7c2c152450c5..8fbed6e83d52 100644 --- a/examples/tutorial/opt/opt/colossalai_zero.py +++ b/examples/tutorial/opt/opt/colossalai_zero.py @@ -2,7 +2,7 @@ from colossalai.zero.shard_utils import TensorShardStrategy except ImportError: # colossalai > 0.2.8 - from colossalai.zero.legacy import TensorShardStrategy + from colossalai.legacy.zero import TensorShardStrategy zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), tensor_placement_policy="auto", diff --git a/tests/test_utils/test_commons.py b/tests/test_legacy/test_zero/test_commons.py similarity index 90% rename from tests/test_utils/test_commons.py rename to tests/test_legacy/test_zero/test_commons.py index 2633d7da21aa..377549ed996e 100644 --- a/tests/test_utils/test_commons.py +++ b/tests/test_legacy/test_zero/test_commons.py @@ -1,9 +1,9 @@ import torch import colossalai +from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline +from colossalai.legacy.zero.sharded_param import ShardedTensor from colossalai.testing import rerun_if_address_is_in_use, spawn -from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline -from colossalai.zero.legacy.sharded_param import ShardedTensor def run_tensor_move(rank, world_size, port): diff --git a/tests/test_utils/test_zero_gradient_clippling.py b/tests/test_utils/test_zero_gradient_clippling.py deleted file mode 100644 index e99cf388e929..000000000000 --- a/tests/test_utils/test_zero_gradient_clippling.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from functools import partial - -import pytest -import torch -import torch.distributed as dist -import torch.nn as nn -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.nn.utils import clip_grad_norm_ - -import colossalai -from colossalai.logging import disable_existing_loggers -from colossalai.testing import rerun_if_address_is_in_use, spawn -from colossalai.utils import checkpoint, clip_grad_norm_fp32 -from colossalai.zero.legacy.shard_utils.tensor_shard_strategy import TensorShardStrategy -from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2 - - -def checkpoint_wrapper(module, enable=True): - if enable: - module.forward = partial(checkpoint, module.forward, False) - return module - - -class Net(nn.Module): - - def __init__(self, checkpoint=False) -> None: - super().__init__() - self.fc1 = nn.Linear(5, 5) - self.fc2 = nn.Linear(5, 5) - self.fc3 = nn.Linear(5, 1) - if checkpoint: - self.fc1 = checkpoint_wrapper(self.fc1) - self.layers = [self.fc1, self.fc2, self.fc1, self.fc2, self.fc3] - - def forward(self, x): - for layer in self.layers: - x = layer(x) - return x - - -def run_step(model, optimizer, x, enable_autocast=False, norm_type=2.0): - model.train() - optimizer.zero_grad() - with torch.cuda.amp.autocast(enabled=enable_autocast): - y = model(x) - loss = y.sum() - loss = loss.float() - loss.backward() - clip_grad(model, norm_type) - optimizer.step() - - -def clip_grad(model, norm_type): - if isinstance(model, DDP): - clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=norm_type) - else: - clip_grad_norm_fp32(model.parameters(), max_norm=1.0, norm_type=norm_type) - - -def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool: - if loose: - return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3) - return torch.allclose(tensor_a, tensor_b) - - -def check_grads(model, zero_model, loose=False): - rank = dist.get_rank() - for p, zero_p in zip(model.parameters(), zero_model.parameters()): - zero_grad = zero_p.grad.clone().to(p.device) - chunks = torch.flatten(p.grad).chunk(4) - if rank >= len(chunks): - continue - grad = chunks[rank] - if zero_p.zero_shard_padding > 0: - zero_grad = zero_grad[:-zero_p.zero_shard_padding] - assert grad.dtype == zero_grad.dtype - assert allclose(grad, zero_grad, loose=loose) - - -def check_params(model, zero_model, loose=False): - rank = dist.get_rank() - for p, zero_p in zip(model.parameters(), zero_model.parameters()): - zero_shard_padding = zero_p.zero_shard_padding - zero_p = zero_p.clone().to(p.device) - chunks = torch.flatten(p).chunk(4) - if rank >= len(chunks): - continue - p = chunks[rank] - if zero_shard_padding > 0: - zero_p = zero_p[:-zero_shard_padding] - assert p.dtype == zero_p.dtype - assert allclose(p, zero_p, loose=loose) - - -def run_dist(rank, world_size, port): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') - - -@pytest.mark.dist -@rerun_if_address_is_in_use() -def test_zero_clip_grad(): - world_size = 4 - spawn(run_dist, world_size) - - -if __name__ == '__main__': - test_zero_clip_grad()