Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions colossalai/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@
PipelineSchedule,
get_tensor_shape,
)
from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
from colossalai.utils.moe import sync_moe_model_param
from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook


def get_default_parser():
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/engine/_base_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
NonPipelineSchedule,
PipelineSchedule,
)
from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively
from colossalai.logging import get_dist_logger
from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively


class Engine:
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/engine/schedule/_pipeline_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def load_micro_batch(self):
return self._move_to_device(micro_batch_data)

def pre_processing(self, engine):
from colossalai.zero.legacy import ShardedModelV2
from colossalai.legacy.zero import ShardedModelV2

# TODO: remove this after testing new zero with pipeline parallelism
model = engine.model
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

import torch

from colossalai.legacy.zero.gemini.tensor_utils import alloc_storage, free_storage
from colossalai.tensor.param_op_hook import ColoParamOpHook
from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage


class TrainingPhase(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
from colossalai.context.parallel_mode import ParallelMode
from colossalai.context.singleton_meta import SingletonMeta
from colossalai.core import global_context as gpc
from colossalai.legacy.zero.shard_utils import BaseShardStrategy
from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
from colossalai.legacy.zero.sharded_model.sharded_model_v2 import ShardedModelV2
from colossalai.legacy.zero.sharded_param import ShardedParamV2
from colossalai.logging import get_dist_logger
from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
from colossalai.zero.legacy.shard_utils import BaseShardStrategy
from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
from colossalai.zero.legacy.sharded_param import ShardedParamV2


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch.distributed as dist

from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor


class BaseShardStrategy(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import torch.distributed as dist
from torch._utils import _flatten_dense_tensors as flatten

from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
from colossalai.utils import get_current_device
from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor

from .tensor_shard_strategy import TensorShardStrategy

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import torch
import torch.distributed as dist

from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline
from colossalai.legacy.zero.shard_utils import BaseShardStrategy
from colossalai.legacy.zero.shard_utils.commons import get_shard
from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
from colossalai.utils import get_current_device
from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline
from colossalai.zero.legacy.shard_utils import BaseShardStrategy
from colossalai.zero.legacy.shard_utils.commons import get_shard
from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor


class TensorShardStrategy(BaseShardStrategy):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch
import torch.nn.functional as F

from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor


def get_gradient_predivide_factor(world_size: int) -> float:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@

from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.zero.gemini.ophooks import register_ophooks_recursively
from colossalai.legacy.zero.gemini.paramhooks import BaseParamHookMgr
from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.legacy.zero.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_move_to_cpu
from colossalai.legacy.zero.shard_utils import BaseShardStrategy
from colossalai.legacy.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
from colossalai.logging import get_dist_logger
from colossalai.utils import disposable, get_current_device
from colossalai.utils.memory import colo_device_memory_capacity
from colossalai.zero.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import register_ophooks_recursively
from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr
from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.zero.legacy.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_move_to_cpu
from colossalai.zero.legacy.shard_utils import BaseShardStrategy
from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBucketer

from ._utils import (
cast_float_arguments,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import torch

from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.legacy.zero.sharded_model import ShardedModelV2


def col_model_deepcopy(sharded_model: ShardedModelV2, other_model: torch.nn.Module):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import torch.distributed as dist

from colossalai.legacy.registry import OPHOOKS
from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.legacy.zero.shard_utils import BaseShardStrategy
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device
from colossalai.zero.gemini.memory_tracer import MemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.zero.legacy.shard_utils import BaseShardStrategy


@OPHOOKS.register_module
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.interface import OptimizerWrapper
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
from colossalai.legacy.zero.sharded_model import ShardedModelV2
from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32
from colossalai.logging import get_dist_logger
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32


class OptimState(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import torch

from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage

from .sharded_tensor import ShardedTensor

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch

from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState


class ShardedTensor(StatefulTensor):
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/layer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .moe import *
# from .moe import *
from .utils import *
2 changes: 1 addition & 1 deletion colossalai/nn/layer/moe/experts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

from colossalai.context import ParallelMode, seed
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator
from colossalai.utils import get_current_device
from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator


class MoeExperts(nn.Module):
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/layer/moe/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch.nn.functional as F

from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
from colossalai.nn.layer.moe._operation import (
COL_MOE_KERNEL_FLAG,
AllGather,
Expand All @@ -18,7 +19,6 @@
from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router
from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator
from colossalai.utils import get_current_device
from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator


@no_shard_zero_decrator(is_replicated=True)
Expand Down
2 changes: 1 addition & 1 deletion colossalai/zero/gemini/memory_tracer/memstats_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def record_model_data_volume(self) -> None:
Sampling model data statistics.
"""
if self._start_flag and not self.use_outside_memstats:
from colossalai.zero.legacy.gemini import StatefulTensor
from colossalai.legacy.zero.gemini import StatefulTensor

# The following code work for ZeroInitContext, which is deprecated in v0.1.12
cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']
Expand Down
6 changes: 3 additions & 3 deletions colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import torch.nn

from colossalai.tensor.param_op_hook import ColoParamOpHookManager
from colossalai.utils import _cast_float
from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
GradMemStats,
GradMemTracerHook,
ParamMemTracerHook,
)
from colossalai.tensor.param_op_hook import ColoParamOpHookManager
from colossalai.utils import _cast_float

from .memory_stats import MemStats

Expand Down
2 changes: 1 addition & 1 deletion examples/language/gpt/titans/train_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.legacy.zero.init_ctx import ZeroInitContext
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import LinearWarmupLR
from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
from colossalai.utils.timer import MultiTimer
from colossalai.zero.legacy.init_ctx import ZeroInitContext


def calc_local_model_size(model: torch.nn.Module):
Expand Down
2 changes: 1 addition & 1 deletion examples/tutorial/opt/opt/colossalai_zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from colossalai.zero.shard_utils import TensorShardStrategy
except ImportError:
# colossalai > 0.2.8
from colossalai.zero.legacy import TensorShardStrategy
from colossalai.legacy.zero import TensorShardStrategy

zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
tensor_placement_policy="auto",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import torch

import colossalai
from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
from colossalai.legacy.zero.sharded_param import ShardedTensor
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
from colossalai.zero.legacy.sharded_param import ShardedTensor


def run_tensor_move(rank, world_size, port):
Expand Down
111 changes: 0 additions & 111 deletions tests/test_utils/test_zero_gradient_clippling.py

This file was deleted.