Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/amp/apex_amp/apex_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from torch import Tensor

from colossalai.interface import OptimizerWrapper
from colossalai.utils import clip_grad_norm_fp32
from colossalai.legacy.utils import clip_grad_norm_fp32


class ApexAMPOptimizer(OptimizerWrapper):
Expand Down
2 changes: 1 addition & 1 deletion colossalai/amp/naive_amp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch.nn as nn
from torch.optim import Optimizer

from colossalai.utils import is_no_pp_or_last_stage
from colossalai.legacy.utils import is_no_pp_or_last_stage

from ._fp16_optimizer import FP16Optimizer
from .grad_scaler import ConstantGradScaler, DynamicGradScaler
Expand Down
3 changes: 2 additions & 1 deletion colossalai/amp/naive_amp/_fp16_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.legacy.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes
from colossalai.logging import get_dist_logger
from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier
from colossalai.utils import multi_tensor_applier

from ._utils import has_inf_or_nan, zero_gard_by_list
from .grad_scaler import BaseGradScaler
Expand Down
2 changes: 1 addition & 1 deletion colossalai/amp/torch_amp/torch_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from torch.optim import Optimizer

from colossalai.interface import OptimizerWrapper
from colossalai.utils import clip_grad_norm_fp32
from colossalai.legacy.utils import clip_grad_norm_fp32

from ._grad_scaler import GradScaler

Expand Down
3 changes: 2 additions & 1 deletion colossalai/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@
PipelineSchedule,
get_tensor_shape,
)
from colossalai.legacy.utils import is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
from colossalai.utils import get_current_device
from colossalai.utils.moe import sync_moe_model_param


Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/engine/schedule/_pipeline_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from colossalai.amp.naive_amp import NaiveAMPModel
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.utils import switch_virtual_pipeline_parallel_rank
from colossalai.logging import get_dist_logger
from colossalai.utils import switch_virtual_pipeline_parallel_rank
from colossalai.utils.cuda import get_current_device

from ._base_schedule import BaseSchedule
Expand Down
4 changes: 2 additions & 2 deletions colossalai/legacy/nn/layer/parallel_1d/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
from colossalai.kernel import LayerNorm
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import (
from colossalai.legacy.utils.checkpointing import (
broadcast_state_dict,
gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict,
)
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device

from ..base_layer import ParallelLayer
Expand Down
5 changes: 4 additions & 1 deletion colossalai/legacy/nn/layer/parallel_2d/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.legacy.utils.checkpointing import (
gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict,
)
from colossalai.nn import init as init
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
from colossalai.utils.cuda import get_current_device

from ..base_layer import ParallelLayer
Expand Down
4 changes: 2 additions & 2 deletions colossalai/legacy/nn/layer/parallel_2p5d/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import (
from colossalai.legacy.utils.checkpointing import (
broadcast_state_dict,
gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict,
)
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device

from ..base_layer import ParallelLayer
Expand Down
4 changes: 2 additions & 2 deletions colossalai/legacy/nn/layer/parallel_3d/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from colossalai.legacy.communication import all_reduce, broadcast
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import (
from colossalai.legacy.utils.checkpointing import (
broadcast_state_dict,
gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict,
)
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device

from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/nn/layer/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.utils import checkpoint
from colossalai.legacy.utils import checkpoint


class CheckpointModule(nn.Module):
Expand Down
3 changes: 2 additions & 1 deletion colossalai/legacy/trainer/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from colossalai.legacy.engine import Engine
from colossalai.legacy.trainer.hooks import BaseHook
from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
from colossalai.logging import DistributedLogger
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
from colossalai.utils import MultiTimer


class Trainer:
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/trainer/hooks/_checkpoint_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from colossalai.legacy.registry import HOOKS
from colossalai.legacy.trainer.hooks import BaseHook
from colossalai.legacy.utils.checkpointing import save_checkpoint
from colossalai.logging import get_dist_logger
from colossalai.utils.checkpointing import save_checkpoint

from ._lr_scheduler_hook import LRSchedulerHook

Expand Down
3 changes: 2 additions & 1 deletion colossalai/legacy/trainer/hooks/_log_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from colossalai.core import global_context as gpc
from colossalai.legacy.registry import HOOKS
from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
from colossalai.logging import DistributedLogger
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
from colossalai.utils import MultiTimer

from ._base_hook import BaseHook
from ._commons_ import _format_number
Expand Down
3 changes: 2 additions & 1 deletion colossalai/legacy/trainer/hooks/_metric_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_reduce
from colossalai.legacy.registry import HOOKS
from colossalai.utils import get_current_device, is_no_pp_or_last_stage
from colossalai.legacy.utils import is_no_pp_or_last_stage
from colossalai.utils import get_current_device

from ._base_hook import BaseHook
from ._commons_ import _format_number
Expand Down
53 changes: 53 additions & 0 deletions colossalai/legacy/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from .checkpointing import load_checkpoint, save_checkpoint
from .common import (
clip_grad_norm_fp32,
copy_tensor_parallel_attributes,
count_zeros_fp32,
is_dp_rank_0,
is_model_parallel_parameter,
is_no_pp_or_last_stage,
is_tp_rank_0,
is_using_ddp,
is_using_pp,
is_using_sequence,
param_is_not_tensor_parallel_duplicate,
print_rank_0,
switch_virtual_pipeline_parallel_rank,
sync_model_param,
)
from .data_sampler import DataParallelSampler, get_dataloader
from .memory import (
colo_device_memory_capacity,
colo_device_memory_used,
colo_get_cpu_memory_capacity,
colo_set_cpu_memory_capacity,
colo_set_process_memory_fraction,
report_memory_usage,
)

__all__ = [
'DataParallelSampler',
'get_dataloader',
'save_checkpoint',
'load_checkpoint',
'colo_device_memory_capacity',
'colo_device_memory_used',
'colo_get_cpu_memory_capacity',
'colo_set_cpu_memory_capacity',
'colo_set_process_memory_fraction',
'report_memory_usage',
'clip_grad_norm_fp32',
'copy_tensor_parallel_attributes',
'count_zeros_fp32',
'is_dp_rank_0',
'is_model_parallel_parameter',
'is_no_pp_or_last_stage',
'is_tp_rank_0',
'is_using_ddp',
'is_using_pp',
'is_using_sequence',
'param_is_not_tensor_parallel_duplicate',
'print_rank_0',
'switch_virtual_pipeline_parallel_rank',
'sync_model_param',
]
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import weakref

import torch
from torch.utils.checkpoint import check_backward_validity, detach_variable

from colossalai.context.random import get_states, get_current_mode, set_seed_states, set_mode, sync_states
from .cuda import get_current_device

import weakref
from colossalai.context.random import get_current_mode, get_states, set_mode, set_seed_states, sync_states
from colossalai.utils import get_current_device


def copy_to_device(obj, device):
Expand Down Expand Up @@ -143,7 +143,7 @@ def checkpoint(function, activation_offload, *args, use_reentrant: bool = True):

Args:
function: Describe the forward pass function. It should know how to handle the input tuples.
activation_offload: The variable to check whether we should offload activation to cpu
activation_offload: The variable to check whether we should offload activation to cpu
args (list): Tuple containing the parameters of the function
use_reentrant: Bool type to check if we need to use_reentrant, if use_reentrant=False, there
might be more flexibility for user to define there checkpoint function
Expand Down Expand Up @@ -227,12 +227,12 @@ def inner_unpack(packed):
# rerun forward, the inner_pack will store all the activations in storage
if has_autocast_in_fwd:
with torch.enable_grad(), \
torch.cuda.amp.autocast(), \
torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
torch.cuda.amp.autocast(), \
torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
_unused = function(*args)
else:
with torch.enable_grad(), \
torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
_unused = function(*args)

if x not in storage:
Expand Down
3 changes: 3 additions & 0 deletions colossalai/legacy/utils/checkpoint/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .module_checkpoint import load_checkpoint, save_checkpoint

__all__ = ['save_checkpoint', 'load_checkpoint']
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from colossalai.interface import OptimizerWrapper
from colossalai.tensor import ColoTensor
from colossalai.utils.checkpoint.utils import gather_tensor, scatter_tensor

from .utils import gather_tensor, scatter_tensor


def save_checkpoint(path: str,
Expand Down
Loading