diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index 85e17b7e06bc..9c7a84d4841e 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -7,9 +7,9 @@ import torch import sys import os +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed import comm as dist from deepspeed.runtime.constants import PIPE_REPLICATED -from deepspeed.ops.op_builder import UtilsBuilder from deepspeed.runtime import ZeROOptimizer from packaging import version as pkg_version @@ -53,10 +53,9 @@ def __init__(self, self.dp_rank = dist.get_rank(group=self.dp_process_group) self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))] - # Load pre-built or JIT compile (un)flatten ops - util_ops = UtilsBuilder().load() - self.flatten = util_ops.flatten - self.unflatten = util_ops.unflatten + # Use torch (un)flatten ops + self.flatten = _flatten_dense_tensors + self.unflatten = _unflatten_dense_tensors #align nccl all-gather send buffers to 4-bye boundary self.nccl_start_alignment_factor = 2 # 4-byte alignment/sizeof(fp16) = 2 diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index f0fef9bcc465..7233aecfc3c3 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -16,6 +16,7 @@ from torch.nn.parameter import Parameter from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from typing import Callable, Dict, Union, Iterable @@ -93,7 +94,6 @@ from deepspeed.utils.logging import print_json_dist, print_configuration from deepspeed.accelerator import get_accelerator -from deepspeed.ops.op_builder import UtilsBuilder from deepspeed.runtime.config import DtypeEnum @@ -360,10 +360,9 @@ def __init__( if self.dump_state(): print_configuration(self, "DeepSpeedEngine") - # Load pre-installed or JIT compile (un)flatten ops - util_ops = UtilsBuilder().load() - self.flatten = util_ops.flatten - self.unflatten = util_ops.unflatten + # Use torch (un)flatten ops + self.flatten = _flatten_dense_tensors + self.unflatten = _unflatten_dense_tensors def destroy(self): if self.optimizer is not None and hasattr(self.optimizer, 'destroy'): diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 7448cb51ae25..098e7c22ee22 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -8,6 +8,7 @@ import collections from typing import Deque, Dict, Tuple +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed.runtime import ZeROOptimizer from deepspeed.utils import logger from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler @@ -23,7 +24,6 @@ from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE from deepspeed.accelerator import get_accelerator -from deepspeed.ops.op_builder import UtilsBuilder # Toggle this to true to enable correctness test # with gradient partitioning and without @@ -126,10 +126,9 @@ def __init__(self, self.optimizer = init_optimizer - # Load pre-built or JIT compile (un)flatten ops - util_ops = UtilsBuilder().load() - self.flatten = util_ops.flatten - self.unflatten = util_ops.unflatten + # Use torch (un)flatten ops + self.flatten = _flatten_dense_tensors + self.unflatten = _unflatten_dense_tensors self.dtype = self.optimizer.param_groups[0]['params'][0].dtype self._global_grad_norm = 0. diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 8f15b87af0e2..06a776b83443 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -8,6 +8,7 @@ from deepspeed import comm as dist from packaging import version as pkg_version from collections import OrderedDict +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed.runtime import ZeROOptimizer from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler @@ -23,7 +24,6 @@ from deepspeed.runtime.constants import PIPE_REPLICATED from deepspeed.accelerator import get_accelerator -from deepspeed.ops.op_builder import UtilsBuilder from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE, CLIP_GRAD, @@ -150,10 +150,9 @@ def __init__(self, raise SystemError("Cannot use fp16 without accelerator.") self.optimizer = init_optimizer - # Load pre-built or JIT compile (un)flatten ops - util_ops = UtilsBuilder().load() - self.flatten = util_ops.flatten - self.unflatten = util_ops.unflatten + # Use torch (un)flatten ops + self.flatten = _flatten_dense_tensors + self.unflatten = _unflatten_dense_tensors # ZeRO stage 1 (False) or 2 (True) self.partition_gradients = partition_grads diff --git a/op_builder/utils.py b/op_builder/utils.py deleted file mode 100644 index c10953397775..000000000000 --- a/op_builder/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - -from .builder import OpBuilder - - -class UtilsBuilder(OpBuilder): - BUILD_VAR = "DS_BUILD_UTILS" - NAME = "utils" - - def __init__(self): - super().__init__(name=self.NAME) - - def absolute_name(self): - return f'deepspeed.ops.{self.NAME}_op' - - def sources(self): - return ['csrc/utils/flatten_unflatten.cpp']