Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)

include(CMakeDependentOption)
cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
if (NVFUSER_DISTRIBUTED)
add_compile_definitions(NVFUSER_DISTRIBUTED)
endif()
message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")

# We try to update which C++ standard we use together in lockstep across all
# built libraries, and these variables control which that is. Generally we are
# on C++20, but we still support a version of CUDA (11) that does not recognize
Expand Down Expand Up @@ -769,7 +762,6 @@ message(STATUS "******** Nvfuser configuration summary ********")
message(STATUS " UCC_FOUND: ${UCC_FOUND}")
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")

if(NVFUSER_STANDALONE_BUILD_WITH_UCC)
Expand Down
134 changes: 0 additions & 134 deletions csrc/multidevice/c10d_mock.h

This file was deleted.

4 changes: 2 additions & 2 deletions csrc/multidevice/communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
*/
// clang-format on
#include <multidevice/communication.h>
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#endif
#include <utils.h>
Expand Down Expand Up @@ -229,7 +229,7 @@ c10::intrusive_ptr<c10d::Work> Reduce::post(
c10d::ReduceOptions options = {
.reduceOp = params_.redOp, .rootRank = root_relative_index_};
auto team_backend = comm.getBackendForTeam(params_.team, backend);
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
auto nccl_backend = dynamic_cast<c10d::ProcessGroupNCCL*>(team_backend.get());
if (nccl_backend) {
#if NVF_TORCH_VERSION_NO_LESS(2, 3, 0)
Expand Down
4 changes: 0 additions & 4 deletions csrc/multidevice/communication.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@

#include <multidevice/communicator.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Types.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif
#include <type.h>
#include <visibility.h>

Expand Down
10 changes: 0 additions & 10 deletions csrc/multidevice/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <netdb.h>
#include <map>

#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
#ifdef USE_C10D_GLOO
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
Expand All @@ -21,7 +20,6 @@
#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
#include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
#endif
#endif

namespace nvfuser {

Expand Down Expand Up @@ -132,7 +130,6 @@ inline std::string getTeamKey(const Team& team, CommunicatorBackend backend) {
});
}

#ifdef NVFUSER_DISTRIBUTED
// creates and return a process group backend
c10::intrusive_ptr<c10d::Backend> createBackend(
CommunicatorBackend backend,
Expand Down Expand Up @@ -164,7 +161,6 @@ c10::intrusive_ptr<c10d::Backend> createBackend(
#endif
NVF_ERROR(false, "no distributed backend available");
}
#endif
} // namespace

Communicator::Communicator(
Expand All @@ -187,7 +183,6 @@ Communicator::Communicator(
return;
}

#ifdef NVFUSER_DISTRIBUTED
c10d::TCPStoreOptions store_opts;
{
char hostname[HOST_NAME_MAX]; // NOLINT (modernize-avoid-c-arrays)
Expand All @@ -203,7 +198,6 @@ Communicator::Communicator(
c10d::TCPStoreOptions::kDefaultPort; // 29500
store_opts.port = master_port_ ? master_port_ : comm_master_port_default;
store_ = c10::make_intrusive<c10d::TCPStore>(master_addr_, store_opts);
#endif

#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
ucc_available_ = true;
Expand All @@ -222,7 +216,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
// check if backend associated with the team is present in the cache
if (backends_.find(team_key) ==
backends_.end()) { // create the backend and cache it
#ifdef NVFUSER_DISTRIBUTED
// check that the caller's rank belongs to the requested team
auto rank_it = std::find(team.begin(), team.end(), deviceId());
NVF_ERROR(
Expand All @@ -237,9 +230,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
team_rank,
static_cast<int64_t>(team.size()));
#else
backends_[team_key] = c10::make_intrusive<c10d::Backend>();
#endif
}
return backends_.at(team_key);
}
Expand Down
31 changes: 12 additions & 19 deletions csrc/multidevice/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,25 @@
#include <ATen/core/TensorBody.h>
#include <ATen/core/ivalue.h>
#include <c10/util/intrusive_ptr.h>

#include <exceptions.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Backend.hpp>
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif

#include <exceptions.h>
#include <multidevice/multidevice.h>
#include <visibility.h>

namespace nvfuser {

/*
This file implements the class Communicator which sets up the inter-process
Backend. This class contains inter-process information, such as the rank, the
world size, as well as the Process Group that can be called to perform
inter-process communications.

Each process is associated with a unique deviceId and device. The actual MPI
rank remains private to the class and should not be used by the user. The
communicator class holds privately the mappings ranks <-> device IDs <->
device.

*/
// This file implements the class Communicator which sets up the inter-process
// Backend. This class contains inter-process information, such as the rank, the
// world size, as well as the Process Group that can be called to perform
// inter-process communications.
//
// Each process is associated with a unique deviceId and device. The actual MPI
// rank remains private to the class and should not be used by the user. The
// communicator class holds privately the mappings ranks <-> device IDs <->
// device.

using RankType = DeviceIdxType;

Expand Down
8 changes: 0 additions & 8 deletions csrc/multidevice/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@

namespace nvfuser {

NVF_API bool distributedEnabled() {
#ifdef NVFUSER_DISTRIBUTED
return true;
#else
return false;
#endif
}

namespace {

std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
Expand Down
3 changes: 0 additions & 3 deletions csrc/multidevice/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@

namespace nvfuser {

// Returns true iff nvFuser was compiled with distributed APIs enabled.
NVF_API bool distributedEnabled();

// Returns whether a TensorView has a non-reduction axis parallelized Didx
// Checks that the other non-reduction axis are not parallelized on Didx
NVF_API bool isSharded(TensorView*);
Expand Down
15 changes: 4 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
# --build-with-ucc
# Build nvfuser with UCC support. You may need to specify environment variables of UCC_HOME, UCC_DIR, UCX_HOME, UCX_DIR.
#
# --build-without-distributed
# Build nvfuser without multidevice support
#
# --debug
# Building nvfuser in debug mode
#
Expand Down Expand Up @@ -74,7 +71,6 @@
NO_NINJA = False
BUILD_WITH_UCC = False
BUILD_WITH_ASAN = False
BUILD_WITHOUT_DISTRIBUTED = False
OVERWRITE_VERSION = False
VERSION_TAG = None
BUILD_TYPE = "Release"
Expand Down Expand Up @@ -106,9 +102,6 @@
if arg == "--build-with-asan":
BUILD_WITH_ASAN = True
continue
if arg == "--build-without-distributed":
BUILD_WITHOUT_DISTRIBUTED = True
continue
if arg == "--debug":
BUILD_TYPE = "Debug"
continue
Expand Down Expand Up @@ -306,7 +299,10 @@ def cmake(install_prefix: str = "./nvfuser"):

logger.setLevel(logger_level)

pytorch_use_distributed = get_pytorch_use_distributed()
if not get_pytorch_use_distributed():
raise RuntimeError(
"nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
)

# generate cmake directory
cmd_str = [
Expand All @@ -315,7 +311,6 @@ def cmake(install_prefix: str = "./nvfuser"):
"-DCMAKE_BUILD_TYPE=" + BUILD_TYPE,
f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
f"-DNVFUSER_CPP_STANDARD={CPP_STANDARD}",
f"-DUSE_DISTRIBUTED={pytorch_use_distributed}",
"-B",
cmake_build_dir,
]
Expand All @@ -333,8 +328,6 @@ def cmake(install_prefix: str = "./nvfuser"):
cmd_str.append("-DBUILD_NVFUSER_BENCHMARK=ON")
if BUILD_WITH_ASAN:
cmd_str.append("-DNVFUSER_BUILD_WITH_ASAN=ON")
if BUILD_WITHOUT_DISTRIBUTED:
cmd_str.append("-DNVFUSER_DISTRIBUTED=OFF")
cmd_str.append(".")

print(f"Configuring CMake with {' '.join(cmd_str)}")
Expand Down
3 changes: 1 addition & 2 deletions tests/cpp/test_multidevice_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ using namespace torch::jit::fuser::cuda;
using namespace at::indexing;

// To run the following tests on several devices, pytorch must be installed with
// the flag USE_DISTRIBUTED=1 and nccl support. With that, nvFuser is built by
// default with NVFUSER_DISTRIBUTED defined. Then, on a node with at least 6
// the flag USE_DISTRIBUTED=1 and nccl support. Then, on a node with at least 6
// GPUs, run the test using mpirun: `mpirun -np 6 build/test_multidevice
// --gtest_filter=PipelineTestTwoStages*`.

Expand Down
3 changes: 0 additions & 3 deletions tests/cpp/test_resharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,6 @@ TEST_F(ReshardingTest, InsertShardedAxisReordering) {
}

TEST_P(ReshardingTest, Insert) {
if (!distributedEnabled()) { // Test only works with distributed
GTEST_SKIP() << "Requires distributed API";
}
auto
[mesh0,
mesh1,
Expand Down
Loading