Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/mscclpp/gpu_data_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#ifndef MSCCLPP_GPU_DATA_TYPES_HPP_
#define MSCCLPP_GPU_DATA_TYPES_HPP_

#if defined(__HIP_PLATFORM_AMD__)
#include <mscclpp/device.hpp>

#if defined(MSCCLPP_DEVICE_HIP)

#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
Expand Down Expand Up @@ -55,8 +57,6 @@ using __bfloat162 = __nv_bfloat162;

#endif

#include <mscclpp/device.hpp>

namespace mscclpp {

/// Data types supported by mscclpp operations.
Expand Down
2 changes: 1 addition & 1 deletion python/csrc/gpu_utils_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ using namespace mscclpp;
constexpr int BYTE_BITS = 8;

static DLDeviceType getDeviceType() {
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
return kDLROCM;
#else
return kDLCUDA;
Expand Down
2 changes: 1 addition & 1 deletion src/executor/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ struct Executor::Impl {
cudaStream_t stream, PacketType packetType) {
static uint32_t flag = 0;
#if defined(ENABLE_NPKIT)
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
DeviceExecutionPlanKey key = context.currentDevicePlan;
int nthreadblocks = context.deviceExecutionPlans[key].size();
if (nthreadblocks > NPKIT_MAX_NUM_GPU_THREADBLOCKS) {
Expand Down
12 changes: 6 additions & 6 deletions src/gpu_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
#include "debug.h"

static inline bool isCudaTeardownError(cudaError_t err) {
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
#else // !defined(__HIP_PLATFORM_AMD__)
#else // !defined(MSCCLPP_USE_ROCM)
return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure;
#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !defined(MSCCLPP_USE_ROCM)
}

static inline bool isCuTeardownError(CUresult r) {
Expand Down Expand Up @@ -178,7 +178,7 @@ void* gpuCallocHost(size_t bytes, unsigned int flags) {
return ptr;
}

#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
void* gpuCallocUncached(size_t bytes) {
AvoidCudaGraphCaptureGuard cgcGuard;
void* ptr;
Expand All @@ -188,7 +188,7 @@ void* gpuCallocUncached(size_t bytes) {
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
return ptr;
}
#endif // defined(__HIP_PLATFORM_AMD__)
#endif // defined(MSCCLPP_USE_ROCM)

#if (CUDA_NVLS_API_AVAILABLE)
size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag) {
Expand Down Expand Up @@ -335,7 +335,7 @@ bool isNvlsSupported() {
}

bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
return false;
#else
CUmemGenericAllocationHandle handle;
Expand Down
18 changes: 9 additions & 9 deletions src/ib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#endif // defined(USE_IBVERBS)
#include "logger.hpp"

#if !defined(__HIP_PLATFORM_AMD__)
#if !defined(MSCCLPP_USE_ROCM)

// Check if nvidia_peermem kernel module is loaded
[[maybe_unused]] static bool checkNvPeerMemLoaded() {
Expand All @@ -35,7 +35,7 @@
return false;
}

#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !defined(MSCCLPP_USE_ROCM)

namespace mscclpp {

Expand All @@ -50,11 +50,11 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
return cache[gpuId];
}
int dmaBufSupported = 0;
#if !defined(__HIP_PLATFORM_AMD__)
#if !defined(MSCCLPP_USE_ROCM)
CUdevice dev;
MSCCLPP_CUTHROW(cuDeviceGet(&dev, gpuId));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&dmaBufSupported, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !defined(MSCCLPP_USE_ROCM)
bool ret = dmaBufSupported != 0;
if (!ret) {
DEBUG(NET, "GPU ", gpuId, " does not support DMABUF");
Expand All @@ -78,7 +78,7 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
int gpuId = detail::gpuIdFromAddress(buff_);
bool isGpuBuff = (gpuId != -1);
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
#if !defined(__HIP_PLATFORM_AMD__)
#if !defined(MSCCLPP_USE_ROCM)
int fd;
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));

Expand All @@ -90,11 +90,11 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
if (mr_ == nullptr) {
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
}
#else // defined(__HIP_PLATFORM_AMD__)
#else // defined(MSCCLPP_USE_ROCM)
THROW(NET, Error, ErrorCode::InvalidUsage, "We don't support DMABUF on HIP platforms yet");
#endif // defined(__HIP_PLATFORM_AMD__)
#endif // defined(MSCCLPP_USE_ROCM)
} else {
#if !defined(__HIP_PLATFORM_AMD__)
#if !defined(MSCCLPP_USE_ROCM)
if (isGpuBuff) {
if (isCuMemMapAllocated(buff_)) {
THROW(NET, Error, ErrorCode::InvalidUsage, "DMABUF is required but is not supported in this platform.");
Expand All @@ -104,7 +104,7 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
THROW(NET, Error, ErrorCode::SystemError, "nvidia_peermem kernel module is not loaded");
}
}
#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !defined(MSCCLPP_USE_ROCM)
mr_ = IBVerbs::ibv_reg_mr(pd, reinterpret_cast<void*>(addr), pages * pageSize,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
Expand Down
8 changes: 6 additions & 2 deletions src/include/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
#define MSCCLPP_ATOMIC_HPP_

#if defined(MSCCLPP_USE_CUDA)
#ifndef MSCCLPP_DEVICE_CUDA
#define MSCCLPP_DEVICE_CUDA
#include <mscclpp/atomic_device.hpp>
#undef MSCCLPP_DEVICE_CUDA
#else // !defined(MSCCLPP_USE_CUDA)
#endif // !defined(MSCCLPP_DEVICE_CUDA)
#else // !defined(MSCCLPP_USE_CUDA)
#ifndef MSCCLPP_DEVICE_HIP
#define MSCCLPP_DEVICE_HIP
#include <mscclpp/atomic_device.hpp>
#undef MSCCLPP_DEVICE_HIP
#endif // !defined(MSCCLPP_DEVICE_HIP)
#endif // !defined(MSCCLPP_USE_CUDA)

#endif // MSCCLPP_ATOMIC_HPP_
#endif // MSCCLPP_ATOMIC_HPP_
25 changes: 13 additions & 12 deletions src/include/execution_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <mscclpp/npkit/npkit.hpp>
#endif
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/device.hpp>
#include <mscclpp/gpu_data_types.hpp>
#include <mscclpp/memory_channel.hpp>
#include <mscclpp/packet_device.hpp>
Expand Down Expand Up @@ -55,7 +56,7 @@ MSCCLPP_DEVICE_INLINE __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
// FP8 E4M3 addition using __hadd (single element)
template <>
MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// Optimized assembly for gfx942
float2 v;
uint32_t ival = 0;
Expand All @@ -71,7 +72,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
// FP8 E5M2 addition using __hadd (single element) - must come before helper functions
template <>
MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// Optimized assembly for gfx942 (bfloat8)
float2 v;
uint32_t ival = 0;
Expand All @@ -84,7 +85,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
#endif
}

#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// HIP gfx942 platform: Helper functions for vectorized FP8 operations
// We use separate function names because __fp8x2_e4m3 and __fp8x2_e5m2 are both uint16_t

Expand Down Expand Up @@ -131,7 +132,7 @@ MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e5m2(uint32_t a, uint32_t b) {
}
#endif

#if !defined(__HIP_PLATFORM_AMD__)
#if !defined(MSCCLPP_DEVICE_HIP)
// CUDA platform: Template specializations for vectorized FP8 operations

// FP8 E4M3 vectorized addition using __hadd2 for 2 elements (CUDA only)
Expand Down Expand Up @@ -202,7 +203,7 @@ MSCCLPP_DEVICE_INLINE int4 add_vectors<__bfloat16>(int4 a, int4 b) {
#if defined(__FP8_TYPES_EXIST__)
template <>
MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// HIP gfx942: Use helper functions that work with storage types
int4 ret;
ret.w = add_fp8x4_e4m3(a.w, b.w);
Expand All @@ -217,7 +218,7 @@ MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) {

template <>
MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e5m2>(int4 a, int4 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// HIP gfx942: Use helper functions that work with storage types
int4 ret;
ret.w = add_fp8x4_e5m2(a.w, b.w);
Expand Down Expand Up @@ -257,7 +258,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__bfloat16>(uint
#if defined(__FP8_TYPES_EXIST__)
template <>
MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint2 a, uint2 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// HIP gfx942: Use helper functions that work with storage types
uint2 ret;
ret.x = add_fp8x4_e4m3(a.x, b.x);
Expand All @@ -270,7 +271,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint

template <>
MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e5m2>(uint2 a, uint2 b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// HIP gfx942: Use helper functions that work with storage types
uint2 ret;
ret.x = add_fp8x4_e5m2(a.x, b.x);
Expand Down Expand Up @@ -305,7 +306,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__bfloat16>(int a,
#if defined(__FP8_TYPES_EXIST__)
template <>
MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a, int b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
return add_fp8x4_e4m3(a, b);
#else
return add_vectors_helper<__fp8x4_e4m3>(a, b);
Expand All @@ -314,7 +315,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a,

template <>
MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e5m2>(int a, int b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
return add_fp8x4_e5m2(a, b);
#else
return add_vectors_helper<__fp8x4_e5m2>(a, b);
Expand Down Expand Up @@ -345,7 +346,7 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) {
#if defined(__FP8_TYPES_EXIST__)
template <>
MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
return add_fp8x4_e4m3(a, b);
#else
return add_vectors_helper<__fp8x4_e4m3>(a, b);
Expand All @@ -354,7 +355,7 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) {

template <>
MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e5m2>(uint32_t a, uint32_t b) {
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
return add_fp8x4_e5m2(a, b);
#else
return add_vectors_helper<__fp8x4_e5m2>(a, b);
Expand Down
2 changes: 1 addition & 1 deletion src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ std::shared_ptr<void> getPeerMemoryHandle(cudaIpcMemHandle_t ipcHandle) {
INFO(mscclpp::P2P, "Closed CUDA IPC handle at pointer ", std::hex, p);
}
};
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
// Unlike Nvidia, ROCm will not reuse the same ipc handle for same memory region.
// We cache the opened ipc handles to avoid opening multiple times. (May exceed system limit on vm.max_map_count)
static auto peerMemoryHandleMap = std::make_shared<
Expand Down
8 changes: 4 additions & 4 deletions src/semaphore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ struct SemaphoreStub::Impl {
Device device_;
};

std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken(std::shared_ptr<Context> context) {
std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken([[maybe_unused]] std::shared_ptr<Context> context) {
#if (CUDA_NVLS_API_AVAILABLE)
if (isNvlsSupported()) {
return context->pimpl_->getToken();
}
#endif // CUDA_NVLS_API_AVAILABLE
#if defined(__HIP_PLATFORM_AMD__)
#if defined(MSCCLPP_USE_ROCM)
return detail::gpuCallocUncachedShared<uint64_t>();
#else // !defined(__HIP_PLATFORM_AMD__)
#else // !defined(MSCCLPP_USE_ROCM)
return detail::gpuCallocShared<uint64_t>();
#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !defined(MSCCLPP_USE_ROCM)
}

SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection) {
Expand Down
Loading