From 3a9aaed6119566006c89867f66403e965331b031 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 10 Apr 2025 10:03:20 -0700 Subject: [PATCH 1/8] Improve packing launch efficiency for pipelined backends using CUDA graphs. --- CMakeLists.txt | 1 + include/internal/common.h | 10 ++ include/internal/graph.h | 64 +++++++++++++ include/internal/hashes.h | 66 ++++++++++++-- include/internal/transpose.h | 171 ++++++++++++++++++++++------------- include/internal/utils.h | 45 +++++++++ src/cudecomp.cc | 13 +++ src/graph.cc | 73 +++++++++++++++ 8 files changed, 376 insertions(+), 67 deletions(-) create mode 100644 include/internal/graph.h create mode 100644 include/internal/utils.h create mode 100644 src/graph.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c35d7..d76d65a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,7 @@ target_sources(cudecomp ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc ) diff --git a/include/internal/common.h b/include/internal/common.h index 7d2ad51..3f06eb5 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -44,6 +44,7 @@ #include "cudecomp.h" #include "internal/checks.h" +#include "internal/graph.h" namespace cudecomp { typedef std::pair, unsigned int> mnnvl_info; @@ -98,6 +99,13 @@ struct cudecompHandle { std::vector rank_to_mnnvl_info; // list of mnnvl information (clusterUuid, cliqueId) by rank std::vector rank_to_clique; // list of rank to MNNVL clique mappings std::vector rank_to_clique_rank; // list of rank to MNNVL clique rank mappings + + // CUDA graphs +#if CUDART_VERSION > 11010 + bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends +#else + bool cuda_graphs_enable = false; +#endif }; // Structure with information about row/column communicator @@ -127,6 +135,8 @@ struct cudecompGridDesc { std::vector events{nullptr}; // CUDA events used for scheduling + cudecomp::graphCache graph_cache; // CUDA graph cache + bool initialized = false; }; diff --git a/include/internal/graph.h b/include/internal/graph.h new file mode 100644 index 0000000..a130cdb --- /dev/null +++ b/include/internal/graph.h @@ -0,0 +1,64 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CUDECOMP_GRAPH_H +#define CUDECOMP_GRAPH_H + +#include +#include + +#include + +#include "cudecomp.h" +#include "internal/checks.h" +#include "internal/hashes.h" +#include "internal/utils.h" + +namespace cudecomp { + +class graphCache { + using key_type = std::tuple; +public: + graphCache(); + ~graphCache(); + void replay(const key_type& key, cudaStream_t stream) const; + cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const; + void endCapture(const key_type& key); + bool cached(const key_type& key) const; + +private: + std::unordered_map graph_cache_; + cudaStream_t graph_stream_; +}; + +} // namespace cudecomp + +#endif // CUDECOMP_GRAPH_H + diff --git a/include/internal/hashes.h b/include/internal/hashes.h index a3ce284..5eda5ff 100644 --- a/include/internal/hashes.h +++ b/include/internal/hashes.h @@ -31,25 +31,79 @@ #ifndef CUDECOMP_HASHES_H #define CUDECOMP_HASHES_H +#include +#include +#include #include +#include "cudecomp.h" + #define MAGIC 0x9e3779b9 +template +inline void hash_combine(size_t& hash_value, const T& val) { + hash_value ^= std::hash{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2); +} + template struct std::hash> { - size_t operator()(const std::array& in) const { + size_t operator()(const std::array& array) const { + size_t hash_value = 0; + for (const auto& val : array) { + hash_combine(hash_value, val); + } + return hash_value; + } +}; + +template struct std::hash { + size_t operator()(const T(&array)[N]) const { size_t hash_value = 0; - for (const auto& val : in) { - hash_value ^= std::hash{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2); + for (size_t i = 0; i < N; ++i) { + hash_combine(hash_value, array[i]); } return hash_value; } }; template struct std::hash> { - size_t operator()(const std::pair& in) const { + size_t operator()(const std::pair& pair) const { + size_t hash_value = 0; + hash_combine(hash_value, pair.first); + hash_combine(hash_value, pair.second); + return hash_value; + } +}; + +template<> struct std::hash { + size_t operator()(const cudecompPencilInfo_t& info) const { + size_t hash_value = 0; + hash_combine(hash_value, info.shape); + hash_combine(hash_value, info.order); + hash_combine(hash_value, info.halo_extents); + hash_combine(hash_value, info.padding); + return hash_value; + } +}; + +template ::value - 1> +struct tuple_hasher { + static void apply(std::size_t& hash_value, const Tuple& tuple) { + tuple_hasher::apply(hash_value, tuple); + hash_combine(hash_value, std::get(tuple)); + } +}; + +template +struct tuple_hasher { + static void apply(std::size_t& hash_value, const Tuple& tuple) { + hash_combine(hash_value, std::get<0>(tuple)); + } +}; + +template struct std::hash> { + size_t operator()(const std::tuple& tuple) const { size_t hash_value = 0; - hash_value ^= std::hash{}(in.first) + MAGIC + (hash_value << 6) + (hash_value >> 2); - hash_value ^= std::hash{}(in.second) + MAGIC + (hash_value << 6) + (hash_value >> 2); + tuple_hasher>::apply(hash_value, tuple); return hash_value; } }; diff --git a/include/internal/transpose.h b/include/internal/transpose.h index f476b33..8852bf4 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -364,37 +364,61 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } if (pipelined) { - for (int j = 1; j < splits_a.size() + 1; ++j) { - int src_rank, dst_rank; - getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h); - size_t shift = offsets_a[dst_rank]; - for (int i = 0; i < 3; ++i) { - if (pinfo_a_h.order[i] == ax_a) break; - shift *= shape_g_a_h[pinfo_a_h.order[i]]; + if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { + grid_desc->graph_cache.replay(key, stream); + } else { + cudaStream_t graph_stream = stream; + if (handle->cuda_graphs_enable && splits_a.size() > 1) { + graph_stream = grid_desc->graph_cache.startCapture(key, stream); } - T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); - T* dst; - if (!direct_transpose) { - dst = o1 + send_offsets[dst_rank]; - } else { - size_t shift_b = offsets_b[src_rank]; + for (int j = 1; j < splits_a.size() + 1; ++j) { + int src_rank, dst_rank; + getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); + if (j == splits_a.size()) dst_rank = comm_rank; + + size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) { - if (pinfo_b_h.order[i] == ax_b) break; - shift *= shape_g_b_h[pinfo_b_h.order[i]]; + if (pinfo_a_h.order[i] == ax_a) break; + shift *= shape_g_a_h[pinfo_a_h.order[i]]; } - dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents); + T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); + T* dst; + if (!direct_transpose) { + dst = o1 + send_offsets[dst_rank]; + } else { + size_t shift_b = offsets_b[src_rank]; + for (int i = 0; i < 3; ++i) { + if (pinfo_b_h.order[i] == ax_b) break; + shift *= shape_g_b_h[pinfo_b_h.order[i]]; + } + + dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents); + } + + for (int i = 0; i < 3; ++i) { + if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank]; + } + + localPermute(handle, extents, order, strides_in, strides_out, src, dst, graph_stream); +#if CUDART_VERSION >= 11010 + cudaStreamCaptureStatus capture_status; + CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); + CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, + capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault)); +#else + CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); +#endif } - for (int i = 0; i < 3; ++i) { - if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank]; + if (handle->cuda_graphs_enable && splits_a.size() > 1) { + grid_desc->graph_cache.endCapture(key); + grid_desc->graph_cache.replay(key, stream); } - localPermute(handle, extents, order, strides_in, strides_out, src, dst, stream); - CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream)); } } else { T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents); @@ -413,54 +437,79 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c // Pack int memcpy_count = 0; cudecompBatchedD2DMemcpy3DParams memcpy_params; - for (int j = 1; j < splits_a.size() + 1; ++j) { - int src_rank, dst_rank; - getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; - size_t shift = offsets_a[dst_rank]; - for (int i = 0; i < 3; ++i) { - if (pinfo_a_h.order[i] == ax_a) break; - shift *= shape_g_a_h[pinfo_a_h.order[i]]; + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h); + + if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { + grid_desc->graph_cache.replay(key, stream); + } else { + cudaStream_t graph_stream = stream; + if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) { + graph_stream = grid_desc->graph_cache.startCapture(key, stream); } - T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); - T* dst; - if (!direct_pack) { - dst = o1 + send_offsets[dst_rank]; - } else { - size_t shift_b = offsets_b[src_rank]; + for (int j = 1; j < splits_a.size() + 1; ++j) { + int src_rank, dst_rank; + getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); + if (j == splits_a.size()) dst_rank = comm_rank; + + size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) { - if (pinfo_b_h.order[i] == ax_b) break; - shift_b *= shape_g_b_h[pinfo_b_h.order[i]]; + if (pinfo_a_h.order[i] == ax_a) break; + shift *= shape_g_a_h[pinfo_a_h.order[i]]; } - dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents); - } - memcpy_params.src[memcpy_count] = src; - memcpy_params.dest[memcpy_count] = dst; - memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1]; - memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0]; - if (!direct_pack) { - memcpy_params.dest_strides[1][memcpy_count] = - (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; - memcpy_params.dest_strides[0][memcpy_count] = - memcpy_params.dest_strides[1][memcpy_count] * - ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]); - } else { - memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1]; - memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0]; + T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); + T* dst; + if (!direct_pack) { + dst = o1 + send_offsets[dst_rank]; + } else { + size_t shift_b = offsets_b[src_rank]; + for (int i = 0; i < 3; ++i) { + if (pinfo_b_h.order[i] == ax_b) break; + shift_b *= shape_g_b_h[pinfo_b_h.order[i]]; + } + dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents); + } + + memcpy_params.src[memcpy_count] = src; + memcpy_params.dest[memcpy_count] = dst; + memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1]; + memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0]; + if (!direct_pack) { + memcpy_params.dest_strides[1][memcpy_count] = + (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; + memcpy_params.dest_strides[0][memcpy_count] = + memcpy_params.dest_strides[1][memcpy_count] * + ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]); + } else { + memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1]; + memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0]; + } + memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; + memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]; + memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2]; + memcpy_count++; + if (memcpy_count == memcpy_limit || j == splits_a.size()) { + memcpy_params.ncopies = memcpy_count; + cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream); + memcpy_count = 0; + } +#if CUDART_VERSION >= 11010 + if (pipelined) { + cudaStreamCaptureStatus capture_status; + CHECK_CUDA(cudaStreamIsCapturing(stream, &capture_status)); + CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, + capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault)); + } +#else + if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], stream)); +#endif } - memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; - memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]; - memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2]; - memcpy_count++; - if (memcpy_count == memcpy_limit || j == splits_a.size()) { - memcpy_params.ncopies = memcpy_count; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); - memcpy_count = 0; + if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) { + grid_desc->graph_cache.endCapture(key); + grid_desc->graph_cache.replay(key, stream); } - if (pipelined) CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream)); } } diff --git a/include/internal/utils.h b/include/internal/utils.h new file mode 100644 index 0000000..f30521c --- /dev/null +++ b/include/internal/utils.h @@ -0,0 +1,45 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CUDECOMP_UTILS_H +#define CUDECOMP_UTILS_H +inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) { + if (a.size != b.size) return false; + for (int i = 0; i < 3; ++i) { + if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || + (a.order[i] != b.order[i]) || (a.halo_extents[i] != b.halo_extents[i]) || + (a.padding[i] != b.padding[i])) { + return false; + } + } + return true; +} + +#endif // CUDECOMP_UTILS_H diff --git a/src/cudecomp.cc b/src/cudecomp.cc index c7ded3a..272a158 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -314,6 +314,19 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) { handle->cuda_cumem_enable = false; } } +#endif + } + + // Check CUDECOMP_ENABLE_CUDA_GRAPHS (CUDA Graphs usage in pipelined backends) + const char* graphs_enable_str = std::getenv("CUDECOMP_ENABLE_CUDA_GRAPHS"); + if (graphs_enable_str) { handle->cuda_graphs_enable = std::strtol(graphs_enable_str, nullptr, 10) == 1; } + if (handle->cuda_graphs_enable) { +#if CUDART_VERSION < 11010 + if (handle->rank == 0) { + printf("CUDECOMP:WARN: CUDECOMP_ENABLE_CUDA_GRAPHS is set but CUDA version used for compilation does not " + "support cudaEventRecordWithFlags which is required. Disabling this feature.\n"); + } + handle->cuda_graphs_enable = false; #endif } } diff --git a/src/graph.cc b/src/graph.cc new file mode 100644 index 0000000..d9980a1 --- /dev/null +++ b/src/graph.cc @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "cudecomp.h" +#include "internal/checks.h" +#include "internal/graph.h" +#include "internal/hashes.h" + +namespace cudecomp { + +graphCache::graphCache() { + CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); +} + +graphCache::~graphCache() { + CHECK_CUDA(cudaStreamDestroy(graph_stream_)); +} + +void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const { + CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream)); +} + +cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStream_t stream) const { + CHECK_CUDA(cudaStreamBeginCapture(graph_stream_, cudaStreamCaptureModeGlobal)); + return graph_stream_; +} + +void graphCache::endCapture(const graphCache::key_type& key){ + cudaGraph_t graph; + cudaGraphExec_t graph_exec; + CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph)); + CHECK_CUDA(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0)); + CHECK_CUDA(cudaGraphDestroy(graph)); + + graph_cache_[key] = graph_exec; +} + +bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; } + +} // namespace cudecomp + From 5748e91a07e7b5dd52154fb346daa01a72bb0571 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 10 Apr 2025 10:27:41 -0700 Subject: [PATCH 2/8] Fixes. --- include/internal/common.h | 2 +- include/internal/transpose.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/internal/common.h b/include/internal/common.h index 3f06eb5..c0396f9 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -101,7 +101,7 @@ struct cudecompHandle { std::vector rank_to_clique_rank; // list of rank to MNNVL clique rank mappings // CUDA graphs -#if CUDART_VERSION > 11010 +#if CUDART_VERSION >= 11010 bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends #else bool cuda_graphs_enable = false; diff --git a/include/internal/transpose.h b/include/internal/transpose.h index 8852bf4..efdfea7 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -498,12 +498,12 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c #if CUDART_VERSION >= 11010 if (pipelined) { cudaStreamCaptureStatus capture_status; - CHECK_CUDA(cudaStreamIsCapturing(stream, &capture_status)); + CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault)); } #else - if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], stream)); + if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); #endif } if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) { From 6e32a23de624927a86be3698561062cf9be66d78 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 10 Apr 2025 13:39:03 -0700 Subject: [PATCH 3/8] Destroy stored cuda graph exec entries in graphCache destructor. --- src/graph.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/graph.cc b/src/graph.cc index d9980a1..be406da 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -46,6 +46,10 @@ graphCache::graphCache() { graphCache::~graphCache() { CHECK_CUDA(cudaStreamDestroy(graph_stream_)); + + for (auto& entry : graph_cache_) { + CHECK_CUDA(cudaGraphExecDestroy(entry.second)); + } } void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const { From fa38b9bf458db9f114af826a294f0b5dce8f7596 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 11 Apr 2025 10:52:46 -0700 Subject: [PATCH 4/8] Clear CUDA graph cache during transpose autotuning. Disable CUDA graphs by default. --- include/internal/common.h | 6 +----- include/internal/graph.h | 1 + src/autotune.cc | 3 +++ src/graph.cc | 13 +++++++++---- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/internal/common.h b/include/internal/common.h index c0396f9..9b46246 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -101,11 +101,7 @@ struct cudecompHandle { std::vector rank_to_clique_rank; // list of rank to MNNVL clique rank mappings // CUDA graphs -#if CUDART_VERSION >= 11010 - bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends -#else - bool cuda_graphs_enable = false; -#endif + bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends }; // Structure with information about row/column communicator diff --git a/include/internal/graph.h b/include/internal/graph.h index a130cdb..0fe6603 100644 --- a/include/internal/graph.h +++ b/include/internal/graph.h @@ -52,6 +52,7 @@ class graphCache { cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const; void endCapture(const key_type& key); bool cached(const key_type& key) const; + void clear(); private: std::unordered_map graph_cache_; diff --git a/src/autotune.cc b/src/autotune.cc index b4fdf69..2e27548 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -449,6 +449,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif } + + // Clear CUDA graph cache between tested process decompositions + grid_desc->graph_cache.clear(); } // Free test data and workspace diff --git a/src/graph.cc b/src/graph.cc index be406da..966f432 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -46,10 +46,7 @@ graphCache::graphCache() { graphCache::~graphCache() { CHECK_CUDA(cudaStreamDestroy(graph_stream_)); - - for (auto& entry : graph_cache_) { - CHECK_CUDA(cudaGraphExecDestroy(entry.second)); - } + this->clear(); } void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const { @@ -73,5 +70,13 @@ void graphCache::endCapture(const graphCache::key_type& key){ bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; } +void graphCache::clear() { + for (auto& entry : graph_cache_) { + CHECK_CUDA(cudaGraphExecDestroy(entry.second)); + } + + graph_cache_.clear(); +} + } // namespace cudecomp From 89b3e75c6389dc80594855f2462a082ed7832cf0 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 11 Apr 2025 11:02:27 -0700 Subject: [PATCH 5/8] Update docs. --- docs/env_vars.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/env_vars.rst b/docs/env_vars.rst index 06b315a..a7b607d 100644 --- a/docs/env_vars.rst +++ b/docs/env_vars.rst @@ -23,3 +23,12 @@ CUDECOMP_ENABLE_CUMEM some MPI distributions on multi-node NVLink (MNNVL) capable systems. Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. + +CUDECOMP_ENABLE_CUDA_GRAPHS +--------------------------- +(since v0.5.1, requires CUDA 11.1 driver/toolkit or newer) + +:code:`CUDECOMP_ENABLE_CUDA_GRAPHS` controls whether cuDecomp uses CUDA Graphs APIs to capture/replay packing operations for pipelined backends. This option can improve the launch efficiency +and communication overlap of packing kernels in large scale cases. + +Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. From f3e9fce301782bf6af222bb1a8a651211618a34e Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 11 Apr 2025 11:37:50 -0700 Subject: [PATCH 6/8] Formatting. --- include/cudecomp.h | 16 ++++++++-------- include/internal/common.h | 2 +- include/internal/graph.h | 2 +- include/internal/hashes.h | 17 ++++++----------- include/internal/transpose.h | 9 ++++++--- include/internal/utils.h | 5 ++--- src/graph.cc | 7 ++----- 7 files changed, 26 insertions(+), 32 deletions(-) diff --git a/include/cudecomp.h b/include/cudecomp.h index 631155c..41c8a6e 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -175,19 +175,19 @@ typedef struct { ///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X ///< (default: [1.0, 1.0, 1.0, 1.0]) - int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose - ///< operation; first index specifies operation in the following order: - ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent - ///< argument (default: all zeros, no halos) + int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose + ///< operation; first index specifies operation in the following order: + ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent + ///< argument (default: all zeros, no halos) int32_t transpose_output_halo_extents[4][3]; ///< output_halo_extents argument to use during autotuning by transpose ///< operation; first index specifies operation in the following order: ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent ///< argument (default: all zeros, no halos) - int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation; - ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, - ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: - ///< all zeros, no padding) + int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation; + ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, + ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: + ///< all zeros, no padding) int32_t transpose_output_padding[4][3]; ///< output_padding argument to use during autotuning by transpose operation; ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: diff --git a/include/internal/common.h b/include/internal/common.h index 9b46246..987f471 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -101,7 +101,7 @@ struct cudecompHandle { std::vector rank_to_clique_rank; // list of rank to MNNVL clique rank mappings // CUDA graphs - bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends + bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used }; // Structure with information about row/column communicator diff --git a/include/internal/graph.h b/include/internal/graph.h index 0fe6603..e4da24b 100644 --- a/include/internal/graph.h +++ b/include/internal/graph.h @@ -45,6 +45,7 @@ namespace cudecomp { class graphCache { using key_type = std::tuple; + public: graphCache(); ~graphCache(); @@ -62,4 +63,3 @@ class graphCache { } // namespace cudecomp #endif // CUDECOMP_GRAPH_H - diff --git a/include/internal/hashes.h b/include/internal/hashes.h index 5eda5ff..c45b065 100644 --- a/include/internal/hashes.h +++ b/include/internal/hashes.h @@ -40,8 +40,7 @@ #define MAGIC 0x9e3779b9 -template -inline void hash_combine(size_t& hash_value, const T& val) { +template inline void hash_combine(size_t& hash_value, const T& val) { hash_value ^= std::hash{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2); } @@ -56,7 +55,7 @@ template struct std::hash> { }; template struct std::hash { - size_t operator()(const T(&array)[N]) const { + size_t operator()(const T (&array)[N]) const { size_t hash_value = 0; for (size_t i = 0; i < N; ++i) { hash_combine(hash_value, array[i]); @@ -74,7 +73,7 @@ template struct std::hash> { } }; -template<> struct std::hash { +template <> struct std::hash { size_t operator()(const cudecompPencilInfo_t& info) const { size_t hash_value = 0; hash_combine(hash_value, info.shape); @@ -85,19 +84,15 @@ template<> struct std::hash { } }; -template ::value - 1> -struct tuple_hasher { +template ::value - 1> struct tuple_hasher { static void apply(std::size_t& hash_value, const Tuple& tuple) { tuple_hasher::apply(hash_value, tuple); hash_combine(hash_value, std::get(tuple)); } }; -template -struct tuple_hasher { - static void apply(std::size_t& hash_value, const Tuple& tuple) { - hash_combine(hash_value, std::get<0>(tuple)); - } +template struct tuple_hasher { + static void apply(std::size_t& hash_value, const Tuple& tuple) { hash_combine(hash_value, std::get<0>(tuple)); } }; template struct std::hash> { diff --git a/include/internal/transpose.h b/include/internal/transpose.h index efdfea7..25c5d88 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -408,7 +408,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c cudaStreamCaptureStatus capture_status; CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, - capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault)); + capture_status == cudaStreamCaptureStatusActive + ? cudaEventRecordExternal + : cudaEventRecordDefault)); #else CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); #endif @@ -418,7 +420,6 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c grid_desc->graph_cache.endCapture(key); grid_desc->graph_cache.replay(key, stream); } - } } else { T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents); @@ -500,7 +501,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c cudaStreamCaptureStatus capture_status; CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, - capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault)); + capture_status == cudaStreamCaptureStatusActive + ? cudaEventRecordExternal + : cudaEventRecordDefault)); } #else if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); diff --git a/include/internal/utils.h b/include/internal/utils.h index f30521c..c6bd9aa 100644 --- a/include/internal/utils.h +++ b/include/internal/utils.h @@ -33,9 +33,8 @@ inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) { if (a.size != b.size) return false; for (int i = 0; i < 3; ++i) { - if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || - (a.order[i] != b.order[i]) || (a.halo_extents[i] != b.halo_extents[i]) || - (a.padding[i] != b.padding[i])) { + if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || (a.order[i] != b.order[i]) || + (a.halo_extents[i] != b.halo_extents[i]) || (a.padding[i] != b.padding[i])) { return false; } } diff --git a/src/graph.cc b/src/graph.cc index 966f432..8e76d34 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -40,9 +40,7 @@ namespace cudecomp { -graphCache::graphCache() { - CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); -} +graphCache::graphCache() { CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); } graphCache::~graphCache() { CHECK_CUDA(cudaStreamDestroy(graph_stream_)); @@ -58,7 +56,7 @@ cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStrea return graph_stream_; } -void graphCache::endCapture(const graphCache::key_type& key){ +void graphCache::endCapture(const graphCache::key_type& key) { cudaGraph_t graph; cudaGraphExec_t graph_exec; CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph)); @@ -79,4 +77,3 @@ void graphCache::clear() { } } // namespace cudecomp - From 6792fabc92f8f1f9bf6f4930e9b181adf43cd347 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 14 Apr 2025 11:46:25 -0700 Subject: [PATCH 7/8] Fix issue with graph cache clearing during autotuning. Need to clear between each pdim/backend combination. --- src/autotune.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/autotune.cc b/src/autotune.cc index 2e27548..6b8a74d 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -389,6 +389,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d } } + // Clear CUDA graph cache between backend/process decomposition pairs + grid_desc->graph_cache.clear(); + auto times = processTimings(handle, trial_times); auto times_w = processTimings(handle, trial_times_w); auto xy_times = processTimings(handle, trial_xy_times); @@ -450,8 +453,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d #endif } - // Clear CUDA graph cache between tested process decompositions - grid_desc->graph_cache.clear(); } // Free test data and workspace From 8d7cca07dea5f5202c24f432feb5664513207ecb Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 16 Apr 2025 09:40:10 -0700 Subject: [PATCH 8/8] Add datatype to graph cache key. --- include/internal/graph.h | 2 +- include/internal/transpose.h | 7 +++++-- include/internal/utils.h | 11 +++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/internal/graph.h b/include/internal/graph.h index e4da24b..d6db47b 100644 --- a/include/internal/graph.h +++ b/include/internal/graph.h @@ -44,7 +44,7 @@ namespace cudecomp { class graphCache { - using key_type = std::tuple; + using key_type = std::tuple; public: graphCache(); diff --git a/include/internal/transpose.h b/include/internal/transpose.h index 25c5d88..f2a3795 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -364,7 +365,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } if (pipelined) { - auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h); + auto dtype = getCudecompDataType(); + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype); if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { grid_desc->graph_cache.replay(key, stream); @@ -439,7 +441,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c int memcpy_count = 0; cudecompBatchedD2DMemcpy3DParams memcpy_params; - auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h); + auto dtype = getCudecompDataType(); + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype); if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { grid_desc->graph_cache.replay(key, stream); diff --git a/include/internal/utils.h b/include/internal/utils.h index c6bd9aa..26dc8a9 100644 --- a/include/internal/utils.h +++ b/include/internal/utils.h @@ -30,6 +30,11 @@ #ifndef CUDECOMP_UTILS_H #define CUDECOMP_UTILS_H + +#include + +#include "cudecomp.h" + inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) { if (a.size != b.size) return false; for (int i = 0; i < 3; ++i) { @@ -41,4 +46,10 @@ inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t return true; } +inline cudecompDataType_t getCudecompDataType(float) { return CUDECOMP_FLOAT; } +inline cudecompDataType_t getCudecompDataType(double) { return CUDECOMP_DOUBLE; } +inline cudecompDataType_t getCudecompDataType(cuda::std::complex) { return CUDECOMP_FLOAT_COMPLEX; } +inline cudecompDataType_t getCudecompDataType(cuda::std::complex) { return CUDECOMP_DOUBLE_COMPLEX; } +template inline cudecompDataType_t getCudecompDataType() { return getCudecompDataType(T(0)); } + #endif // CUDECOMP_UTILS_H