diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c35d7..d76d65a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,7 @@ target_sources(cudecomp ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc ) diff --git a/docs/env_vars.rst b/docs/env_vars.rst index 06b315a..a7b607d 100644 --- a/docs/env_vars.rst +++ b/docs/env_vars.rst @@ -23,3 +23,12 @@ CUDECOMP_ENABLE_CUMEM some MPI distributions on multi-node NVLink (MNNVL) capable systems. Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. + +CUDECOMP_ENABLE_CUDA_GRAPHS +--------------------------- +(since v0.5.1, requires CUDA 11.1 driver/toolkit or newer) + +:code:`CUDECOMP_ENABLE_CUDA_GRAPHS` controls whether cuDecomp uses CUDA Graphs APIs to capture/replay packing operations for pipelined backends. This option can improve the launch efficiency +and communication overlap of packing kernels in large scale cases. + +Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. diff --git a/include/cudecomp.h b/include/cudecomp.h index 631155c..41c8a6e 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -175,19 +175,19 @@ typedef struct { ///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X ///< (default: [1.0, 1.0, 1.0, 1.0]) - int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose - ///< operation; first index specifies operation in the following order: - ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent - ///< argument (default: all zeros, no halos) + int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose + ///< operation; first index specifies operation in the following order: + ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent + ///< argument (default: all zeros, no halos) int32_t transpose_output_halo_extents[4][3]; ///< output_halo_extents argument to use during autotuning by transpose ///< operation; first index specifies operation in the following order: ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent ///< argument (default: all zeros, no halos) - int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation; - ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, - ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: - ///< all zeros, no padding) + int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation; + ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, + ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: + ///< all zeros, no padding) int32_t transpose_output_padding[4][3]; ///< output_padding argument to use during autotuning by transpose operation; ///< first index specifies operation in the following order: X-to-Y, Y-to-Z, ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default: diff --git a/include/internal/common.h b/include/internal/common.h index 7d2ad51..987f471 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -44,6 +44,7 @@ #include "cudecomp.h" #include "internal/checks.h" +#include "internal/graph.h" namespace cudecomp { typedef std::pair, unsigned int> mnnvl_info; @@ -98,6 +99,9 @@ struct cudecompHandle { std::vector rank_to_mnnvl_info; // list of mnnvl information (clusterUuid, cliqueId) by rank std::vector rank_to_clique; // list of rank to MNNVL clique mappings std::vector rank_to_clique_rank; // list of rank to MNNVL clique rank mappings + + // CUDA graphs + bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used }; // Structure with information about row/column communicator @@ -127,6 +131,8 @@ struct cudecompGridDesc { std::vector events{nullptr}; // CUDA events used for scheduling + cudecomp::graphCache graph_cache; // CUDA graph cache + bool initialized = false; }; diff --git a/include/internal/graph.h b/include/internal/graph.h new file mode 100644 index 0000000..d6db47b --- /dev/null +++ b/include/internal/graph.h @@ -0,0 +1,65 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CUDECOMP_GRAPH_H +#define CUDECOMP_GRAPH_H + +#include +#include + +#include + +#include "cudecomp.h" +#include "internal/checks.h" +#include "internal/hashes.h" +#include "internal/utils.h" + +namespace cudecomp { + +class graphCache { + using key_type = std::tuple; + +public: + graphCache(); + ~graphCache(); + void replay(const key_type& key, cudaStream_t stream) const; + cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const; + void endCapture(const key_type& key); + bool cached(const key_type& key) const; + void clear(); + +private: + std::unordered_map graph_cache_; + cudaStream_t graph_stream_; +}; + +} // namespace cudecomp + +#endif // CUDECOMP_GRAPH_H diff --git a/include/internal/hashes.h b/include/internal/hashes.h index a3ce284..c45b065 100644 --- a/include/internal/hashes.h +++ b/include/internal/hashes.h @@ -31,25 +31,74 @@ #ifndef CUDECOMP_HASHES_H #define CUDECOMP_HASHES_H +#include +#include +#include #include +#include "cudecomp.h" + #define MAGIC 0x9e3779b9 +template inline void hash_combine(size_t& hash_value, const T& val) { + hash_value ^= std::hash{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2); +} + template struct std::hash> { - size_t operator()(const std::array& in) const { + size_t operator()(const std::array& array) const { + size_t hash_value = 0; + for (const auto& val : array) { + hash_combine(hash_value, val); + } + return hash_value; + } +}; + +template struct std::hash { + size_t operator()(const T (&array)[N]) const { size_t hash_value = 0; - for (const auto& val : in) { - hash_value ^= std::hash{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2); + for (size_t i = 0; i < N; ++i) { + hash_combine(hash_value, array[i]); } return hash_value; } }; template struct std::hash> { - size_t operator()(const std::pair& in) const { + size_t operator()(const std::pair& pair) const { + size_t hash_value = 0; + hash_combine(hash_value, pair.first); + hash_combine(hash_value, pair.second); + return hash_value; + } +}; + +template <> struct std::hash { + size_t operator()(const cudecompPencilInfo_t& info) const { + size_t hash_value = 0; + hash_combine(hash_value, info.shape); + hash_combine(hash_value, info.order); + hash_combine(hash_value, info.halo_extents); + hash_combine(hash_value, info.padding); + return hash_value; + } +}; + +template ::value - 1> struct tuple_hasher { + static void apply(std::size_t& hash_value, const Tuple& tuple) { + tuple_hasher::apply(hash_value, tuple); + hash_combine(hash_value, std::get(tuple)); + } +}; + +template struct tuple_hasher { + static void apply(std::size_t& hash_value, const Tuple& tuple) { hash_combine(hash_value, std::get<0>(tuple)); } +}; + +template struct std::hash> { + size_t operator()(const std::tuple& tuple) const { size_t hash_value = 0; - hash_value ^= std::hash{}(in.first) + MAGIC + (hash_value << 6) + (hash_value >> 2); - hash_value ^= std::hash{}(in.second) + MAGIC + (hash_value << 6) + (hash_value >> 2); + tuple_hasher>::apply(hash_value, tuple); return hash_value; } }; diff --git a/include/internal/transpose.h b/include/internal/transpose.h index f476b33..f2a3795 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -364,37 +365,63 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } if (pipelined) { - for (int j = 1; j < splits_a.size() + 1; ++j) { - int src_rank, dst_rank; - getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; + auto dtype = getCudecompDataType(); + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype); - size_t shift = offsets_a[dst_rank]; - for (int i = 0; i < 3; ++i) { - if (pinfo_a_h.order[i] == ax_a) break; - shift *= shape_g_a_h[pinfo_a_h.order[i]]; + if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { + grid_desc->graph_cache.replay(key, stream); + } else { + cudaStream_t graph_stream = stream; + if (handle->cuda_graphs_enable && splits_a.size() > 1) { + graph_stream = grid_desc->graph_cache.startCapture(key, stream); } - T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); - T* dst; - if (!direct_transpose) { - dst = o1 + send_offsets[dst_rank]; - } else { - size_t shift_b = offsets_b[src_rank]; + for (int j = 1; j < splits_a.size() + 1; ++j) { + int src_rank, dst_rank; + getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); + if (j == splits_a.size()) dst_rank = comm_rank; + + size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) { - if (pinfo_b_h.order[i] == ax_b) break; - shift *= shape_g_b_h[pinfo_b_h.order[i]]; + if (pinfo_a_h.order[i] == ax_a) break; + shift *= shape_g_a_h[pinfo_a_h.order[i]]; } - dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents); - } + T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); + T* dst; + if (!direct_transpose) { + dst = o1 + send_offsets[dst_rank]; + } else { + size_t shift_b = offsets_b[src_rank]; + for (int i = 0; i < 3; ++i) { + if (pinfo_b_h.order[i] == ax_b) break; + shift *= shape_g_b_h[pinfo_b_h.order[i]]; + } - for (int i = 0; i < 3; ++i) { - if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank]; + dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents); + } + + for (int i = 0; i < 3; ++i) { + if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank]; + } + + localPermute(handle, extents, order, strides_in, strides_out, src, dst, graph_stream); +#if CUDART_VERSION >= 11010 + cudaStreamCaptureStatus capture_status; + CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); + CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, + capture_status == cudaStreamCaptureStatusActive + ? cudaEventRecordExternal + : cudaEventRecordDefault)); +#else + CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); +#endif } - localPermute(handle, extents, order, strides_in, strides_out, src, dst, stream); - CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream)); + if (handle->cuda_graphs_enable && splits_a.size() > 1) { + grid_desc->graph_cache.endCapture(key); + grid_desc->graph_cache.replay(key, stream); + } } } else { T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents); @@ -413,54 +440,82 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c // Pack int memcpy_count = 0; cudecompBatchedD2DMemcpy3DParams memcpy_params; - for (int j = 1; j < splits_a.size() + 1; ++j) { - int src_rank, dst_rank; - getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; - size_t shift = offsets_a[dst_rank]; - for (int i = 0; i < 3; ++i) { - if (pinfo_a_h.order[i] == ax_a) break; - shift *= shape_g_a_h[pinfo_a_h.order[i]]; + auto dtype = getCudecompDataType(); + auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype); + + if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) { + grid_desc->graph_cache.replay(key, stream); + } else { + cudaStream_t graph_stream = stream; + if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) { + graph_stream = grid_desc->graph_cache.startCapture(key, stream); } - T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); - T* dst; - if (!direct_pack) { - dst = o1 + send_offsets[dst_rank]; - } else { - size_t shift_b = offsets_b[src_rank]; + for (int j = 1; j < splits_a.size() + 1; ++j) { + int src_rank, dst_rank; + getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); + if (j == splits_a.size()) dst_rank = comm_rank; + + size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) { - if (pinfo_b_h.order[i] == ax_b) break; - shift_b *= shape_g_b_h[pinfo_b_h.order[i]]; + if (pinfo_a_h.order[i] == ax_a) break; + shift *= shape_g_a_h[pinfo_a_h.order[i]]; } - dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents); - } - memcpy_params.src[memcpy_count] = src; - memcpy_params.dest[memcpy_count] = dst; - memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1]; - memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0]; - if (!direct_pack) { - memcpy_params.dest_strides[1][memcpy_count] = - (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; - memcpy_params.dest_strides[0][memcpy_count] = - memcpy_params.dest_strides[1][memcpy_count] * - ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]); - } else { - memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1]; - memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0]; + T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents); + T* dst; + if (!direct_pack) { + dst = o1 + send_offsets[dst_rank]; + } else { + size_t shift_b = offsets_b[src_rank]; + for (int i = 0; i < 3; ++i) { + if (pinfo_b_h.order[i] == ax_b) break; + shift_b *= shape_g_b_h[pinfo_b_h.order[i]]; + } + dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents); + } + + memcpy_params.src[memcpy_count] = src; + memcpy_params.dest[memcpy_count] = dst; + memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1]; + memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0]; + if (!direct_pack) { + memcpy_params.dest_strides[1][memcpy_count] = + (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; + memcpy_params.dest_strides[0][memcpy_count] = + memcpy_params.dest_strides[1][memcpy_count] * + ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]); + } else { + memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1]; + memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0]; + } + memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; + memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]; + memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2]; + memcpy_count++; + if (memcpy_count == memcpy_limit || j == splits_a.size()) { + memcpy_params.ncopies = memcpy_count; + cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream); + memcpy_count = 0; + } +#if CUDART_VERSION >= 11010 + if (pipelined) { + cudaStreamCaptureStatus capture_status; + CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status)); + CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream, + capture_status == cudaStreamCaptureStatusActive + ? cudaEventRecordExternal + : cudaEventRecordDefault)); + } +#else + if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream)); +#endif } - memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0]; - memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]; - memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2]; - memcpy_count++; - if (memcpy_count == memcpy_limit || j == splits_a.size()) { - memcpy_params.ncopies = memcpy_count; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); - memcpy_count = 0; + if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) { + grid_desc->graph_cache.endCapture(key); + grid_desc->graph_cache.replay(key, stream); } - if (pipelined) CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream)); } } diff --git a/include/internal/utils.h b/include/internal/utils.h new file mode 100644 index 0000000..26dc8a9 --- /dev/null +++ b/include/internal/utils.h @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CUDECOMP_UTILS_H +#define CUDECOMP_UTILS_H + +#include + +#include "cudecomp.h" + +inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) { + if (a.size != b.size) return false; + for (int i = 0; i < 3; ++i) { + if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || (a.order[i] != b.order[i]) || + (a.halo_extents[i] != b.halo_extents[i]) || (a.padding[i] != b.padding[i])) { + return false; + } + } + return true; +} + +inline cudecompDataType_t getCudecompDataType(float) { return CUDECOMP_FLOAT; } +inline cudecompDataType_t getCudecompDataType(double) { return CUDECOMP_DOUBLE; } +inline cudecompDataType_t getCudecompDataType(cuda::std::complex) { return CUDECOMP_FLOAT_COMPLEX; } +inline cudecompDataType_t getCudecompDataType(cuda::std::complex) { return CUDECOMP_DOUBLE_COMPLEX; } +template inline cudecompDataType_t getCudecompDataType() { return getCudecompDataType(T(0)); } + +#endif // CUDECOMP_UTILS_H diff --git a/src/autotune.cc b/src/autotune.cc index b4fdf69..6b8a74d 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -389,6 +389,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d } } + // Clear CUDA graph cache between backend/process decomposition pairs + grid_desc->graph_cache.clear(); + auto times = processTimings(handle, trial_times); auto times_w = processTimings(handle, trial_times_w); auto xy_times = processTimings(handle, trial_xy_times); @@ -449,6 +452,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif } + } // Free test data and workspace diff --git a/src/cudecomp.cc b/src/cudecomp.cc index c7ded3a..272a158 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -314,6 +314,19 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) { handle->cuda_cumem_enable = false; } } +#endif + } + + // Check CUDECOMP_ENABLE_CUDA_GRAPHS (CUDA Graphs usage in pipelined backends) + const char* graphs_enable_str = std::getenv("CUDECOMP_ENABLE_CUDA_GRAPHS"); + if (graphs_enable_str) { handle->cuda_graphs_enable = std::strtol(graphs_enable_str, nullptr, 10) == 1; } + if (handle->cuda_graphs_enable) { +#if CUDART_VERSION < 11010 + if (handle->rank == 0) { + printf("CUDECOMP:WARN: CUDECOMP_ENABLE_CUDA_GRAPHS is set but CUDA version used for compilation does not " + "support cudaEventRecordWithFlags which is required. Disabling this feature.\n"); + } + handle->cuda_graphs_enable = false; #endif } } diff --git a/src/graph.cc b/src/graph.cc new file mode 100644 index 0000000..8e76d34 --- /dev/null +++ b/src/graph.cc @@ -0,0 +1,79 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "cudecomp.h" +#include "internal/checks.h" +#include "internal/graph.h" +#include "internal/hashes.h" + +namespace cudecomp { + +graphCache::graphCache() { CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); } + +graphCache::~graphCache() { + CHECK_CUDA(cudaStreamDestroy(graph_stream_)); + this->clear(); +} + +void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const { + CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream)); +} + +cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStream_t stream) const { + CHECK_CUDA(cudaStreamBeginCapture(graph_stream_, cudaStreamCaptureModeGlobal)); + return graph_stream_; +} + +void graphCache::endCapture(const graphCache::key_type& key) { + cudaGraph_t graph; + cudaGraphExec_t graph_exec; + CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph)); + CHECK_CUDA(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0)); + CHECK_CUDA(cudaGraphDestroy(graph)); + + graph_cache_[key] = graph_exec; +} + +bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; } + +void graphCache::clear() { + for (auto& entry : graph_cache_) { + CHECK_CUDA(cudaGraphExecDestroy(entry.second)); + } + + graph_cache_.clear(); +} + +} // namespace cudecomp