NVIDIA · romerojosh · Apr 16, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -161,6 +161,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
 )
 

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
@@ -23,3 +23,12 @@ CUDECOMP_ENABLE_CUMEM
 some MPI distributions on multi-node NVLink (MNNVL) capable systems.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_CUDA_GRAPHS
+---------------------------
+(since v0.5.1, requires CUDA 11.1 driver/toolkit or newer)
+
+:code:`CUDECOMP_ENABLE_CUDA_GRAPHS` controls whether cuDecomp uses CUDA Graphs APIs to capture/replay packing operations for pipelined backends. This option can improve the launch efficiency
+and communication overlap of packing kernels in large scale cases.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
diff --git a/include/cudecomp.h b/include/cudecomp.h
@@ -175,19 +175,19 @@ typedef struct {
                                   ///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
                                   ///< (default: [1.0, 1.0, 1.0, 1.0])
 
-  int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose
-                                              ///< operation; first index specifies operation in the following order:
-                                              ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
-                                              ///< argument (default: all zeros, no halos)
+  int32_t transpose_input_halo_extents[4][3];  ///< input_halo_extents argument to use during autotuning by transpose
+                                               ///< operation; first index specifies operation in the following order:
+                                               ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
+                                               ///< argument (default: all zeros, no halos)
   int32_t transpose_output_halo_extents[4][3]; ///< output_halo_extents argument to use during autotuning by transpose
                                                ///< operation; first index specifies operation in the following order:
                                                ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
                                                ///< argument (default: all zeros, no halos)
 
-  int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation;
-                                         ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
-                                         ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
-                                         ///< all zeros, no padding)
+  int32_t transpose_input_padding[4][3];  ///< input_padding argument to use during autotuning by transpose operation;
+                                          ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
+                                          ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
+                                          ///< all zeros, no padding)
   int32_t transpose_output_padding[4][3]; ///< output_padding argument to use during autotuning by transpose operation;
                                           ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
                                           ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:

diff --git a/include/internal/common.h b/include/internal/common.h
@@ -44,6 +44,7 @@
 
 #include "cudecomp.h"
 #include "internal/checks.h"
+#include "internal/graph.h"
 
 namespace cudecomp {
 typedef std::pair<std::array<unsigned char, NVML_GPU_FABRIC_UUID_LEN>, unsigned int> mnnvl_info;
@@ -98,6 +99,9 @@ struct cudecompHandle {
   std::vector<cudecomp::mnnvl_info> rank_to_mnnvl_info; // list of mnnvl information (clusterUuid, cliqueId) by rank
   std::vector<unsigned int> rank_to_clique;             // list of rank to MNNVL clique mappings
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
+
+  // CUDA graphs
+  bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 };
 
 // Structure with information about row/column communicator
@@ -127,6 +131,8 @@ struct cudecompGridDesc {
 
   std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
 
+  cudecomp::graphCache graph_cache; // CUDA graph cache
+
   bool initialized = false;
 };
 

diff --git a/include/internal/graph.h b/include/internal/graph.h
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_GRAPH_H
+#define CUDECOMP_GRAPH_H
+
+#include <tuple>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/hashes.h"
+#include "internal/utils.h"
+
+namespace cudecomp {
+
+class graphCache {
+  using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t, cudecompDataType_t>;
+
+public:
+  graphCache();
+  ~graphCache();
+  void replay(const key_type& key, cudaStream_t stream) const;
+  cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
+  void endCapture(const key_type& key);
+  bool cached(const key_type& key) const;
+  void clear();
+
+private:
+  std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
+  cudaStream_t graph_stream_;
+};
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_GRAPH_H
diff --git a/include/internal/hashes.h b/include/internal/hashes.h
@@ -31,25 +31,74 @@
 #ifndef CUDECOMP_HASHES_H
 #define CUDECOMP_HASHES_H
 
+#include <array>
+#include <functional>
+#include <tuple>
 #include <utility>
 
+#include "cudecomp.h"
+
 #define MAGIC 0x9e3779b9
 
+template <typename T> inline void hash_combine(size_t& hash_value, const T& val) {
+  hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+}
+
 template <typename T, size_t N> struct std::hash<std::array<T, N>> {
-  size_t operator()(const std::array<T, N>& in) const {
+  size_t operator()(const std::array<T, N>& array) const {
+    size_t hash_value = 0;
+    for (const auto& val : array) {
+      hash_combine(hash_value, val);
+    }
+    return hash_value;
+  }
+};
+
+template <typename T, size_t N> struct std::hash<T[N]> {
+  size_t operator()(const T (&array)[N]) const {
     size_t hash_value = 0;
-    for (const auto& val : in) {
-      hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    for (size_t i = 0; i < N; ++i) {
+      hash_combine(hash_value, array[i]);
     }
     return hash_value;
   }
 };
 
 template <typename U, typename V> struct std::hash<std::pair<U, V>> {
-  size_t operator()(const std::pair<U, V>& in) const {
+  size_t operator()(const std::pair<U, V>& pair) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, pair.first);
+    hash_combine(hash_value, pair.second);
+    return hash_value;
+  }
+};
+
+template <> struct std::hash<cudecompPencilInfo_t> {
+  size_t operator()(const cudecompPencilInfo_t& info) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, info.shape);
+    hash_combine(hash_value, info.order);
+    hash_combine(hash_value, info.halo_extents);
+    hash_combine(hash_value, info.padding);
+    return hash_value;
+  }
+};
+
+template <typename Tuple, std::size_t Index = std::tuple_size<Tuple>::value - 1> struct tuple_hasher {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) {
+    tuple_hasher<Tuple, Index - 1>::apply(hash_value, tuple);
+    hash_combine(hash_value, std::get<Index>(tuple));
+  }
+};
+
+template <typename Tuple> struct tuple_hasher<Tuple, 0> {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) { hash_combine(hash_value, std::get<0>(tuple)); }
+};
+
+template <typename... Types> struct std::hash<std::tuple<Types...>> {
+  size_t operator()(const std::tuple<Types...>& tuple) const {
     size_t hash_value = 0;
-    hash_value ^= std::hash<U>{}(in.first) + MAGIC + (hash_value << 6) + (hash_value >> 2);
-    hash_value ^= std::hash<V>{}(in.second) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    tuple_hasher<std::tuple<Types...>>::apply(hash_value, tuple);
     return hash_value;
   }
 };