From 3a9aaed6119566006c89867f66403e965331b031 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Thu, 10 Apr 2025 10:03:20 -0700
Subject: [PATCH 1/8] Improve packing launch efficiency for pipelined backends
 using CUDA graphs.

---
 CMakeLists.txt               |   1 +
 include/internal/common.h    |  10 ++
 include/internal/graph.h     |  64 +++++++++++++
 include/internal/hashes.h    |  66 ++++++++++++--
 include/internal/transpose.h | 171 ++++++++++++++++++++++-------------
 include/internal/utils.h     |  45 +++++++++
 src/cudecomp.cc              |  13 +++
 src/graph.cc                 |  73 +++++++++++++++
 8 files changed, 376 insertions(+), 67 deletions(-)
 create mode 100644 include/internal/graph.h
 create mode 100644 include/internal/utils.h
 create mode 100644 src/graph.cc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91c35d7..d76d65a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,6 +161,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
 )
 
diff --git a/include/internal/common.h b/include/internal/common.h
index 7d2ad51..3f06eb5 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -44,6 +44,7 @@
 
 #include "cudecomp.h"
 #include "internal/checks.h"
+#include "internal/graph.h"
 
 namespace cudecomp {
 typedef std::pair<std::array<unsigned char, NVML_GPU_FABRIC_UUID_LEN>, unsigned int> mnnvl_info;
@@ -98,6 +99,13 @@ struct cudecompHandle {
   std::vector<cudecomp::mnnvl_info> rank_to_mnnvl_info; // list of mnnvl information (clusterUuid, cliqueId) by rank
   std::vector<unsigned int> rank_to_clique;             // list of rank to MNNVL clique mappings
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
+
+  // CUDA graphs
+#if CUDART_VERSION > 11010
+  bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends
+#else
+  bool cuda_graphs_enable = false;
+#endif
 };
 
 // Structure with information about row/column communicator
@@ -127,6 +135,8 @@ struct cudecompGridDesc {
 
   std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
 
+  cudecomp::graphCache graph_cache; // CUDA graph cache
+
   bool initialized = false;
 };
 
diff --git a/include/internal/graph.h b/include/internal/graph.h
new file mode 100644
index 0000000..a130cdb
--- /dev/null
+++ b/include/internal/graph.h
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_GRAPH_H
+#define CUDECOMP_GRAPH_H
+
+#include <tuple>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/hashes.h"
+#include "internal/utils.h"
+
+namespace cudecomp {
+
+class graphCache {
+  using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t>;
+public:
+  graphCache();
+  ~graphCache();
+  void replay(const key_type& key, cudaStream_t stream) const;
+  cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
+  void endCapture(const key_type& key);
+  bool cached(const key_type& key) const;
+
+private:
+  std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
+  cudaStream_t graph_stream_;
+};
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_GRAPH_H
+
diff --git a/include/internal/hashes.h b/include/internal/hashes.h
index a3ce284..5eda5ff 100644
--- a/include/internal/hashes.h
+++ b/include/internal/hashes.h
@@ -31,25 +31,79 @@
 #ifndef CUDECOMP_HASHES_H
 #define CUDECOMP_HASHES_H
 
+#include <array>
+#include <functional>
+#include <tuple>
 #include <utility>
 
+#include "cudecomp.h"
+
 #define MAGIC 0x9e3779b9
 
+template <typename T>
+inline void hash_combine(size_t& hash_value, const T& val) {
+  hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+}
+
 template <typename T, size_t N> struct std::hash<std::array<T, N>> {
-  size_t operator()(const std::array<T, N>& in) const {
+  size_t operator()(const std::array<T, N>& array) const {
+    size_t hash_value = 0;
+    for (const auto& val : array) {
+      hash_combine(hash_value, val);
+    }
+    return hash_value;
+  }
+};
+
+template <typename T, size_t N> struct std::hash<T[N]> {
+  size_t operator()(const T(&array)[N]) const {
     size_t hash_value = 0;
-    for (const auto& val : in) {
-      hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    for (size_t i = 0; i < N; ++i) {
+      hash_combine(hash_value, array[i]);
     }
     return hash_value;
   }
 };
 
 template <typename U, typename V> struct std::hash<std::pair<U, V>> {
-  size_t operator()(const std::pair<U, V>& in) const {
+  size_t operator()(const std::pair<U, V>& pair) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, pair.first);
+    hash_combine(hash_value, pair.second);
+    return hash_value;
+  }
+};
+
+template<> struct std::hash<cudecompPencilInfo_t> {
+  size_t operator()(const cudecompPencilInfo_t& info) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, info.shape);
+    hash_combine(hash_value, info.order);
+    hash_combine(hash_value, info.halo_extents);
+    hash_combine(hash_value, info.padding);
+    return hash_value;
+  }
+};
+
+template <typename Tuple, std::size_t Index = std::tuple_size<Tuple>::value - 1>
+struct tuple_hasher {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) {
+    tuple_hasher<Tuple, Index - 1>::apply(hash_value, tuple);
+    hash_combine(hash_value, std::get<Index>(tuple));
+  }
+};
+
+template <typename Tuple>
+struct tuple_hasher<Tuple, 0> {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) {
+    hash_combine(hash_value, std::get<0>(tuple));
+  }
+};
+
+template <typename... Types> struct std::hash<std::tuple<Types...>> {
+  size_t operator()(const std::tuple<Types...>& tuple) const {
     size_t hash_value = 0;
-    hash_value ^= std::hash<U>{}(in.first) + MAGIC + (hash_value << 6) + (hash_value >> 2);
-    hash_value ^= std::hash<V>{}(in.second) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    tuple_hasher<std::tuple<Types...>>::apply(hash_value, tuple);
     return hash_value;
   }
 };
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index f476b33..8852bf4 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -364,37 +364,61 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
 
       if (pipelined) {
-        for (int j = 1; j < splits_a.size() + 1; ++j) {
-          int src_rank, dst_rank;
-          getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
-          if (j == splits_a.size()) dst_rank = comm_rank;
+        auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h);
 
-          size_t shift = offsets_a[dst_rank];
-          for (int i = 0; i < 3; ++i) {
-            if (pinfo_a_h.order[i] == ax_a) break;
-            shift *= shape_g_a_h[pinfo_a_h.order[i]];
+        if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
+          grid_desc->graph_cache.replay(key, stream);
+        } else {
+          cudaStream_t graph_stream = stream;
+          if (handle->cuda_graphs_enable && splits_a.size() > 1) {
+            graph_stream = grid_desc->graph_cache.startCapture(key, stream);
           }
 
-          T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
-          T* dst;
-          if (!direct_transpose) {
-            dst = o1 + send_offsets[dst_rank];
-          } else {
-            size_t shift_b = offsets_b[src_rank];
+          for (int j = 1; j < splits_a.size() + 1; ++j) {
+            int src_rank, dst_rank;
+            getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
+            if (j == splits_a.size()) dst_rank = comm_rank;
+
+            size_t shift = offsets_a[dst_rank];
             for (int i = 0; i < 3; ++i) {
-              if (pinfo_b_h.order[i] == ax_b) break;
-              shift *= shape_g_b_h[pinfo_b_h.order[i]];
+              if (pinfo_a_h.order[i] == ax_a) break;
+              shift *= shape_g_a_h[pinfo_a_h.order[i]];
             }
 
-            dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
+            T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
+            T* dst;
+            if (!direct_transpose) {
+              dst = o1 + send_offsets[dst_rank];
+            } else {
+              size_t shift_b = offsets_b[src_rank];
+              for (int i = 0; i < 3; ++i) {
+                if (pinfo_b_h.order[i] == ax_b) break;
+                shift *= shape_g_b_h[pinfo_b_h.order[i]];
+              }
+
+              dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
+            }
+
+            for (int i = 0; i < 3; ++i) {
+              if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank];
+            }
+
+            localPermute(handle, extents, order, strides_in, strides_out, src, dst, graph_stream);
+#if CUDART_VERSION >= 11010
+            cudaStreamCaptureStatus capture_status;
+            CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
+            CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
+                                                capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault));
+#else
+            CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
+#endif
           }
 
-          for (int i = 0; i < 3; ++i) {
-            if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank];
+          if (handle->cuda_graphs_enable && splits_a.size() > 1) {
+            grid_desc->graph_cache.endCapture(key);
+            grid_desc->graph_cache.replay(key, stream);
           }
 
-          localPermute(handle, extents, order, strides_in, strides_out, src, dst, stream);
-          CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream));
         }
       } else {
         T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
@@ -413,54 +437,79 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       // Pack
       int memcpy_count = 0;
       cudecompBatchedD2DMemcpy3DParams<T> memcpy_params;
-      for (int j = 1; j < splits_a.size() + 1; ++j) {
-        int src_rank, dst_rank;
-        getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
-        if (j == splits_a.size()) dst_rank = comm_rank;
 
-        size_t shift = offsets_a[dst_rank];
-        for (int i = 0; i < 3; ++i) {
-          if (pinfo_a_h.order[i] == ax_a) break;
-          shift *= shape_g_a_h[pinfo_a_h.order[i]];
+      auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h);
+
+      if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
+        grid_desc->graph_cache.replay(key, stream);
+      } else {
+        cudaStream_t graph_stream = stream;
+        if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) {
+          graph_stream = grid_desc->graph_cache.startCapture(key, stream);
         }
 
-        T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
-        T* dst;
-        if (!direct_pack) {
-          dst = o1 + send_offsets[dst_rank];
-        } else {
-          size_t shift_b = offsets_b[src_rank];
+        for (int j = 1; j < splits_a.size() + 1; ++j) {
+          int src_rank, dst_rank;
+          getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
+          if (j == splits_a.size()) dst_rank = comm_rank;
+
+          size_t shift = offsets_a[dst_rank];
           for (int i = 0; i < 3; ++i) {
-            if (pinfo_b_h.order[i] == ax_b) break;
-            shift_b *= shape_g_b_h[pinfo_b_h.order[i]];
+            if (pinfo_a_h.order[i] == ax_a) break;
+            shift *= shape_g_a_h[pinfo_a_h.order[i]];
           }
-          dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
-        }
 
-        memcpy_params.src[memcpy_count] = src;
-        memcpy_params.dest[memcpy_count] = dst;
-        memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1];
-        memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0];
-        if (!direct_pack) {
-          memcpy_params.dest_strides[1][memcpy_count] =
-              (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
-          memcpy_params.dest_strides[0][memcpy_count] =
-              memcpy_params.dest_strides[1][memcpy_count] *
-              ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]);
-        } else {
-          memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1];
-          memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0];
+          T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
+          T* dst;
+          if (!direct_pack) {
+            dst = o1 + send_offsets[dst_rank];
+          } else {
+            size_t shift_b = offsets_b[src_rank];
+            for (int i = 0; i < 3; ++i) {
+              if (pinfo_b_h.order[i] == ax_b) break;
+              shift_b *= shape_g_b_h[pinfo_b_h.order[i]];
+            }
+            dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
+          }
+
+          memcpy_params.src[memcpy_count] = src;
+          memcpy_params.dest[memcpy_count] = dst;
+          memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1];
+          memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0];
+          if (!direct_pack) {
+            memcpy_params.dest_strides[1][memcpy_count] =
+                (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
+            memcpy_params.dest_strides[0][memcpy_count] =
+                memcpy_params.dest_strides[1][memcpy_count] *
+                ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]);
+          } else {
+            memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1];
+            memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0];
+          }
+          memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
+          memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1];
+          memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2];
+          memcpy_count++;
+          if (memcpy_count == memcpy_limit || j == splits_a.size()) {
+            memcpy_params.ncopies = memcpy_count;
+            cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream);
+            memcpy_count = 0;
+          }
+#if CUDART_VERSION >= 11010
+          if (pipelined) {
+            cudaStreamCaptureStatus capture_status;
+            CHECK_CUDA(cudaStreamIsCapturing(stream, &capture_status));
+            CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
+                                                capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault));
+          }
+#else
+          if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], stream));
+#endif
         }
-        memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
-        memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1];
-        memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2];
-        memcpy_count++;
-        if (memcpy_count == memcpy_limit || j == splits_a.size()) {
-          memcpy_params.ncopies = memcpy_count;
-          cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
-          memcpy_count = 0;
+        if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) {
+          grid_desc->graph_cache.endCapture(key);
+          grid_desc->graph_cache.replay(key, stream);
         }
-        if (pipelined) CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream));
       }
     }
 
diff --git a/include/internal/utils.h b/include/internal/utils.h
new file mode 100644
index 0000000..f30521c
--- /dev/null
+++ b/include/internal/utils.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_UTILS_H
+#define CUDECOMP_UTILS_H
+inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) {
+  if (a.size != b.size) return false;
+  for (int i = 0; i < 3; ++i) {
+    if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) ||
+        (a.order[i] != b.order[i]) || (a.halo_extents[i] != b.halo_extents[i]) ||
+        (a.padding[i] != b.padding[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#endif // CUDECOMP_UTILS_H
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index c7ded3a..272a158 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -314,6 +314,19 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
         handle->cuda_cumem_enable = false;
       }
     }
+#endif
+  }
+
+  // Check CUDECOMP_ENABLE_CUDA_GRAPHS (CUDA Graphs usage in pipelined backends)
+  const char* graphs_enable_str = std::getenv("CUDECOMP_ENABLE_CUDA_GRAPHS");
+  if (graphs_enable_str) { handle->cuda_graphs_enable = std::strtol(graphs_enable_str, nullptr, 10) == 1; }
+  if (handle->cuda_graphs_enable) {
+#if CUDART_VERSION < 11010
+    if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: CUDECOMP_ENABLE_CUDA_GRAPHS is set but CUDA version used for compilation does not "
+             "support cudaEventRecordWithFlags which is required. Disabling this feature.\n");
+    }
+    handle->cuda_graphs_enable = false;
 #endif
   }
 }
diff --git a/src/graph.cc b/src/graph.cc
new file mode 100644
index 0000000..d9980a1
--- /dev/null
+++ b/src/graph.cc
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <tuple>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/graph.h"
+#include "internal/hashes.h"
+
+namespace cudecomp {
+
+graphCache::graphCache() {
+  CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking));
+}
+
+graphCache::~graphCache() {
+  CHECK_CUDA(cudaStreamDestroy(graph_stream_));
+}
+
+void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {
+  CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream));
+}
+
+cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStream_t stream) const {
+  CHECK_CUDA(cudaStreamBeginCapture(graph_stream_, cudaStreamCaptureModeGlobal));
+  return graph_stream_;
+}
+
+void graphCache::endCapture(const graphCache::key_type& key){
+  cudaGraph_t graph;
+  cudaGraphExec_t graph_exec;
+  CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph));
+  CHECK_CUDA(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+  CHECK_CUDA(cudaGraphDestroy(graph));
+
+  graph_cache_[key] = graph_exec;
+}
+
+bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; }
+
+} // namespace cudecomp
+

From 5748e91a07e7b5dd52154fb346daa01a72bb0571 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Thu, 10 Apr 2025 10:27:41 -0700
Subject: [PATCH 2/8] Fixes.

---
 include/internal/common.h    | 2 +-
 include/internal/transpose.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/internal/common.h b/include/internal/common.h
index 3f06eb5..c0396f9 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -101,7 +101,7 @@ struct cudecompHandle {
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
 
   // CUDA graphs
-#if CUDART_VERSION > 11010
+#if CUDART_VERSION >= 11010
   bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends
 #else
   bool cuda_graphs_enable = false;
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 8852bf4..efdfea7 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -498,12 +498,12 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 #if CUDART_VERSION >= 11010
           if (pipelined) {
             cudaStreamCaptureStatus capture_status;
-            CHECK_CUDA(cudaStreamIsCapturing(stream, &capture_status));
+            CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
             CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
                                                 capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault));
           }
 #else
-          if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], stream));
+          if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
 #endif
         }
         if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) {

From 6e32a23de624927a86be3698561062cf9be66d78 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Thu, 10 Apr 2025 13:39:03 -0700
Subject: [PATCH 3/8] Destroy stored cuda graph exec entries in graphCache
 destructor.

---
 src/graph.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/graph.cc b/src/graph.cc
index d9980a1..be406da 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -46,6 +46,10 @@ graphCache::graphCache() {
 
 graphCache::~graphCache() {
   CHECK_CUDA(cudaStreamDestroy(graph_stream_));
+
+  for (auto& entry : graph_cache_) {
+    CHECK_CUDA(cudaGraphExecDestroy(entry.second));
+  }
 }
 
 void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {

From fa38b9bf458db9f114af826a294f0b5dce8f7596 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 11 Apr 2025 10:52:46 -0700
Subject: [PATCH 4/8] Clear CUDA graph cache during transpose autotuning.
 Disable CUDA graphs by default.

---
 include/internal/common.h |  6 +-----
 include/internal/graph.h  |  1 +
 src/autotune.cc           |  3 +++
 src/graph.cc              | 13 +++++++++----
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/internal/common.h b/include/internal/common.h
index c0396f9..9b46246 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -101,11 +101,7 @@ struct cudecompHandle {
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
 
   // CUDA graphs
-#if CUDART_VERSION >= 11010
-  bool cuda_graphs_enable = true; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends
-#else
-  bool cuda_graphs_enable = false;
-#endif
+  bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends
 };
 
 // Structure with information about row/column communicator
diff --git a/include/internal/graph.h b/include/internal/graph.h
index a130cdb..0fe6603 100644
--- a/include/internal/graph.h
+++ b/include/internal/graph.h
@@ -52,6 +52,7 @@ class graphCache {
   cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
   void endCapture(const key_type& key);
   bool cached(const key_type& key) const;
+  void clear();
 
 private:
   std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
diff --git a/src/autotune.cc b/src/autotune.cc
index b4fdf69..2e27548 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -449,6 +449,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
     }
+
+    // Clear CUDA graph cache between tested process decompositions
+    grid_desc->graph_cache.clear();
   }
 
   // Free test data and workspace
diff --git a/src/graph.cc b/src/graph.cc
index be406da..966f432 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -46,10 +46,7 @@ graphCache::graphCache() {
 
 graphCache::~graphCache() {
   CHECK_CUDA(cudaStreamDestroy(graph_stream_));
-
-  for (auto& entry : graph_cache_) {
-    CHECK_CUDA(cudaGraphExecDestroy(entry.second));
-  }
+  this->clear();
 }
 
 void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {
@@ -73,5 +70,13 @@ void graphCache::endCapture(const graphCache::key_type& key){
 
 bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; }
 
+void graphCache::clear() {
+  for (auto& entry : graph_cache_) {
+    CHECK_CUDA(cudaGraphExecDestroy(entry.second));
+  }
+
+  graph_cache_.clear();
+}
+
 } // namespace cudecomp
 

From 89b3e75c6389dc80594855f2462a082ed7832cf0 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 11 Apr 2025 11:02:27 -0700
Subject: [PATCH 5/8] Update docs.

---
 docs/env_vars.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 06b315a..a7b607d 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -23,3 +23,12 @@ CUDECOMP_ENABLE_CUMEM
 some MPI distributions on multi-node NVLink (MNNVL) capable systems.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_CUDA_GRAPHS
+---------------------------
+(since v0.5.1, requires CUDA 11.1 driver/toolkit or newer)
+
+:code:`CUDECOMP_ENABLE_CUDA_GRAPHS` controls whether cuDecomp uses CUDA Graphs APIs to capture/replay packing operations for pipelined backends. This option can improve the launch efficiency
+and communication overlap of packing kernels in large scale cases.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.

From f3e9fce301782bf6af222bb1a8a651211618a34e Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 11 Apr 2025 11:37:50 -0700
Subject: [PATCH 6/8] Formatting.

---
 include/cudecomp.h           | 16 ++++++++--------
 include/internal/common.h    |  2 +-
 include/internal/graph.h     |  2 +-
 include/internal/hashes.h    | 17 ++++++-----------
 include/internal/transpose.h |  9 ++++++---
 include/internal/utils.h     |  5 ++---
 src/graph.cc                 |  7 ++-----
 7 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/include/cudecomp.h b/include/cudecomp.h
index 631155c..41c8a6e 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -175,19 +175,19 @@ typedef struct {
                                   ///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
                                   ///< (default: [1.0, 1.0, 1.0, 1.0])
 
-  int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose
-                                              ///< operation; first index specifies operation in the following order:
-                                              ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
-                                              ///< argument (default: all zeros, no halos)
+  int32_t transpose_input_halo_extents[4][3];  ///< input_halo_extents argument to use during autotuning by transpose
+                                               ///< operation; first index specifies operation in the following order:
+                                               ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
+                                               ///< argument (default: all zeros, no halos)
   int32_t transpose_output_halo_extents[4][3]; ///< output_halo_extents argument to use during autotuning by transpose
                                                ///< operation; first index specifies operation in the following order:
                                                ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
                                                ///< argument (default: all zeros, no halos)
 
-  int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation;
-                                         ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
-                                         ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
-                                         ///< all zeros, no padding)
+  int32_t transpose_input_padding[4][3];  ///< input_padding argument to use during autotuning by transpose operation;
+                                          ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
+                                          ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
+                                          ///< all zeros, no padding)
   int32_t transpose_output_padding[4][3]; ///< output_padding argument to use during autotuning by transpose operation;
                                           ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
                                           ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
diff --git a/include/internal/common.h b/include/internal/common.h
index 9b46246..987f471 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -101,7 +101,7 @@ struct cudecompHandle {
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
 
   // CUDA graphs
-  bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used for packing launches in pipelined backends
+  bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 };
 
 // Structure with information about row/column communicator
diff --git a/include/internal/graph.h b/include/internal/graph.h
index 0fe6603..e4da24b 100644
--- a/include/internal/graph.h
+++ b/include/internal/graph.h
@@ -45,6 +45,7 @@ namespace cudecomp {
 
 class graphCache {
   using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t>;
+
 public:
   graphCache();
   ~graphCache();
@@ -62,4 +63,3 @@ class graphCache {
 } // namespace cudecomp
 
 #endif // CUDECOMP_GRAPH_H
-
diff --git a/include/internal/hashes.h b/include/internal/hashes.h
index 5eda5ff..c45b065 100644
--- a/include/internal/hashes.h
+++ b/include/internal/hashes.h
@@ -40,8 +40,7 @@
 
 #define MAGIC 0x9e3779b9
 
-template <typename T>
-inline void hash_combine(size_t& hash_value, const T& val) {
+template <typename T> inline void hash_combine(size_t& hash_value, const T& val) {
   hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
 }
 
@@ -56,7 +55,7 @@ template <typename T, size_t N> struct std::hash<std::array<T, N>> {
 };
 
 template <typename T, size_t N> struct std::hash<T[N]> {
-  size_t operator()(const T(&array)[N]) const {
+  size_t operator()(const T (&array)[N]) const {
     size_t hash_value = 0;
     for (size_t i = 0; i < N; ++i) {
       hash_combine(hash_value, array[i]);
@@ -74,7 +73,7 @@ template <typename U, typename V> struct std::hash<std::pair<U, V>> {
   }
 };
 
-template<> struct std::hash<cudecompPencilInfo_t> {
+template <> struct std::hash<cudecompPencilInfo_t> {
   size_t operator()(const cudecompPencilInfo_t& info) const {
     size_t hash_value = 0;
     hash_combine(hash_value, info.shape);
@@ -85,19 +84,15 @@ template<> struct std::hash<cudecompPencilInfo_t> {
   }
 };
 
-template <typename Tuple, std::size_t Index = std::tuple_size<Tuple>::value - 1>
-struct tuple_hasher {
+template <typename Tuple, std::size_t Index = std::tuple_size<Tuple>::value - 1> struct tuple_hasher {
   static void apply(std::size_t& hash_value, const Tuple& tuple) {
     tuple_hasher<Tuple, Index - 1>::apply(hash_value, tuple);
     hash_combine(hash_value, std::get<Index>(tuple));
   }
 };
 
-template <typename Tuple>
-struct tuple_hasher<Tuple, 0> {
-  static void apply(std::size_t& hash_value, const Tuple& tuple) {
-    hash_combine(hash_value, std::get<0>(tuple));
-  }
+template <typename Tuple> struct tuple_hasher<Tuple, 0> {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) { hash_combine(hash_value, std::get<0>(tuple)); }
 };
 
 template <typename... Types> struct std::hash<std::tuple<Types...>> {
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index efdfea7..25c5d88 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -408,7 +408,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
             cudaStreamCaptureStatus capture_status;
             CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
             CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
-                                                capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault));
+                                                capture_status == cudaStreamCaptureStatusActive
+                                                    ? cudaEventRecordExternal
+                                                    : cudaEventRecordDefault));
 #else
             CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
 #endif
@@ -418,7 +420,6 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
             grid_desc->graph_cache.endCapture(key);
             grid_desc->graph_cache.replay(key, stream);
           }
-
         }
       } else {
         T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
@@ -500,7 +501,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
             cudaStreamCaptureStatus capture_status;
             CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
             CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
-                                                capture_status == cudaStreamCaptureStatusActive ? cudaEventRecordExternal : cudaEventRecordDefault));
+                                                capture_status == cudaStreamCaptureStatusActive
+                                                    ? cudaEventRecordExternal
+                                                    : cudaEventRecordDefault));
           }
 #else
           if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
diff --git a/include/internal/utils.h b/include/internal/utils.h
index f30521c..c6bd9aa 100644
--- a/include/internal/utils.h
+++ b/include/internal/utils.h
@@ -33,9 +33,8 @@
 inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) {
   if (a.size != b.size) return false;
   for (int i = 0; i < 3; ++i) {
-    if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) ||
-        (a.order[i] != b.order[i]) || (a.halo_extents[i] != b.halo_extents[i]) ||
-        (a.padding[i] != b.padding[i])) {
+    if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || (a.order[i] != b.order[i]) ||
+        (a.halo_extents[i] != b.halo_extents[i]) || (a.padding[i] != b.padding[i])) {
       return false;
     }
   }
diff --git a/src/graph.cc b/src/graph.cc
index 966f432..8e76d34 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -40,9 +40,7 @@
 
 namespace cudecomp {
 
-graphCache::graphCache() {
-  CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking));
-}
+graphCache::graphCache() { CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); }
 
 graphCache::~graphCache() {
   CHECK_CUDA(cudaStreamDestroy(graph_stream_));
@@ -58,7 +56,7 @@ cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStrea
   return graph_stream_;
 }
 
-void graphCache::endCapture(const graphCache::key_type& key){
+void graphCache::endCapture(const graphCache::key_type& key) {
   cudaGraph_t graph;
   cudaGraphExec_t graph_exec;
   CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph));
@@ -79,4 +77,3 @@ void graphCache::clear() {
 }
 
 } // namespace cudecomp
-

From 6792fabc92f8f1f9bf6f4930e9b181adf43cd347 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 14 Apr 2025 11:46:25 -0700
Subject: [PATCH 7/8] Fix issue with graph cache clearing during autotuning.
 Need to clear between each pdim/backend combination.

---
 src/autotune.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/autotune.cc b/src/autotune.cc
index 2e27548..6b8a74d 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -389,6 +389,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
+      // Clear CUDA graph cache between backend/process decomposition pairs
+      grid_desc->graph_cache.clear();
+
       auto times = processTimings(handle, trial_times);
       auto times_w = processTimings(handle, trial_times_w);
       auto xy_times = processTimings(handle, trial_xy_times);
@@ -450,8 +453,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
 #endif
     }
 
-    // Clear CUDA graph cache between tested process decompositions
-    grid_desc->graph_cache.clear();
   }
 
   // Free test data and workspace

From 8d7cca07dea5f5202c24f432feb5664513207ecb Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 16 Apr 2025 09:40:10 -0700
Subject: [PATCH 8/8] Add datatype to graph cache key.

---
 include/internal/graph.h     |  2 +-
 include/internal/transpose.h |  7 +++++--
 include/internal/utils.h     | 11 +++++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/internal/graph.h b/include/internal/graph.h
index e4da24b..d6db47b 100644
--- a/include/internal/graph.h
+++ b/include/internal/graph.h
@@ -44,7 +44,7 @@
 namespace cudecomp {
 
 class graphCache {
-  using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t>;
+  using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t, cudecompDataType_t>;
 
 public:
   graphCache();
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 25c5d88..f2a3795 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -36,6 +36,7 @@
 #include <vector>
 
 #include <cuda_runtime.h>
+#include <cuda/std/complex>
 #include <cutensor.h>
 #include <mpi.h>
 
@@ -364,7 +365,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
 
       if (pipelined) {
-        auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h);
+        auto dtype = getCudecompDataType<T>();
+        auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype);
 
         if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
           grid_desc->graph_cache.replay(key, stream);
@@ -439,7 +441,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       int memcpy_count = 0;
       cudecompBatchedD2DMemcpy3DParams<T> memcpy_params;
 
-      auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h);
+      auto dtype = getCudecompDataType<T>();
+      auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype);
 
       if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
         grid_desc->graph_cache.replay(key, stream);
diff --git a/include/internal/utils.h b/include/internal/utils.h
index c6bd9aa..26dc8a9 100644
--- a/include/internal/utils.h
+++ b/include/internal/utils.h
@@ -30,6 +30,11 @@
 
 #ifndef CUDECOMP_UTILS_H
 #define CUDECOMP_UTILS_H
+
+#include <cuda/std/complex>
+
+#include "cudecomp.h"
+
 inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) {
   if (a.size != b.size) return false;
   for (int i = 0; i < 3; ++i) {
@@ -41,4 +46,10 @@ inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t
   return true;
 }
 
+inline cudecompDataType_t getCudecompDataType(float) { return CUDECOMP_FLOAT; }
+inline cudecompDataType_t getCudecompDataType(double) { return CUDECOMP_DOUBLE; }
+inline cudecompDataType_t getCudecompDataType(cuda::std::complex<float>) { return CUDECOMP_FLOAT_COMPLEX; }
+inline cudecompDataType_t getCudecompDataType(cuda::std::complex<double>) { return CUDECOMP_DOUBLE_COMPLEX; }
+template <typename T> inline cudecompDataType_t getCudecompDataType() { return getCudecompDataType(T(0)); }
+
 #endif // CUDECOMP_UTILS_H