diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91c35d7..d76d65a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,6 +161,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
 )
 
diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 06b315a..a7b607d 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -23,3 +23,12 @@ CUDECOMP_ENABLE_CUMEM
 some MPI distributions on multi-node NVLink (MNNVL) capable systems.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_CUDA_GRAPHS
+---------------------------
+(since v0.5.1, requires CUDA 11.1 driver/toolkit or newer)
+
+:code:`CUDECOMP_ENABLE_CUDA_GRAPHS` controls whether cuDecomp uses CUDA Graphs APIs to capture/replay packing operations for pipelined backends. This option can improve the launch efficiency
+and communication overlap of packing kernels in large scale cases.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
diff --git a/include/cudecomp.h b/include/cudecomp.h
index 631155c..41c8a6e 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -175,19 +175,19 @@ typedef struct {
                                   ///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
                                   ///< (default: [1.0, 1.0, 1.0, 1.0])
 
-  int32_t transpose_input_halo_extents[4][3]; ///< input_halo_extents argument to use during autotuning by transpose
-                                              ///< operation; first index specifies operation in the following order:
-                                              ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
-                                              ///< argument (default: all zeros, no halos)
+  int32_t transpose_input_halo_extents[4][3];  ///< input_halo_extents argument to use during autotuning by transpose
+                                               ///< operation; first index specifies operation in the following order:
+                                               ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
+                                               ///< argument (default: all zeros, no halos)
   int32_t transpose_output_halo_extents[4][3]; ///< output_halo_extents argument to use during autotuning by transpose
                                                ///< operation; first index specifies operation in the following order:
                                                ///< X-to-Y, Y-to-Z, Z-to-Y, Y-to-X, second index specifies halo_extent
                                                ///< argument (default: all zeros, no halos)
 
-  int32_t transpose_input_padding[4][3]; ///< input_padding argument to use during autotuning by transpose operation;
-                                         ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
-                                         ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
-                                         ///< all zeros, no padding)
+  int32_t transpose_input_padding[4][3];  ///< input_padding argument to use during autotuning by transpose operation;
+                                          ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
+                                          ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
+                                          ///< all zeros, no padding)
   int32_t transpose_output_padding[4][3]; ///< output_padding argument to use during autotuning by transpose operation;
                                           ///< first index specifies operation in the following order: X-to-Y, Y-to-Z,
                                           ///< Z-to-Y, Y-to-X, second index specifies input_padding argument (default:
diff --git a/include/internal/common.h b/include/internal/common.h
index 7d2ad51..987f471 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -44,6 +44,7 @@
 
 #include "cudecomp.h"
 #include "internal/checks.h"
+#include "internal/graph.h"
 
 namespace cudecomp {
 typedef std::pair<std::array<unsigned char, NVML_GPU_FABRIC_UUID_LEN>, unsigned int> mnnvl_info;
@@ -98,6 +99,9 @@ struct cudecompHandle {
   std::vector<cudecomp::mnnvl_info> rank_to_mnnvl_info; // list of mnnvl information (clusterUuid, cliqueId) by rank
   std::vector<unsigned int> rank_to_clique;             // list of rank to MNNVL clique mappings
   std::vector<int> rank_to_clique_rank;                 // list of rank to MNNVL clique rank mappings
+
+  // CUDA graphs
+  bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 };
 
 // Structure with information about row/column communicator
@@ -127,6 +131,8 @@ struct cudecompGridDesc {
 
   std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
 
+  cudecomp::graphCache graph_cache; // CUDA graph cache
+
   bool initialized = false;
 };
 
diff --git a/include/internal/graph.h b/include/internal/graph.h
new file mode 100644
index 0000000..d6db47b
--- /dev/null
+++ b/include/internal/graph.h
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_GRAPH_H
+#define CUDECOMP_GRAPH_H
+
+#include <tuple>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/hashes.h"
+#include "internal/utils.h"
+
+namespace cudecomp {
+
+class graphCache {
+  using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t, cudecompDataType_t>;
+
+public:
+  graphCache();
+  ~graphCache();
+  void replay(const key_type& key, cudaStream_t stream) const;
+  cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
+  void endCapture(const key_type& key);
+  bool cached(const key_type& key) const;
+  void clear();
+
+private:
+  std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
+  cudaStream_t graph_stream_;
+};
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_GRAPH_H
diff --git a/include/internal/hashes.h b/include/internal/hashes.h
index a3ce284..c45b065 100644
--- a/include/internal/hashes.h
+++ b/include/internal/hashes.h
@@ -31,25 +31,74 @@
 #ifndef CUDECOMP_HASHES_H
 #define CUDECOMP_HASHES_H
 
+#include <array>
+#include <functional>
+#include <tuple>
 #include <utility>
 
+#include "cudecomp.h"
+
 #define MAGIC 0x9e3779b9
 
+template <typename T> inline void hash_combine(size_t& hash_value, const T& val) {
+  hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+}
+
 template <typename T, size_t N> struct std::hash<std::array<T, N>> {
-  size_t operator()(const std::array<T, N>& in) const {
+  size_t operator()(const std::array<T, N>& array) const {
+    size_t hash_value = 0;
+    for (const auto& val : array) {
+      hash_combine(hash_value, val);
+    }
+    return hash_value;
+  }
+};
+
+template <typename T, size_t N> struct std::hash<T[N]> {
+  size_t operator()(const T (&array)[N]) const {
     size_t hash_value = 0;
-    for (const auto& val : in) {
-      hash_value ^= std::hash<T>{}(val) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    for (size_t i = 0; i < N; ++i) {
+      hash_combine(hash_value, array[i]);
     }
     return hash_value;
   }
 };
 
 template <typename U, typename V> struct std::hash<std::pair<U, V>> {
-  size_t operator()(const std::pair<U, V>& in) const {
+  size_t operator()(const std::pair<U, V>& pair) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, pair.first);
+    hash_combine(hash_value, pair.second);
+    return hash_value;
+  }
+};
+
+template <> struct std::hash<cudecompPencilInfo_t> {
+  size_t operator()(const cudecompPencilInfo_t& info) const {
+    size_t hash_value = 0;
+    hash_combine(hash_value, info.shape);
+    hash_combine(hash_value, info.order);
+    hash_combine(hash_value, info.halo_extents);
+    hash_combine(hash_value, info.padding);
+    return hash_value;
+  }
+};
+
+template <typename Tuple, std::size_t Index = std::tuple_size<Tuple>::value - 1> struct tuple_hasher {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) {
+    tuple_hasher<Tuple, Index - 1>::apply(hash_value, tuple);
+    hash_combine(hash_value, std::get<Index>(tuple));
+  }
+};
+
+template <typename Tuple> struct tuple_hasher<Tuple, 0> {
+  static void apply(std::size_t& hash_value, const Tuple& tuple) { hash_combine(hash_value, std::get<0>(tuple)); }
+};
+
+template <typename... Types> struct std::hash<std::tuple<Types...>> {
+  size_t operator()(const std::tuple<Types...>& tuple) const {
     size_t hash_value = 0;
-    hash_value ^= std::hash<U>{}(in.first) + MAGIC + (hash_value << 6) + (hash_value >> 2);
-    hash_value ^= std::hash<V>{}(in.second) + MAGIC + (hash_value << 6) + (hash_value >> 2);
+    tuple_hasher<std::tuple<Types...>>::apply(hash_value, tuple);
     return hash_value;
   }
 };
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index f476b33..f2a3795 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -36,6 +36,7 @@
 #include <vector>
 
 #include <cuda_runtime.h>
+#include <cuda/std/complex>
 #include <cutensor.h>
 #include <mpi.h>
 
@@ -364,37 +365,63 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
 
       if (pipelined) {
-        for (int j = 1; j < splits_a.size() + 1; ++j) {
-          int src_rank, dst_rank;
-          getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
-          if (j == splits_a.size()) dst_rank = comm_rank;
+        auto dtype = getCudecompDataType<T>();
+        auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype);
 
-          size_t shift = offsets_a[dst_rank];
-          for (int i = 0; i < 3; ++i) {
-            if (pinfo_a_h.order[i] == ax_a) break;
-            shift *= shape_g_a_h[pinfo_a_h.order[i]];
+        if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
+          grid_desc->graph_cache.replay(key, stream);
+        } else {
+          cudaStream_t graph_stream = stream;
+          if (handle->cuda_graphs_enable && splits_a.size() > 1) {
+            graph_stream = grid_desc->graph_cache.startCapture(key, stream);
           }
 
-          T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
-          T* dst;
-          if (!direct_transpose) {
-            dst = o1 + send_offsets[dst_rank];
-          } else {
-            size_t shift_b = offsets_b[src_rank];
+          for (int j = 1; j < splits_a.size() + 1; ++j) {
+            int src_rank, dst_rank;
+            getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
+            if (j == splits_a.size()) dst_rank = comm_rank;
+
+            size_t shift = offsets_a[dst_rank];
             for (int i = 0; i < 3; ++i) {
-              if (pinfo_b_h.order[i] == ax_b) break;
-              shift *= shape_g_b_h[pinfo_b_h.order[i]];
+              if (pinfo_a_h.order[i] == ax_a) break;
+              shift *= shape_g_a_h[pinfo_a_h.order[i]];
             }
 
-            dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
-          }
+            T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
+            T* dst;
+            if (!direct_transpose) {
+              dst = o1 + send_offsets[dst_rank];
+            } else {
+              size_t shift_b = offsets_b[src_rank];
+              for (int i = 0; i < 3; ++i) {
+                if (pinfo_b_h.order[i] == ax_b) break;
+                shift *= shape_g_b_h[pinfo_b_h.order[i]];
+              }
 
-          for (int i = 0; i < 3; ++i) {
-            if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank];
+              dst = o1 + shift + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
+            }
+
+            for (int i = 0; i < 3; ++i) {
+              if (ax_a == pinfo_a.order[i]) extents[i] = splits_a[dst_rank];
+            }
+
+            localPermute(handle, extents, order, strides_in, strides_out, src, dst, graph_stream);
+#if CUDART_VERSION >= 11010
+            cudaStreamCaptureStatus capture_status;
+            CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
+            CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
+                                                capture_status == cudaStreamCaptureStatusActive
+                                                    ? cudaEventRecordExternal
+                                                    : cudaEventRecordDefault));
+#else
+            CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
+#endif
           }
 
-          localPermute(handle, extents, order, strides_in, strides_out, src, dst, stream);
-          CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream));
+          if (handle->cuda_graphs_enable && splits_a.size() > 1) {
+            grid_desc->graph_cache.endCapture(key);
+            grid_desc->graph_cache.replay(key, stream);
+          }
         }
       } else {
         T* src = i1 + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
@@ -413,54 +440,82 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       // Pack
       int memcpy_count = 0;
       cudecompBatchedD2DMemcpy3DParams<T> memcpy_params;
-      for (int j = 1; j < splits_a.size() + 1; ++j) {
-        int src_rank, dst_rank;
-        getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
-        if (j == splits_a.size()) dst_rank = comm_rank;
 
-        size_t shift = offsets_a[dst_rank];
-        for (int i = 0; i < 3; ++i) {
-          if (pinfo_a_h.order[i] == ax_a) break;
-          shift *= shape_g_a_h[pinfo_a_h.order[i]];
+      auto dtype = getCudecompDataType<T>();
+      auto key = std::tie(i1, o1, ax, dir, pinfo_a_h, pinfo_b_h, dtype);
+
+      if (handle->cuda_graphs_enable && grid_desc->graph_cache.cached(key)) {
+        grid_desc->graph_cache.replay(key, stream);
+      } else {
+        cudaStream_t graph_stream = stream;
+        if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) {
+          graph_stream = grid_desc->graph_cache.startCapture(key, stream);
         }
 
-        T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
-        T* dst;
-        if (!direct_pack) {
-          dst = o1 + send_offsets[dst_rank];
-        } else {
-          size_t shift_b = offsets_b[src_rank];
+        for (int j = 1; j < splits_a.size() + 1; ++j) {
+          int src_rank, dst_rank;
+          getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
+          if (j == splits_a.size()) dst_rank = comm_rank;
+
+          size_t shift = offsets_a[dst_rank];
           for (int i = 0; i < 3; ++i) {
-            if (pinfo_b_h.order[i] == ax_b) break;
-            shift_b *= shape_g_b_h[pinfo_b_h.order[i]];
+            if (pinfo_a_h.order[i] == ax_a) break;
+            shift *= shape_g_a_h[pinfo_a_h.order[i]];
           }
-          dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
-        }
 
-        memcpy_params.src[memcpy_count] = src;
-        memcpy_params.dest[memcpy_count] = dst;
-        memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1];
-        memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0];
-        if (!direct_pack) {
-          memcpy_params.dest_strides[1][memcpy_count] =
-              (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
-          memcpy_params.dest_strides[0][memcpy_count] =
-              memcpy_params.dest_strides[1][memcpy_count] *
-              ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]);
-        } else {
-          memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1];
-          memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0];
+          T* src = i1 + shift + getPencilPtrOffset(pinfo_a_h, input_halo_extents);
+          T* dst;
+          if (!direct_pack) {
+            dst = o1 + send_offsets[dst_rank];
+          } else {
+            size_t shift_b = offsets_b[src_rank];
+            for (int i = 0; i < 3; ++i) {
+              if (pinfo_b_h.order[i] == ax_b) break;
+              shift_b *= shape_g_b_h[pinfo_b_h.order[i]];
+            }
+            dst = o1 + shift_b + getPencilPtrOffset(pinfo_b_h, output_halo_extents);
+          }
+
+          memcpy_params.src[memcpy_count] = src;
+          memcpy_params.dest[memcpy_count] = dst;
+          memcpy_params.src_strides[0][memcpy_count] = pinfo_a_h.shape[0] * pinfo_a_h.shape[1];
+          memcpy_params.src_strides[1][memcpy_count] = pinfo_a_h.shape[0];
+          if (!direct_pack) {
+            memcpy_params.dest_strides[1][memcpy_count] =
+                (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
+            memcpy_params.dest_strides[0][memcpy_count] =
+                memcpy_params.dest_strides[1][memcpy_count] *
+                ((ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1]);
+          } else {
+            memcpy_params.dest_strides[0][memcpy_count] = pinfo_b_h.shape[0] * pinfo_b_h.shape[1];
+            memcpy_params.dest_strides[1][memcpy_count] = pinfo_b_h.shape[0];
+          }
+          memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
+          memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1];
+          memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2];
+          memcpy_count++;
+          if (memcpy_count == memcpy_limit || j == splits_a.size()) {
+            memcpy_params.ncopies = memcpy_count;
+            cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream);
+            memcpy_count = 0;
+          }
+#if CUDART_VERSION >= 11010
+          if (pipelined) {
+            cudaStreamCaptureStatus capture_status;
+            CHECK_CUDA(cudaStreamIsCapturing(graph_stream, &capture_status));
+            CHECK_CUDA(cudaEventRecordWithFlags(grid_desc->events[dst_rank], graph_stream,
+                                                capture_status == cudaStreamCaptureStatusActive
+                                                    ? cudaEventRecordExternal
+                                                    : cudaEventRecordDefault));
+          }
+#else
+          if (pipelined) CHECK_CUDA(cudaEventRecord((grid_desc->events[dst_rank], graph_stream));
+#endif
         }
-        memcpy_params.extents[2][memcpy_count] = (ax_a == pinfo_a.order[0]) ? splits_a[dst_rank] : pinfo_a.shape[0];
-        memcpy_params.extents[1][memcpy_count] = (ax_a == pinfo_a.order[1]) ? splits_a[dst_rank] : pinfo_a.shape[1];
-        memcpy_params.extents[0][memcpy_count] = (ax_a == pinfo_a.order[2]) ? splits_a[dst_rank] : pinfo_a.shape[2];
-        memcpy_count++;
-        if (memcpy_count == memcpy_limit || j == splits_a.size()) {
-          memcpy_params.ncopies = memcpy_count;
-          cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
-          memcpy_count = 0;
+        if (handle->cuda_graphs_enable && pipelined && splits_a.size() > 1) {
+          grid_desc->graph_cache.endCapture(key);
+          grid_desc->graph_cache.replay(key, stream);
         }
-        if (pipelined) CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], stream));
       }
     }
 
diff --git a/include/internal/utils.h b/include/internal/utils.h
new file mode 100644
index 0000000..26dc8a9
--- /dev/null
+++ b/include/internal/utils.h
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_UTILS_H
+#define CUDECOMP_UTILS_H
+
+#include <cuda/std/complex>
+
+#include "cudecomp.h"
+
+inline bool operator==(const cudecompPencilInfo_t& a, const cudecompPencilInfo_t& b) {
+  if (a.size != b.size) return false;
+  for (int i = 0; i < 3; ++i) {
+    if ((a.shape[i] != b.shape[i]) || (a.lo[i] != b.lo[i]) || (a.hi[i] != b.hi[i]) || (a.order[i] != b.order[i]) ||
+        (a.halo_extents[i] != b.halo_extents[i]) || (a.padding[i] != b.padding[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline cudecompDataType_t getCudecompDataType(float) { return CUDECOMP_FLOAT; }
+inline cudecompDataType_t getCudecompDataType(double) { return CUDECOMP_DOUBLE; }
+inline cudecompDataType_t getCudecompDataType(cuda::std::complex<float>) { return CUDECOMP_FLOAT_COMPLEX; }
+inline cudecompDataType_t getCudecompDataType(cuda::std::complex<double>) { return CUDECOMP_DOUBLE_COMPLEX; }
+template <typename T> inline cudecompDataType_t getCudecompDataType() { return getCudecompDataType(T(0)); }
+
+#endif // CUDECOMP_UTILS_H
diff --git a/src/autotune.cc b/src/autotune.cc
index b4fdf69..6b8a74d 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -389,6 +389,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
+      // Clear CUDA graph cache between backend/process decomposition pairs
+      grid_desc->graph_cache.clear();
+
       auto times = processTimings(handle, trial_times);
       auto times_w = processTimings(handle, trial_times_w);
       auto xy_times = processTimings(handle, trial_xy_times);
@@ -449,6 +452,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
     }
+
   }
 
   // Free test data and workspace
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index c7ded3a..272a158 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -314,6 +314,19 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
         handle->cuda_cumem_enable = false;
       }
     }
+#endif
+  }
+
+  // Check CUDECOMP_ENABLE_CUDA_GRAPHS (CUDA Graphs usage in pipelined backends)
+  const char* graphs_enable_str = std::getenv("CUDECOMP_ENABLE_CUDA_GRAPHS");
+  if (graphs_enable_str) { handle->cuda_graphs_enable = std::strtol(graphs_enable_str, nullptr, 10) == 1; }
+  if (handle->cuda_graphs_enable) {
+#if CUDART_VERSION < 11010
+    if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: CUDECOMP_ENABLE_CUDA_GRAPHS is set but CUDA version used for compilation does not "
+             "support cudaEventRecordWithFlags which is required. Disabling this feature.\n");
+    }
+    handle->cuda_graphs_enable = false;
 #endif
   }
 }
diff --git a/src/graph.cc b/src/graph.cc
new file mode 100644
index 0000000..8e76d34
--- /dev/null
+++ b/src/graph.cc
@@ -0,0 +1,79 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <tuple>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/graph.h"
+#include "internal/hashes.h"
+
+namespace cudecomp {
+
+graphCache::graphCache() { CHECK_CUDA(cudaStreamCreateWithFlags(&graph_stream_, cudaStreamNonBlocking)); }
+
+graphCache::~graphCache() {
+  CHECK_CUDA(cudaStreamDestroy(graph_stream_));
+  this->clear();
+}
+
+void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {
+  CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream));
+}
+
+cudaStream_t graphCache::startCapture(const graphCache::key_type& key, cudaStream_t stream) const {
+  CHECK_CUDA(cudaStreamBeginCapture(graph_stream_, cudaStreamCaptureModeGlobal));
+  return graph_stream_;
+}
+
+void graphCache::endCapture(const graphCache::key_type& key) {
+  cudaGraph_t graph;
+  cudaGraphExec_t graph_exec;
+  CHECK_CUDA(cudaStreamEndCapture(graph_stream_, &graph));
+  CHECK_CUDA(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+  CHECK_CUDA(cudaGraphDestroy(graph));
+
+  graph_cache_[key] = graph_exec;
+}
+
+bool graphCache::cached(const graphCache::key_type& key) const { return graph_cache_.count(key) > 0; }
+
+void graphCache::clear() {
+  for (auto& entry : graph_cache_) {
+    CHECK_CUDA(cudaGraphExecDestroy(entry.second));
+  }
+
+  graph_cache_.clear();
+}
+
+} // namespace cudecomp