diff --git a/CMakeLists.txt b/CMakeLists.txt
index 241ed51..e4b39a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,6 +163,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/performance.cc
 )
 
 set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu PROPERTIES COMPILE_FLAGS -rdc=true)
diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index a7b607d..2123b57 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -32,3 +32,61 @@ CUDECOMP_ENABLE_CUDA_GRAPHS
 and communication overlap of packing kernels in large scale cases.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_PERFORMANCE_REPORT
+------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` controls whether cuDecomp performance reporting is enabled.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_PERFORMANCE_REPORT_DETAIL
+----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.
+
+The following values are supported:
+
+- :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default)
+- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose/halo configuration, but only from rank 0
+- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose/halo configuration from all ranks, gathered and sorted by rank on rank 0
+
+Default setting is :code:`0`.
+
+CUDECOMP_PERFORMANCE_REPORT_SAMPLES
+-----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose/halo configuration.
+
+Default setting is :code:`20` samples.
+
+CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
+------------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
+
+Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering.
+
+CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR
+-------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory.
+
+CSV files are created with descriptive names encoding the grid configuration, for example:
+:code:`cudecomp-perf-report-transpose-aggregated-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`
+
+The following CSV files are generated:
+
+- Aggregated transpose performance data
+- Aggregated halo performance data
+- Per-sample transpose data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+- Per-sample halo data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+
+Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.
+
+Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 789ba87..b14b0fe 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -150,14 +150,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
 #endif
 
 template <typename T>
-static void
-cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                 const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
-                 T* recv_buff, const std::vector<comm_count_t>& recv_counts,
-                 const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
-                 cudecompCommAxis comm_axis, cudaStream_t stream) {
+static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                             const std::vector<comm_count_t>& send_counts,
+                             const std::vector<comm_count_t>& send_offsets, T* recv_buff,
+                             const std::vector<comm_count_t>& recv_counts,
+                             const std::vector<comm_count_t>& recv_offsets,
+                             const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                             cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -269,18 +274,28 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     break;
   }
   }
+
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream));
+    current_sample->alltoall_timing_count++;
+  }
+
   nvtx::rangePop();
 }
 
 template <typename T>
-static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                                      const std::vector<comm_count_t>& send_counts,
-                                      const std::vector<comm_count_t>& send_offsets, T* recv_buff,
-                                      const std::vector<comm_count_t>& recv_counts,
-                                      const std::vector<comm_count_t>& recv_offsets,
-                                      const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
-                                      const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
-                                      cudaStream_t stream, bool& synced) {
+static void
+cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                          const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
+                          T* recv_buff, const std::vector<comm_count_t>& recv_counts,
+                          const std::vector<comm_count_t>& recv_offsets,
+                          const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                          const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
+                          bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
+
+  // If there are no transfers to complete, quick return
+  if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }
+
   std::ostringstream os;
   os << "cudecompAlltoallPipelined_";
   for (int i = 0; i < src_ranks.size(); ++i) {
@@ -289,6 +304,14 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
   nvtx::rangePush(os.str());
 
+  int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    // Note: skipping self-copy for timing as it should be overlapped
+    CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count],
+                               handle->pl_stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -459,6 +482,12 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
     break;
   }
   }
+
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    CHECK_CUDA(
+        cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
+    current_sample->alltoall_timing_count++;
+  }
   nvtx::rangePop();
 }
 
@@ -468,9 +497,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
                                  const std::array<comm_count_t, 2>& send_counts,
                                  const std::array<size_t, 2>& send_offsets, T* recv_buff,
                                  const std::array<comm_count_t, 2>& recv_counts,
-                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0) {
+                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0,
+                                 cudecompHaloPerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompSendRecvPair");
 
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_start_event, stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       haloBackendRequiresMpi(grid_desc->config.halo_comm_backend) &&
@@ -592,6 +626,11 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
     break;
   }
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_end_event, stream));
+  }
+
   nvtx::rangePop();
 }
 
diff --git a/include/internal/common.h b/include/internal/common.h
index a299271..6738af5 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -33,8 +33,10 @@
 
 #include <array>
 #include <complex>
+#include <functional>
 #include <map>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -43,6 +45,10 @@
 #include <cuda_runtime.h>
 #include <mpi.h>
 #include <nccl.h>
+#ifdef ENABLE_NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#endif
 
 #include "cudecomp.h"
 #include "internal/checks.h"
@@ -108,6 +114,15 @@ struct cudecompHandle {
 
   // CUDA graphs
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
+
+  // Performance reporting related entries
+  bool performance_report_enable = false; // flag to track if performance reporting is enabled
+  int32_t performance_report_detail =
+      0; // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
+  int32_t performance_report_samples = 20;       // number of performance samples to keep for final report
+  int32_t performance_report_warmup_samples = 3; // number of initial warmup samples to ignore for each configuration
+  std::string performance_report_write_dir =
+      ""; // directory to write CSV performance reports, empty means no file writing
 };
 
 // Structure with information about row/column communicator
@@ -124,6 +139,41 @@ struct cudecompCommInfo {
 #endif
 };
 
+// Structure to contain data for transpose performance sample
+struct cudecompTransposePerformanceSample {
+  cudaEvent_t transpose_start_event;
+  cudaEvent_t transpose_end_event;
+  std::vector<cudaEvent_t> alltoall_start_events;
+  std::vector<cudaEvent_t> alltoall_end_events;
+  int32_t alltoall_timing_count = 0;
+  size_t alltoall_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of transpose performance samples for a specific configuration
+struct cudecompTransposePerformanceSampleCollection {
+  std::vector<cudecompTransposePerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
+// Structure to contain data for halo performance sample
+struct cudecompHaloPerformanceSample {
+  cudaEvent_t halo_start_event;
+  cudaEvent_t halo_end_event;
+  cudaEvent_t sendrecv_start_event;
+  cudaEvent_t sendrecv_end_event;
+  size_t sendrecv_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of halo performance samples for a specific configuration
+struct cudecompHaloPerformanceSampleCollection {
+  std::vector<cudecompHaloPerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
 // cuDecomp grid descriptor containing grid-specific information
 struct cudecompGridDesc {
   cudecompGridDescConfig_t config;      // configuration struct
@@ -144,6 +194,23 @@ struct cudecompGridDesc {
   cudecomp::ncclComm
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
+  // Performance reporting related entries
+  std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
+  std::vector<cudaEvent_t> alltoall_end_events;   // events for alltoall timing
+  int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
+  cudaEvent_t transpose_start_event;              // event for transpose timing
+  cudaEvent_t transpose_end_event;                // event for transpose timing
+
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
+                                std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
+                     cudecompTransposePerformanceSampleCollection>
+      transpose_perf_samples_map;
+
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>, std::array<int32_t, 3>,
+                                bool, cudecompDataType_t>,
+                     cudecompHaloPerformanceSampleCollection>
+      halo_perf_samples_map;
+
   bool initialized = false;
 };
 
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 096292e..55a4a8f 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -41,6 +41,8 @@
 #include "internal/comm_routines.h"
 #include "internal/cudecomp_kernels.h"
 #include "internal/nvtx.h"
+#include "internal/performance.h"
+#include "internal/utils.h"
 
 namespace cudecomp {
 
@@ -73,6 +75,20 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
   // Quick return if no halos
   if (halo_extents[dim] == 0) { return; }
 
+  cudecompHaloPerformanceSample* current_sample = nullptr;
+  if (handle->performance_report_enable) {
+    auto& samples =
+        getOrCreateHaloPerformanceSamples(handle, grid_desc,
+                                          createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                           padding.data(), getCudecompDataType<T>()));
+    current_sample = &samples.samples[samples.sample_idx];
+    current_sample->sendrecv_bytes = 0;
+    current_sample->valid = true;
+
+    // Record start event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
+  }
+
   // Check if halos include more than one process (unsupported currently).
   int count = 0;
   for (int i = 0; i < 3; ++i) {
@@ -120,6 +136,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     c = 0;
   } else if (neighbors[0] == -1 && neighbors[1] == -1) {
     // Single rank in this dimension and not periodic. Return.
+    if (handle->performance_report_enable && current_sample) {
+      // Record end event and advance sample even for early return
+      CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+      advanceHaloPerformanceSample(handle, grid_desc,
+                                   createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                    padding.data(), getCudecompDataType<T>()));
+    }
     return;
   }
 
@@ -204,7 +227,14 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     std::array<size_t, 2> offsets{};
     offsets[1] = halo_size;
 
-    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream);
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
+      }
+    }
+    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream,
+                         current_sample);
 
     // Unpack
     // Left
@@ -261,10 +291,24 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     lx[dim] = shape_g_h_p[dim] - halo_extents[dim];
     recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx);
 
-    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
-                         stream);
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
+      }
+    }
+    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, stream,
+                         current_sample);
   } break;
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    // Record end event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+    advanceHaloPerformanceSample(handle, grid_desc,
+                                 createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                  padding.data(), getCudecompDataType<T>()));
+  }
 }
 
 template <typename T>
diff --git a/include/internal/performance.h b/include/internal/performance.h
new file mode 100644
index 0000000..82f54e2
--- /dev/null
+++ b/include/internal/performance.h
@@ -0,0 +1,149 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_PERFORMANCE_H
+#define CUDECOMP_PERFORMANCE_H
+
+#include <array>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "internal/common.h"
+
+namespace cudecomp {
+
+using cudecompTransposeConfigKey = std::tuple<int32_t,                // ax (axis)
+                                              int32_t,                // dir (direction)
+                                              std::array<int32_t, 3>, // input_halo_extents
+                                              std::array<int32_t, 3>, // output_halo_extents
+                                              std::array<int32_t, 3>, // input_padding
+                                              std::array<int32_t, 3>, // output_padding
+                                              bool,                   // inplace
+                                              bool,                   // managed_memory
+                                              cudecompDataType_t      // datatype
+                                              >;
+
+using cudecompHaloConfigKey = std::tuple<int32_t,                // ax (axis)
+                                         int32_t,                // dim (dimension)
+                                         std::array<int32_t, 3>, // halo_extents
+                                         std::array<bool, 3>,    // halo_periods
+                                         std::array<int32_t, 3>, // padding
+                                         bool,                   // managed_memory
+                                         cudecompDataType_t      // datatype
+                                         >;
+
+// Helper structure for transpose statistics
+struct TransposePerformanceStats {
+  std::string operation;
+  std::string datatype;
+  std::string input_halos;
+  std::string output_halos;
+  std::string input_padding;
+  std::string output_padding;
+  std::string inplace;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float alltoall_time_avg;
+  float local_time_avg;
+  float alltoall_bw_avg;
+};
+
+// Helper structure for halo statistics
+struct HaloPerformanceStats {
+  std::string operation;
+  std::string datatype;
+  int dim;
+  std::string halos;
+  std::string periods;
+  std::string padding;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float sendrecv_time_avg;
+  float local_time_avg;
+  float sendrecv_bw_avg;
+};
+
+// Helper structure to hold pre-computed transpose timing data
+struct TransposeConfigTimingData {
+  TransposePerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> alltoall_times;
+  std::vector<float> local_times;
+  std::vector<float> alltoall_bws;
+  std::vector<int> sample_indices;
+};
+
+// Helper structure to hold pre-computed halo timing data
+struct HaloConfigTimingData {
+  HaloPerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> sendrecv_times;
+  std::vector<float> local_times;
+  std::vector<float> sendrecv_bws;
+  std::vector<int> sample_indices;
+};
+
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
+
+void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
+
+void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config);
+
+cudecompTransposePerformanceSampleCollection&
+getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config);
+
+void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                  const cudecompHaloConfigKey& config);
+
+cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle,
+                                                                           cudecompGridDesc_t grid_desc,
+                                                                           const cudecompHaloConfigKey& config);
+
+// Helper function to create transpose configuration key
+cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
+                                                 const int32_t input_halo_extents_ptr[],
+                                                 const int32_t output_halo_extents_ptr[],
+                                                 const int32_t input_padding_ptr[], const int32_t output_padding_ptr[],
+                                                 cudecompDataType_t datatype);
+
+// Helper function to create halo configuration key
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[], const int32_t padding_ptr[],
+                                       cudecompDataType_t datatype);
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_PERFORMANCE_H
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index fb47c76..afb6480 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -44,6 +44,8 @@
 #include "internal/comm_routines.h"
 #include "internal/cudecomp_kernels.h"
 #include "internal/nvtx.h"
+#include "internal/performance.h"
+#include "internal/utils.h"
 
 namespace cudecomp {
 
@@ -250,6 +252,22 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
   }
 
+  cudecompTransposePerformanceSample* current_sample = nullptr;
+  if (handle->performance_report_enable) {
+    auto& samples =
+        getOrCreateTransposePerformanceSamples(handle, grid_desc,
+                                               createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                     output_halo_extents.data(), input_padding.data(),
+                                                                     output_padding.data(), getCudecompDataType<T>()));
+    current_sample = &samples.samples[samples.sample_idx];
+    current_sample->alltoall_timing_count = 0;
+    current_sample->alltoall_bytes = pinfo_a.size * sizeof(T);
+    current_sample->valid = true;
+
+    // Record start event
+    CHECK_CUDA(cudaEventRecord(current_sample->transpose_start_event, stream));
+  }
+
   // Adjust pointers to handle special cases
   bool direct_pack = false;
   bool direct_transpose = false;
@@ -259,6 +277,14 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       if (inplace) {
         if (halos_padding_equal) {
           // Single rank, in place, Pack -> Unpack: No transpose necessary.
+          if (handle->performance_report_enable) {
+            // Record performance data
+            CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+            advanceTransposePerformanceSample(handle, grid_desc,
+                                              createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                    output_halo_extents.data(), input_padding.data(),
+                                                                    output_padding.data(), getCudecompDataType<T>()));
+          }
           return;
         }
       } else {
@@ -333,6 +359,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   }
 
   bool data_transposed = false;
+
   if (o1 != i1) {
     if (pinfo_b.order[2] == ax_a && !orders_equal) {
       // Transpose/Pack
@@ -523,6 +550,13 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
     if (o1 == output) {
       // o1 is output. Return.
+      if (handle->performance_report_enable) {
+        CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+        advanceTransposePerformanceSample(handle, grid_desc,
+                                          createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                output_halo_extents.data(), input_padding.data(),
+                                                                output_padding.data(), getCudecompDataType<T>()));
+      }
       return;
     }
   } else {
@@ -539,7 +573,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   if (splits_a.size() > 1) {
     if (!pipelined) {
       cudecompAlltoall(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                       recv_offsets_nvshmem, comm_axis, stream);
+                       recv_offsets_nvshmem, comm_axis, stream, current_sample);
     }
   } else {
     o2 = o1;
@@ -609,7 +643,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                      current_sample);
           }
 
           if (o2 != o3) {
@@ -704,7 +739,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                      current_sample);
           }
         }
 
@@ -761,7 +797,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
         if (o2 != o1) {
           cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                    current_sample);
         }
       }
 
@@ -794,6 +831,15 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
     }
   }
+
+  if (handle->performance_report_enable) {
+    // Record performance data
+    CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+    advanceTransposePerformanceSample(handle, grid_desc,
+                                      createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                            output_halo_extents.data(), input_padding.data(),
+                                                            output_padding.data(), getCudecompDataType<T>()));
+  }
 }
 
 template <typename T>
diff --git a/src/autotune.cc b/src/autotune.cc
index b49cf0b..4cb0dbe 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -45,6 +45,7 @@
 #include "internal/checks.h"
 #include "internal/common.h"
 #include "internal/halo.h"
+#include "internal/performance.h"
 #include "internal/transpose.h"
 
 namespace cudecomp {
@@ -288,6 +289,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       if (transposeBackendRequiresNvshmem(comm)) { w = work_nvshmem; }
 #endif
 
+      // Reset performance samples
+      resetPerformanceSamples(handle, grid_desc);
+
       // Warmup
       for (int i = 0; i < options->n_warmup_trials; ++i) {
         if (options->transpose_op_weights[0] != 0.0) {
@@ -431,6 +435,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); }
+
       if (skip_case) continue;
 
       if (times_w[2] < t_best) {
@@ -515,6 +522,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   CHECK_MPI(MPI_Barrier(handle->mpi_comm));
   CHECK_CUDA(cudaFree(tmp1));
   CHECK_CUDA(cudaFree(tmp2));
+
+  // Reset performance samples after autotuning
+  resetPerformanceSamples(handle, grid_desc);
 }
 
 void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
@@ -682,6 +692,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       if (haloBackendRequiresNvshmem(comm)) { w = work_nvshmem; }
 #endif
 
+      // Reset performance samples
+      resetPerformanceSamples(handle, grid_desc);
+
       // Warmup
       for (int i = 0; i < options->n_warmup_trials; ++i) {
         for (int dim = 0; dim < 3; ++dim) {
@@ -742,6 +755,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           }
         }
       }
+
       auto times = processTimings(handle, trial_times, 1000.);
 
       if (handle->rank == 0) {
@@ -759,6 +773,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
         }
       }
 
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); }
+
       if (skip_case) continue;
 
       if (times[2] < t_best) {
@@ -822,6 +839,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   CHECK_MPI(MPI_Barrier(handle->mpi_comm));
   double t_end = MPI_Wtime();
   if (handle->rank == 0) printf("CUDECOMP: halo autotuning time [s]: %f\n", t_end - t_start);
+
+  // Reset performance samples after autotuning
+  resetPerformanceSamples(handle, grid_desc);
 }
 
 } // namespace cudecomp
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index bd4338e..d0852f7 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -329,6 +329,51 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
     handle->cuda_graphs_enable = false;
 #endif
   }
+
+  // Check CUDECOMP_ENABLE_PERFORMANCE_REPORT (Performance reporting)
+  const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORT");
+  if (performance_report_str) {
+    handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1;
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_DETAIL (Performance report detail level)
+  const char* performance_detail_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_DETAIL");
+  if (performance_detail_str) {
+    int32_t detail = std::strtol(performance_detail_str, nullptr, 10);
+    if (detail >= 0 && detail <= 2) {
+      handle->performance_report_detail = detail;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_DETAIL value (%d). Using default (0).\n", detail);
+    }
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_SAMPLES (Number of performance samples to keep)
+  const char* performance_samples_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_SAMPLES");
+  if (performance_samples_str) {
+    int32_t samples = std::strtol(performance_samples_str, nullptr, 10);
+    if (samples > 0) { // Only require positive values
+      handle->performance_report_samples = samples;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n", samples,
+             handle->performance_report_samples);
+    }
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES (Number of initial samples to ignore)
+  const char* performance_warmup_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES");
+  if (performance_warmup_str) {
+    int32_t warmup_samples = std::strtol(performance_warmup_str, nullptr, 10);
+    if (warmup_samples >= 0) { // Only require non-negative values
+      handle->performance_report_warmup_samples = warmup_samples;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES value (%d). Using default (%d).\n",
+             warmup_samples, handle->performance_report_warmup_samples);
+    }
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports)
+  const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
+  if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); }
 }
 
 #ifdef ENABLE_NVSHMEM
@@ -582,14 +627,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
         handle->nvshmem_initialized = true;
         handle->nvshmem_allocation_size = 0;
       }
-      if (!handle->pl_stream) {
-        int greatest_priority;
-        CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority));
-        CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority));
-      }
 #endif
     }
 
+    if (!handle->pl_stream) {
+      int greatest_priority;
+      CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority));
+      CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority));
+    }
+
     // Create CUDA events for scheduling
     grid_desc->events.resize(handle->nranks);
     for (auto& event : grid_desc->events) {
@@ -722,6 +768,37 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
     if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
 #endif
 
+    if (handle->performance_report_enable) {
+      // Print performance report before destroying events
+      printPerformanceReport(handle, grid_desc);
+
+      // Destroy all transpose performance sample events in the map
+      for (auto& entry : grid_desc->transpose_perf_samples_map) {
+        auto& collection = entry.second;
+        for (auto& sample : collection.samples) {
+          CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
+          for (auto& event : sample.alltoall_start_events) {
+            CHECK_CUDA(cudaEventDestroy(event));
+          }
+          for (auto& event : sample.alltoall_end_events) {
+            CHECK_CUDA(cudaEventDestroy(event));
+          }
+        }
+      }
+
+      // Destroy all halo performance sample events in the map
+      for (auto& entry : grid_desc->halo_perf_samples_map) {
+        auto& collection = entry.second;
+        for (auto& sample : collection.samples) {
+          CHECK_CUDA(cudaEventDestroy(sample.halo_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.halo_end_event));
+          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event));
+        }
+      }
+    }
+
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
       // Release grid descriptor references to NCCL communicators
diff --git a/src/performance.cc b/src/performance.cc
new file mode 100644
index 0000000..1910e9d
--- /dev/null
+++ b/src/performance.cc
@@ -0,0 +1,950 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+#include <mpi.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/performance.h"
+
+namespace cudecomp {
+
+// Helper function to create file name with grid descriptor information
+std::filesystem::path createPerformanceReportFileName(const std::string& write_dir, const std::string& table_type,
+                                                      const cudecompGridDesc_t grid_desc) {
+  std::ostringstream filename;
+  filename << "cudecomp-perf-report-" << table_type << "-";
+  filename << "tcomm_" << grid_desc->config.transpose_comm_backend << "-";
+  filename << "hcomm_" << grid_desc->config.halo_comm_backend << "-";
+  filename << "pdims_" << grid_desc->config.pdims[0] << "x" << grid_desc->config.pdims[1] << "-";
+  filename << "gdims_" << grid_desc->config.gdims[0] << "x" << grid_desc->config.gdims[1] << "x"
+           << grid_desc->config.gdims[2] << "-";
+  filename << "memorder_";
+  for (int axis = 0; axis < 3; ++axis) {
+    filename << grid_desc->config.transpose_mem_order[axis][0] << grid_desc->config.transpose_mem_order[axis][1]
+             << grid_desc->config.transpose_mem_order[axis][2];
+  }
+  filename << ".csv";
+
+  return std::filesystem::path(write_dir) / filename.str();
+}
+
+// Helper function to write CSV header comment with grid configuration information
+void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
+  file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend)
+       << "\n";
+  file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n";
+  file << "# Process grid: [" << grid_desc->config.pdims[0] << ", " << grid_desc->config.pdims[1] << "]\n";
+  file << "# Global dimensions: [" << grid_desc->config.gdims[0] << ", " << grid_desc->config.gdims[1] << ", "
+       << grid_desc->config.gdims[2] << "]\n";
+  file << "# Memory order: ";
+  for (int axis = 0; axis < 3; ++axis) {
+    file << "[" << grid_desc->config.transpose_mem_order[axis][0] << ","
+         << grid_desc->config.transpose_mem_order[axis][1] << "," << grid_desc->config.transpose_mem_order[axis][2]
+         << "]";
+    if (axis < 2) file << "; ";
+  }
+  file << "\n#\n";
+}
+
+// Write transpose performance table to CSV
+void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
+                                       const cudecompGridDesc_t grid_desc, const std::string& write_dir) {
+  if (all_transpose_config_data.empty()) return;
+
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-aggregated", grid_desc);
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
+    return;
+  }
+
+  writeCSVHeader(file, grid_desc);
+
+  // Write table header
+  file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,samples,"
+          "total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+
+  // Write table data rows
+  for (const auto& config_data : all_transpose_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      file << stats.operation << "," << stats.datatype << ","
+           << "\"" << stats.input_halos << "\","
+           << "\"" << stats.output_halos << "\","
+           << "\"" << stats.input_padding << "\","
+           << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << stats.samples
+           << "," << std::fixed << std::setprecision(3) << stats.total_time_avg << "," << std::fixed
+           << std::setprecision(3) << stats.alltoall_time_avg << "," << std::fixed << std::setprecision(3)
+           << stats.local_time_avg << "," << std::fixed << std::setprecision(3) << stats.alltoall_bw_avg << "\n";
+    }
+  }
+
+  file.close();
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.string().c_str());
+}
+
+// Write halo performance table to CSV
+void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_halo_config_data,
+                                  const cudecompGridDesc_t grid_desc, const std::string& write_dir) {
+  if (all_halo_config_data.empty()) return;
+
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-aggregated", grid_desc);
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
+    return;
+  }
+
+  writeCSVHeader(file, grid_desc);
+
+  // Write table header
+  file << "operation,dtype,dim,halo_extent,periods,padding,managed,samples,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
+
+  // Write table data rows
+  for (const auto& config_data : all_halo_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      file << stats.operation << "," << stats.datatype << "," << stats.dim << ","
+           << "\"" << stats.halos << "\","
+           << "\"" << stats.periods << "\","
+           << "\"" << stats.padding << "\"," << stats.managed << "," << stats.samples << "," << std::fixed
+           << std::setprecision(3) << stats.total_time_avg << "," << std::fixed << std::setprecision(3)
+           << stats.sendrecv_time_avg << "," << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.sendrecv_bw_avg << "\n";
+    }
+  }
+
+  file.close();
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: Wrote halo performance data to %s\n", filename.string().c_str());
+}
+
+// Helper function to create transpose configuration key
+cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
+                                                 const int32_t input_halo_extents_ptr[],
+                                                 const int32_t output_halo_extents_ptr[],
+                                                 const int32_t input_padding_ptr[], const int32_t output_padding_ptr[],
+                                                 cudecompDataType_t datatype) {
+  std::array<int32_t, 3> input_halo_extents{0, 0, 0};
+  std::array<int32_t, 3> output_halo_extents{0, 0, 0};
+  std::array<int32_t, 3> input_padding{0, 0, 0};
+  std::array<int32_t, 3> output_padding{0, 0, 0};
+
+  if (input_halo_extents_ptr) {
+    std::copy(input_halo_extents_ptr, input_halo_extents_ptr + 3, input_halo_extents.begin());
+  }
+  if (output_halo_extents_ptr) {
+    std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin());
+  }
+  if (input_padding_ptr) { std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin()); }
+  if (output_padding_ptr) { std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin()); }
+
+  bool inplace = (input == output);
+  bool managed_memory = isManagedPointer(input) || isManagedPointer(output);
+
+  return std::make_tuple(ax, dir, input_halo_extents, output_halo_extents, input_padding, output_padding, inplace,
+                         managed_memory, datatype);
+}
+
+// Helper function to create halo configuration key
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[], const int32_t padding_ptr[],
+                                       cudecompDataType_t datatype) {
+  std::array<int32_t, 3> halo_extents{0, 0, 0};
+  std::array<bool, 3> halo_periods{false, false, false};
+  std::array<int32_t, 3> padding{0, 0, 0};
+
+  if (halo_extents_ptr) { std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin()); }
+  if (halo_periods_ptr) { std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin()); }
+  if (padding_ptr) { std::copy(padding_ptr, padding_ptr + 3, padding.begin()); }
+
+  bool managed_memory = isManagedPointer(input);
+
+  return std::make_tuple(ax, dim, halo_extents, halo_periods, padding, managed_memory, datatype);
+}
+
+// Helper function to get or create transpose performance sample collection for a configuration
+cudecompTransposePerformanceSampleCollection&
+getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config) {
+  auto& samples_map = grid_desc->transpose_perf_samples_map;
+
+  if (samples_map.find(config) == samples_map.end()) {
+    // Create new sample collection for this configuration
+    cudecompTransposePerformanceSampleCollection collection;
+    collection.samples.resize(handle->performance_report_samples);
+    collection.sample_idx = 0;
+
+    // Create events for each sample
+    for (auto& sample : collection.samples) {
+      CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event));
+      sample.alltoall_start_events.resize(handle->nranks);
+      sample.alltoall_end_events.resize(handle->nranks);
+      for (auto& event : sample.alltoall_start_events) {
+        CHECK_CUDA(cudaEventCreate(&event));
+      }
+      for (auto& event : sample.alltoall_end_events) {
+        CHECK_CUDA(cudaEventCreate(&event));
+      }
+      sample.valid = false;
+    }
+
+    samples_map[config] = std::move(collection);
+  }
+
+  return samples_map[config];
+}
+
+// Helper function to get or create halo performance sample collection for a configuration
+cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle,
+                                                                           cudecompGridDesc_t grid_desc,
+                                                                           const cudecompHaloConfigKey& config) {
+  auto& samples_map = grid_desc->halo_perf_samples_map;
+
+  if (samples_map.find(config) == samples_map.end()) {
+    // Create new sample collection for this configuration
+    cudecompHaloPerformanceSampleCollection collection;
+    collection.samples.resize(handle->performance_report_samples);
+    collection.sample_idx = 0;
+
+    // Create events for each sample
+    for (auto& sample : collection.samples) {
+      CHECK_CUDA(cudaEventCreate(&sample.halo_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.halo_end_event));
+      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event));
+      sample.valid = false;
+    }
+
+    samples_map[config] = std::move(collection);
+  }
+
+  return samples_map[config];
+}
+
+// Helper function to format integer array as compact string
+std::string formatArray(const std::array<int32_t, 3>& arr) {
+  std::ostringstream oss;
+  oss << "[" << arr[0] << "," << arr[1] << "," << arr[2] << "]";
+  return oss.str();
+}
+
+// Helper function to format boolean array as compact string
+std::string formatArray(const std::array<bool, 3>& arr) {
+  std::ostringstream oss;
+  oss << "[" << (arr[0] ? "1" : "0") << "," << (arr[1] ? "1" : "0") << "," << (arr[2] ? "1" : "0") << "]";
+  return oss.str();
+}
+
+// Helper function to get operation name from transpose config
+std::string getTransposeOperationName(const cudecompTransposeConfigKey& config) {
+  int ax = std::get<0>(config);
+  int dir = std::get<1>(config);
+
+  if (ax == 0) {
+    return "TransposeXY";
+  } else if (ax == 1 && dir == 1) {
+    return "TransposeYZ";
+  } else if (ax == 2) {
+    return "TransposeZY";
+  } else if (ax == 1 && dir == -1) {
+    return "TransposeYX";
+  }
+  return "Unknown";
+}
+
+// Helper function to get operation name from halo config
+std::string getHaloOperationName(const cudecompHaloConfigKey& config) {
+  int ax = std::get<0>(config);
+
+  if (ax == 0) {
+    return "HaloX";
+  } else if (ax == 1) {
+    return "HaloY";
+  } else if (ax == 2) {
+    return "HaloZ";
+  }
+  return "Unknown";
+}
+
+// Helper function to convert datatype to string
+std::string getDatatypeString(cudecompDataType_t datatype) {
+  switch (datatype) {
+  case CUDECOMP_FLOAT: return "S";
+  case CUDECOMP_DOUBLE: return "D";
+  case CUDECOMP_FLOAT_COMPLEX: return "C";
+  case CUDECOMP_DOUBLE_COMPLEX: return "Z";
+  default: return "unknown";
+  }
+}
+
+// Comparison function to order transpose configurations.
+// Ordering is (operation, datatype, halos, padding, inplace, managed).
+bool compareTransposeConfigData(const TransposeConfigTimingData& a, const TransposeConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {{"TransposeXY", 0},
+                                                         {"TransposeYZ", 1},
+                                                         {"TransposeZY", 2},
+                                                         {"TransposeYX", 3}};
+
+  static const std::map<std::string, int> dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}};
+
+  if (a.stats.operation != b.stats.operation) {
+    return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
+  }
+  if (a.stats.datatype != b.stats.datatype) {
+    return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
+  }
+  if (a.stats.input_halos != b.stats.input_halos) { return a.stats.input_halos < b.stats.input_halos; }
+  if (a.stats.output_halos != b.stats.output_halos) { return a.stats.output_halos < b.stats.output_halos; }
+  if (a.stats.input_padding != b.stats.input_padding) { return a.stats.input_padding < b.stats.input_padding; }
+  if (a.stats.output_padding != b.stats.output_padding) { return a.stats.output_padding < b.stats.output_padding; }
+  if (a.stats.inplace != b.stats.inplace) { return a.stats.inplace < b.stats.inplace; }
+  return a.stats.managed < b.stats.managed;
+}
+
+// Comparison function to order halo configurations.
+// Ordering is (operation, datatype, dim, halos, periods, padding, managed).
+bool compareHaloConfigData(const HaloConfigTimingData& a, const HaloConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {{"HaloX", 0}, {"HaloY", 1}, {"HaloZ", 2}};
+
+  static const std::map<std::string, int> dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}};
+
+  if (a.stats.operation != b.stats.operation) {
+    return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
+  }
+  if (a.stats.dim != b.stats.dim) { return a.stats.dim < b.stats.dim; }
+  if (a.stats.datatype != b.stats.datatype) {
+    return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
+  }
+  if (a.stats.halos != b.stats.halos) { return a.stats.halos < b.stats.halos; }
+  if (a.stats.periods != b.stats.periods) { return a.stats.periods < b.stats.periods; }
+  if (a.stats.padding != b.stats.padding) { return a.stats.padding < b.stats.padding; }
+  return a.stats.managed < b.stats.managed;
+}
+
+// Function to compute average across ranks
+float computeGlobalAverage(const std::vector<float>& values, const cudecompHandle_t handle) {
+  if (values.size() == 0) { return 0.0f; }
+
+  float value = std::accumulate(values.begin(), values.end(), 0.0f);
+  value /= values.size();
+  CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &value, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+  value /= handle->nranks;
+  return value;
+}
+
+// Function to gather data from all ranks
+void gatherSampleData(const std::vector<float>& local_data, std::vector<float>& all_data,
+                      const cudecompHandle_t handle) {
+  int num_samples = local_data.size();
+  if (handle->rank == 0) { all_data.resize(num_samples * handle->nranks); }
+
+  CHECK_MPI(MPI_Gather(local_data.data(), num_samples, MPI_FLOAT, all_data.data(), num_samples, MPI_FLOAT, 0,
+                       handle->mpi_comm));
+}
+
+// Process transpose timing data from sample collections
+TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKey& config,
+                                                 const cudecompTransposePerformanceSampleCollection& collection,
+                                                 const cudecompHandle_t handle) {
+  TransposeConfigTimingData config_data;
+
+  config_data.total_times.reserve(collection.samples.size());
+  config_data.alltoall_times.reserve(collection.samples.size());
+  config_data.local_times.reserve(collection.samples.size());
+  config_data.alltoall_bws.reserve(collection.samples.size());
+
+  // Collect valid samples and compute elapsed times
+  for (int i = 0; i < collection.samples.size(); ++i) {
+    const auto& sample = collection.samples[i];
+    if (!sample.valid) continue;
+
+    float alltoall_timing_ms = 0.0f;
+    for (int j = 0; j < sample.alltoall_timing_count; ++j) {
+      float elapsed_time;
+      CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, sample.alltoall_start_events[j], sample.alltoall_end_events[j]));
+      alltoall_timing_ms += elapsed_time;
+    }
+
+    float transpose_timing_ms;
+    CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event));
+
+    config_data.total_times.push_back(transpose_timing_ms);
+    config_data.alltoall_times.push_back(alltoall_timing_ms);
+    config_data.local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
+
+    float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0;
+    config_data.alltoall_bws.push_back(alltoall_bw);
+  }
+
+  // Prepare aggregated statistics
+  TransposePerformanceStats& stats = config_data.stats;
+  stats.operation = getTransposeOperationName(config);
+  stats.datatype = getDatatypeString(std::get<8>(config));
+
+  auto input_halos = std::get<2>(config);
+  auto output_halos = std::get<3>(config);
+  auto input_padding = std::get<4>(config);
+  auto output_padding = std::get<5>(config);
+
+  stats.input_halos = formatArray(input_halos);
+  stats.output_halos = formatArray(output_halos);
+  stats.input_padding = formatArray(input_padding);
+  stats.output_padding = formatArray(output_padding);
+  stats.inplace = std::get<6>(config) ? "T" : "F";
+  stats.managed = std::get<7>(config) ? "T" : "F";
+  stats.samples = config_data.total_times.size();
+
+  // Compute average statistics and reduce across all ranks
+  stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle);
+  stats.alltoall_time_avg = computeGlobalAverage(config_data.alltoall_times, handle);
+  stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle);
+  stats.alltoall_bw_avg = computeGlobalAverage(config_data.alltoall_bws, handle);
+
+  return config_data;
+}
+
+// Process halo timing data from sample collections
+HaloConfigTimingData processHaloConfig(const cudecompHaloConfigKey& config,
+                                       const cudecompHaloPerformanceSampleCollection& collection,
+                                       const cudecompHandle_t handle) {
+  HaloConfigTimingData config_data;
+
+  config_data.total_times.reserve(collection.samples.size());
+  config_data.sendrecv_times.reserve(collection.samples.size());
+  config_data.local_times.reserve(collection.samples.size());
+  config_data.sendrecv_bws.reserve(collection.samples.size());
+
+  // Collect valid samples and compute elapsed times
+  for (int i = 0; i < collection.samples.size(); ++i) {
+    const auto& sample = collection.samples[i];
+    if (!sample.valid) continue;
+
+    float sendrecv_timing_ms = 0.0f;
+    if (sample.sendrecv_bytes > 0) {
+      CHECK_CUDA(cudaEventElapsedTime(&sendrecv_timing_ms, sample.sendrecv_start_event, sample.sendrecv_end_event));
+    }
+
+    float halo_timing_ms;
+    CHECK_CUDA(cudaEventElapsedTime(&halo_timing_ms, sample.halo_start_event, sample.halo_end_event));
+
+    config_data.total_times.push_back(halo_timing_ms);
+    config_data.sendrecv_times.push_back(sendrecv_timing_ms);
+    config_data.local_times.push_back(halo_timing_ms - sendrecv_timing_ms);
+
+    float sendrecv_bw = (sendrecv_timing_ms > 0) ? sample.sendrecv_bytes * 1e-6 / sendrecv_timing_ms : 0;
+    config_data.sendrecv_bws.push_back(sendrecv_bw);
+  }
+
+  // Prepare aggregated statistics
+  HaloPerformanceStats& stats = config_data.stats;
+  stats.operation = getHaloOperationName(config);
+  stats.datatype = getDatatypeString(std::get<6>(config));
+  stats.dim = std::get<1>(config);
+
+  // Format halo extents, periods, and padding
+  auto halo_extents = std::get<2>(config);
+  auto halo_periods = std::get<3>(config);
+  auto padding = std::get<4>(config);
+
+  stats.halos = formatArray(halo_extents);
+  stats.periods = formatArray(halo_periods);
+  stats.padding = formatArray(padding);
+  stats.managed = std::get<5>(config) ? "T" : "F";
+  stats.samples = config_data.total_times.size();
+
+  // Compute average statistics across all ranks
+  stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle);
+  stats.sendrecv_time_avg = computeGlobalAverage(config_data.sendrecv_times, handle);
+  stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle);
+  stats.sendrecv_bw_avg = computeGlobalAverage(config_data.sendrecv_bws, handle);
+
+  return config_data;
+}
+
+// Print grid configuration information
+void printGridConfiguration(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+  if (handle->rank != 0) return;
+
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: ===== Performance Summary =====\n");
+  printf("CUDECOMP: Grid Configuration:\n");
+  printf("CUDECOMP:\tTranspose backend: %s\n",
+         cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
+  printf("CUDECOMP:\tHalo backend: %s\n", cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
+  printf("CUDECOMP:\tProcess grid: [%d, %d]\n", grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
+  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n", grid_desc->config.gdims[0], grid_desc->config.gdims[1],
+         grid_desc->config.gdims[2]);
+
+  // Print memory ordering information
+  printf("CUDECOMP:\tMemory order: ");
+  for (int axis = 0; axis < 3; ++axis) {
+    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0], grid_desc->config.transpose_mem_order[axis][1],
+           grid_desc->config.transpose_mem_order[axis][2]);
+    if (axis < 2) printf("; ");
+  }
+  printf("\n");
+  printf("CUDECOMP:\n");
+}
+
+// Print transpose performance table
+void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>& all_transpose_config_data) {
+  printf("CUDECOMP: Transpose Performance Data:\n");
+  printf("CUDECOMP:\n");
+
+  // Print compact table header
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "halo extents",
+         "padding", "inplace", "managed", "samples", "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "[ms]",
+         "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: ");
+  for (int i = 0; i < 120; ++i)
+    printf("-");
+  printf("\n");
+
+  // Print table rows
+  for (const auto& config_data : all_transpose_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      printf("CUDECOMP: %-12s %-6s %-7s/%-7s %-7s/%-7s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(),
+             stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str(),
+             stats.samples, stats.total_time_avg, stats.alltoall_time_avg, stats.local_time_avg, stats.alltoall_bw_avg);
+    }
+  }
+}
+
+// Print halo performance table
+void printHaloPerformanceTable(const std::vector<HaloConfigTimingData>& all_halo_config_data) {
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: Halo Performance Data:\n");
+  printf("CUDECOMP:\n");
+
+  // Print compact table header
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "dim",
+         "halo extent", "periods", "padding", "managed", "samples", "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "",
+         "[ms]", "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: ");
+  for (int i = 0; i < 125; ++i)
+    printf("-");
+  printf("\n");
+
+  // Print table rows
+  for (const auto& config_data : all_halo_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(),
+             stats.padding.c_str(), stats.managed.c_str(), stats.samples, stats.total_time_avg, stats.sendrecv_time_avg,
+             stats.local_time_avg, stats.sendrecv_bw_avg);
+    }
+  }
+}
+
+// Print per-sample transpose data for a single configuration
+void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& config_data,
+                                             const cudecompHandle_t handle, int detail_level,
+                                             std::ofstream* csv_file = nullptr) {
+  const auto& stats = config_data.stats;
+  const auto& total_times = config_data.total_times;
+  const auto& alltoall_times = config_data.alltoall_times;
+  const auto& local_times = config_data.local_times;
+  const auto& alltoall_bws = config_data.alltoall_bws;
+
+  if (total_times.empty()) return;
+
+  int num_samples = total_times.size();
+  std::vector<float> all_total_times, all_alltoall_times, all_local_times, all_alltoall_bws;
+
+  if (detail_level == 1) {
+    // Detail level 1: Only rank 0 prints its own data
+    if (handle->rank != 0) return;
+
+    all_total_times = total_times;
+    all_alltoall_times = alltoall_times;
+    all_local_times = local_times;
+    all_alltoall_bws = alltoall_bws;
+
+  } else if (detail_level == 2) {
+    // Detail level 2: Gather data from all ranks
+    gatherSampleData(total_times, all_total_times, handle);
+    gatherSampleData(alltoall_times, all_alltoall_times, handle);
+    gatherSampleData(local_times, all_local_times, handle);
+    gatherSampleData(alltoall_bws, all_alltoall_bws, handle);
+
+    if (handle->rank != 0) return;
+  }
+
+  printf("CUDECOMP: %s (dtype=%s, halo extents=%s/%s, padding=%s/%s, inplace=%s, managed=%s) samples:\n",
+         stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(),
+         stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+  for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
+    for (int s = 0; s < num_samples; ++s) {
+      int idx = r * num_samples + s;
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_alltoall_times[idx],
+             all_local_times[idx], all_alltoall_bws[idx]);
+
+      // Write to CSV if file is provided
+      if (csv_file && csv_file->is_open()) {
+        *csv_file << stats.operation << "," << stats.datatype << ","
+                  << "\"" << stats.input_halos << "\","
+                  << "\"" << stats.output_halos << "\","
+                  << "\"" << stats.input_padding << "\","
+                  << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << r << ","
+                  << s << "," << std::fixed << std::setprecision(3) << all_total_times[idx] << "," << std::fixed
+                  << std::setprecision(3) << all_alltoall_times[idx] << "," << std::fixed << std::setprecision(3)
+                  << all_local_times[idx] << "," << std::fixed << std::setprecision(3) << all_alltoall_bws[idx] << "\n";
+      }
+    }
+  }
+  printf("CUDECOMP:\n");
+}
+
+// Print per-sample halo data for a single configuration
+void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data, const cudecompHandle_t handle,
+                                        int detail_level, std::ofstream* csv_file = nullptr) {
+  const auto& stats = config_data.stats;
+  const auto& total_times = config_data.total_times;
+  const auto& sendrecv_times = config_data.sendrecv_times;
+  const auto& local_times = config_data.local_times;
+  const auto& sendrecv_bws = config_data.sendrecv_bws;
+
+  if (total_times.empty()) return;
+
+  int num_samples = total_times.size();
+  std::vector<float> all_total_times, all_sendrecv_times, all_local_times, all_sendrecv_bws;
+
+  if (detail_level == 1) {
+    // Detail level 1: Only rank 0 prints its own data
+    if (handle->rank != 0) return;
+
+    all_total_times = total_times;
+    all_sendrecv_times = sendrecv_times;
+    all_local_times = local_times;
+    all_sendrecv_bws = sendrecv_bws;
+
+  } else if (detail_level == 2) {
+    // Detail level 2: Gather data from all ranks
+    gatherSampleData(total_times, all_total_times, handle);
+    gatherSampleData(sendrecv_times, all_sendrecv_times, handle);
+    gatherSampleData(local_times, all_local_times, handle);
+    gatherSampleData(sendrecv_bws, all_sendrecv_bws, handle);
+
+    if (handle->rank != 0) return;
+  }
+
+  printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n",
+         stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(),
+         stats.padding.c_str(), stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+  for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
+    for (int s = 0; s < num_samples; ++s) {
+      int idx = r * num_samples + s;
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_sendrecv_times[idx],
+             all_local_times[idx], all_sendrecv_bws[idx]);
+
+      // Write to CSV if file is provided
+      if (csv_file && csv_file->is_open()) {
+        *csv_file << stats.operation << "," << stats.datatype << "," << stats.dim << ","
+                  << "\"" << stats.halos << "\","
+                  << "\"" << stats.periods << "\","
+                  << "\"" << stats.padding << "\"," << stats.managed << "," << r << "," << s << "," << std::fixed
+                  << std::setprecision(3) << all_total_times[idx] << "," << std::fixed << std::setprecision(3)
+                  << all_sendrecv_times[idx] << "," << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_sendrecv_bws[idx] << "\n";
+      }
+    }
+  }
+  printf("CUDECOMP:\n");
+}
+
+// Print per-sample details for transpose configurations
+void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
+                                    const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                                    const std::string& write_dir = "") {
+  std::ofstream csv_file;
+  bool csv_enabled = !write_dir.empty();
+
+  // Check if there are any samples before creating CSV file
+  bool has_samples = false;
+  for (const auto& config_data : all_transpose_config_data) {
+    if (config_data.stats.samples > 0) {
+      has_samples = true;
+      break;
+    }
+  }
+
+  if (csv_enabled && handle->rank == 0 && has_samples) {
+    std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc);
+    csv_file.open(filename);
+    if (csv_file.is_open()) {
+      writeCSVHeader(csv_file, grid_desc);
+      csv_file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,"
+                  "rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+    } else {
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
+    }
+  }
+
+  for (const auto& config_data : all_transpose_config_data) {
+    if (config_data.stats.samples == 0) continue;
+
+    printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
+                                            csv_enabled && has_samples ? &csv_file : nullptr);
+  }
+
+  if (csv_file.is_open()) {
+    csv_file.close();
+    printf("CUDECOMP:\n");
+    printf("CUDECOMP: Wrote per-sample transpose data to %s\n",
+           createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
+  }
+}
+
+// Print per-sample details for halo configurations
+void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo_config_data,
+                               const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                               const std::string& write_dir = "") {
+  std::ofstream csv_file;
+  bool csv_enabled = !write_dir.empty();
+
+  // Check if there are any samples before creating CSV file
+  bool has_samples = false;
+  for (const auto& config_data : all_halo_config_data) {
+    if (config_data.stats.samples > 0) {
+      has_samples = true;
+      break;
+    }
+  }
+
+  if (csv_enabled && handle->rank == 0 && has_samples) {
+    std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc);
+    csv_file.open(filename);
+    if (csv_file.is_open()) {
+      writeCSVHeader(csv_file, grid_desc);
+      csv_file
+          << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
+    } else {
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
+    }
+  }
+
+  for (const auto& config_data : all_halo_config_data) {
+    if (config_data.stats.samples == 0) continue;
+
+    printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
+                                       csv_enabled && has_samples ? &csv_file : nullptr);
+  }
+
+  if (csv_file.is_open()) {
+    csv_file.close();
+    printf("CUDECOMP:\n");
+    printf("CUDECOMP: Wrote per-sample halo data to %s\n",
+           createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
+  }
+}
+
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+  // Synchronize to ensure all events are recorded
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  // Collect all transpose statistics and timing data
+  std::vector<TransposeConfigTimingData> all_transpose_config_data;
+  all_transpose_config_data.reserve(grid_desc->transpose_perf_samples_map.size());
+
+  for (const auto& entry : grid_desc->transpose_perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
+
+    TransposeConfigTimingData config_data = processTransposeConfig(config, collection, handle);
+    if (config_data.stats.samples > 0) { all_transpose_config_data.emplace_back(std::move(config_data)); }
+  }
+
+  // Sort transpose configuration data for consistent ordering
+  std::sort(all_transpose_config_data.begin(), all_transpose_config_data.end(), compareTransposeConfigData);
+
+  // Collect all halo statistics and timing data with performance optimizations
+  std::vector<HaloConfigTimingData> all_halo_config_data;
+  all_halo_config_data.reserve(grid_desc->halo_perf_samples_map.size());
+
+  for (const auto& entry : grid_desc->halo_perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
+
+    HaloConfigTimingData config_data = processHaloConfig(config, collection, handle);
+    if (config_data.stats.samples > 0) { all_halo_config_data.emplace_back(std::move(config_data)); }
+  }
+
+  // Sort halo configuration data for consistent ordering
+  std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData);
+
+  // Check if CSV writing is enabled
+  const std::string& write_dir = handle->performance_report_write_dir;
+  bool csv_enabled = !write_dir.empty();
+
+  // Print grid configuration information
+  printGridConfiguration(handle, grid_desc);
+
+  if (handle->rank == 0) {
+    if (all_transpose_config_data.empty() && all_halo_config_data.empty()) {
+      printf("CUDECOMP: No performance data collected\n");
+      printf("CUDECOMP: ================================\n");
+      printf("CUDECOMP:\n");
+      return;
+    }
+
+    // Print transpose performance data
+    if (!all_transpose_config_data.empty()) {
+      printTransposePerformanceTable(all_transpose_config_data);
+
+      // Write transpose performance data to CSV if enabled
+      if (csv_enabled) { writeTransposePerformanceTableCSV(all_transpose_config_data, grid_desc, write_dir); }
+    }
+
+    // Print halo performance data
+    if (!all_halo_config_data.empty()) {
+      printHaloPerformanceTable(all_halo_config_data);
+
+      // Write halo performance data to CSV if enabled
+      if (csv_enabled) { writeHaloPerformanceTableCSV(all_halo_config_data, grid_desc, write_dir); }
+    }
+  }
+
+  // Print per-sample data if detail level > 0
+  if (handle->performance_report_detail > 0) {
+    if (handle->rank == 0) {
+      printf("CUDECOMP:\n");
+      printf("CUDECOMP: Per-Sample Details:\n");
+      printf("CUDECOMP:\n");
+    }
+
+    // Print transpose per-sample details (and write CSV if enabled)
+    printTransposePerSampleDetails(all_transpose_config_data, handle, grid_desc, csv_enabled ? write_dir : "");
+
+    // Print halo per-sample details (and write CSV if enabled)
+    printHaloPerSampleDetails(all_halo_config_data, handle, grid_desc, csv_enabled ? write_dir : "");
+  }
+
+  if (handle->rank == 0) {
+    printf("CUDECOMP: ================================\n");
+    printf("CUDECOMP:\n");
+  }
+}
+
+void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
+  if (!handle->performance_report_enable) return;
+
+  // Reset all transpose sample collections in the map
+  for (auto& entry : grid_desc->transpose_perf_samples_map) {
+    auto& collection = entry.second;
+    collection.sample_idx = 0;
+    collection.warmup_count = 0;
+
+    // Mark all samples as invalid and reset counters
+    for (auto& sample : collection.samples) {
+      sample.valid = false;
+      sample.alltoall_timing_count = 0;
+      sample.alltoall_bytes = 0;
+    }
+  }
+
+  // Reset all halo sample collections in the map
+  for (auto& entry : grid_desc->halo_perf_samples_map) {
+    auto& collection = entry.second;
+    collection.sample_idx = 0;
+    collection.warmup_count = 0;
+
+    // Mark all samples as invalid and reset counters
+    for (auto& sample : collection.samples) {
+      sample.valid = false;
+      sample.sendrecv_bytes = 0;
+    }
+  }
+}
+
+// Helper function to advance transpose sample index with warmup handling
+void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config) {
+  if (!handle->performance_report_enable) return;
+
+  auto& collection = getOrCreateTransposePerformanceSamples(handle, grid_desc, config);
+
+  // Check if we're still in warmup phase
+  if (collection.warmup_count < handle->performance_report_warmup_samples) {
+    collection.warmup_count++;
+    // During warmup, don't advance the circular buffer, just mark current sample as invalid
+    collection.samples[collection.sample_idx].valid = false;
+  } else {
+    // Past warmup, advance the circular buffer normally
+    collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples;
+  }
+}
+
+// Helper function to advance halo sample index with warmup handling
+void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                  const cudecompHaloConfigKey& config) {
+  if (!handle->performance_report_enable) return;
+
+  auto& collection = getOrCreateHaloPerformanceSamples(handle, grid_desc, config);
+
+  // Check if we're still in warmup phase
+  if (collection.warmup_count < handle->performance_report_warmup_samples) {
+    collection.warmup_count++;
+    // During warmup, don't advance the circular buffer, just mark current sample as invalid
+    collection.samples[collection.sample_idx].valid = false;
+  } else {
+    // Past warmup, advance the circular buffer normally
+    collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples;
+  }
+}
+
+} // namespace cudecomp
diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc
index 69e9dbc..895fe86 100644
--- a/tests/cc/halo_test.cc
+++ b/tests/cc/halo_test.cc
@@ -587,6 +587,21 @@ int main(int argc, char** argv) {
   }
 
   // Finalize
+  // Free grid descriptors
+  for (auto& entry : grid_desc_cache) {
+    auto& backend = entry.first;
+    auto& grid_desc = entry.second;
+    // If backend matches workspace, free workspace
+    if (std::get<0>(workspace) == static_cast<int>(backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace)));
+      std::get<0>(workspace) = -1;
+      std::get<1>(workspace) = nullptr;
+      std::get<2>(workspace) = 0;
+    }
+    CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc));
+  }
+  grid_desc_cache.clear();
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle));
   CHECK_MPI_EXIT(MPI_Finalize());
 
diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc
index f9ad62c..c217a3c 100644
--- a/tests/cc/transpose_test.cc
+++ b/tests/cc/transpose_test.cc
@@ -651,6 +651,21 @@ int main(int argc, char** argv) {
   }
 
   // Finalize
+  // Free grid descriptors
+  for (auto& entry : grid_desc_cache) {
+    auto& backend = entry.first;
+    auto& grid_desc = entry.second;
+    // Free workspace using correct grid descriptor
+    if (std::get<0>(workspace) == static_cast<int>(backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace)));
+      std::get<0>(workspace) = -1;
+      std::get<1>(workspace) = nullptr;
+      std::get<2>(workspace) = 0;
+    }
+    CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc));
+  }
+  grid_desc_cache.clear();
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle));
   CHECK_MPI_EXIT(MPI_Finalize());
 
diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90
index 9e67e1c..ac816c3 100644
--- a/tests/fortran/halo_test.f90
+++ b/tests/fortran/halo_test.f90
@@ -630,6 +630,17 @@ program main
     if (nfailed /= 0) retcode = 1;
   endif
 
+  ! Free grid descriptors
+  do i = 1, 5
+    if (grid_desc_cache_set(i)) then
+      ! Free workspace with correct grid descriptor
+      if (work_backend == i) then
+        CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d))
+      endif
+      CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i)))
+    endif
+  end do
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle))
   call MPI_Finalize(ierr)
 
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index 02810df..57caf7c 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -654,6 +654,17 @@ program main
     if (nfailed /= 0) retcode = 1;
   endif
 
+  ! Free grid descriptors
+  do i = 1, 7
+    if (grid_desc_cache_set(i)) then
+      ! Free workspace with correct grid descriptor
+      if (work_backend == i) then
+        CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d))
+      endif
+      CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i)))
+    endif
+  end do
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle))
   call MPI_Finalize(ierr)