From 9fa1ee15e9d92a50a20ad03d6d73f8b67d5a7331 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 1 Jul 2025 13:13:48 -0700
Subject: [PATCH 01/13] Add option to print per-operation performance metrics
 during code runtime.

---
 docs/env_vars.rst                |  9 +++++
 include/internal/comm_routines.h | 28 +++++++++++++++
 include/internal/common.h        | 10 ++++++
 include/internal/transpose.h     | 58 ++++++++++++++++++++++++++++++++
 src/cudecomp.cc                  | 36 +++++++++++++++++---
 5 files changed, 136 insertions(+), 5 deletions(-)
diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index a7b607d..8e679da 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -32,3 +32,12 @@ CUDECOMP_ENABLE_CUDA_GRAPHS
 and communication overlap of packing kernels in large scale cases.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_PERFORMANCE_REPORTING
+-------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls whether cuDecomp prints performance reports to the console. With this option enabled, cuDecomp will print performance metrics for each transpose operation called.
+This option requires a device synchronization after each transpose operation to capture event timings which can impact performance.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 789ba87..27efcc8 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -158,6 +158,10 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
                  cudecompCommAxis comm_axis, cudaStream_t stream) {
   nvtx::rangePush("cudecompAlltoall");
 
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_start_events[0], stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -269,6 +273,12 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     break;
   }
   }
+
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_end_events[0], stream));
+    grid_desc->alltoall_timing_count++;
+  }
+
   nvtx::rangePop();
 }
 
@@ -281,6 +291,12 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
                                       const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
                                       const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
                                       cudaStream_t stream, bool& synced) {
+
+  // If there are no transfers to complete, quick return
+  if (send_counts.size() == 0 && recv_counts.size() == 0) {
+    return;
+  }
+
   std::ostringstream os;
   os << "cudecompAlltoallPipelined_";
   for (int i = 0; i < src_ranks.size(); ++i) {
@@ -289,6 +305,13 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
   nvtx::rangePush(os.str());
 
+  int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    // Note: skipping self-copy for timing as it should be overlapped
+    CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
+    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_start_events[grid_desc->alltoall_timing_count], handle->pl_stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -459,6 +482,11 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
     break;
   }
   }
+
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_end_events[grid_desc->alltoall_timing_count], handle->pl_stream));
+    grid_desc->alltoall_timing_count++;
+  }
   nvtx::rangePop();
 }
 
diff --git a/include/internal/common.h b/include/internal/common.h
index a299271..c367ad4 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -108,6 +108,9 @@ struct cudecompHandle {
 
   // CUDA graphs
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
+
+  // Performance reporting related entries
+  bool performance_report_enable = false;              // Flag to enable performance reporting
 };
 
 // Structure with information about row/column communicator
@@ -144,6 +147,13 @@ struct cudecompGridDesc {
   cudecomp::ncclComm
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
+  // Performance reporting related entries
+  std::vector<cudaEvent_t> alltoall_start_events;  // events for alltoall timing
+  std::vector<cudaEvent_t> alltoall_end_events;    // events for alltoall timing
+  int32_t alltoall_timing_count = 0;               // count of alltoall timing events pairs (for pipelined alltoall)
+  cudaEvent_t transpose_start_event;               // event for transpose timing
+  cudaEvent_t transpose_end_event;                 // event for transpose timing
+
   bool initialized = false;
 };
 
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index fb47c76..a6a238c 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -157,6 +157,36 @@ static void localPermute(const cudecompHandle_t handle, const std::array<int64_t
 }
 #endif
 
+static void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, int ax, int dir, size_t alltoall_bytes) {
+
+  // Compute total timing by summing all individual timings
+  float alltoall_timing_ms = 0.0f;
+  float transpose_timing_ms = 0.0f;
+  for (int i = 0; i < grid_desc->alltoall_timing_count; ++i) {
+    float elapsed_time;
+    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, grid_desc->alltoall_start_events[i], grid_desc->alltoall_end_events[i]));
+    alltoall_timing_ms += elapsed_time;
+  }
+  CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, grid_desc->transpose_start_event, grid_desc->transpose_end_event));
+  // Report on rank 0 only for now.
+  if (handle->rank == 0) {
+    std::string op_name;
+    if (ax == 0) {
+      op_name = "cudecompTransposeXToY";
+    } else if (ax == 1 && dir == 1) {
+      op_name = "cudecompTransposeYToZ";
+    } else if (ax == 2) {
+      op_name = "cudecompTransposeZToY";
+    } else if (ax == 1 && dir == -1) {
+      op_name = "cudecompTransposeYToX";
+    }
+    float alltoall_bw = (alltoall_timing_ms > 0) ? alltoall_bytes * 1e-6/ alltoall_timing_ms : 0;
+    printf("CUDECOMP:PERFORMANCE: rank: %d, op: %s, total time: %.3f ms, alltoall time: %.3f ms, local operation time: %.3f ms, alltoall bw: %.3f GB/s\n",
+           handle->rank, op_name.c_str(), transpose_timing_ms, alltoall_timing_ms, transpose_timing_ms - alltoall_timing_ms, alltoall_bw);
+  }
+
+}
+
 template <typename T>
 static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
                                T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr,
@@ -250,6 +280,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
   }
 
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(grid_desc->transpose_start_event, stream));
+  }
+
   // Adjust pointers to handle special cases
   bool direct_pack = false;
   bool direct_transpose = false;
@@ -259,6 +293,12 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       if (inplace) {
         if (halos_padding_equal) {
           // Single rank, in place, Pack -> Unpack: No transpose necessary.
+          if (handle->performance_report_enable) {
+            // Synchronize and print performance report
+            CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
+            CHECK_CUDA(cudaDeviceSynchronize());
+            printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
+          }
           return;
         }
       } else {
@@ -333,6 +373,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   }
 
   bool data_transposed = false;
+
   if (o1 != i1) {
     if (pinfo_b.order[2] == ax_a && !orders_equal) {
       // Transpose/Pack
@@ -523,6 +564,12 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
     if (o1 == output) {
       // o1 is output. Return.
+      if (handle->performance_report_enable) {
+        // Synchronize and print performance report
+        CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
+        CHECK_CUDA(cudaDeviceSynchronize());
+        printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
+      }
       return;
     }
   } else {
@@ -794,6 +841,17 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       }
     }
   }
+
+  if (handle->performance_report_enable) {
+    // Synchronize and print performance report
+    CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
+    CHECK_CUDA(cudaDeviceSynchronize());
+    printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
+
+    // Reset count for next report
+    grid_desc->alltoall_timing_count = 0;
+  }
+
 }
 
 template <typename T>
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index bd4338e..a7e8e0a 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -329,6 +329,10 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
     handle->cuda_graphs_enable = false;
 #endif
   }
+
+  // Check CUDECOMP_ENABLE_PERFORMANCE_REPORTING (Performance reporting)
+  const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORTING");
+  if (performance_report_str) { handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; }
 }
 
 #ifdef ENABLE_NVSHMEM
@@ -582,14 +586,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
         handle->nvshmem_initialized = true;
         handle->nvshmem_allocation_size = 0;
       }
-      if (!handle->pl_stream) {
-        int greatest_priority;
-        CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority));
-        CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority));
-      }
 #endif
     }
 
+    if (!handle->pl_stream) {
+      int greatest_priority;
+      CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority));
+      CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority));
+    }
+
     // Create CUDA events for scheduling
     grid_desc->events.resize(handle->nranks);
     for (auto& event : grid_desc->events) {
@@ -599,6 +604,18 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
     CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
 #endif
 
+    // Create timing events for AlltoAll operations
+    if (handle->performance_report_enable) {
+      grid_desc->alltoall_start_events.resize(handle->nranks);
+      grid_desc->alltoall_end_events.resize(handle->nranks);
+      for (int i = 0; i < handle->nranks; ++i) {
+        CHECK_CUDA(cudaEventCreate(&grid_desc->alltoall_start_events[i]));
+        CHECK_CUDA(cudaEventCreate(&grid_desc->alltoall_end_events[i]));
+      }
+      CHECK_CUDA(cudaEventCreate(&grid_desc->transpose_start_event));
+      CHECK_CUDA(cudaEventCreate(&grid_desc->transpose_end_event));
+    }
+
     // Disable decompositions with empty pencils
     if (!autotune_pdims &&
         (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
@@ -722,6 +739,15 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
     if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
 #endif
 
+    // Destroy timing events for AlltoAll operations
+    if (handle->performance_report_enable) {
+      for (auto& event : grid_desc->alltoall_start_events) { CHECK_CUDA(cudaEventDestroy(event)); }
+      for (auto& event : grid_desc->alltoall_end_events) { CHECK_CUDA(cudaEventDestroy(event)); }
+
+      CHECK_CUDA(cudaEventDestroy(grid_desc->transpose_start_event));
+      CHECK_CUDA(cudaEventDestroy(grid_desc->transpose_end_event));
+    }
+
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
       // Release grid descriptor references to NCCL communicators

From 9e5117858b52d2c14379a3a80c74c491c8101c22 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Thu, 3 Jul 2025 11:33:02 -0700
Subject: [PATCH 02/13] More work on performance reporting. Adding more
 comprehensive final report mode that is less verbose. Adding sample count and
 warmup knobs.

---
 CMakeLists.txt                   |   1 +
 docs/env_vars.rst                |  27 ++-
 include/internal/comm_routines.h |  24 +-
 include/internal/common.h        |  33 ++-
 include/internal/performance.h   |  80 +++++++
 include/internal/transpose.h     |  84 +++----
 src/autotune.cc                  |  12 +
 src/cudecomp.cc                  |  65 ++++--
 src/performance.cc               | 374 +++++++++++++++++++++++++++++++
 9 files changed, 611 insertions(+), 89 deletions(-)
 create mode 100644 include/internal/performance.h
 create mode 100644 src/performance.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 241ed51..e4b39a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,6 +163,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/performance.cc
 )
 
 set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu PROPERTIES COMPILE_FLAGS -rdc=true)
diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 8e679da..37f5dc5 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -37,7 +37,28 @@ CUDECOMP_ENABLE_PERFORMANCE_REPORTING
 -------------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls whether cuDecomp prints performance reports to the console. With this option enabled, cuDecomp will print performance metrics for each transpose operation called.
-This option requires a device synchronization after each transpose operation to capture event timings which can impact performance.
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls the level of performance reporting cuDecomp prints to the console. This option requires a device synchronization after each transpose operation to capture event timings which can impact performance.
 
-Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+The following values are supported:
+
+- :code:`0`: Performance reporting disabled
+- :code:`1`: Final performance summary only - prints a comprehensive table with averaged performance statistics for all transpose operation configurations when the grid descriptor is destroyed
+- :code:`2`: Verbose reporting - prints both per-operation performance reports and the final performance summary
+
+Default setting is off (:code:`0`).
+
+CUDECOMP_PERFORMANCE_REPORT_SAMPLES
+-----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose configuration.
+
+Default setting is :code:`20` samples. Valid range is 1-1000 samples.
+
+CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
+------------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
+
+Default setting is :code:`2` warmup samples. Valid range is 0-100 samples. Setting this to 0 disables warmup sample filtering.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 27efcc8..1bc98fa 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -155,11 +155,11 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
                  const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
                  T* recv_buff, const std::vector<comm_count_t>& recv_counts,
                  const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
-                 cudecompCommAxis comm_axis, cudaStream_t stream) {
+                 cudecompCommAxis comm_axis, cudaStream_t stream, cudecompPerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
-  if (handle->performance_report_enable) {
-    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_start_events[0], stream));
+  if (handle->performance_report_enable > 0) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream));
   }
 
 #ifdef ENABLE_NVSHMEM
@@ -274,9 +274,9 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
   }
   }
 
-  if (handle->performance_report_enable) {
-    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_end_events[0], stream));
-    grid_desc->alltoall_timing_count++;
+  if (handle->performance_report_enable > 0) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream));
+    current_sample->alltoall_timing_count++;
   }
 
   nvtx::rangePop();
@@ -290,7 +290,7 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
                                       const std::vector<comm_count_t>& recv_offsets,
                                       const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
                                       const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
-                                      cudaStream_t stream, bool& synced) {
+                                      cudaStream_t stream, bool& synced, cudecompPerformanceSample* current_sample = nullptr) {
 
   // If there are no transfers to complete, quick return
   if (send_counts.size() == 0 && recv_counts.size() == 0) {
@@ -306,10 +306,10 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   nvtx::rangePush(os.str());
 
   int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
-  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+  if (handle->performance_report_enable > 0 && src_ranks[0] != self_rank) {
     // Note: skipping self-copy for timing as it should be overlapped
     CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
-    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_start_events[grid_desc->alltoall_timing_count], handle->pl_stream));
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], handle->pl_stream));
   }
 
 #ifdef ENABLE_NVSHMEM
@@ -483,9 +483,9 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
   }
 
-  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
-    CHECK_CUDA(cudaEventRecord(grid_desc->alltoall_end_events[grid_desc->alltoall_timing_count], handle->pl_stream));
-    grid_desc->alltoall_timing_count++;
+  if (handle->performance_report_enable > 0 && src_ranks[0] != self_rank) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
+    current_sample->alltoall_timing_count++;
   }
   nvtx::rangePop();
 }
diff --git a/include/internal/common.h b/include/internal/common.h
index c367ad4..992e296 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -33,8 +33,10 @@
 
 #include <array>
 #include <complex>
+#include <functional>
 #include <map>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -43,6 +45,10 @@
 #include <cuda_runtime.h>
 #include <mpi.h>
 #include <nccl.h>
+#ifdef ENABLE_NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#endif
 
 #include "cudecomp.h"
 #include "internal/checks.h"
@@ -110,7 +116,9 @@ struct cudecompHandle {
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 
   // Performance reporting related entries
-  bool performance_report_enable = false;              // Flag to enable performance reporting
+  int32_t performance_report_enable = 0;               // performance reporting level: 0=off, 1=final only, 2=verbose
+  int32_t performance_report_samples = 20;             // number of performance samples to keep for final report
+  int32_t performance_report_warmup_samples = 2;       // number of initial warmup samples to ignore for each configuration
 };
 
 // Structure with information about row/column communicator
@@ -127,6 +135,25 @@ struct cudecompCommInfo {
 #endif
 };
 
+// Structure to contain data for transpose performance sample
+struct cudecompPerformanceSample {
+  cudaEvent_t transpose_start_event;
+  cudaEvent_t transpose_end_event;
+  std::vector<cudaEvent_t> alltoall_start_events;
+  std::vector<cudaEvent_t> alltoall_end_events;
+  int32_t alltoall_timing_count = 0;
+  size_t alltoall_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of performance samples for a specific configuration
+struct cudecompPerformanceSampleCollection {
+  std::vector<cudecompPerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
+
 // cuDecomp grid descriptor containing grid-specific information
 struct cudecompGridDesc {
   cudecompGridDescConfig_t config;      // configuration struct
@@ -154,6 +181,10 @@ struct cudecompGridDesc {
   cudaEvent_t transpose_start_event;               // event for transpose timing
   cudaEvent_t transpose_end_event;                 // event for transpose timing
 
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
+                                std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
+                     cudecompPerformanceSampleCollection> perf_samples_map;
+
   bool initialized = false;
 };
 
diff --git a/include/internal/performance.h b/include/internal/performance.h
new file mode 100644
index 0000000..a45699c
--- /dev/null
+++ b/include/internal/performance.h
@@ -0,0 +1,80 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CUDECOMP_PERFORMANCE_H
+#define CUDECOMP_PERFORMANCE_H
+
+#include <array>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "internal/common.h"
+
+namespace cudecomp {
+
+using cudecompTransposeConfigKey = std::tuple<
+  int32_t,                         // ax (axis)
+  int32_t,                         // dir (direction)
+  std::array<int32_t, 3>,          // input_halo_extents
+  std::array<int32_t, 3>,          // output_halo_extents
+  std::array<int32_t, 3>,          // input_padding
+  std::array<int32_t, 3>,          // output_padding
+  bool,                            // inplace
+  bool,                            // managed_memory
+  cudecompDataType_t               // datatype
+>;
+
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                           int ax, int dir, size_t alltoall_bytes, cudecompPerformanceSample* current_sample);
+
+void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
+
+void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
+
+void advancePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                             const cudecompTransposeConfigKey& config);
+
+cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecompHandle_t handle,
+                                                                   cudecompGridDesc_t grid_desc,
+                                                                   const cudecompTransposeConfigKey& config);
+
+// Helper function to create transpose configuration key
+cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
+                                                const int32_t input_halo_extents_ptr[],
+                                                const int32_t output_halo_extents_ptr[],
+                                                const int32_t input_padding_ptr[],
+                                                const int32_t output_padding_ptr[],
+                                                cudecompDataType_t datatype);
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_PERFORMANCE_H
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index a6a238c..88f1019 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -44,6 +44,8 @@
 #include "internal/comm_routines.h"
 #include "internal/cudecomp_kernels.h"
 #include "internal/nvtx.h"
+#include "internal/performance.h"
+#include "internal/utils.h"
 
 namespace cudecomp {
 
@@ -157,36 +159,6 @@ static void localPermute(const cudecompHandle_t handle, const std::array<int64_t
 }
 #endif
 
-static void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, int ax, int dir, size_t alltoall_bytes) {
-
-  // Compute total timing by summing all individual timings
-  float alltoall_timing_ms = 0.0f;
-  float transpose_timing_ms = 0.0f;
-  for (int i = 0; i < grid_desc->alltoall_timing_count; ++i) {
-    float elapsed_time;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, grid_desc->alltoall_start_events[i], grid_desc->alltoall_end_events[i]));
-    alltoall_timing_ms += elapsed_time;
-  }
-  CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, grid_desc->transpose_start_event, grid_desc->transpose_end_event));
-  // Report on rank 0 only for now.
-  if (handle->rank == 0) {
-    std::string op_name;
-    if (ax == 0) {
-      op_name = "cudecompTransposeXToY";
-    } else if (ax == 1 && dir == 1) {
-      op_name = "cudecompTransposeYToZ";
-    } else if (ax == 2) {
-      op_name = "cudecompTransposeZToY";
-    } else if (ax == 1 && dir == -1) {
-      op_name = "cudecompTransposeYToX";
-    }
-    float alltoall_bw = (alltoall_timing_ms > 0) ? alltoall_bytes * 1e-6/ alltoall_timing_ms : 0;
-    printf("CUDECOMP:PERFORMANCE: rank: %d, op: %s, total time: %.3f ms, alltoall time: %.3f ms, local operation time: %.3f ms, alltoall bw: %.3f GB/s\n",
-           handle->rank, op_name.c_str(), transpose_timing_ms, alltoall_timing_ms, transpose_timing_ms - alltoall_timing_ms, alltoall_bw);
-  }
-
-}
-
 template <typename T>
 static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
                                T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr,
@@ -280,8 +252,16 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
   }
 
-  if (handle->performance_report_enable) {
-    CHECK_CUDA(cudaEventRecord(grid_desc->transpose_start_event, stream));
+  cudecompPerformanceSample* current_sample = nullptr;
+  if (handle->performance_report_enable > 0) {
+    auto& samples = getOrCreatePerformanceSamples(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+    current_sample = &samples.samples[samples.sample_idx];
+    current_sample->alltoall_timing_count = 0;
+    current_sample->alltoall_bytes = pinfo_a.size * sizeof(T);
+    current_sample->valid = true;
+
+    // Record start event
+    CHECK_CUDA(cudaEventRecord(current_sample->transpose_start_event, stream));
   }
 
   // Adjust pointers to handle special cases
@@ -293,11 +273,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       if (inplace) {
         if (halos_padding_equal) {
           // Single rank, in place, Pack -> Unpack: No transpose necessary.
-          if (handle->performance_report_enable) {
-            // Synchronize and print performance report
-            CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
-            CHECK_CUDA(cudaDeviceSynchronize());
-            printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
+          if (handle->performance_report_enable > 0) {
+            // Print performance report
+            CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+            printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
+            advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
           }
           return;
         }
@@ -564,11 +544,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
     if (o1 == output) {
       // o1 is output. Return.
-      if (handle->performance_report_enable) {
-        // Synchronize and print performance report
-        CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
-        CHECK_CUDA(cudaDeviceSynchronize());
-        printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
+      if (handle->performance_report_enable > 0) {
+        CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+        printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
+        advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
       }
       return;
     }
@@ -586,7 +565,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   if (splits_a.size() > 1) {
     if (!pipelined) {
       cudecompAlltoall(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                       recv_offsets_nvshmem, comm_axis, stream);
+                       recv_offsets_nvshmem, comm_axis, stream, current_sample);
     }
   } else {
     o2 = o1;
@@ -656,7 +635,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
           }
 
           if (o2 != o3) {
@@ -751,7 +730,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
           }
         }
 
@@ -808,7 +787,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
         if (o2 != o1) {
           cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced);
+                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
         }
       }
 
@@ -842,14 +821,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     }
   }
 
-  if (handle->performance_report_enable) {
-    // Synchronize and print performance report
-    CHECK_CUDA(cudaEventRecord(grid_desc->transpose_end_event, stream));
-    CHECK_CUDA(cudaDeviceSynchronize());
-    printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T));
-
-    // Reset count for next report
-    grid_desc->alltoall_timing_count = 0;
+  if (handle->performance_report_enable > 0) {
+    // Print performance report
+    CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
+    printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
+    advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
   }
 
 }
diff --git a/src/autotune.cc b/src/autotune.cc
index b49cf0b..0ccbfc8 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -45,6 +45,7 @@
 #include "internal/checks.h"
 #include "internal/common.h"
 #include "internal/halo.h"
+#include "internal/performance.h"
 #include "internal/transpose.h"
 
 namespace cudecomp {
@@ -316,6 +317,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
+      // Reset performance samples after warmup to ensure clean measurement
+      resetPerformanceSamples(handle, grid_desc);
+
       // Trials
       std::vector<float> trial_times(options->n_trials);
       std::vector<float> trial_times_w(options->n_trials);
@@ -392,6 +396,11 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       // Clear CUDA graph cache between backend/process decomposition pairs
       grid_desc->graph_cache.clear();
 
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable > 0 && !skip_case) {
+        printFinalPerformanceReport(handle, grid_desc);
+      }
+
       auto times = processTimings(handle, trial_times);
       auto times_w = processTimings(handle, trial_times_w);
       auto xy_times = processTimings(handle, trial_xy_times);
@@ -515,6 +524,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   CHECK_MPI(MPI_Barrier(handle->mpi_comm));
   CHECK_CUDA(cudaFree(tmp1));
   CHECK_CUDA(cudaFree(tmp2));
+
+  // Reset performance samples after autotuning
+  resetPerformanceSamples(handle, grid_desc);
 }
 
 void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index a7e8e0a..6862f30 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -332,7 +332,38 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
 
   // Check CUDECOMP_ENABLE_PERFORMANCE_REPORTING (Performance reporting)
   const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORTING");
-  if (performance_report_str) { handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; }
+  if (performance_report_str) {
+    int32_t level = std::strtol(performance_report_str, nullptr, 10);
+    if (level >= 0 && level <= 2) {
+      handle->performance_report_enable = level;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_ENABLE_PERFORMANCE_REPORTING value (%d). Using default (0).\n", level);
+    }
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_SAMPLES (Number of performance samples to keep)
+  const char* performance_samples_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_SAMPLES");
+  if (performance_samples_str) {
+    int32_t samples = std::strtol(performance_samples_str, nullptr, 10);
+    if (samples > 0 && samples <= 1000) {  // Reasonable bounds
+      handle->performance_report_samples = samples;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n",
+             samples, handle->performance_report_samples);
+    }
+  }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES (Number of initial samples to ignore)
+  const char* performance_warmup_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES");
+  if (performance_warmup_str) {
+    int32_t warmup_samples = std::strtol(performance_warmup_str, nullptr, 10);
+    if (warmup_samples >= 0 && warmup_samples <= 100) {  // Reasonable bounds
+      handle->performance_report_warmup_samples = warmup_samples;
+    } else if (handle->rank == 0) {
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES value (%d). Using default (%d).\n",
+             warmup_samples, handle->performance_report_warmup_samples);
+    }
+  }
 }
 
 #ifdef ENABLE_NVSHMEM
@@ -604,18 +635,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
     CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
 #endif
 
-    // Create timing events for AlltoAll operations
-    if (handle->performance_report_enable) {
-      grid_desc->alltoall_start_events.resize(handle->nranks);
-      grid_desc->alltoall_end_events.resize(handle->nranks);
-      for (int i = 0; i < handle->nranks; ++i) {
-        CHECK_CUDA(cudaEventCreate(&grid_desc->alltoall_start_events[i]));
-        CHECK_CUDA(cudaEventCreate(&grid_desc->alltoall_end_events[i]));
-      }
-      CHECK_CUDA(cudaEventCreate(&grid_desc->transpose_start_event));
-      CHECK_CUDA(cudaEventCreate(&grid_desc->transpose_end_event));
-    }
-
     // Disable decompositions with empty pencils
     if (!autotune_pdims &&
         (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
@@ -740,12 +759,20 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
 #endif
 
     // Destroy timing events for AlltoAll operations
-    if (handle->performance_report_enable) {
-      for (auto& event : grid_desc->alltoall_start_events) { CHECK_CUDA(cudaEventDestroy(event)); }
-      for (auto& event : grid_desc->alltoall_end_events) { CHECK_CUDA(cudaEventDestroy(event)); }
-
-      CHECK_CUDA(cudaEventDestroy(grid_desc->transpose_start_event));
-      CHECK_CUDA(cudaEventDestroy(grid_desc->transpose_end_event));
+    if (handle->performance_report_enable > 0) {
+      // Print final performance report before destroying events
+      printFinalPerformanceReport(handle, grid_desc);
+
+      // Destroy all performance sample events in the map
+      for (auto& entry : grid_desc->perf_samples_map) {
+        auto& collection = entry.second;
+        for (auto& sample : collection.samples) {
+          CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
+          for (auto& event : sample.alltoall_start_events) { CHECK_CUDA(cudaEventDestroy(event)); }
+          for (auto& event : sample.alltoall_end_events) { CHECK_CUDA(cudaEventDestroy(event)); }
+        }
+      }
     }
 
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
diff --git a/src/performance.cc b/src/performance.cc
new file mode 100644
index 0000000..f350d6c
--- /dev/null
+++ b/src/performance.cc
@@ -0,0 +1,374 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+#include <mpi.h>
+
+#include "cudecomp.h"
+#include "internal/checks.h"
+#include "internal/performance.h"
+
+namespace cudecomp {
+
+// Helper function to create transpose configuration key (no longer template)
+cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
+                                                const int32_t input_halo_extents_ptr[],
+                                                const int32_t output_halo_extents_ptr[],
+                                                const int32_t input_padding_ptr[],
+                                                const int32_t output_padding_ptr[],
+                                                cudecompDataType_t datatype) {
+  std::array<int32_t, 3> input_halo_extents{0, 0, 0};
+  std::array<int32_t, 3> output_halo_extents{0, 0, 0};
+  std::array<int32_t, 3> input_padding{0, 0, 0};
+  std::array<int32_t, 3> output_padding{0, 0, 0};
+
+  if (input_halo_extents_ptr) {
+    std::copy(input_halo_extents_ptr, input_halo_extents_ptr + 3, input_halo_extents.begin());
+  }
+  if (output_halo_extents_ptr) {
+    std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin());
+  }
+  if (input_padding_ptr) {
+    std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin());
+  }
+  if (output_padding_ptr) {
+    std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin());
+  }
+
+  bool inplace = (input == output);
+  bool managed_memory = isManagedPointer(input) || isManagedPointer(output);
+
+  return std::make_tuple(ax, dir, input_halo_extents, output_halo_extents,
+                         input_padding, output_padding, inplace, managed_memory, datatype);
+}
+
+// Helper function to get or create performance sample collection for a configuration
+cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecompHandle_t handle,
+                                                                   cudecompGridDesc_t grid_desc,
+                                                                   const cudecompTransposeConfigKey& config) {
+  auto& samples_map = grid_desc->perf_samples_map;
+
+  if (samples_map.find(config) == samples_map.end()) {
+    // Create new sample collection for this configuration
+    cudecompPerformanceSampleCollection collection;
+    collection.samples.resize(handle->performance_report_samples);
+    collection.sample_idx = 0;
+
+    // Create events for each sample
+    for (auto& sample : collection.samples) {
+      CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event));
+      sample.alltoall_start_events.resize(handle->nranks);
+      sample.alltoall_end_events.resize(handle->nranks);
+      for (auto& event : sample.alltoall_start_events) {
+        CHECK_CUDA(cudaEventCreate(&event));
+      }
+      for (auto& event : sample.alltoall_end_events) {
+        CHECK_CUDA(cudaEventCreate(&event));
+      }
+      sample.valid = false;
+    }
+
+    samples_map[config] = std::move(collection);
+  }
+
+  return samples_map[config];
+}
+
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                           int ax, int dir, size_t alltoall_bytes, cudecompPerformanceSample* current_sample) {
+  // Only print per-operation reports at level 2
+  if (handle->performance_report_enable != 2) return;
+
+  // Synchronize to ensure all events are recorded
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  // Compute total timing by summing all individual timings
+  float alltoall_timing_ms = 0.0f;
+  float transpose_timing_ms = 0.0f;
+
+  for (int i = 0; i < current_sample->alltoall_timing_count; ++i) {
+    float elapsed_time;
+    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, current_sample->alltoall_start_events[i], current_sample->alltoall_end_events[i]));
+    alltoall_timing_ms += elapsed_time;
+  }
+  CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, current_sample->transpose_start_event, current_sample->transpose_end_event));
+
+  // Report on rank 0 only for now.
+  if (handle->rank == 0) {
+    std::string op_name;
+    if (ax == 0) {
+      op_name = "TransposeXY";
+    } else if (ax == 1 && dir == 1) {
+      op_name = "TransposeYZ";
+    } else if (ax == 2) {
+      op_name = "TransposeZY";
+    } else if (ax == 1 && dir == -1) {
+      op_name = "TransposeYX";
+    }
+
+    float alltoall_bw = 0.0f;
+    if (alltoall_timing_ms > 0) {
+      alltoall_bw = alltoall_bytes * 1e-6 / alltoall_timing_ms;
+    }
+    printf("CUDECOMP: rank: %d, op: %s, total time: %.3f ms, alltoall time: %.3f ms, local operation time: %.3f ms, alltoall bw: %.3f GB/s\n",
+           handle->rank, op_name.c_str(), transpose_timing_ms, alltoall_timing_ms, transpose_timing_ms - alltoall_timing_ms, alltoall_bw);
+  }
+}
+
+// Helper function to format array as compact string
+std::string formatArray(const std::array<int32_t, 3>& arr) {
+  std::ostringstream oss;
+  oss << "[" << arr[0] << "," << arr[1] << "," << arr[2] << "]";
+  return oss.str();
+}
+
+// Helper function to get operation name from config
+std::string getOperationName(const cudecompTransposeConfigKey& config) {
+  int ax = std::get<0>(config);
+  int dir = std::get<1>(config);
+
+  if (ax == 0) {
+    return "TransposeXY";
+  } else if (ax == 1 && dir == 1) {
+    return "TransposeYZ";
+  } else if (ax == 2) {
+    return "TransposeZY";
+  } else if (ax == 1 && dir == -1) {
+    return "TransposeYX";
+  }
+  return "Unknown";
+}
+
+// Helper function to convert datatype to string
+std::string getDatatypeString(cudecompDataType_t datatype) {
+  switch (datatype) {
+    case CUDECOMP_FLOAT: return "S";
+    case CUDECOMP_DOUBLE: return "D";
+    case CUDECOMP_FLOAT_COMPLEX: return "C";
+    case CUDECOMP_DOUBLE_COMPLEX: return "Z";
+    default: return "unknown";
+  }
+}
+
+// Helper structure for statistics
+struct PerformanceStats {
+  std::string operation;
+  std::string datatype;
+  std::string halos;        // Combined input/output halos
+  std::string padding;      // Combined input/output padding
+  std::string inplace;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float alltoall_time_avg;
+  float local_time_avg;
+  float alltoall_bw_avg;
+};
+
+void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+  // Synchronize to ensure all events are recorded
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  // Collect all statistics
+  std::vector<PerformanceStats> all_stats;
+
+  for (const auto& entry : grid_desc->perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
+
+    // Collect valid samples
+    std::vector<float> total_times, alltoall_times, local_times, alltoall_bws;
+
+    for (const auto& sample : collection.samples) {
+      if (!sample.valid) continue;
+
+      float alltoall_timing_ms = 0.0f;
+      for (int j = 0; j < sample.alltoall_timing_count; ++j) {
+        float elapsed_time;
+        CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, sample.alltoall_start_events[j], sample.alltoall_end_events[j]));
+        alltoall_timing_ms += elapsed_time;
+      }
+
+      float transpose_timing_ms;
+      CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event));
+
+      total_times.push_back(transpose_timing_ms);
+      alltoall_times.push_back(alltoall_timing_ms);
+      local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
+
+      float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0;
+      alltoall_bws.push_back(alltoall_bw);
+    }
+
+    if (total_times.empty()) continue;
+
+    PerformanceStats stats;
+    stats.operation = getOperationName(config);
+    stats.datatype = getDatatypeString(std::get<8>(config));
+
+    // Format combined halos and padding
+    auto input_halos = std::get<2>(config);
+    auto output_halos = std::get<3>(config);
+    auto input_padding = std::get<4>(config);
+    auto output_padding = std::get<5>(config);
+
+    stats.halos = formatArray(input_halos) + "/" + formatArray(output_halos);
+    stats.padding = formatArray(input_padding) + "/" + formatArray(output_padding);
+    stats.inplace = std::get<6>(config) ? "Y" : "N";
+    stats.managed = std::get<7>(config) ? "Y" : "N";
+    stats.samples = total_times.size();
+
+    // Compute average statistics across all ranks
+    stats.total_time_avg = std::accumulate(total_times.begin(), total_times.end(), 0.0f) / total_times.size();
+    stats.alltoall_time_avg = std::accumulate(alltoall_times.begin(), alltoall_times.end(), 0.0f) / alltoall_times.size();
+    stats.local_time_avg = std::accumulate(local_times.begin(), local_times.end(), 0.0f) / local_times.size();
+    stats.alltoall_bw_avg = std::accumulate(alltoall_bws.begin(), alltoall_bws.end(), 0.0f) / alltoall_bws.size();
+
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.total_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.alltoall_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.local_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.alltoall_bw_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+
+    stats.total_time_avg /= handle->nranks;
+    stats.alltoall_time_avg /= handle->nranks;
+    stats.local_time_avg /= handle->nranks;
+    stats.alltoall_bw_avg /= handle->nranks;
+
+    all_stats.push_back(stats);
+  }
+
+  if (handle->rank != 0) return; // Only print on rank 0
+
+  printf("CUDECOMP: ===== Performance Summary =====\n");
+
+  // Print grid descriptor configuration information
+  printf("CUDECOMP: Grid Configuration:\n");
+  printf("CUDECOMP:\tTranspose backend: %s\n",
+         cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
+  printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
+         grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
+  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
+         grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
+
+  // Print memory ordering information
+  printf("CUDECOMP:\tMemory order: ");
+  for (int axis = 0; axis < 3; ++axis) {
+    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
+                        grid_desc->config.transpose_mem_order[axis][1],
+                        grid_desc->config.transpose_mem_order[axis][2]);
+    if (axis < 2) printf("; ");
+  }
+  printf("\n");
+
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: Transpose Performance Data:\n");
+  printf("CUDECOMP:\n");
+
+  if (all_stats.empty()) {
+    printf("CUDECOMP: No performance data collected\n");
+    printf("CUDECOMP: ================================\n");
+    return;
+  }
+
+  // Print compact table header
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
+         "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "", "", "", "", "", "", "",
+         "[ms]", "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: ");
+  for (int i = 0; i < 120; ++i) printf("-");
+  printf("\n");
+
+  // Print table rows
+  for (const auto& stats : all_stats) {
+    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+           stats.operation.c_str(),
+           stats.datatype.c_str(),
+           stats.halos.c_str(),
+           stats.padding.c_str(),
+           stats.inplace.c_str(),
+           stats.managed.c_str(),
+           stats.samples,
+           stats.total_time_avg,
+           stats.alltoall_time_avg,
+           stats.local_time_avg,
+           stats.alltoall_bw_avg
+           );
+  }
+
+  printf("CUDECOMP: ================================\n");
+}
+
+void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
+  if (handle->performance_report_enable == 0) return;
+
+  // Reset all sample collections in the map
+  for (auto& entry : grid_desc->perf_samples_map) {
+    auto& collection = entry.second;
+    collection.sample_idx = 0;
+    collection.warmup_count = 0;
+
+    // Mark all samples as invalid and reset counters
+    for (auto& sample : collection.samples) {
+      sample.valid = false;
+      sample.alltoall_timing_count = 0;
+      sample.alltoall_bytes = 0;
+    }
+  }
+}
+
+// Helper function to advance sample index with warmup handling
+void advancePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                             const cudecompTransposeConfigKey& config) {
+  if (handle->performance_report_enable == 0) return;
+
+  auto& collection = getOrCreatePerformanceSamples(handle, grid_desc, config);
+
+  // Check if we're still in warmup phase
+  if (collection.warmup_count < handle->performance_report_warmup_samples) {
+    collection.warmup_count++;
+    // During warmup, don't advance the circular buffer, just mark current sample as invalid
+    collection.samples[collection.sample_idx].valid = false;
+  } else {
+    // Past warmup, advance the circular buffer normally
+    collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples;
+  }
+}
+
+} // namespace cudecomp

From 4d0b8556d092f79980c6b0f59151457130a29451 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 4 Jul 2025 22:50:11 -0700
Subject: [PATCH 03/13] Replace existing per-op performance reporting with
 per-sample performance reporting. Enable with
 CUDECOMP_PERFORMANCE_REPORT_DETAIL.

---
 docs/env_vars.rst                |  20 +-
 include/internal/comm_routines.h |   8 +-
 include/internal/common.h        |   5 +-
 include/internal/transpose.h     |  15 +-
 src/autotune.cc                  |   8 +-
 src/cudecomp.cc                  |  16 +-
 src/performance.cc               | 313 +++++++++++++++++++------------
 7 files changed, 238 insertions(+), 147 deletions(-)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 37f5dc5..13f1ef2 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -37,15 +37,23 @@ CUDECOMP_ENABLE_PERFORMANCE_REPORTING
 -------------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls the level of performance reporting cuDecomp prints to the console. This option requires a device synchronization after each transpose operation to capture event timings which can impact performance.
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls whether cuDecomp performance reporting is enabled.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_PERFORMANCE_REPORT_DETAIL
+----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.
 
 The following values are supported:
 
-- :code:`0`: Performance reporting disabled
-- :code:`1`: Final performance summary only - prints a comprehensive table with averaged performance statistics for all transpose operation configurations when the grid descriptor is destroyed
-- :code:`2`: Verbose reporting - prints both per-operation performance reports and the final performance summary
+- :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default)
+- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose configuration, but only from rank 0
+- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose configuration from all ranks, gathered and sorted by rank on rank 0
 
-Default setting is off (:code:`0`).
+Default setting is :code:`0`.
 
 CUDECOMP_PERFORMANCE_REPORT_SAMPLES
 -----------------------------------
@@ -61,4 +69,4 @@ CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
 
 :code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
 
-Default setting is :code:`2` warmup samples. Valid range is 0-100 samples. Setting this to 0 disables warmup sample filtering.
+Default setting is :code:`3` warmup samples. Valid range is 0-100 samples. Setting this to 0 disables warmup sample filtering.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 1bc98fa..6230aa7 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -158,7 +158,7 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
                  cudecompCommAxis comm_axis, cudaStream_t stream, cudecompPerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
-  if (handle->performance_report_enable > 0) {
+  if (handle->performance_report_enable) {
     CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream));
   }
 
@@ -274,7 +274,7 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
   }
   }
 
-  if (handle->performance_report_enable > 0) {
+  if (handle->performance_report_enable) {
     CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream));
     current_sample->alltoall_timing_count++;
   }
@@ -306,7 +306,7 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   nvtx::rangePush(os.str());
 
   int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
-  if (handle->performance_report_enable > 0 && src_ranks[0] != self_rank) {
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
     // Note: skipping self-copy for timing as it should be overlapped
     CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
     CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], handle->pl_stream));
@@ -483,7 +483,7 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
   }
 
-  if (handle->performance_report_enable > 0 && src_ranks[0] != self_rank) {
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
     CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
     current_sample->alltoall_timing_count++;
   }
diff --git a/include/internal/common.h b/include/internal/common.h
index 992e296..5b137ed 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -116,9 +116,10 @@ struct cudecompHandle {
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 
   // Performance reporting related entries
-  int32_t performance_report_enable = 0;               // performance reporting level: 0=off, 1=final only, 2=verbose
+  bool performance_report_enable = false;              // flag to track if performance reporting is enabled
+  int32_t performance_report_detail = 0;               // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
   int32_t performance_report_samples = 20;             // number of performance samples to keep for final report
-  int32_t performance_report_warmup_samples = 2;       // number of initial warmup samples to ignore for each configuration
+  int32_t performance_report_warmup_samples = 3;       // number of initial warmup samples to ignore for each configuration
 };
 
 // Structure with information about row/column communicator
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 88f1019..2f9e806 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -253,7 +253,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   }
 
   cudecompPerformanceSample* current_sample = nullptr;
-  if (handle->performance_report_enable > 0) {
+  if (handle->performance_report_enable) {
     auto& samples = getOrCreatePerformanceSamples(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
     current_sample = &samples.samples[samples.sample_idx];
     current_sample->alltoall_timing_count = 0;
@@ -273,10 +273,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       if (inplace) {
         if (halos_padding_equal) {
           // Single rank, in place, Pack -> Unpack: No transpose necessary.
-          if (handle->performance_report_enable > 0) {
-            // Print performance report
+          if (handle->performance_report_enable) {
+            // Record performance data
             CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-            printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
             advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
           }
           return;
@@ -544,9 +543,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
     if (o1 == output) {
       // o1 is output. Return.
-      if (handle->performance_report_enable > 0) {
+      if (handle->performance_report_enable) {
         CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-        printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
         advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
       }
       return;
@@ -821,10 +819,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     }
   }
 
-  if (handle->performance_report_enable > 0) {
-    // Print performance report
+  if (handle->performance_report_enable) {
+    // Record performance data
     CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-    printPerformanceReport(handle, grid_desc, ax, dir, pinfo_a.size * sizeof(T), current_sample);
     advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
   }
 
diff --git a/src/autotune.cc b/src/autotune.cc
index 0ccbfc8..facb1ce 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -289,6 +289,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       if (transposeBackendRequiresNvshmem(comm)) { w = work_nvshmem; }
 #endif
 
+      // Reset performance samples
+      resetPerformanceSamples(handle, grid_desc);
+
       // Warmup
       for (int i = 0; i < options->n_warmup_trials; ++i) {
         if (options->transpose_op_weights[0] != 0.0) {
@@ -317,9 +320,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
-      // Reset performance samples after warmup to ensure clean measurement
-      resetPerformanceSamples(handle, grid_desc);
-
       // Trials
       std::vector<float> trial_times(options->n_trials);
       std::vector<float> trial_times_w(options->n_trials);
@@ -397,7 +397,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       grid_desc->graph_cache.clear();
 
       // Print performance report for this configuration if enabled
-      if (handle->performance_report_enable > 0 && !skip_case) {
+      if (handle->performance_report_enable && !skip_case) {
         printFinalPerformanceReport(handle, grid_desc);
       }
 
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 6862f30..6d53634 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -332,12 +332,16 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
 
   // Check CUDECOMP_ENABLE_PERFORMANCE_REPORTING (Performance reporting)
   const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORTING");
-  if (performance_report_str) {
-    int32_t level = std::strtol(performance_report_str, nullptr, 10);
-    if (level >= 0 && level <= 2) {
-      handle->performance_report_enable = level;
+  if (performance_report_str) { handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_DETAIL (Performance report detail level)
+  const char* performance_detail_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_DETAIL");
+  if (performance_detail_str) {
+    int32_t detail = std::strtol(performance_detail_str, nullptr, 10);
+    if (detail >= 0 && detail <= 2) {
+      handle->performance_report_detail = detail;
     } else if (handle->rank == 0) {
-      printf("CUDECOMP:WARN: Invalid CUDECOMP_ENABLE_PERFORMANCE_REPORTING value (%d). Using default (0).\n", level);
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_DETAIL value (%d). Using default (0).\n", detail);
     }
   }
 
@@ -759,7 +763,7 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
 #endif
 
     // Destroy timing events for AlltoAll operations
-    if (handle->performance_report_enable > 0) {
+    if (handle->performance_report_enable) {
       // Print final performance report before destroying events
       printFinalPerformanceReport(handle, grid_desc);
 
diff --git a/src/performance.cc b/src/performance.cc
index f350d6c..0402526 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -108,47 +108,6 @@ cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecom
   return samples_map[config];
 }
 
-void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
-                           int ax, int dir, size_t alltoall_bytes, cudecompPerformanceSample* current_sample) {
-  // Only print per-operation reports at level 2
-  if (handle->performance_report_enable != 2) return;
-
-  // Synchronize to ensure all events are recorded
-  CHECK_CUDA(cudaDeviceSynchronize());
-
-  // Compute total timing by summing all individual timings
-  float alltoall_timing_ms = 0.0f;
-  float transpose_timing_ms = 0.0f;
-
-  for (int i = 0; i < current_sample->alltoall_timing_count; ++i) {
-    float elapsed_time;
-    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, current_sample->alltoall_start_events[i], current_sample->alltoall_end_events[i]));
-    alltoall_timing_ms += elapsed_time;
-  }
-  CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, current_sample->transpose_start_event, current_sample->transpose_end_event));
-
-  // Report on rank 0 only for now.
-  if (handle->rank == 0) {
-    std::string op_name;
-    if (ax == 0) {
-      op_name = "TransposeXY";
-    } else if (ax == 1 && dir == 1) {
-      op_name = "TransposeYZ";
-    } else if (ax == 2) {
-      op_name = "TransposeZY";
-    } else if (ax == 1 && dir == -1) {
-      op_name = "TransposeYX";
-    }
-
-    float alltoall_bw = 0.0f;
-    if (alltoall_timing_ms > 0) {
-      alltoall_bw = alltoall_bytes * 1e-6 / alltoall_timing_ms;
-    }
-    printf("CUDECOMP: rank: %d, op: %s, total time: %.3f ms, alltoall time: %.3f ms, local operation time: %.3f ms, alltoall bw: %.3f GB/s\n",
-           handle->rank, op_name.c_str(), transpose_timing_ms, alltoall_timing_ms, transpose_timing_ms - alltoall_timing_ms, alltoall_bw);
-  }
-}
-
 // Helper function to format array as compact string
 std::string formatArray(const std::array<int32_t, 3>& arr) {
   std::ostringstream oss;
@@ -199,21 +158,32 @@ struct PerformanceStats {
   float alltoall_bw_avg;
 };
 
+// Helper structure to hold pre-computed timing data
+struct ConfigTimingData {
+  PerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> alltoall_times;
+  std::vector<float> local_times;
+  std::vector<float> alltoall_bws;
+  std::vector<int> sample_indices;
+};
+
 void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
   // Synchronize to ensure all events are recorded
   CHECK_CUDA(cudaDeviceSynchronize());
 
-  // Collect all statistics
-  std::vector<PerformanceStats> all_stats;
+  // Collect all statistics and timing data
+  std::vector<ConfigTimingData> all_config_data;
 
   for (const auto& entry : grid_desc->perf_samples_map) {
     const auto& config = entry.first;
     const auto& collection = entry.second;
 
-    // Collect valid samples
-    std::vector<float> total_times, alltoall_times, local_times, alltoall_bws;
+    ConfigTimingData config_data;
 
-    for (const auto& sample : collection.samples) {
+    // Collect valid samples and compute elapsed times once
+    for (int i = 0; i < collection.samples.size(); ++i) {
+      const auto& sample = collection.samples[i];
       if (!sample.valid) continue;
 
       float alltoall_timing_ms = 0.0f;
@@ -226,17 +196,19 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
       float transpose_timing_ms;
       CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event));
 
-      total_times.push_back(transpose_timing_ms);
-      alltoall_times.push_back(alltoall_timing_ms);
-      local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
+      config_data.total_times.push_back(transpose_timing_ms);
+      config_data.alltoall_times.push_back(alltoall_timing_ms);
+      config_data.local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
 
       float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0;
-      alltoall_bws.push_back(alltoall_bw);
+      config_data.alltoall_bws.push_back(alltoall_bw);
+      config_data.sample_indices.push_back(i);
     }
 
-    if (total_times.empty()) continue;
+    if (config_data.total_times.empty()) continue;
 
-    PerformanceStats stats;
+    // Prepare aggregated statistics
+    PerformanceStats& stats = config_data.stats;
     stats.operation = getOperationName(config);
     stats.datatype = getDatatypeString(std::get<8>(config));
 
@@ -250,13 +222,13 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     stats.padding = formatArray(input_padding) + "/" + formatArray(output_padding);
     stats.inplace = std::get<6>(config) ? "Y" : "N";
     stats.managed = std::get<7>(config) ? "Y" : "N";
-    stats.samples = total_times.size();
+    stats.samples = config_data.total_times.size();
 
     // Compute average statistics across all ranks
-    stats.total_time_avg = std::accumulate(total_times.begin(), total_times.end(), 0.0f) / total_times.size();
-    stats.alltoall_time_avg = std::accumulate(alltoall_times.begin(), alltoall_times.end(), 0.0f) / alltoall_times.size();
-    stats.local_time_avg = std::accumulate(local_times.begin(), local_times.end(), 0.0f) / local_times.size();
-    stats.alltoall_bw_avg = std::accumulate(alltoall_bws.begin(), alltoall_bws.end(), 0.0f) / alltoall_bws.size();
+    stats.total_time_avg = std::accumulate(config_data.total_times.begin(), config_data.total_times.end(), 0.0f) / config_data.total_times.size();
+    stats.alltoall_time_avg = std::accumulate(config_data.alltoall_times.begin(), config_data.alltoall_times.end(), 0.0f) / config_data.alltoall_times.size();
+    stats.local_time_avg = std::accumulate(config_data.local_times.begin(), config_data.local_times.end(), 0.0f) / config_data.local_times.size();
+    stats.alltoall_bw_avg = std::accumulate(config_data.alltoall_bws.begin(), config_data.alltoall_bws.end(), 0.0f) / config_data.alltoall_bws.size();
 
     CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.total_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
     CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.alltoall_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
@@ -268,75 +240,184 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     stats.local_time_avg /= handle->nranks;
     stats.alltoall_bw_avg /= handle->nranks;
 
-    all_stats.push_back(stats);
+    all_config_data.push_back(std::move(config_data));
   }
 
-  if (handle->rank != 0) return; // Only print on rank 0
-
-  printf("CUDECOMP: ===== Performance Summary =====\n");
-
-  // Print grid descriptor configuration information
-  printf("CUDECOMP: Grid Configuration:\n");
-  printf("CUDECOMP:\tTranspose backend: %s\n",
-         cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
-  printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
-         grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
-  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
-         grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
-
-  // Print memory ordering information
-  printf("CUDECOMP:\tMemory order: ");
-  for (int axis = 0; axis < 3; ++axis) {
-    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
-                        grid_desc->config.transpose_mem_order[axis][1],
-                        grid_desc->config.transpose_mem_order[axis][2]);
-    if (axis < 2) printf("; ");
-  }
-  printf("\n");
+  // Print summary information on rank 0 only
+  if (handle->rank == 0) {
+    printf("CUDECOMP: ===== Performance Summary =====\n");
+
+    // Print grid descriptor configuration information
+    printf("CUDECOMP: Grid Configuration:\n");
+    printf("CUDECOMP:\tTranspose backend: %s\n",
+           cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
+    printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
+           grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
+    printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
+           grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
+
+    // Print memory ordering information
+    printf("CUDECOMP:\tMemory order: ");
+    for (int axis = 0; axis < 3; ++axis) {
+      printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
+                          grid_desc->config.transpose_mem_order[axis][1],
+                          grid_desc->config.transpose_mem_order[axis][2]);
+      if (axis < 2) printf("; ");
+    }
+    printf("\n");
 
-  printf("CUDECOMP:\n");
-  printf("CUDECOMP: Transpose Performance Data:\n");
-  printf("CUDECOMP:\n");
+    printf("CUDECOMP:\n");
+    printf("CUDECOMP: Transpose Performance Data:\n");
+    printf("CUDECOMP:\n");
 
-  if (all_stats.empty()) {
-    printf("CUDECOMP: No performance data collected\n");
-    printf("CUDECOMP: ================================\n");
-    return;
+    if (all_config_data.empty()) {
+      printf("CUDECOMP: No performance data collected\n");
+      printf("CUDECOMP: ================================\n");
+      return;
+    }
+
+    // Print compact table header
+    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+           "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
+           "total", "A2A", "local", "A2A BW");
+    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+           "", "", "", "", "", "", "",
+           "[ms]", "[ms]", "[ms]", "[GB/s]");
+    printf("CUDECOMP: ");
+    for (int i = 0; i < 120; ++i) printf("-");
+    printf("\n");
+
+    // Print table rows
+    for (const auto& config_data : all_config_data) {
+      const auto& stats = config_data.stats;
+      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             stats.operation.c_str(),
+             stats.datatype.c_str(),
+             stats.halos.c_str(),
+             stats.padding.c_str(),
+             stats.inplace.c_str(),
+             stats.managed.c_str(),
+             stats.samples,
+             stats.total_time_avg,
+             stats.alltoall_time_avg,
+             stats.local_time_avg,
+             stats.alltoall_bw_avg
+             );
+    }
   }
 
-  // Print compact table header
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
-         "total", "A2A", "local", "A2A BW");
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "", "", "", "", "", "", "",
-         "[ms]", "[ms]", "[ms]", "[GB/s]");
-  printf("CUDECOMP: ");
-  for (int i = 0; i < 120; ++i) printf("-");
-  printf("\n");
-
-  // Print table rows
-  for (const auto& stats : all_stats) {
-    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-           stats.operation.c_str(),
-           stats.datatype.c_str(),
-           stats.halos.c_str(),
-           stats.padding.c_str(),
-           stats.inplace.c_str(),
-           stats.managed.c_str(),
-           stats.samples,
-           stats.total_time_avg,
-           stats.alltoall_time_avg,
-           stats.local_time_avg,
-           stats.alltoall_bw_avg
-           );
+  // Print per-sample data if detail level > 0
+  if (handle->performance_report_detail > 0) {
+    if (handle->rank == 0) {
+      printf("CUDECOMP:\n");
+      printf("CUDECOMP: Per-Sample Details:\n");
+      printf("CUDECOMP:\n");
+    }
+
+    for (const auto& config_data : all_config_data) {
+      const auto& stats = config_data.stats;
+
+      // Print configuration header on rank 0
+      if (handle->rank == 0) {
+        printf("CUDECOMP: %s (dtype=%s, halos=%s, padding=%s, inplace=%s, managed=%s) samples:\n",
+               stats.operation.c_str(),
+               stats.datatype.c_str(),
+               stats.halos.c_str(),
+               stats.padding.c_str(),
+               stats.inplace.c_str(),
+               stats.managed.c_str());
+      }
+
+      const auto& total_times = config_data.total_times;
+      const auto& alltoall_times = config_data.alltoall_times;
+      const auto& local_times = config_data.local_times;
+      const auto& alltoall_bws = config_data.alltoall_bws;
+      const auto& sample_indices = config_data.sample_indices;
+
+      if (total_times.empty()) continue;
+
+      if (handle->performance_report_detail == 1) {
+        // Print per-sample data for rank 0 only
+        if (handle->rank == 0) {
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "rank", "sample", "total", "A2A", "local", "A2A BW");
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+          for (int i = 0; i < total_times.size(); ++i) {
+            printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+                   handle->rank, sample_indices[i], total_times[i], alltoall_times[i],
+                   local_times[i], alltoall_bws[i]);
+          }
+        }
+      } else if (handle->performance_report_detail == 2) {
+        // Gather data from all ranks to rank 0
+        // Note: We assume all entries have the same number of samples per rank
+        int num_samples = total_times.size();
+
+        if (handle->rank == 0) {
+          // Use MPI_Gather instead of MPI_Gatherv since all ranks have the same number of samples
+          std::vector<float> all_total_times(num_samples * handle->nranks);
+          std::vector<float> all_alltoall_times(num_samples * handle->nranks);
+          std::vector<float> all_local_times(num_samples * handle->nranks);
+          std::vector<float> all_alltoall_bws(num_samples * handle->nranks);
+          std::vector<int> all_sample_indices(num_samples * handle->nranks);
+
+          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
+                               all_total_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(alltoall_times.data(), num_samples, MPI_FLOAT,
+                               all_alltoall_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
+                               all_local_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(alltoall_bws.data(), num_samples, MPI_FLOAT,
+                               all_alltoall_bws.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
+                               all_sample_indices.data(), num_samples, MPI_INT, 0, handle->mpi_comm));
+
+          // Print header
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "rank", "sample", "total", "A2A", "local", "A2A BW");
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+          // Print data sorted by rank
+          for (int r = 0; r < handle->nranks; ++r) {
+            for (int s = 0; s < num_samples; ++s) {
+              int idx = r * num_samples + s;
+              printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+                     r, all_sample_indices[idx], all_total_times[idx],
+                     all_alltoall_times[idx], all_local_times[idx],
+                     all_alltoall_bws[idx]);
+            }
+          }
+        } else {
+          // Non-rank-0 processes just send their data
+          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(alltoall_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(alltoall_bws.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
+                               nullptr, num_samples, MPI_INT, 0, handle->mpi_comm));
+        }
+      }
+
+      if (handle->rank == 0) {
+        printf("CUDECOMP:\n");
+      }
+    }
   }
 
-  printf("CUDECOMP: ================================\n");
+  if (handle->rank == 0) {
+    printf("CUDECOMP: ================================\n");
+  }
 }
 
 void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
-  if (handle->performance_report_enable == 0) return;
+  if (!handle->performance_report_enable) return;
 
   // Reset all sample collections in the map
   for (auto& entry : grid_desc->perf_samples_map) {
@@ -356,7 +437,7 @@ void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t g
 // Helper function to advance sample index with warmup handling
 void advancePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
                              const cudecompTransposeConfigKey& config) {
-  if (handle->performance_report_enable == 0) return;
+  if (!handle->performance_report_enable) return;
 
   auto& collection = getOrCreatePerformanceSamples(handle, grid_desc, config);
 

From f20b03ced8d8c53e0bd35538e68d70333ec79e3b Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 7 Jul 2025 11:44:32 -0700
Subject: [PATCH 04/13] Add performance reporting for halo operations. Update
 env var names and handling.

---
 docs/env_vars.rst                |  20 +-
 include/internal/comm_routines.h |  16 +-
 include/internal/common.h        |  31 ++-
 include/internal/halo.h          |  44 ++-
 include/internal/performance.h   |  35 ++-
 include/internal/transpose.h     |  10 +-
 src/autotune.cc                  |  12 +
 src/cudecomp.cc                  |  23 +-
 src/performance.cc               | 445 +++++++++++++++++++++++++++----
 tests/cc/halo_test.cc            |  15 ++
 tests/cc/transpose_test.cc       |  15 ++
 tests/fortran/halo_test.f90      |  11 +
 tests/fortran/transpose_test.f90 |  11 +
 13 files changed, 594 insertions(+), 94 deletions(-)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 13f1ef2..6620711 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -33,11 +33,11 @@ and communication overlap of packing kernels in large scale cases.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
 
-CUDECOMP_ENABLE_PERFORMANCE_REPORTING
--------------------------------------
+CUDECOMP_ENABLE_PERFORMANCE_REPORT
+------------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` controls whether cuDecomp performance reporting is enabled.
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` controls whether cuDecomp performance reporting is enabled.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
 
@@ -45,13 +45,13 @@ CUDECOMP_PERFORMANCE_REPORT_DETAIL
 ----------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORTING` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.
+:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.
 
 The following values are supported:
 
 - :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default)
-- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose configuration, but only from rank 0
-- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose configuration from all ranks, gathered and sorted by rank on rank 0
+- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose/halo configuration, but only from rank 0
+- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose/halo configuration from all ranks, gathered and sorted by rank on rank 0
 
 Default setting is :code:`0`.
 
@@ -59,14 +59,14 @@ CUDECOMP_PERFORMANCE_REPORT_SAMPLES
 -----------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose configuration.
+:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose/halo configuration.
 
-Default setting is :code:`20` samples. Valid range is 1-1000 samples.
+Default setting is :code:`20` samples.
 
 CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
 ------------------------------------------
 (since v0.5.1)
 
-:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
+:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
 
-Default setting is :code:`3` warmup samples. Valid range is 0-100 samples. Setting this to 0 disables warmup sample filtering.
+Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 6230aa7..c0a0879 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -155,7 +155,7 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
                  const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
                  T* recv_buff, const std::vector<comm_count_t>& recv_counts,
                  const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
-                 cudecompCommAxis comm_axis, cudaStream_t stream, cudecompPerformanceSample* current_sample = nullptr) {
+                 cudecompCommAxis comm_axis, cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
   if (handle->performance_report_enable) {
@@ -290,7 +290,7 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
                                       const std::vector<comm_count_t>& recv_offsets,
                                       const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
                                       const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
-                                      cudaStream_t stream, bool& synced, cudecompPerformanceSample* current_sample = nullptr) {
+                                      cudaStream_t stream, bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
 
   // If there are no transfers to complete, quick return
   if (send_counts.size() == 0 && recv_counts.size() == 0) {
@@ -496,9 +496,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
                                  const std::array<comm_count_t, 2>& send_counts,
                                  const std::array<size_t, 2>& send_offsets, T* recv_buff,
                                  const std::array<comm_count_t, 2>& recv_counts,
-                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0) {
+                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0,
+                                 cudecompHaloPerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompSendRecvPair");
 
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_start_event, stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       haloBackendRequiresMpi(grid_desc->config.halo_comm_backend) &&
@@ -620,6 +625,11 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
     break;
   }
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_end_event, stream));
+  }
+
   nvtx::rangePop();
 }
 
diff --git a/include/internal/common.h b/include/internal/common.h
index 5b137ed..5f8aa2e 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -137,7 +137,7 @@ struct cudecompCommInfo {
 };
 
 // Structure to contain data for transpose performance sample
-struct cudecompPerformanceSample {
+struct cudecompTransposePerformanceSample {
   cudaEvent_t transpose_start_event;
   cudaEvent_t transpose_end_event;
   std::vector<cudaEvent_t> alltoall_start_events;
@@ -147,9 +147,26 @@ struct cudecompPerformanceSample {
   bool valid = false;
 };
 
-// Collection of performance samples for a specific configuration
-struct cudecompPerformanceSampleCollection {
-  std::vector<cudecompPerformanceSample> samples;
+// Collection of transpose performance samples for a specific configuration
+struct cudecompTransposePerformanceSampleCollection {
+  std::vector<cudecompTransposePerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
+// Structure to contain data for halo performance sample
+struct cudecompHaloPerformanceSample {
+  cudaEvent_t halo_start_event;
+  cudaEvent_t halo_end_event;
+  cudaEvent_t sendrecv_start_event;
+  cudaEvent_t sendrecv_end_event;
+  size_t sendrecv_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of halo performance samples for a specific configuration
+struct cudecompHaloPerformanceSampleCollection {
+  std::vector<cudecompHaloPerformanceSample> samples;
   int32_t sample_idx = 0;
   int32_t warmup_count = 0;
 };
@@ -184,7 +201,11 @@ struct cudecompGridDesc {
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
-                     cudecompPerformanceSampleCollection> perf_samples_map;
+                     cudecompTransposePerformanceSampleCollection> transpose_perf_samples_map;
+
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>,
+                                std::array<int32_t, 3>, bool, cudecompDataType_t>,
+                     cudecompHaloPerformanceSampleCollection> halo_perf_samples_map;
 
   bool initialized = false;
 };
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 096292e..9a6c58f 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -41,6 +41,8 @@
 #include "internal/comm_routines.h"
 #include "internal/cudecomp_kernels.h"
 #include "internal/nvtx.h"
+#include "internal/performance.h"
+#include "internal/utils.h"
 
 namespace cudecomp {
 
@@ -73,6 +75,17 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
   // Quick return if no halos
   if (halo_extents[dim] == 0) { return; }
 
+  cudecompHaloPerformanceSample* current_sample = nullptr;
+  if (handle->performance_report_enable) {
+    auto& samples = getOrCreateHaloPerformanceSamples(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+    current_sample = &samples.samples[samples.sample_idx];
+    current_sample->sendrecv_bytes = 0;
+    current_sample->valid = true;
+
+    // Record start event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
+  }
+
   // Check if halos include more than one process (unsupported currently).
   int count = 0;
   for (int i = 0; i < 3; ++i) {
@@ -120,6 +133,11 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     c = 0;
   } else if (neighbors[0] == -1 && neighbors[1] == -1) {
     // Single rank in this dimension and not periodic. Return.
+    if (handle->performance_report_enable && current_sample) {
+      // Record end event and advance sample even for early return
+      CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+      advanceHaloPerformanceSample(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+    }
     return;
   }
 
@@ -204,7 +222,15 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     std::array<size_t, 2> offsets{};
     offsets[1] = halo_size;
 
-    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream);
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) {
+          current_sample->sendrecv_bytes += halo_size * sizeof(T);
+        }
+      }
+    }
+    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream, current_sample);
 
     // Unpack
     // Left
@@ -261,10 +287,24 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     lx[dim] = shape_g_h_p[dim] - halo_extents[dim];
     recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx);
 
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) {
+          current_sample->sendrecv_bytes += halo_size * sizeof(T);
+        }
+      }
+    }
     cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
-                         stream);
+                         stream, current_sample);
   } break;
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    // Record end event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+    advanceHaloPerformanceSample(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+  }
 }
 
 template <typename T>
diff --git a/include/internal/performance.h b/include/internal/performance.h
index a45699c..201d569 100644
--- a/include/internal/performance.h
+++ b/include/internal/performance.h
@@ -53,19 +53,33 @@ using cudecompTransposeConfigKey = std::tuple<
   cudecompDataType_t               // datatype
 >;
 
-void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
-                           int ax, int dir, size_t alltoall_bytes, cudecompPerformanceSample* current_sample);
+using cudecompHaloConfigKey = std::tuple<
+  int32_t,                         // ax (axis)
+  int32_t,                         // dim (dimension)
+  std::array<int32_t, 3>,          // halo_extents
+  std::array<bool, 3>,             // halo_periods
+  std::array<int32_t, 3>,          // padding
+  bool,                            // managed_memory
+  cudecompDataType_t               // datatype
+>;
 
 void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
 
 void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
 
-void advancePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                             const cudecompTransposeConfigKey& config);
+void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                      const cudecompTransposeConfigKey& config);
 
-cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecompHandle_t handle,
-                                                                   cudecompGridDesc_t grid_desc,
-                                                                   const cudecompTransposeConfigKey& config);
+cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
+                                                                                    cudecompGridDesc_t grid_desc,
+                                                                                    const cudecompTransposeConfigKey& config);
+
+void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                 const cudecompHaloConfigKey& config);
+
+cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle,
+                                                                           cudecompGridDesc_t grid_desc,
+                                                                           const cudecompHaloConfigKey& config);
 
 // Helper function to create transpose configuration key
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
@@ -75,6 +89,13 @@ cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, v
                                                 const int32_t output_padding_ptr[],
                                                 cudecompDataType_t datatype);
 
+// Helper function to create halo configuration key
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
+                                      const int32_t halo_extents_ptr[],
+                                      const bool halo_periods_ptr[],
+                                      const int32_t padding_ptr[],
+                                      cudecompDataType_t datatype);
+
 } // namespace cudecomp
 
 #endif // CUDECOMP_PERFORMANCE_H
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 2f9e806..2f45ec4 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -252,9 +252,9 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
   }
 
-  cudecompPerformanceSample* current_sample = nullptr;
+  cudecompTransposePerformanceSample* current_sample = nullptr;
   if (handle->performance_report_enable) {
-    auto& samples = getOrCreatePerformanceSamples(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+    auto& samples = getOrCreateTransposePerformanceSamples(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
     current_sample = &samples.samples[samples.sample_idx];
     current_sample->alltoall_timing_count = 0;
     current_sample->alltoall_bytes = pinfo_a.size * sizeof(T);
@@ -276,7 +276,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
           if (handle->performance_report_enable) {
             // Record performance data
             CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-            advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+            advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
           }
           return;
         }
@@ -545,7 +545,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       // o1 is output. Return.
       if (handle->performance_report_enable) {
         CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-        advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+        advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
       }
       return;
     }
@@ -822,7 +822,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   if (handle->performance_report_enable) {
     // Record performance data
     CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-    advancePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+    advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
   }
 
 }
diff --git a/src/autotune.cc b/src/autotune.cc
index facb1ce..2470ebe 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -694,6 +694,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       if (haloBackendRequiresNvshmem(comm)) { w = work_nvshmem; }
 #endif
 
+      // Reset performance samples
+      resetPerformanceSamples(handle, grid_desc);
+
       // Warmup
       for (int i = 0; i < options->n_warmup_trials; ++i) {
         for (int dim = 0; dim < 3; ++dim) {
@@ -754,6 +757,12 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           }
         }
       }
+
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable && !skip_case) {
+        printFinalPerformanceReport(handle, grid_desc);
+      }
+
       auto times = processTimings(handle, trial_times, 1000.);
 
       if (handle->rank == 0) {
@@ -834,6 +843,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   CHECK_MPI(MPI_Barrier(handle->mpi_comm));
   double t_end = MPI_Wtime();
   if (handle->rank == 0) printf("CUDECOMP: halo autotuning time [s]: %f\n", t_end - t_start);
+
+  // Reset performance samples after autotuning
+  resetPerformanceSamples(handle, grid_desc);
 }
 
 } // namespace cudecomp
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 6d53634..8ed03f2 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -330,8 +330,8 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
 #endif
   }
 
-  // Check CUDECOMP_ENABLE_PERFORMANCE_REPORTING (Performance reporting)
-  const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORTING");
+  // Check CUDECOMP_ENABLE_PERFORMANCE_REPORT (Performance reporting)
+  const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORT");
   if (performance_report_str) { handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; }
 
   // Check CUDECOMP_PERFORMANCE_REPORT_DETAIL (Performance report detail level)
@@ -349,7 +349,7 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
   const char* performance_samples_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_SAMPLES");
   if (performance_samples_str) {
     int32_t samples = std::strtol(performance_samples_str, nullptr, 10);
-    if (samples > 0 && samples <= 1000) {  // Reasonable bounds
+    if (samples > 0) {  // Only require positive values
       handle->performance_report_samples = samples;
     } else if (handle->rank == 0) {
       printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n",
@@ -361,7 +361,7 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
   const char* performance_warmup_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES");
   if (performance_warmup_str) {
     int32_t warmup_samples = std::strtol(performance_warmup_str, nullptr, 10);
-    if (warmup_samples >= 0 && warmup_samples <= 100) {  // Reasonable bounds
+    if (warmup_samples >= 0) {  // Only require non-negative values
       handle->performance_report_warmup_samples = warmup_samples;
     } else if (handle->rank == 0) {
       printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES value (%d). Using default (%d).\n",
@@ -767,8 +767,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
       // Print final performance report before destroying events
       printFinalPerformanceReport(handle, grid_desc);
 
-      // Destroy all performance sample events in the map
-      for (auto& entry : grid_desc->perf_samples_map) {
+      // Destroy all transpose performance sample events in the map
+      for (auto& entry : grid_desc->transpose_perf_samples_map) {
         auto& collection = entry.second;
         for (auto& sample : collection.samples) {
           CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
@@ -777,6 +777,17 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
           for (auto& event : sample.alltoall_end_events) { CHECK_CUDA(cudaEventDestroy(event)); }
         }
       }
+
+      // Destroy all halo performance sample events in the map
+      for (auto& entry : grid_desc->halo_perf_samples_map) {
+        auto& collection = entry.second;
+        for (auto& sample : collection.samples) {
+          CHECK_CUDA(cudaEventDestroy(sample.halo_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.halo_end_event));
+          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event));
+          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event));
+        }
+      }
     }
 
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
diff --git a/src/performance.cc b/src/performance.cc
index 0402526..848a92d 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -75,15 +75,40 @@ cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, v
                          input_padding, output_padding, inplace, managed_memory, datatype);
 }
 
-// Helper function to get or create performance sample collection for a configuration
-cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecompHandle_t handle,
-                                                                   cudecompGridDesc_t grid_desc,
-                                                                   const cudecompTransposeConfigKey& config) {
-  auto& samples_map = grid_desc->perf_samples_map;
+// Helper function to create halo configuration key
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
+                                      const int32_t halo_extents_ptr[],
+                                      const bool halo_periods_ptr[],
+                                      const int32_t padding_ptr[],
+                                      cudecompDataType_t datatype) {
+  std::array<int32_t, 3> halo_extents{0, 0, 0};
+  std::array<bool, 3> halo_periods{false, false, false};
+  std::array<int32_t, 3> padding{0, 0, 0};
+
+  if (halo_extents_ptr) {
+    std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin());
+  }
+  if (halo_periods_ptr) {
+    std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin());
+  }
+  if (padding_ptr) {
+    std::copy(padding_ptr, padding_ptr + 3, padding.begin());
+  }
+
+  bool managed_memory = isManagedPointer(input);
+
+  return std::make_tuple(ax, dim, halo_extents, halo_periods, padding, managed_memory, datatype);
+}
+
+// Helper function to get or create transpose performance sample collection for a configuration
+cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
+                                                                                    cudecompGridDesc_t grid_desc,
+                                                                                    const cudecompTransposeConfigKey& config) {
+  auto& samples_map = grid_desc->transpose_perf_samples_map;
 
   if (samples_map.find(config) == samples_map.end()) {
     // Create new sample collection for this configuration
-    cudecompPerformanceSampleCollection collection;
+    cudecompTransposePerformanceSampleCollection collection;
     collection.samples.resize(handle->performance_report_samples);
     collection.sample_idx = 0;
 
@@ -108,6 +133,33 @@ cudecompPerformanceSampleCollection& getOrCreatePerformanceSamples(const cudecom
   return samples_map[config];
 }
 
+// Helper function to get or create halo performance sample collection for a configuration
+cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle,
+                                                                           cudecompGridDesc_t grid_desc,
+                                                                           const cudecompHaloConfigKey& config) {
+  auto& samples_map = grid_desc->halo_perf_samples_map;
+
+  if (samples_map.find(config) == samples_map.end()) {
+    // Create new sample collection for this configuration
+    cudecompHaloPerformanceSampleCollection collection;
+    collection.samples.resize(handle->performance_report_samples);
+    collection.sample_idx = 0;
+
+    // Create events for each sample
+    for (auto& sample : collection.samples) {
+      CHECK_CUDA(cudaEventCreate(&sample.halo_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.halo_end_event));
+      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event));
+      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event));
+      sample.valid = false;
+    }
+
+    samples_map[config] = std::move(collection);
+  }
+
+  return samples_map[config];
+}
+
 // Helper function to format array as compact string
 std::string formatArray(const std::array<int32_t, 3>& arr) {
   std::ostringstream oss;
@@ -115,8 +167,8 @@ std::string formatArray(const std::array<int32_t, 3>& arr) {
   return oss.str();
 }
 
-// Helper function to get operation name from config
-std::string getOperationName(const cudecompTransposeConfigKey& config) {
+// Helper function to get operation name from transpose config
+std::string getTransposeOperationName(const cudecompTransposeConfigKey& config) {
   int ax = std::get<0>(config);
   int dir = std::get<1>(config);
 
@@ -132,6 +184,20 @@ std::string getOperationName(const cudecompTransposeConfigKey& config) {
   return "Unknown";
 }
 
+// Helper function to get operation name from halo config
+std::string getHaloOperationName(const cudecompHaloConfigKey& config) {
+  int ax = std::get<0>(config);
+
+  if (ax == 0) {
+    return "HaloX";
+  } else if (ax == 1) {
+    return "HaloY";
+  } else if (ax == 2) {
+    return "HaloZ";
+  }
+  return "Unknown";
+}
+
 // Helper function to convert datatype to string
 std::string getDatatypeString(cudecompDataType_t datatype) {
   switch (datatype) {
@@ -143,8 +209,8 @@ std::string getDatatypeString(cudecompDataType_t datatype) {
   }
 }
 
-// Helper structure for statistics
-struct PerformanceStats {
+// Helper structure for transpose statistics
+struct TransposePerformanceStats {
   std::string operation;
   std::string datatype;
   std::string halos;        // Combined input/output halos
@@ -158,9 +224,25 @@ struct PerformanceStats {
   float alltoall_bw_avg;
 };
 
-// Helper structure to hold pre-computed timing data
-struct ConfigTimingData {
-  PerformanceStats stats;
+// Helper structure for halo statistics
+struct HaloPerformanceStats {
+  std::string operation;
+  std::string datatype;
+  int dim;
+  std::string halos;
+  std::string periods;
+  std::string padding;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float sendrecv_time_avg;
+  float local_time_avg;
+  float sendrecv_bw_avg;
+};
+
+// Helper structure to hold pre-computed transpose timing data
+struct TransposeConfigTimingData {
+  TransposePerformanceStats stats;
   std::vector<float> total_times;
   std::vector<float> alltoall_times;
   std::vector<float> local_times;
@@ -168,18 +250,28 @@ struct ConfigTimingData {
   std::vector<int> sample_indices;
 };
 
+// Helper structure to hold pre-computed halo timing data
+struct HaloConfigTimingData {
+  HaloPerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> sendrecv_times;
+  std::vector<float> local_times;
+  std::vector<float> sendrecv_bws;
+  std::vector<int> sample_indices;
+};
+
 void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
   // Synchronize to ensure all events are recorded
   CHECK_CUDA(cudaDeviceSynchronize());
 
-  // Collect all statistics and timing data
-  std::vector<ConfigTimingData> all_config_data;
+  // Collect all transpose statistics and timing data
+  std::vector<TransposeConfigTimingData> all_transpose_config_data;
 
-  for (const auto& entry : grid_desc->perf_samples_map) {
+  for (const auto& entry : grid_desc->transpose_perf_samples_map) {
     const auto& config = entry.first;
     const auto& collection = entry.second;
 
-    ConfigTimingData config_data;
+    TransposeConfigTimingData config_data;
 
     // Collect valid samples and compute elapsed times once
     for (int i = 0; i < collection.samples.size(); ++i) {
@@ -208,8 +300,8 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     if (config_data.total_times.empty()) continue;
 
     // Prepare aggregated statistics
-    PerformanceStats& stats = config_data.stats;
-    stats.operation = getOperationName(config);
+    TransposePerformanceStats& stats = config_data.stats;
+    stats.operation = getTransposeOperationName(config);
     stats.datatype = getDatatypeString(std::get<8>(config));
 
     // Format combined halos and padding
@@ -240,7 +332,76 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     stats.local_time_avg /= handle->nranks;
     stats.alltoall_bw_avg /= handle->nranks;
 
-    all_config_data.push_back(std::move(config_data));
+    all_transpose_config_data.push_back(std::move(config_data));
+  }
+
+  // Collect all halo statistics and timing data
+  std::vector<HaloConfigTimingData> all_halo_config_data;
+
+  for (const auto& entry : grid_desc->halo_perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
+
+    HaloConfigTimingData config_data;
+
+    // Collect valid samples and compute elapsed times once
+    for (int i = 0; i < collection.samples.size(); ++i) {
+      const auto& sample = collection.samples[i];
+      if (!sample.valid) continue;
+
+      float sendrecv_timing_ms = 0.0f;
+      if (sample.sendrecv_bytes > 0) {
+        CHECK_CUDA(cudaEventElapsedTime(&sendrecv_timing_ms, sample.sendrecv_start_event, sample.sendrecv_end_event));
+      }
+
+      float halo_timing_ms;
+      CHECK_CUDA(cudaEventElapsedTime(&halo_timing_ms, sample.halo_start_event, sample.halo_end_event));
+
+      config_data.total_times.push_back(halo_timing_ms);
+      config_data.sendrecv_times.push_back(sendrecv_timing_ms);
+      config_data.local_times.push_back(halo_timing_ms - sendrecv_timing_ms);
+
+      float sendrecv_bw = (sendrecv_timing_ms > 0) ? sample.sendrecv_bytes * 1e-6 / sendrecv_timing_ms : 0;
+      config_data.sendrecv_bws.push_back(sendrecv_bw);
+      config_data.sample_indices.push_back(i);
+    }
+
+    if (config_data.total_times.empty()) continue;
+
+    // Prepare aggregated statistics
+    HaloPerformanceStats& stats = config_data.stats;
+    stats.operation = getHaloOperationName(config);
+    stats.datatype = getDatatypeString(std::get<6>(config));
+    stats.dim = std::get<1>(config);
+
+    // Format halo extents, periods, and padding
+    auto halo_extents = std::get<2>(config);
+    auto halo_periods = std::get<3>(config);
+    auto padding = std::get<4>(config);
+
+    stats.halos = formatArray(halo_extents);
+    stats.periods = "[" + std::to_string(halo_periods[0]) + "," + std::to_string(halo_periods[1]) + "," + std::to_string(halo_periods[2]) + "]";
+    stats.padding = formatArray(padding);
+    stats.managed = std::get<5>(config) ? "Y" : "N";
+    stats.samples = config_data.total_times.size();
+
+    // Compute average statistics across all ranks
+    stats.total_time_avg = std::accumulate(config_data.total_times.begin(), config_data.total_times.end(), 0.0f) / config_data.total_times.size();
+    stats.sendrecv_time_avg = std::accumulate(config_data.sendrecv_times.begin(), config_data.sendrecv_times.end(), 0.0f) / config_data.sendrecv_times.size();
+    stats.local_time_avg = std::accumulate(config_data.local_times.begin(), config_data.local_times.end(), 0.0f) / config_data.local_times.size();
+    stats.sendrecv_bw_avg = std::accumulate(config_data.sendrecv_bws.begin(), config_data.sendrecv_bws.end(), 0.0f) / config_data.sendrecv_bws.size();
+
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.total_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.sendrecv_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.local_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.sendrecv_bw_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+
+    stats.total_time_avg /= handle->nranks;
+    stats.sendrecv_time_avg /= handle->nranks;
+    stats.local_time_avg /= handle->nranks;
+    stats.sendrecv_bw_avg /= handle->nranks;
+
+    all_halo_config_data.push_back(std::move(config_data));
   }
 
   // Print summary information on rank 0 only
@@ -251,6 +412,8 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     printf("CUDECOMP: Grid Configuration:\n");
     printf("CUDECOMP:\tTranspose backend: %s\n",
            cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
+    printf("CUDECOMP:\tHalo backend: %s\n",
+           cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
     printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
            grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
     printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
@@ -266,43 +429,84 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     }
     printf("\n");
 
-    printf("CUDECOMP:\n");
-    printf("CUDECOMP: Transpose Performance Data:\n");
     printf("CUDECOMP:\n");
 
-    if (all_config_data.empty()) {
+    if (all_transpose_config_data.empty() && all_halo_config_data.empty()) {
       printf("CUDECOMP: No performance data collected\n");
       printf("CUDECOMP: ================================\n");
       return;
     }
 
-    // Print compact table header
-    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-           "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
-           "total", "A2A", "local", "A2A BW");
-    printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-           "", "", "", "", "", "", "",
-           "[ms]", "[ms]", "[ms]", "[GB/s]");
-    printf("CUDECOMP: ");
-    for (int i = 0; i < 120; ++i) printf("-");
-    printf("\n");
+    // Print transpose performance data
+    if (!all_transpose_config_data.empty()) {
+      printf("CUDECOMP: Transpose Performance Data:\n");
+      printf("CUDECOMP:\n");
 
-    // Print table rows
-    for (const auto& config_data : all_config_data) {
-      const auto& stats = config_data.stats;
-      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-             stats.operation.c_str(),
-             stats.datatype.c_str(),
-             stats.halos.c_str(),
-             stats.padding.c_str(),
-             stats.inplace.c_str(),
-             stats.managed.c_str(),
-             stats.samples,
-             stats.total_time_avg,
-             stats.alltoall_time_avg,
-             stats.local_time_avg,
-             stats.alltoall_bw_avg
-             );
+      // Print compact table header
+      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+             "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
+             "total", "A2A", "local", "A2A BW");
+      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+             "", "", "", "", "", "", "",
+             "[ms]", "[ms]", "[ms]", "[GB/s]");
+      printf("CUDECOMP: ");
+      for (int i = 0; i < 120; ++i) printf("-");
+      printf("\n");
+
+      // Print table rows
+      for (const auto& config_data : all_transpose_config_data) {
+        const auto& stats = config_data.stats;
+        printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+               stats.operation.c_str(),
+               stats.datatype.c_str(),
+               stats.halos.c_str(),
+               stats.padding.c_str(),
+               stats.inplace.c_str(),
+               stats.managed.c_str(),
+               stats.samples,
+               stats.total_time_avg,
+               stats.alltoall_time_avg,
+               stats.local_time_avg,
+               stats.alltoall_bw_avg
+               );
+      }
+    }
+
+    // Print halo performance data
+    if (!all_halo_config_data.empty()) {
+      printf("CUDECOMP:\n");
+      printf("CUDECOMP: Halo Performance Data:\n");
+      printf("CUDECOMP:\n");
+
+      // Print compact table header
+      printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+             "operation", "dtype", "dim", "halo extent", "periods", "padding", "managed", "samples",
+             "total", "SR", "local", "SR BW");
+      printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+             "", "", "", "", "", "", "", "",
+             "[ms]", "[ms]", "[ms]", "[GB/s]");
+      printf("CUDECOMP: ");
+      for (int i = 0; i < 125; ++i) printf("-");
+      printf("\n");
+
+      // Print table rows
+      for (const auto& config_data : all_halo_config_data) {
+        const auto& stats = config_data.stats;
+        printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+               stats.operation.c_str(),
+               stats.datatype.c_str(),
+               stats.dim,
+               stats.halos.c_str(),
+               stats.periods.c_str(),
+               stats.padding.c_str(),
+               stats.managed.c_str(),
+               stats.samples,
+               stats.total_time_avg,
+               stats.sendrecv_time_avg,
+               stats.local_time_avg,
+               stats.sendrecv_bw_avg
+               );
+      }
     }
   }
 
@@ -314,7 +518,7 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
       printf("CUDECOMP:\n");
     }
 
-    for (const auto& config_data : all_config_data) {
+    for (const auto& config_data : all_transpose_config_data) {
       const auto& stats = config_data.stats;
 
       // Print configuration header on rank 0
@@ -409,6 +613,104 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
         printf("CUDECOMP:\n");
       }
     }
+
+    // Print halo per-sample details
+    for (const auto& config_data : all_halo_config_data) {
+      const auto& stats = config_data.stats;
+
+      // Print configuration header on rank 0
+      if (handle->rank == 0) {
+        printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n",
+               stats.operation.c_str(),
+               stats.datatype.c_str(),
+               stats.dim,
+               stats.halos.c_str(),
+               stats.periods.c_str(),
+               stats.padding.c_str(),
+               stats.managed.c_str());
+      }
+
+      const auto& total_times = config_data.total_times;
+      const auto& sendrecv_times = config_data.sendrecv_times;
+      const auto& local_times = config_data.local_times;
+      const auto& sendrecv_bws = config_data.sendrecv_bws;
+      const auto& sample_indices = config_data.sample_indices;
+
+      if (total_times.empty()) continue;
+
+      if (handle->performance_report_detail == 1) {
+        // Print per-sample data for rank 0 only
+        if (handle->rank == 0) {
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "rank", "sample", "total", "SR", "local", "SR BW");
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+          for (int i = 0; i < total_times.size(); ++i) {
+            printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+                   handle->rank, sample_indices[i], total_times[i], sendrecv_times[i],
+                   local_times[i], sendrecv_bws[i]);
+          }
+        }
+      } else if (handle->performance_report_detail == 2) {
+        // Gather data from all ranks to rank 0
+        // Note: We assume all entries have the same number of samples per rank
+        int num_samples = total_times.size();
+
+        if (handle->rank == 0) {
+          // Use MPI_Gather instead of MPI_Gatherv since all ranks have the same number of samples
+          std::vector<float> all_total_times(num_samples * handle->nranks);
+          std::vector<float> all_sendrecv_times(num_samples * handle->nranks);
+          std::vector<float> all_local_times(num_samples * handle->nranks);
+          std::vector<float> all_sendrecv_bws(num_samples * handle->nranks);
+          std::vector<int> all_sample_indices(num_samples * handle->nranks);
+
+          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
+                               all_total_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sendrecv_times.data(), num_samples, MPI_FLOAT,
+                               all_sendrecv_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
+                               all_local_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sendrecv_bws.data(), num_samples, MPI_FLOAT,
+                               all_sendrecv_bws.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
+                               all_sample_indices.data(), num_samples, MPI_INT, 0, handle->mpi_comm));
+
+          // Print header
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "rank", "sample", "total", "SR", "local", "SR BW");
+          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+          // Print data sorted by rank
+          for (int r = 0; r < handle->nranks; ++r) {
+            for (int s = 0; s < num_samples; ++s) {
+              int idx = r * num_samples + s;
+              printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+                     r, all_sample_indices[idx], all_total_times[idx],
+                     all_sendrecv_times[idx], all_local_times[idx],
+                     all_sendrecv_bws[idx]);
+            }
+          }
+        } else {
+          // Non-rank-0 processes just send their data
+          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sendrecv_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sendrecv_bws.data(), num_samples, MPI_FLOAT,
+                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
+                               nullptr, num_samples, MPI_INT, 0, handle->mpi_comm));
+        }
+      }
+
+      if (handle->rank == 0) {
+        printf("CUDECOMP:\n");
+      }
+    }
   }
 
   if (handle->rank == 0) {
@@ -419,8 +721,8 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
 void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
   if (!handle->performance_report_enable) return;
 
-  // Reset all sample collections in the map
-  for (auto& entry : grid_desc->perf_samples_map) {
+  // Reset all transpose sample collections in the map
+  for (auto& entry : grid_desc->transpose_perf_samples_map) {
     auto& collection = entry.second;
     collection.sample_idx = 0;
     collection.warmup_count = 0;
@@ -432,14 +734,45 @@ void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t g
       sample.alltoall_bytes = 0;
     }
   }
+
+  // Reset all halo sample collections in the map
+  for (auto& entry : grid_desc->halo_perf_samples_map) {
+    auto& collection = entry.second;
+    collection.sample_idx = 0;
+    collection.warmup_count = 0;
+
+    // Mark all samples as invalid and reset counters
+    for (auto& sample : collection.samples) {
+      sample.valid = false;
+      sample.sendrecv_bytes = 0;
+    }
+  }
+}
+
+// Helper function to advance transpose sample index with warmup handling
+void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                      const cudecompTransposeConfigKey& config) {
+  if (!handle->performance_report_enable) return;
+
+  auto& collection = getOrCreateTransposePerformanceSamples(handle, grid_desc, config);
+
+  // Check if we're still in warmup phase
+  if (collection.warmup_count < handle->performance_report_warmup_samples) {
+    collection.warmup_count++;
+    // During warmup, don't advance the circular buffer, just mark current sample as invalid
+    collection.samples[collection.sample_idx].valid = false;
+  } else {
+    // Past warmup, advance the circular buffer normally
+    collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples;
+  }
 }
 
-// Helper function to advance sample index with warmup handling
-void advancePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                             const cudecompTransposeConfigKey& config) {
+// Helper function to advance halo sample index with warmup handling
+void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                 const cudecompHaloConfigKey& config) {
   if (!handle->performance_report_enable) return;
 
-  auto& collection = getOrCreatePerformanceSamples(handle, grid_desc, config);
+  auto& collection = getOrCreateHaloPerformanceSamples(handle, grid_desc, config);
 
   // Check if we're still in warmup phase
   if (collection.warmup_count < handle->performance_report_warmup_samples) {
diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc
index 69e9dbc..895fe86 100644
--- a/tests/cc/halo_test.cc
+++ b/tests/cc/halo_test.cc
@@ -587,6 +587,21 @@ int main(int argc, char** argv) {
   }
 
   // Finalize
+  // Free grid descriptors
+  for (auto& entry : grid_desc_cache) {
+    auto& backend = entry.first;
+    auto& grid_desc = entry.second;
+    // If backend matches workspace, free workspace
+    if (std::get<0>(workspace) == static_cast<int>(backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace)));
+      std::get<0>(workspace) = -1;
+      std::get<1>(workspace) = nullptr;
+      std::get<2>(workspace) = 0;
+    }
+    CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc));
+  }
+  grid_desc_cache.clear();
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle));
   CHECK_MPI_EXIT(MPI_Finalize());
 
diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc
index f9ad62c..c217a3c 100644
--- a/tests/cc/transpose_test.cc
+++ b/tests/cc/transpose_test.cc
@@ -651,6 +651,21 @@ int main(int argc, char** argv) {
   }
 
   // Finalize
+  // Free grid descriptors
+  for (auto& entry : grid_desc_cache) {
+    auto& backend = entry.first;
+    auto& grid_desc = entry.second;
+    // Free workspace using correct grid descriptor
+    if (std::get<0>(workspace) == static_cast<int>(backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace)));
+      std::get<0>(workspace) = -1;
+      std::get<1>(workspace) = nullptr;
+      std::get<2>(workspace) = 0;
+    }
+    CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc));
+  }
+  grid_desc_cache.clear();
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle));
   CHECK_MPI_EXIT(MPI_Finalize());
 
diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90
index 9e67e1c..ac816c3 100644
--- a/tests/fortran/halo_test.f90
+++ b/tests/fortran/halo_test.f90
@@ -630,6 +630,17 @@ program main
     if (nfailed /= 0) retcode = 1;
   endif
 
+  ! Free grid descriptors
+  do i = 1, 5
+    if (grid_desc_cache_set(i)) then
+      ! Free workspace with correct grid descriptor
+      if (work_backend == i) then
+        CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d))
+      endif
+      CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i)))
+    endif
+  end do
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle))
   call MPI_Finalize(ierr)
 
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index 02810df..57caf7c 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -654,6 +654,17 @@ program main
     if (nfailed /= 0) retcode = 1;
   endif
 
+  ! Free grid descriptors
+  do i = 1, 7
+    if (grid_desc_cache_set(i)) then
+      ! Free workspace with correct grid descriptor
+      if (work_backend == i) then
+        CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d))
+      endif
+      CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i)))
+    endif
+  end do
+
   CHECK_CUDECOMP_EXIT(cudecompFinalize(handle))
   call MPI_Finalize(ierr)
 

From aab4b539d4cdc9d4a0681765852c5c154f38f3d5 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 8 Jul 2025 16:01:53 -0700
Subject: [PATCH 05/13] Some formatting changes.

---
 include/internal/performance.h |  2 +-
 src/autotune.cc                | 20 ++++++++++----------
 src/cudecomp.cc                |  4 ++--
 src/performance.cc             |  5 ++++-
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/internal/performance.h b/include/internal/performance.h
index 201d569..0972ee0 100644
--- a/include/internal/performance.h
+++ b/include/internal/performance.h
@@ -63,7 +63,7 @@ using cudecompHaloConfigKey = std::tuple<
   cudecompDataType_t               // datatype
 >;
 
-void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
 
 void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
 
diff --git a/src/autotune.cc b/src/autotune.cc
index 2470ebe..d59f33b 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -396,11 +396,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       // Clear CUDA graph cache between backend/process decomposition pairs
       grid_desc->graph_cache.clear();
 
-      // Print performance report for this configuration if enabled
-      if (handle->performance_report_enable && !skip_case) {
-        printFinalPerformanceReport(handle, grid_desc);
-      }
-
       auto times = processTimings(handle, trial_times);
       auto times_w = processTimings(handle, trial_times_w);
       auto xy_times = processTimings(handle, trial_xy_times);
@@ -440,6 +435,11 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         }
       }
 
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable && !skip_case) {
+        printPerformanceReport(handle, grid_desc);
+      }
+
       if (skip_case) continue;
 
       if (times_w[2] < t_best) {
@@ -758,11 +758,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
         }
       }
 
-      // Print performance report for this configuration if enabled
-      if (handle->performance_report_enable && !skip_case) {
-        printFinalPerformanceReport(handle, grid_desc);
-      }
-
       auto times = processTimings(handle, trial_times, 1000.);
 
       if (handle->rank == 0) {
@@ -780,6 +775,11 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
         }
       }
 
+      // Print performance report for this configuration if enabled
+      if (handle->performance_report_enable && !skip_case) {
+        printPerformanceReport(handle, grid_desc);
+      }
+
       if (skip_case) continue;
 
       if (times[2] < t_best) {
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 8ed03f2..814b19a 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -764,8 +764,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
 
     // Destroy timing events for AlltoAll operations
     if (handle->performance_report_enable) {
-      // Print final performance report before destroying events
-      printFinalPerformanceReport(handle, grid_desc);
+      // Print performance report before destroying events
+      printPerformanceReport(handle, grid_desc);
 
       // Destroy all transpose performance sample events in the map
       for (auto& entry : grid_desc->transpose_perf_samples_map) {
diff --git a/src/performance.cc b/src/performance.cc
index 848a92d..bfd9f16 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -260,7 +260,7 @@ struct HaloConfigTimingData {
   std::vector<int> sample_indices;
 };
 
-void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
   // Synchronize to ensure all events are recorded
   CHECK_CUDA(cudaDeviceSynchronize());
 
@@ -406,6 +406,7 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
 
   // Print summary information on rank 0 only
   if (handle->rank == 0) {
+    printf("CUDECOMP:\n");
     printf("CUDECOMP: ===== Performance Summary =====\n");
 
     // Print grid descriptor configuration information
@@ -434,6 +435,7 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
     if (all_transpose_config_data.empty() && all_halo_config_data.empty()) {
       printf("CUDECOMP: No performance data collected\n");
       printf("CUDECOMP: ================================\n");
+      printf("CUDECOMP:\n");
       return;
     }
 
@@ -715,6 +717,7 @@ void printFinalPerformanceReport(const cudecompHandle_t handle, const cudecompGr
 
   if (handle->rank == 0) {
     printf("CUDECOMP: ================================\n");
+    printf("CUDECOMP:\n");
   }
 }
 

From 4c340d0fadf88a238a6f8c8a1c8c38b229d7e566 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 8 Jul 2025 22:34:15 -0700
Subject: [PATCH 06/13] Apply fixed sorting to performance report entries.

---
 src/performance.cc | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/performance.cc b/src/performance.cc
index bfd9f16..20606b1 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <map>
 #include <numeric>
 #include <sstream>
 #include <string>
@@ -260,6 +261,71 @@ struct HaloConfigTimingData {
   std::vector<int> sample_indices;
 };
 
+// Custom comparison functions for consistent ordering
+bool compareTransposeConfigData(const TransposeConfigTimingData& a,
+                                const TransposeConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {
+    {"TransposeXY", 0},
+    {"TransposeYZ", 1},
+    {"TransposeZY", 2},
+    {"TransposeYX", 3}
+  };
+
+  static const std::map<std::string, int> dtype_priority = {
+    {"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}
+  };
+
+  if (a.stats.operation != b.stats.operation) {
+    return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
+  }
+  if (a.stats.datatype != b.stats.datatype) {
+    return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
+  }
+  if (a.stats.halos != b.stats.halos) {
+    return a.stats.halos < b.stats.halos;
+  }
+  if (a.stats.padding != b.stats.padding) {
+    return a.stats.padding < b.stats.padding;
+  }
+  if (a.stats.inplace != b.stats.inplace) {
+    return a.stats.inplace < b.stats.inplace;
+  }
+  return a.stats.managed < b.stats.managed;
+}
+
+bool compareHaloConfigData(const HaloConfigTimingData& a,
+                           const HaloConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {
+    {"HaloX", 0},
+    {"HaloY", 1},
+    {"HaloZ", 2}
+  };
+
+  static const std::map<std::string, int> dtype_priority = {
+    {"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}
+  };
+
+  if (a.stats.operation != b.stats.operation) {
+    return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
+  }
+  if (a.stats.dim != b.stats.dim) {
+    return a.stats.dim < b.stats.dim;
+  }
+  if (a.stats.datatype != b.stats.datatype) {
+    return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
+  }
+  if (a.stats.halos != b.stats.halos) {
+    return a.stats.halos < b.stats.halos;
+  }
+  if (a.stats.periods != b.stats.periods) {
+    return a.stats.periods < b.stats.periods;
+  }
+  if (a.stats.padding != b.stats.padding) {
+    return a.stats.padding < b.stats.padding;
+  }
+  return a.stats.managed < b.stats.managed;
+}
+
 void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
   // Synchronize to ensure all events are recorded
   CHECK_CUDA(cudaDeviceSynchronize());
@@ -335,6 +401,9 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     all_transpose_config_data.push_back(std::move(config_data));
   }
 
+  // Sort transpose configuration data for consistent ordering
+  std::sort(all_transpose_config_data.begin(), all_transpose_config_data.end(), compareTransposeConfigData);
+
   // Collect all halo statistics and timing data
   std::vector<HaloConfigTimingData> all_halo_config_data;
 
@@ -404,6 +473,9 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     all_halo_config_data.push_back(std::move(config_data));
   }
 
+  // Sort halo configuration data for consistent ordering
+  std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData);
+
   // Print summary information on rank 0 only
   if (handle->rank == 0) {
     printf("CUDECOMP:\n");

From 43cefd50960dc983adca1156eca877759916992b Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 09:59:52 -0700
Subject: [PATCH 07/13] Refactoring and cleanup.

---
 include/internal/performance.h |  75 ++-
 src/cudecomp.cc                |   1 -
 src/performance.cc             | 855 ++++++++++++++++-----------------
 3 files changed, 470 insertions(+), 461 deletions(-)

diff --git a/include/internal/performance.h b/include/internal/performance.h
index 0972ee0..fa4c1f7 100644
--- a/include/internal/performance.h
+++ b/include/internal/performance.h
@@ -63,16 +63,67 @@ using cudecompHaloConfigKey = std::tuple<
   cudecompDataType_t               // datatype
 >;
 
+// Helper structure for transpose statistics
+struct TransposePerformanceStats {
+  std::string operation;
+  std::string datatype;
+  std::string halos;
+  std::string padding;
+  std::string inplace;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float alltoall_time_avg;
+  float local_time_avg;
+  float alltoall_bw_avg;
+};
+
+// Helper structure for halo statistics
+struct HaloPerformanceStats {
+  std::string operation;
+  std::string datatype;
+  int dim;
+  std::string halos;
+  std::string periods;
+  std::string padding;
+  std::string managed;
+  int samples;
+  float total_time_avg;
+  float sendrecv_time_avg;
+  float local_time_avg;
+  float sendrecv_bw_avg;
+};
+
+// Helper structure to hold pre-computed transpose timing data
+struct TransposeConfigTimingData {
+  TransposePerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> alltoall_times;
+  std::vector<float> local_times;
+  std::vector<float> alltoall_bws;
+  std::vector<int> sample_indices;
+};
+
+// Helper structure to hold pre-computed halo timing data
+struct HaloConfigTimingData {
+  HaloPerformanceStats stats;
+  std::vector<float> total_times;
+  std::vector<float> sendrecv_times;
+  std::vector<float> local_times;
+  std::vector<float> sendrecv_bws;
+  std::vector<int> sample_indices;
+};
+
 void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc);
 
 void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
 
 void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                      const cudecompTransposeConfigKey& config);
+                                       const cudecompTransposeConfigKey& config);
 
 cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
-                                                                                    cudecompGridDesc_t grid_desc,
-                                                                                    const cudecompTransposeConfigKey& config);
+                                                                                     cudecompGridDesc_t grid_desc,
+                                                                                     const cudecompTransposeConfigKey& config);
 
 void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
                                  const cudecompHaloConfigKey& config);
@@ -83,18 +134,18 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
 
 // Helper function to create transpose configuration key
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
-                                                const int32_t input_halo_extents_ptr[],
-                                                const int32_t output_halo_extents_ptr[],
-                                                const int32_t input_padding_ptr[],
-                                                const int32_t output_padding_ptr[],
-                                                cudecompDataType_t datatype);
+                                                 const int32_t input_halo_extents_ptr[],
+                                                 const int32_t output_halo_extents_ptr[],
+                                                 const int32_t input_padding_ptr[],
+                                                 const int32_t output_padding_ptr[],
+                                                 cudecompDataType_t datatype);
 
 // Helper function to create halo configuration key
 cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
-                                      const int32_t halo_extents_ptr[],
-                                      const bool halo_periods_ptr[],
-                                      const int32_t padding_ptr[],
-                                      cudecompDataType_t datatype);
+                                       const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[],
+                                       const int32_t padding_ptr[],
+                                       cudecompDataType_t datatype);
 
 } // namespace cudecomp
 
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 814b19a..fa428e8 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -762,7 +762,6 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
     if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
 #endif
 
-    // Destroy timing events for AlltoAll operations
     if (handle->performance_report_enable) {
       // Print performance report before destroying events
       printPerformanceReport(handle, grid_desc);
diff --git a/src/performance.cc b/src/performance.cc
index 20606b1..99eab36 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -44,7 +44,7 @@
 
 namespace cudecomp {
 
-// Helper function to create transpose configuration key (no longer template)
+// Helper function to create transpose configuration key
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
                                                 const int32_t input_halo_extents_ptr[],
                                                 const int32_t output_halo_extents_ptr[],
@@ -103,8 +103,8 @@ cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
 
 // Helper function to get or create transpose performance sample collection for a configuration
 cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
-                                                                                    cudecompGridDesc_t grid_desc,
-                                                                                    const cudecompTransposeConfigKey& config) {
+                                                                                     cudecompGridDesc_t grid_desc,
+                                                                                     const cudecompTransposeConfigKey& config) {
   auto& samples_map = grid_desc->transpose_perf_samples_map;
 
   if (samples_map.find(config) == samples_map.end()) {
@@ -161,13 +161,20 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
   return samples_map[config];
 }
 
-// Helper function to format array as compact string
+// Helper function to format integer array as compact string
 std::string formatArray(const std::array<int32_t, 3>& arr) {
   std::ostringstream oss;
   oss << "[" << arr[0] << "," << arr[1] << "," << arr[2] << "]";
   return oss.str();
 }
 
+// Helper function to format boolean array as compact string
+std::string formatArray(const std::array<bool, 3>& arr) {
+  std::ostringstream oss;
+  oss << "[" << (arr[0] ? "1" : "0") << "," << (arr[1] ? "1" : "0") << "," << (arr[2] ? "1" : "0") << "]";
+  return oss.str();
+}
+
 // Helper function to get operation name from transpose config
 std::string getTransposeOperationName(const cudecompTransposeConfigKey& config) {
   int ax = std::get<0>(config);
@@ -210,58 +217,8 @@ std::string getDatatypeString(cudecompDataType_t datatype) {
   }
 }
 
-// Helper structure for transpose statistics
-struct TransposePerformanceStats {
-  std::string operation;
-  std::string datatype;
-  std::string halos;        // Combined input/output halos
-  std::string padding;      // Combined input/output padding
-  std::string inplace;
-  std::string managed;
-  int samples;
-  float total_time_avg;
-  float alltoall_time_avg;
-  float local_time_avg;
-  float alltoall_bw_avg;
-};
-
-// Helper structure for halo statistics
-struct HaloPerformanceStats {
-  std::string operation;
-  std::string datatype;
-  int dim;
-  std::string halos;
-  std::string periods;
-  std::string padding;
-  std::string managed;
-  int samples;
-  float total_time_avg;
-  float sendrecv_time_avg;
-  float local_time_avg;
-  float sendrecv_bw_avg;
-};
-
-// Helper structure to hold pre-computed transpose timing data
-struct TransposeConfigTimingData {
-  TransposePerformanceStats stats;
-  std::vector<float> total_times;
-  std::vector<float> alltoall_times;
-  std::vector<float> local_times;
-  std::vector<float> alltoall_bws;
-  std::vector<int> sample_indices;
-};
-
-// Helper structure to hold pre-computed halo timing data
-struct HaloConfigTimingData {
-  HaloPerformanceStats stats;
-  std::vector<float> total_times;
-  std::vector<float> sendrecv_times;
-  std::vector<float> local_times;
-  std::vector<float> sendrecv_bws;
-  std::vector<int> sample_indices;
-};
-
-// Custom comparison functions for consistent ordering
+// Comparison function to order transpose configurations.
+// Ordering is (operation, datatype, halos, padding, inplace, managed).
 bool compareTransposeConfigData(const TransposeConfigTimingData& a,
                                 const TransposeConfigTimingData& b) {
   static const std::map<std::string, int> op_priority = {
@@ -293,6 +250,8 @@ bool compareTransposeConfigData(const TransposeConfigTimingData& a,
   return a.stats.managed < b.stats.managed;
 }
 
+// Comparison function to order halo configurations.
+// Ordering is (operation, datatype, dim, halos, periods, padding, managed).
 bool compareHaloConfigData(const HaloConfigTimingData& a,
                            const HaloConfigTimingData& b) {
   static const std::map<std::string, int> op_priority = {
@@ -326,184 +285,434 @@ bool compareHaloConfigData(const HaloConfigTimingData& a,
   return a.stats.managed < b.stats.managed;
 }
 
-void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
-  // Synchronize to ensure all events are recorded
-  CHECK_CUDA(cudaDeviceSynchronize());
+// Function to compute average across ranks
+float computeGlobalAverage(const std::vector<float>& values, const cudecompHandle_t handle) {
+  float value = std::accumulate(values.begin(), values.end(), 0.0f);
+  value /= values.size();
+  CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &value, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+  value /= handle->nranks;
+  return value;
+}
 
-  // Collect all transpose statistics and timing data
-  std::vector<TransposeConfigTimingData> all_transpose_config_data;
+// Function to gather data from all ranks
+void gatherSampleData(const std::vector<float>& local_data, std::vector<float>& all_data,
+                      const cudecompHandle_t handle) {
+  int num_samples = local_data.size();
+  if (handle->rank == 0) {
+    all_data.resize(num_samples * handle->nranks);
+  }
 
-  for (const auto& entry : grid_desc->transpose_perf_samples_map) {
-    const auto& config = entry.first;
-    const auto& collection = entry.second;
+  CHECK_MPI(MPI_Gather(local_data.data(), num_samples, MPI_FLOAT,
+                       all_data.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+}
 
-    TransposeConfigTimingData config_data;
+// Process transpose timing data from sample collections
+TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKey& config,
+                                                 const cudecompTransposePerformanceSampleCollection& collection,
+                                                 const cudecompHandle_t handle) {
+  TransposeConfigTimingData config_data;
+
+  config_data.total_times.reserve(collection.samples.size());
+  config_data.alltoall_times.reserve(collection.samples.size());
+  config_data.local_times.reserve(collection.samples.size());
+  config_data.alltoall_bws.reserve(collection.samples.size());
+
+  // Collect valid samples and compute elapsed times
+  for (int i = 0; i < collection.samples.size(); ++i) {
+    const auto& sample = collection.samples[i];
+    if (!sample.valid) continue;
+
+    float alltoall_timing_ms = 0.0f;
+    for (int j = 0; j < sample.alltoall_timing_count; ++j) {
+      float elapsed_time;
+      CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, sample.alltoall_start_events[j], sample.alltoall_end_events[j]));
+      alltoall_timing_ms += elapsed_time;
+    }
 
-    // Collect valid samples and compute elapsed times once
-    for (int i = 0; i < collection.samples.size(); ++i) {
-      const auto& sample = collection.samples[i];
-      if (!sample.valid) continue;
+    float transpose_timing_ms;
+    CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event));
 
-      float alltoall_timing_ms = 0.0f;
-      for (int j = 0; j < sample.alltoall_timing_count; ++j) {
-        float elapsed_time;
-        CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, sample.alltoall_start_events[j], sample.alltoall_end_events[j]));
-        alltoall_timing_ms += elapsed_time;
-      }
+    config_data.total_times.push_back(transpose_timing_ms);
+    config_data.alltoall_times.push_back(alltoall_timing_ms);
+    config_data.local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
 
-      float transpose_timing_ms;
-      CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event));
+    float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0;
+    config_data.alltoall_bws.push_back(alltoall_bw);
+  }
 
-      config_data.total_times.push_back(transpose_timing_ms);
-      config_data.alltoall_times.push_back(alltoall_timing_ms);
-      config_data.local_times.push_back(transpose_timing_ms - alltoall_timing_ms);
+  if (config_data.total_times.empty()) {
+    return config_data;
+  }
 
-      float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0;
-      config_data.alltoall_bws.push_back(alltoall_bw);
-      config_data.sample_indices.push_back(i);
-    }
+  // Prepare aggregated statistics
+  TransposePerformanceStats& stats = config_data.stats;
+  stats.operation = getTransposeOperationName(config);
+  stats.datatype = getDatatypeString(std::get<8>(config));
 
-    if (config_data.total_times.empty()) continue;
+  // Format combined halos and padding
+  auto input_halos = std::get<2>(config);
+  auto output_halos = std::get<3>(config);
+  auto input_padding = std::get<4>(config);
+  auto output_padding = std::get<5>(config);
 
-    // Prepare aggregated statistics
-    TransposePerformanceStats& stats = config_data.stats;
-    stats.operation = getTransposeOperationName(config);
-    stats.datatype = getDatatypeString(std::get<8>(config));
+  stats.halos = formatArray(input_halos) + "/" + formatArray(output_halos);
+  stats.padding = formatArray(input_padding) + "/" + formatArray(output_padding);
+  stats.inplace = std::get<6>(config) ? "Y" : "N";
+  stats.managed = std::get<7>(config) ? "Y" : "N";
+  stats.samples = config_data.total_times.size();
 
-    // Format combined halos and padding
-    auto input_halos = std::get<2>(config);
-    auto output_halos = std::get<3>(config);
-    auto input_padding = std::get<4>(config);
-    auto output_padding = std::get<5>(config);
+  // Compute average statistics and reduce across all ranks
+  stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle);
+  stats.alltoall_time_avg = computeGlobalAverage(config_data.alltoall_times, handle);
+  stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle);
+  stats.alltoall_bw_avg = computeGlobalAverage(config_data.alltoall_bws, handle);
 
-    stats.halos = formatArray(input_halos) + "/" + formatArray(output_halos);
-    stats.padding = formatArray(input_padding) + "/" + formatArray(output_padding);
-    stats.inplace = std::get<6>(config) ? "Y" : "N";
-    stats.managed = std::get<7>(config) ? "Y" : "N";
-    stats.samples = config_data.total_times.size();
+  return config_data;
+}
 
-    // Compute average statistics across all ranks
-    stats.total_time_avg = std::accumulate(config_data.total_times.begin(), config_data.total_times.end(), 0.0f) / config_data.total_times.size();
-    stats.alltoall_time_avg = std::accumulate(config_data.alltoall_times.begin(), config_data.alltoall_times.end(), 0.0f) / config_data.alltoall_times.size();
-    stats.local_time_avg = std::accumulate(config_data.local_times.begin(), config_data.local_times.end(), 0.0f) / config_data.local_times.size();
-    stats.alltoall_bw_avg = std::accumulate(config_data.alltoall_bws.begin(), config_data.alltoall_bws.end(), 0.0f) / config_data.alltoall_bws.size();
+// Process halo timing data from sample collections
+HaloConfigTimingData processHaloConfig(const cudecompHaloConfigKey& config,
+                                       const cudecompHaloPerformanceSampleCollection& collection,
+                                       const cudecompHandle_t handle) {
+  HaloConfigTimingData config_data;
+
+  config_data.total_times.reserve(collection.samples.size());
+  config_data.sendrecv_times.reserve(collection.samples.size());
+  config_data.local_times.reserve(collection.samples.size());
+  config_data.sendrecv_bws.reserve(collection.samples.size());
+
+  // Collect valid samples and compute elapsed times
+  for (int i = 0; i < collection.samples.size(); ++i) {
+    const auto& sample = collection.samples[i];
+    if (!sample.valid) continue;
+
+    float sendrecv_timing_ms = 0.0f;
+    if (sample.sendrecv_bytes > 0) {
+      CHECK_CUDA(cudaEventElapsedTime(&sendrecv_timing_ms, sample.sendrecv_start_event, sample.sendrecv_end_event));
+    }
 
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.total_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.alltoall_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.local_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.alltoall_bw_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+    float halo_timing_ms;
+    CHECK_CUDA(cudaEventElapsedTime(&halo_timing_ms, sample.halo_start_event, sample.halo_end_event));
 
-    stats.total_time_avg /= handle->nranks;
-    stats.alltoall_time_avg /= handle->nranks;
-    stats.local_time_avg /= handle->nranks;
-    stats.alltoall_bw_avg /= handle->nranks;
+    config_data.total_times.push_back(halo_timing_ms);
+    config_data.sendrecv_times.push_back(sendrecv_timing_ms);
+    config_data.local_times.push_back(halo_timing_ms - sendrecv_timing_ms);
 
-    all_transpose_config_data.push_back(std::move(config_data));
+    float sendrecv_bw = (sendrecv_timing_ms > 0) ? sample.sendrecv_bytes * 1e-6 / sendrecv_timing_ms : 0;
+    config_data.sendrecv_bws.push_back(sendrecv_bw);
   }
 
-  // Sort transpose configuration data for consistent ordering
-  std::sort(all_transpose_config_data.begin(), all_transpose_config_data.end(), compareTransposeConfigData);
+  if (config_data.total_times.empty()) {
+    return config_data;
+  }
 
-  // Collect all halo statistics and timing data
-  std::vector<HaloConfigTimingData> all_halo_config_data;
+  // Prepare aggregated statistics
+  HaloPerformanceStats& stats = config_data.stats;
+  stats.operation = getHaloOperationName(config);
+  stats.datatype = getDatatypeString(std::get<6>(config));
+  stats.dim = std::get<1>(config);
 
-  for (const auto& entry : grid_desc->halo_perf_samples_map) {
-    const auto& config = entry.first;
-    const auto& collection = entry.second;
+  // Format halo extents, periods, and padding
+  auto halo_extents = std::get<2>(config);
+  auto halo_periods = std::get<3>(config);
+  auto padding = std::get<4>(config);
 
-    HaloConfigTimingData config_data;
+  stats.halos = formatArray(halo_extents);
+  stats.periods = formatArray(halo_periods);
+  stats.padding = formatArray(padding);
+  stats.managed = std::get<5>(config) ? "Y" : "N";
+  stats.samples = config_data.total_times.size();
 
-    // Collect valid samples and compute elapsed times once
-    for (int i = 0; i < collection.samples.size(); ++i) {
-      const auto& sample = collection.samples[i];
-      if (!sample.valid) continue;
+  // Compute average statistics across all ranks
+  stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle);
+  stats.sendrecv_time_avg = computeGlobalAverage(config_data.sendrecv_times, handle);
+  stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle);
+  stats.sendrecv_bw_avg = computeGlobalAverage(config_data.sendrecv_bws, handle);
 
-      float sendrecv_timing_ms = 0.0f;
-      if (sample.sendrecv_bytes > 0) {
-        CHECK_CUDA(cudaEventElapsedTime(&sendrecv_timing_ms, sample.sendrecv_start_event, sample.sendrecv_end_event));
-      }
+  return config_data;
+}
+
+// Print grid configuration information
+void printGridConfiguration(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+  if (handle->rank != 0) return;
+
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: ===== Performance Summary =====\n");
+  printf("CUDECOMP: Grid Configuration:\n");
+  printf("CUDECOMP:\tTranspose backend: %s\n",
+         cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
+  printf("CUDECOMP:\tHalo backend: %s\n",
+         cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
+  printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
+         grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
+  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
+         grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
+
+  // Print memory ordering information
+  printf("CUDECOMP:\tMemory order: ");
+  for (int axis = 0; axis < 3; ++axis) {
+    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
+                        grid_desc->config.transpose_mem_order[axis][1],
+                        grid_desc->config.transpose_mem_order[axis][2]);
+    if (axis < 2) printf("; ");
+  }
+  printf("\n");
+  printf("CUDECOMP:\n");
+}
+
+// Print transpose performance table
+void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>& all_transpose_config_data) {
+  printf("CUDECOMP: Transpose Performance Data:\n");
+  printf("CUDECOMP:\n");
+
+  // Print compact table header
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
+         "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "", "", "", "", "", "", "",
+         "[ms]", "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: ");
+  for (int i = 0; i < 120; ++i) printf("-");
+  printf("\n");
+
+  // Print table rows
+  for (const auto& config_data : all_transpose_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             stats.operation.c_str(),
+             stats.datatype.c_str(),
+             stats.halos.c_str(),
+             stats.padding.c_str(),
+             stats.inplace.c_str(),
+             stats.managed.c_str(),
+             stats.samples,
+             stats.total_time_avg,
+             stats.alltoall_time_avg,
+             stats.local_time_avg,
+             stats.alltoall_bw_avg);
+    }
+  }
+}
 
-      float halo_timing_ms;
-      CHECK_CUDA(cudaEventElapsedTime(&halo_timing_ms, sample.halo_start_event, sample.halo_end_event));
+// Print halo performance table
+void printHaloPerformanceTable(const std::vector<HaloConfigTimingData>& all_halo_config_data) {
+  printf("CUDECOMP:\n");
+  printf("CUDECOMP: Halo Performance Data:\n");
+  printf("CUDECOMP:\n");
+
+  // Print compact table header
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "operation", "dtype", "dim", "halo extent", "periods", "padding", "managed", "samples",
+         "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "", "", "", "", "", "", "", "",
+         "[ms]", "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: ");
+  for (int i = 0; i < 125; ++i) printf("-");
+  printf("\n");
+
+  // Print table rows
+  for (const auto& config_data : all_halo_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             stats.operation.c_str(),
+             stats.datatype.c_str(),
+             stats.dim,
+             stats.halos.c_str(),
+             stats.periods.c_str(),
+             stats.padding.c_str(),
+             stats.managed.c_str(),
+             stats.samples,
+             stats.total_time_avg,
+             stats.sendrecv_time_avg,
+             stats.local_time_avg,
+             stats.sendrecv_bw_avg);
+    }
+  }
+}
 
-      config_data.total_times.push_back(halo_timing_ms);
-      config_data.sendrecv_times.push_back(sendrecv_timing_ms);
-      config_data.local_times.push_back(halo_timing_ms - sendrecv_timing_ms);
+// Print per-sample transpose data for a single configuration
+void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& config_data,
+                                            const cudecompHandle_t handle, int detail_level) {
+  const auto& stats = config_data.stats;
+  const auto& total_times = config_data.total_times;
+  const auto& alltoall_times = config_data.alltoall_times;
+  const auto& local_times = config_data.local_times;
+  const auto& alltoall_bws = config_data.alltoall_bws;
+
+  if (total_times.empty()) return;
+
+  int num_samples = total_times.size();
+  std::vector<float> all_total_times, all_alltoall_times, all_local_times, all_alltoall_bws;
+
+  if (detail_level == 1) {
+    // Detail level 1: Only rank 0 prints its own data
+    if (handle->rank != 0) return;
+
+    all_total_times = total_times;
+    all_alltoall_times = alltoall_times;
+    all_local_times = local_times;
+    all_alltoall_bws = alltoall_bws;
+
+  } else if (detail_level == 2) {
+    // Detail level 2: Gather data from all ranks
+    gatherSampleData(total_times, all_total_times, handle);
+    gatherSampleData(alltoall_times, all_alltoall_times, handle);
+    gatherSampleData(local_times, all_local_times, handle);
+    gatherSampleData(alltoall_bws, all_alltoall_bws, handle);
+
+    if (handle->rank != 0) return;
+  }
+
+  printf("CUDECOMP: %s (dtype=%s, halos=%s, padding=%s, inplace=%s, managed=%s) samples:\n",
+         stats.operation.c_str(),
+         stats.datatype.c_str(),
+         stats.halos.c_str(),
+         stats.padding.c_str(),
+         stats.inplace.c_str(),
+         stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+         "rank", "sample", "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+         "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+  for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
+    for (int s = 0; s < num_samples; ++s) {
+      int idx = r * num_samples + s;
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             r, s, all_total_times[idx],
+             all_alltoall_times[idx], all_local_times[idx],
+             all_alltoall_bws[idx]);
+    }
+  }
+  printf("CUDECOMP:\n");
+}
 
-      float sendrecv_bw = (sendrecv_timing_ms > 0) ? sample.sendrecv_bytes * 1e-6 / sendrecv_timing_ms : 0;
-      config_data.sendrecv_bws.push_back(sendrecv_bw);
-      config_data.sample_indices.push_back(i);
+// Print per-sample halo data for a single configuration
+void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
+                                        const cudecompHandle_t handle, int detail_level) {
+  const auto& stats = config_data.stats;
+  const auto& total_times = config_data.total_times;
+  const auto& sendrecv_times = config_data.sendrecv_times;
+  const auto& local_times = config_data.local_times;
+  const auto& sendrecv_bws = config_data.sendrecv_bws;
+
+  if (total_times.empty()) return;
+
+  int num_samples = total_times.size();
+  std::vector<float> all_total_times, all_sendrecv_times, all_local_times, all_sendrecv_bws;
+
+  if (detail_level == 1) {
+    // Detail level 1: Only rank 0 prints its own data
+    if (handle->rank != 0) return;
+
+    all_total_times = total_times;
+    all_sendrecv_times = sendrecv_times;
+    all_local_times = local_times;
+    all_sendrecv_bws = sendrecv_bws;
+
+  } else if (detail_level == 2) {
+    // Detail level 2: Gather data from all ranks
+    gatherSampleData(total_times, all_total_times, handle);
+    gatherSampleData(sendrecv_times, all_sendrecv_times, handle);
+    gatherSampleData(local_times, all_local_times, handle);
+    gatherSampleData(sendrecv_bws, all_sendrecv_bws, handle);
+
+    if (handle->rank != 0) return;
+  }
+
+  printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n",
+         stats.operation.c_str(),
+         stats.datatype.c_str(),
+         stats.dim,
+         stats.halos.c_str(),
+         stats.periods.c_str(),
+         stats.padding.c_str(),
+         stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+         "rank", "sample", "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
+         "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+
+  for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
+    for (int s = 0; s < num_samples; ++s) {
+      int idx = r * num_samples + s;
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+             r, s, all_total_times[idx],
+             all_sendrecv_times[idx], all_local_times[idx],
+             all_sendrecv_bws[idx]);
     }
+  }
+  printf("CUDECOMP:\n");
+}
+
+// Print per-sample details for transpose configurations
+void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
+                                   const cudecompHandle_t handle) {
+  for (const auto& config_data : all_transpose_config_data) {
+    if (config_data.stats.samples == 0) continue;
+
+    printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail);
+  }
+}
+
+// Print per-sample details for halo configurations
+void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo_config_data,
+                              const cudecompHandle_t handle) {
+  for (const auto& config_data : all_halo_config_data) {
+    if (config_data.stats.samples == 0) continue;
+
+    printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail);
+  }
+}
 
-    if (config_data.total_times.empty()) continue;
+void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) {
+  // Synchronize to ensure all events are recorded
+  CHECK_CUDA(cudaDeviceSynchronize());
 
-    // Prepare aggregated statistics
-    HaloPerformanceStats& stats = config_data.stats;
-    stats.operation = getHaloOperationName(config);
-    stats.datatype = getDatatypeString(std::get<6>(config));
-    stats.dim = std::get<1>(config);
+  // Collect all transpose statistics and timing data
+  std::vector<TransposeConfigTimingData> all_transpose_config_data;
+  all_transpose_config_data.reserve(grid_desc->transpose_perf_samples_map.size());
 
-    // Format halo extents, periods, and padding
-    auto halo_extents = std::get<2>(config);
-    auto halo_periods = std::get<3>(config);
-    auto padding = std::get<4>(config);
+  for (const auto& entry : grid_desc->transpose_perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
 
-    stats.halos = formatArray(halo_extents);
-    stats.periods = "[" + std::to_string(halo_periods[0]) + "," + std::to_string(halo_periods[1]) + "," + std::to_string(halo_periods[2]) + "]";
-    stats.padding = formatArray(padding);
-    stats.managed = std::get<5>(config) ? "Y" : "N";
-    stats.samples = config_data.total_times.size();
+    TransposeConfigTimingData config_data = processTransposeConfig(config, collection, handle);
+    if (config_data.stats.samples > 0) {
+      all_transpose_config_data.emplace_back(std::move(config_data));
+    }
+  }
 
-    // Compute average statistics across all ranks
-    stats.total_time_avg = std::accumulate(config_data.total_times.begin(), config_data.total_times.end(), 0.0f) / config_data.total_times.size();
-    stats.sendrecv_time_avg = std::accumulate(config_data.sendrecv_times.begin(), config_data.sendrecv_times.end(), 0.0f) / config_data.sendrecv_times.size();
-    stats.local_time_avg = std::accumulate(config_data.local_times.begin(), config_data.local_times.end(), 0.0f) / config_data.local_times.size();
-    stats.sendrecv_bw_avg = std::accumulate(config_data.sendrecv_bws.begin(), config_data.sendrecv_bws.end(), 0.0f) / config_data.sendrecv_bws.size();
+  // Sort transpose configuration data for consistent ordering
+  std::sort(all_transpose_config_data.begin(), all_transpose_config_data.end(), compareTransposeConfigData);
 
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.total_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.sendrecv_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.local_time_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &stats.sendrecv_bw_avg, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
+  // Collect all halo statistics and timing data with performance optimizations
+  std::vector<HaloConfigTimingData> all_halo_config_data;
+  all_halo_config_data.reserve(grid_desc->halo_perf_samples_map.size());
 
-    stats.total_time_avg /= handle->nranks;
-    stats.sendrecv_time_avg /= handle->nranks;
-    stats.local_time_avg /= handle->nranks;
-    stats.sendrecv_bw_avg /= handle->nranks;
+  for (const auto& entry : grid_desc->halo_perf_samples_map) {
+    const auto& config = entry.first;
+    const auto& collection = entry.second;
 
-    all_halo_config_data.push_back(std::move(config_data));
+    HaloConfigTimingData config_data = processHaloConfig(config, collection, handle);
+    if (config_data.stats.samples > 0) {
+      all_halo_config_data.emplace_back(std::move(config_data));
+    }
   }
 
   // Sort halo configuration data for consistent ordering
   std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData);
 
-  // Print summary information on rank 0 only
-  if (handle->rank == 0) {
-    printf("CUDECOMP:\n");
-    printf("CUDECOMP: ===== Performance Summary =====\n");
-
-    // Print grid descriptor configuration information
-    printf("CUDECOMP: Grid Configuration:\n");
-    printf("CUDECOMP:\tTranspose backend: %s\n",
-           cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
-    printf("CUDECOMP:\tHalo backend: %s\n",
-           cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
-    printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
-           grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
-    printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
-           grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
-
-    // Print memory ordering information
-    printf("CUDECOMP:\tMemory order: ");
-    for (int axis = 0; axis < 3; ++axis) {
-      printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
-                          grid_desc->config.transpose_mem_order[axis][1],
-                          grid_desc->config.transpose_mem_order[axis][2]);
-      if (axis < 2) printf("; ");
-    }
-    printf("\n");
-
-    printf("CUDECOMP:\n");
+  // Print grid configuration information
+  printGridConfiguration(handle, grid_desc);
 
+  if (handle->rank == 0) {
     if (all_transpose_config_data.empty() && all_halo_config_data.empty()) {
       printf("CUDECOMP: No performance data collected\n");
       printf("CUDECOMP: ================================\n");
@@ -513,74 +722,12 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
 
     // Print transpose performance data
     if (!all_transpose_config_data.empty()) {
-      printf("CUDECOMP: Transpose Performance Data:\n");
-      printf("CUDECOMP:\n");
-
-      // Print compact table header
-      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-             "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
-             "total", "A2A", "local", "A2A BW");
-      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-             "", "", "", "", "", "", "",
-             "[ms]", "[ms]", "[ms]", "[GB/s]");
-      printf("CUDECOMP: ");
-      for (int i = 0; i < 120; ++i) printf("-");
-      printf("\n");
-
-      // Print table rows
-      for (const auto& config_data : all_transpose_config_data) {
-        const auto& stats = config_data.stats;
-        printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-               stats.operation.c_str(),
-               stats.datatype.c_str(),
-               stats.halos.c_str(),
-               stats.padding.c_str(),
-               stats.inplace.c_str(),
-               stats.managed.c_str(),
-               stats.samples,
-               stats.total_time_avg,
-               stats.alltoall_time_avg,
-               stats.local_time_avg,
-               stats.alltoall_bw_avg
-               );
-      }
+      printTransposePerformanceTable(all_transpose_config_data);
     }
 
     // Print halo performance data
     if (!all_halo_config_data.empty()) {
-      printf("CUDECOMP:\n");
-      printf("CUDECOMP: Halo Performance Data:\n");
-      printf("CUDECOMP:\n");
-
-      // Print compact table header
-      printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-             "operation", "dtype", "dim", "halo extent", "periods", "padding", "managed", "samples",
-             "total", "SR", "local", "SR BW");
-      printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-             "", "", "", "", "", "", "", "",
-             "[ms]", "[ms]", "[ms]", "[GB/s]");
-      printf("CUDECOMP: ");
-      for (int i = 0; i < 125; ++i) printf("-");
-      printf("\n");
-
-      // Print table rows
-      for (const auto& config_data : all_halo_config_data) {
-        const auto& stats = config_data.stats;
-        printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-               stats.operation.c_str(),
-               stats.datatype.c_str(),
-               stats.dim,
-               stats.halos.c_str(),
-               stats.periods.c_str(),
-               stats.padding.c_str(),
-               stats.managed.c_str(),
-               stats.samples,
-               stats.total_time_avg,
-               stats.sendrecv_time_avg,
-               stats.local_time_avg,
-               stats.sendrecv_bw_avg
-               );
-      }
+      printHaloPerformanceTable(all_halo_config_data);
     }
   }
 
@@ -592,199 +739,11 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
       printf("CUDECOMP:\n");
     }
 
-    for (const auto& config_data : all_transpose_config_data) {
-      const auto& stats = config_data.stats;
-
-      // Print configuration header on rank 0
-      if (handle->rank == 0) {
-        printf("CUDECOMP: %s (dtype=%s, halos=%s, padding=%s, inplace=%s, managed=%s) samples:\n",
-               stats.operation.c_str(),
-               stats.datatype.c_str(),
-               stats.halos.c_str(),
-               stats.padding.c_str(),
-               stats.inplace.c_str(),
-               stats.managed.c_str());
-      }
-
-      const auto& total_times = config_data.total_times;
-      const auto& alltoall_times = config_data.alltoall_times;
-      const auto& local_times = config_data.local_times;
-      const auto& alltoall_bws = config_data.alltoall_bws;
-      const auto& sample_indices = config_data.sample_indices;
-
-      if (total_times.empty()) continue;
-
-      if (handle->performance_report_detail == 1) {
-        // Print per-sample data for rank 0 only
-        if (handle->rank == 0) {
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "rank", "sample", "total", "A2A", "local", "A2A BW");
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
-
-          for (int i = 0; i < total_times.size(); ++i) {
-            printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-                   handle->rank, sample_indices[i], total_times[i], alltoall_times[i],
-                   local_times[i], alltoall_bws[i]);
-          }
-        }
-      } else if (handle->performance_report_detail == 2) {
-        // Gather data from all ranks to rank 0
-        // Note: We assume all entries have the same number of samples per rank
-        int num_samples = total_times.size();
-
-        if (handle->rank == 0) {
-          // Use MPI_Gather instead of MPI_Gatherv since all ranks have the same number of samples
-          std::vector<float> all_total_times(num_samples * handle->nranks);
-          std::vector<float> all_alltoall_times(num_samples * handle->nranks);
-          std::vector<float> all_local_times(num_samples * handle->nranks);
-          std::vector<float> all_alltoall_bws(num_samples * handle->nranks);
-          std::vector<int> all_sample_indices(num_samples * handle->nranks);
-
-          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
-                               all_total_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(alltoall_times.data(), num_samples, MPI_FLOAT,
-                               all_alltoall_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
-                               all_local_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(alltoall_bws.data(), num_samples, MPI_FLOAT,
-                               all_alltoall_bws.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
-                               all_sample_indices.data(), num_samples, MPI_INT, 0, handle->mpi_comm));
-
-          // Print header
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "rank", "sample", "total", "A2A", "local", "A2A BW");
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
-
-          // Print data sorted by rank
-          for (int r = 0; r < handle->nranks; ++r) {
-            for (int s = 0; s < num_samples; ++s) {
-              int idx = r * num_samples + s;
-              printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-                     r, all_sample_indices[idx], all_total_times[idx],
-                     all_alltoall_times[idx], all_local_times[idx],
-                     all_alltoall_bws[idx]);
-            }
-          }
-        } else {
-          // Non-rank-0 processes just send their data
-          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(alltoall_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(alltoall_bws.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
-                               nullptr, num_samples, MPI_INT, 0, handle->mpi_comm));
-        }
-      }
-
-      if (handle->rank == 0) {
-        printf("CUDECOMP:\n");
-      }
-    }
+    // Print transpose per-sample details
+    printTransposePerSampleDetails(all_transpose_config_data, handle);
 
     // Print halo per-sample details
-    for (const auto& config_data : all_halo_config_data) {
-      const auto& stats = config_data.stats;
-
-      // Print configuration header on rank 0
-      if (handle->rank == 0) {
-        printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n",
-               stats.operation.c_str(),
-               stats.datatype.c_str(),
-               stats.dim,
-               stats.halos.c_str(),
-               stats.periods.c_str(),
-               stats.padding.c_str(),
-               stats.managed.c_str());
-      }
-
-      const auto& total_times = config_data.total_times;
-      const auto& sendrecv_times = config_data.sendrecv_times;
-      const auto& local_times = config_data.local_times;
-      const auto& sendrecv_bws = config_data.sendrecv_bws;
-      const auto& sample_indices = config_data.sample_indices;
-
-      if (total_times.empty()) continue;
-
-      if (handle->performance_report_detail == 1) {
-        // Print per-sample data for rank 0 only
-        if (handle->rank == 0) {
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "rank", "sample", "total", "SR", "local", "SR BW");
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
-
-          for (int i = 0; i < total_times.size(); ++i) {
-            printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-                   handle->rank, sample_indices[i], total_times[i], sendrecv_times[i],
-                   local_times[i], sendrecv_bws[i]);
-          }
-        }
-      } else if (handle->performance_report_detail == 2) {
-        // Gather data from all ranks to rank 0
-        // Note: We assume all entries have the same number of samples per rank
-        int num_samples = total_times.size();
-
-        if (handle->rank == 0) {
-          // Use MPI_Gather instead of MPI_Gatherv since all ranks have the same number of samples
-          std::vector<float> all_total_times(num_samples * handle->nranks);
-          std::vector<float> all_sendrecv_times(num_samples * handle->nranks);
-          std::vector<float> all_local_times(num_samples * handle->nranks);
-          std::vector<float> all_sendrecv_bws(num_samples * handle->nranks);
-          std::vector<int> all_sample_indices(num_samples * handle->nranks);
-
-          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
-                               all_total_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sendrecv_times.data(), num_samples, MPI_FLOAT,
-                               all_sendrecv_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
-                               all_local_times.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sendrecv_bws.data(), num_samples, MPI_FLOAT,
-                               all_sendrecv_bws.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
-                               all_sample_indices.data(), num_samples, MPI_INT, 0, handle->mpi_comm));
-
-          // Print header
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "rank", "sample", "total", "SR", "local", "SR BW");
-          printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-                 "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
-
-          // Print data sorted by rank
-          for (int r = 0; r < handle->nranks; ++r) {
-            for (int s = 0; s < num_samples; ++s) {
-              int idx = r * num_samples + s;
-              printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-                     r, all_sample_indices[idx], all_total_times[idx],
-                     all_sendrecv_times[idx], all_local_times[idx],
-                     all_sendrecv_bws[idx]);
-            }
-          }
-        } else {
-          // Non-rank-0 processes just send their data
-          CHECK_MPI(MPI_Gather(total_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sendrecv_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(local_times.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sendrecv_bws.data(), num_samples, MPI_FLOAT,
-                               nullptr, num_samples, MPI_FLOAT, 0, handle->mpi_comm));
-          CHECK_MPI(MPI_Gather(sample_indices.data(), num_samples, MPI_INT,
-                               nullptr, num_samples, MPI_INT, 0, handle->mpi_comm));
-        }
-      }
-
-      if (handle->rank == 0) {
-        printf("CUDECOMP:\n");
-      }
-    }
+    printHaloPerSampleDetails(all_halo_config_data, handle);
   }
 
   if (handle->rank == 0) {

From 13dc563b26c10fc0c749acfd55f72ae6c29c2cf3 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 10:37:17 -0700
Subject: [PATCH 08/13] Remove erroneous quick return.

---
 src/performance.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/performance.cc b/src/performance.cc
index 99eab36..1478c80 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -287,6 +287,8 @@ bool compareHaloConfigData(const HaloConfigTimingData& a,
 
 // Function to compute average across ranks
 float computeGlobalAverage(const std::vector<float>& values, const cudecompHandle_t handle) {
+  if (values.size() == 0) { return 0.0f; }
+
   float value = std::accumulate(values.begin(), values.end(), 0.0f);
   value /= values.size();
   CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &value, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm));
@@ -340,10 +342,6 @@ TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKe
     config_data.alltoall_bws.push_back(alltoall_bw);
   }
 
-  if (config_data.total_times.empty()) {
-    return config_data;
-  }
-
   // Prepare aggregated statistics
   TransposePerformanceStats& stats = config_data.stats;
   stats.operation = getTransposeOperationName(config);
@@ -402,10 +400,6 @@ HaloConfigTimingData processHaloConfig(const cudecompHaloConfigKey& config,
     config_data.sendrecv_bws.push_back(sendrecv_bw);
   }
 
-  if (config_data.total_times.empty()) {
-    return config_data;
-  }
-
   // Prepare aggregated statistics
   HaloPerformanceStats& stats = config_data.stats;
   stats.operation = getHaloOperationName(config);

From 02b4087ad9aeddfc576c501ec6a3b57f85841db0 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 12:04:03 -0700
Subject: [PATCH 09/13] wip

---
 src/performance.cc | 242 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 232 insertions(+), 10 deletions(-)

diff --git a/src/performance.cc b/src/performance.cc
index 1478c80..3d54eff 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -35,6 +35,8 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <fstream>
+#include <cstdlib>
 
 #include <mpi.h>
 
@@ -44,6 +46,135 @@
 
 namespace cudecomp {
 
+// Helper function to get CSV write directory from environment variable
+std::string getPerformanceReportWriteDir() {
+  const char* write_dir = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
+  return write_dir ? std::string(write_dir) : std::string();
+}
+
+// Helper function to create file name with grid descriptor information
+std::string createPerformanceReportFileName(const std::string& write_dir,
+                                           const std::string& table_type,
+                                           const cudecompGridDesc_t grid_desc) {
+  std::ostringstream filename;
+  filename << write_dir;
+  if (!write_dir.empty() && write_dir.back() != '/') {
+    filename << "/";
+  }
+  
+  filename << "cudecomp-perf-report-" << table_type << "-";
+  filename << "tcomm_" << grid_desc->config.transpose_comm_backend << "-";
+  filename << "hcomm_" << grid_desc->config.halo_comm_backend << "-";
+  filename << "pdims_" << grid_desc->config.pdims[0] << "x" << grid_desc->config.pdims[1] << "-";
+  filename << "gdims_" << grid_desc->config.gdims[0] << "x" << grid_desc->config.gdims[1] << "x" << grid_desc->config.gdims[2] << "-";
+  filename << "memorder_";
+  for (int axis = 0; axis < 3; ++axis) {
+    filename << grid_desc->config.transpose_mem_order[axis][0] << grid_desc->config.transpose_mem_order[axis][1] << grid_desc->config.transpose_mem_order[axis][2];
+  }
+  filename << ".csv";
+  
+  return filename.str();
+}
+
+// Helper function to write CSV header with grid configuration information
+void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
+  file << "# CUDECOMP Performance Report\n";
+  file << "# Grid Configuration:\n";
+  file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend) << "\n";
+  file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n";
+  file << "# Process grid: [" << grid_desc->config.pdims[0] << ", " << grid_desc->config.pdims[1] << "]\n";
+  file << "# Global dimensions: [" << grid_desc->config.gdims[0] << ", " << grid_desc->config.gdims[1] << ", " << grid_desc->config.gdims[2] << "]\n";
+  file << "# Memory order: ";
+  for (int axis = 0; axis < 3; ++axis) {
+    file << "[" << grid_desc->config.transpose_mem_order[axis][0] << ","
+         << grid_desc->config.transpose_mem_order[axis][1] << ","
+         << grid_desc->config.transpose_mem_order[axis][2] << "]";
+    if (axis < 2) file << "; ";
+  }
+  file << "\n#\n";
+}
+
+// Write transpose performance table to CSV
+void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
+                                      const cudecompGridDesc_t grid_desc,
+                                      const std::string& write_dir) {
+  if (all_transpose_config_data.empty()) return;
+
+  std::string filename = createPerformanceReportFileName(write_dir, "transpose", grid_desc);
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    return;
+  }
+
+  writeCSVHeader(file, grid_desc);
+
+  // Write CSV header
+  file << "operation,dtype,halo_extents,padding,inplace,managed,samples,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+
+  // Write CSV data rows
+  for (const auto& config_data : all_transpose_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      file << stats.operation << ","
+           << stats.datatype << ","
+           << "\"" << stats.halos << "\","
+           << "\"" << stats.padding << "\","
+           << stats.inplace << ","
+           << stats.managed << ","
+           << stats.samples << ","
+           << std::fixed << std::setprecision(3) << stats.total_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.alltoall_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.alltoall_bw_avg << "\n";
+    }
+  }
+
+  file.close();
+  printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.c_str());
+}
+
+// Write halo performance table to CSV
+void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_halo_config_data,
+                                 const cudecompGridDesc_t grid_desc,
+                                 const std::string& write_dir) {
+  if (all_halo_config_data.empty()) return;
+
+  std::string filename = createPerformanceReportFileName(write_dir, "halo", grid_desc);
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    return;
+  }
+
+  writeCSVHeader(file, grid_desc);
+
+  // Write CSV header
+  file << "operation,dtype,dim,halo_extent,periods,padding,managed,samples,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
+
+  // Write CSV data rows
+  for (const auto& config_data : all_halo_config_data) {
+    const auto& stats = config_data.stats;
+    if (stats.samples > 0) {
+      file << stats.operation << ","
+           << stats.datatype << ","
+           << stats.dim << ","
+           << "\"" << stats.halos << "\","
+           << "\"" << stats.periods << "\","
+           << "\"" << stats.padding << "\","
+           << stats.managed << ","
+           << stats.samples << ","
+           << std::fixed << std::setprecision(3) << stats.total_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.sendrecv_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
+           << std::fixed << std::setprecision(3) << stats.sendrecv_bw_avg << "\n";
+    }
+  }
+
+  file.close();
+  printf("CUDECOMP: Wrote halo performance data to %s\n", filename.c_str());
+}
+
 // Helper function to create transpose configuration key
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
                                                 const int32_t input_halo_extents_ptr[],
@@ -530,7 +661,8 @@ void printHaloPerformanceTable(const std::vector<HaloConfigTimingData>& all_halo
 
 // Print per-sample transpose data for a single configuration
 void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& config_data,
-                                            const cudecompHandle_t handle, int detail_level) {
+                                            const cudecompHandle_t handle, int detail_level,
+                                            std::ofstream* csv_file = nullptr) {
   const auto& stats = config_data.stats;
   const auto& total_times = config_data.total_times;
   const auto& alltoall_times = config_data.alltoall_times;
@@ -581,6 +713,22 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
              r, s, all_total_times[idx],
              all_alltoall_times[idx], all_local_times[idx],
              all_alltoall_bws[idx]);
+      
+      // Write to CSV if file is provided
+      if (csv_file && csv_file->is_open()) {
+        *csv_file << stats.operation << ","
+                  << stats.datatype << ","
+                  << "\"" << stats.halos << "\","
+                  << "\"" << stats.padding << "\","
+                  << stats.inplace << ","
+                  << stats.managed << ","
+                  << r << ","
+                  << s << ","
+                  << std::fixed << std::setprecision(3) << all_total_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_alltoall_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_alltoall_bws[idx] << "\n";
+      }
     }
   }
   printf("CUDECOMP:\n");
@@ -588,7 +736,8 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
 
 // Print per-sample halo data for a single configuration
 void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
-                                        const cudecompHandle_t handle, int detail_level) {
+                                        const cudecompHandle_t handle, int detail_level,
+                                        std::ofstream* csv_file = nullptr) {
   const auto& stats = config_data.stats;
   const auto& total_times = config_data.total_times;
   const auto& sendrecv_times = config_data.sendrecv_times;
@@ -640,6 +789,23 @@ void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
              r, s, all_total_times[idx],
              all_sendrecv_times[idx], all_local_times[idx],
              all_sendrecv_bws[idx]);
+      
+      // Write to CSV if file is provided
+      if (csv_file && csv_file->is_open()) {
+        *csv_file << stats.operation << ","
+                  << stats.datatype << ","
+                  << stats.dim << ","
+                  << "\"" << stats.halos << "\","
+                  << "\"" << stats.periods << "\","
+                  << "\"" << stats.padding << "\","
+                  << stats.managed << ","
+                  << r << ","
+                  << s << ","
+                  << std::fixed << std::setprecision(3) << all_total_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_sendrecv_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
+                  << std::fixed << std::setprecision(3) << all_sendrecv_bws[idx] << "\n";
+      }
     }
   }
   printf("CUDECOMP:\n");
@@ -647,21 +813,63 @@ void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
 
 // Print per-sample details for transpose configurations
 void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
-                                   const cudecompHandle_t handle) {
+                                   const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                                   const std::string& write_dir = "") {
+  std::ofstream csv_file;
+  bool csv_enabled = !write_dir.empty();
+  
+  if (csv_enabled && handle->rank == 0) {
+    std::string filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc);
+    csv_file.open(filename);
+    if (csv_file.is_open()) {
+      writeCSVHeader(csv_file, grid_desc);
+      csv_file << "operation,dtype,halo_extents,padding,inplace,managed,rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+    } else {
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    }
+  }
+
   for (const auto& config_data : all_transpose_config_data) {
     if (config_data.stats.samples == 0) continue;
 
-    printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail);
+    printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
+                                           csv_enabled ? &csv_file : nullptr);
+  }
+  
+  if (csv_file.is_open()) {
+    csv_file.close();
+    printf("CUDECOMP: Wrote transpose per-sample data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).c_str());
   }
 }
 
 // Print per-sample details for halo configurations
 void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo_config_data,
-                              const cudecompHandle_t handle) {
+                              const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                              const std::string& write_dir = "") {
+  std::ofstream csv_file;
+  bool csv_enabled = !write_dir.empty();
+  
+  if (csv_enabled && handle->rank == 0) {
+    std::string filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc);
+    csv_file.open(filename);
+    if (csv_file.is_open()) {
+      writeCSVHeader(csv_file, grid_desc);
+      csv_file << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
+    } else {
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    }
+  }
+
   for (const auto& config_data : all_halo_config_data) {
     if (config_data.stats.samples == 0) continue;
 
-    printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail);
+    printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
+                                      csv_enabled ? &csv_file : nullptr);
+  }
+  
+  if (csv_file.is_open()) {
+    csv_file.close();
+    printf("CUDECOMP: Wrote halo per-sample data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).c_str());
   }
 }
 
@@ -703,6 +911,10 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
   // Sort halo configuration data for consistent ordering
   std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData);
 
+  // Check if CSV writing is enabled
+  std::string write_dir = getPerformanceReportWriteDir();
+  bool csv_enabled = !write_dir.empty();
+
   // Print grid configuration information
   printGridConfiguration(handle, grid_desc);
 
@@ -717,11 +929,21 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     // Print transpose performance data
     if (!all_transpose_config_data.empty()) {
       printTransposePerformanceTable(all_transpose_config_data);
+      
+      // Write transpose performance data to CSV if enabled
+      if (csv_enabled) {
+        writeTransposePerformanceTableCSV(all_transpose_config_data, grid_desc, write_dir);
+      }
     }
 
     // Print halo performance data
     if (!all_halo_config_data.empty()) {
       printHaloPerformanceTable(all_halo_config_data);
+      
+      // Write halo performance data to CSV if enabled
+      if (csv_enabled) {
+        writeHaloPerformanceTableCSV(all_halo_config_data, grid_desc, write_dir);
+      }
     }
   }
 
@@ -733,11 +955,11 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
       printf("CUDECOMP:\n");
     }
 
-    // Print transpose per-sample details
-    printTransposePerSampleDetails(all_transpose_config_data, handle);
+    // Print transpose per-sample details (and write CSV if enabled)
+    printTransposePerSampleDetails(all_transpose_config_data, handle, grid_desc, csv_enabled ? write_dir : "");
 
-    // Print halo per-sample details
-    printHaloPerSampleDetails(all_halo_config_data, handle);
+    // Print halo per-sample details (and write CSV if enabled)
+    printHaloPerSampleDetails(all_halo_config_data, handle, grid_desc, csv_enabled ? write_dir : "");
   }
 
   if (handle->rank == 0) {

From 1ee01c2bb34354bfbb0cf173ffd62335ed3138c2 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 12:35:38 -0700
Subject: [PATCH 10/13] wip

---
 docs/env_vars.rst         | 20 +++++++++
 include/internal/common.h |  1 +
 src/cudecomp.cc           |  6 +++
 src/performance.cc        | 86 +++++++++++++++++++++------------------
 4 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index 6620711..dda3913 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -70,3 +70,23 @@ CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
 :code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
 
 Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering.
+
+CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR
+-------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory.
+
+CSV files are created with descriptive names encoding the grid configuration, for example:
+:code:`cudecomp-perf-report-transpose-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`
+
+The following CSV files are generated:
+
+- Aggregated transpose performance data
+- Aggregated halo performance data
+- Per-sample transpose data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+- Per-sample halo data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+
+Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.
+
+Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.
diff --git a/include/internal/common.h b/include/internal/common.h
index 5f8aa2e..e65b7d6 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -120,6 +120,7 @@ struct cudecompHandle {
   int32_t performance_report_detail = 0;               // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
   int32_t performance_report_samples = 20;             // number of performance samples to keep for final report
   int32_t performance_report_warmup_samples = 3;       // number of initial warmup samples to ignore for each configuration
+  std::string performance_report_write_dir = "";       // directory to write CSV performance reports, empty means no file writing
 };
 
 // Structure with information about row/column communicator
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index fa428e8..de653e9 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -368,6 +368,12 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
              warmup_samples, handle->performance_report_warmup_samples);
     }
   }
+
+  // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports)
+  const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
+  if (performance_write_dir_str) {
+    handle->performance_report_write_dir = std::string(performance_write_dir_str);
+  }
 }
 
 #ifdef ENABLE_NVSHMEM
diff --git a/src/performance.cc b/src/performance.cc
index 3d54eff..7da7820 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -37,6 +37,7 @@
 #include <string>
 #include <fstream>
 #include <cstdlib>
+#include <filesystem>
 
 #include <mpi.h>
 
@@ -46,22 +47,11 @@
 
 namespace cudecomp {
 
-// Helper function to get CSV write directory from environment variable
-std::string getPerformanceReportWriteDir() {
-  const char* write_dir = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
-  return write_dir ? std::string(write_dir) : std::string();
-}
-
 // Helper function to create file name with grid descriptor information
-std::string createPerformanceReportFileName(const std::string& write_dir,
-                                           const std::string& table_type,
-                                           const cudecompGridDesc_t grid_desc) {
+std::filesystem::path createPerformanceReportFileName(const std::string& write_dir,
+                                                      const std::string& table_type,
+                                                      const cudecompGridDesc_t grid_desc) {
   std::ostringstream filename;
-  filename << write_dir;
-  if (!write_dir.empty() && write_dir.back() != '/') {
-    filename << "/";
-  }
-  
   filename << "cudecomp-perf-report-" << table_type << "-";
   filename << "tcomm_" << grid_desc->config.transpose_comm_backend << "-";
   filename << "hcomm_" << grid_desc->config.halo_comm_backend << "-";
@@ -73,13 +63,11 @@ std::string createPerformanceReportFileName(const std::string& write_dir,
   }
   filename << ".csv";
   
-  return filename.str();
+  return std::filesystem::path(write_dir) / filename.str();
 }
 
 // Helper function to write CSV header with grid configuration information
 void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
-  file << "# CUDECOMP Performance Report\n";
-  file << "# Grid Configuration:\n";
   file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend) << "\n";
   file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n";
   file << "# Process grid: [" << grid_desc->config.pdims[0] << ", " << grid_desc->config.pdims[1] << "]\n";
@@ -96,23 +84,23 @@ void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
 
 // Write transpose performance table to CSV
 void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
-                                      const cudecompGridDesc_t grid_desc,
-                                      const std::string& write_dir) {
+                                       const cudecompGridDesc_t grid_desc,
+                                       const std::string& write_dir) {
   if (all_transpose_config_data.empty()) return;
 
-  std::string filename = createPerformanceReportFileName(write_dir, "transpose", grid_desc);
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose", grid_desc);
   std::ofstream file(filename);
   if (!file.is_open()) {
-    printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
     return;
   }
 
   writeCSVHeader(file, grid_desc);
 
-  // Write CSV header
+  // Write table header
   file << "operation,dtype,halo_extents,padding,inplace,managed,samples,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
 
-  // Write CSV data rows
+  // Write table data rows
   for (const auto& config_data : all_transpose_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
@@ -131,7 +119,7 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
   }
 
   file.close();
-  printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.c_str());
+  printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.string().c_str());
 }
 
 // Write halo performance table to CSV
@@ -140,19 +128,19 @@ void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_h
                                  const std::string& write_dir) {
   if (all_halo_config_data.empty()) return;
 
-  std::string filename = createPerformanceReportFileName(write_dir, "halo", grid_desc);
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo", grid_desc);
   std::ofstream file(filename);
   if (!file.is_open()) {
-    printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+    printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
     return;
   }
 
   writeCSVHeader(file, grid_desc);
 
-  // Write CSV header
+  // Write table header
   file << "operation,dtype,dim,halo_extent,periods,padding,managed,samples,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
 
-  // Write CSV data rows
+  // Write table data rows
   for (const auto& config_data : all_halo_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
@@ -172,7 +160,7 @@ void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_h
   }
 
   file.close();
-  printf("CUDECOMP: Wrote halo performance data to %s\n", filename.c_str());
+  printf("CUDECOMP: Wrote halo performance data to %s\n", filename.string().c_str());
 }
 
 // Helper function to create transpose configuration key
@@ -818,14 +806,23 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
   std::ofstream csv_file;
   bool csv_enabled = !write_dir.empty();
   
-  if (csv_enabled && handle->rank == 0) {
-    std::string filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc);
+  // Check if there are any samples before creating CSV file
+  bool has_samples = false;
+  for (const auto& config_data : all_transpose_config_data) {
+    if (config_data.stats.samples > 0) {
+      has_samples = true;
+      break;
+    }
+  }
+  
+  if (csv_enabled && handle->rank == 0 && has_samples) {
+    std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc);
     csv_file.open(filename);
     if (csv_file.is_open()) {
       writeCSVHeader(csv_file, grid_desc);
       csv_file << "operation,dtype,halo_extents,padding,inplace,managed,rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
     } else {
-      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
     }
   }
 
@@ -833,12 +830,12 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
     if (config_data.stats.samples == 0) continue;
 
     printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
-                                           csv_enabled ? &csv_file : nullptr);
+                                           csv_enabled && has_samples ? &csv_file : nullptr);
   }
   
   if (csv_file.is_open()) {
     csv_file.close();
-    printf("CUDECOMP: Wrote transpose per-sample data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).c_str());
+    printf("CUDECOMP: Wrote transpose per-sample data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
   }
 }
 
@@ -849,14 +846,23 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
   std::ofstream csv_file;
   bool csv_enabled = !write_dir.empty();
   
-  if (csv_enabled && handle->rank == 0) {
-    std::string filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc);
+  // Check if there are any samples before creating CSV file
+  bool has_samples = false;
+  for (const auto& config_data : all_halo_config_data) {
+    if (config_data.stats.samples > 0) {
+      has_samples = true;
+      break;
+    }
+  }
+  
+  if (csv_enabled && handle->rank == 0 && has_samples) {
+    std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc);
     csv_file.open(filename);
     if (csv_file.is_open()) {
       writeCSVHeader(csv_file, grid_desc);
       csv_file << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
     } else {
-      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.c_str());
+      printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
     }
   }
 
@@ -864,12 +870,12 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
     if (config_data.stats.samples == 0) continue;
 
     printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
-                                      csv_enabled ? &csv_file : nullptr);
+                                      csv_enabled && has_samples ? &csv_file : nullptr);
   }
   
   if (csv_file.is_open()) {
     csv_file.close();
-    printf("CUDECOMP: Wrote halo per-sample data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).c_str());
+    printf("CUDECOMP: Wrote halo per-sample data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
   }
 }
 
@@ -912,7 +918,7 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
   std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData);
 
   // Check if CSV writing is enabled
-  std::string write_dir = getPerformanceReportWriteDir();
+  const std::string& write_dir = handle->performance_report_write_dir;
   bool csv_enabled = !write_dir.empty();
 
   // Print grid configuration information

From 6e8307f616f66ef565defb1d91ec05bbc074ed7d Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 13:59:15 -0700
Subject: [PATCH 11/13] Adding CSV file writing option. More cleanup.

---
 docs/env_vars.rst              |   2 +-
 include/internal/performance.h |   6 +-
 src/performance.cc             | 114 +++++++++++++++++++--------------
 3 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
index dda3913..2123b57 100644
--- a/docs/env_vars.rst
+++ b/docs/env_vars.rst
@@ -78,7 +78,7 @@ CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR
 :code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory.
 
 CSV files are created with descriptive names encoding the grid configuration, for example:
-:code:`cudecomp-perf-report-transpose-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`
+:code:`cudecomp-perf-report-transpose-aggregated-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`
 
 The following CSV files are generated:
 
diff --git a/include/internal/performance.h b/include/internal/performance.h
index fa4c1f7..5ea6bb5 100644
--- a/include/internal/performance.h
+++ b/include/internal/performance.h
@@ -67,8 +67,10 @@ using cudecompHaloConfigKey = std::tuple<
 struct TransposePerformanceStats {
   std::string operation;
   std::string datatype;
-  std::string halos;
-  std::string padding;
+  std::string input_halos;
+  std::string output_halos;
+  std::string input_padding;
+  std::string output_padding;
   std::string inplace;
   std::string managed;
   int samples;
diff --git a/src/performance.cc b/src/performance.cc
index 7da7820..a19c0e1 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -66,7 +66,7 @@ std::filesystem::path createPerformanceReportFileName(const std::string& write_d
   return std::filesystem::path(write_dir) / filename.str();
 }
 
-// Helper function to write CSV header with grid configuration information
+// Helper function to write CSV header comment with grid configuration information
 void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
   file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend) << "\n";
   file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n";
@@ -88,7 +88,7 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
                                        const std::string& write_dir) {
   if (all_transpose_config_data.empty()) return;
 
-  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose", grid_desc);
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-aggregated", grid_desc);
   std::ofstream file(filename);
   if (!file.is_open()) {
     printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
@@ -98,7 +98,7 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
   writeCSVHeader(file, grid_desc);
 
   // Write table header
-  file << "operation,dtype,halo_extents,padding,inplace,managed,samples,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+  file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,samples,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
 
   // Write table data rows
   for (const auto& config_data : all_transpose_config_data) {
@@ -106,8 +106,10 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
     if (stats.samples > 0) {
       file << stats.operation << ","
            << stats.datatype << ","
-           << "\"" << stats.halos << "\","
-           << "\"" << stats.padding << "\","
+           << "\"" << stats.input_halos << "\","
+           << "\"" << stats.output_halos << "\","
+           << "\"" << stats.input_padding << "\","
+           << "\"" << stats.output_padding << "\","
            << stats.inplace << ","
            << stats.managed << ","
            << stats.samples << ","
@@ -119,16 +121,17 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
   }
 
   file.close();
+  printf("CUDECOMP:\n");
   printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.string().c_str());
 }
 
 // Write halo performance table to CSV
 void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_halo_config_data,
-                                 const cudecompGridDesc_t grid_desc,
-                                 const std::string& write_dir) {
+                                  const cudecompGridDesc_t grid_desc,
+                                  const std::string& write_dir) {
   if (all_halo_config_data.empty()) return;
 
-  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo", grid_desc);
+  std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-aggregated", grid_desc);
   std::ofstream file(filename);
   if (!file.is_open()) {
     printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str());
@@ -160,16 +163,17 @@ void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_h
   }
 
   file.close();
+  printf("CUDECOMP:\n");
   printf("CUDECOMP: Wrote halo performance data to %s\n", filename.string().c_str());
 }
 
 // Helper function to create transpose configuration key
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
-                                                const int32_t input_halo_extents_ptr[],
-                                                const int32_t output_halo_extents_ptr[],
-                                                const int32_t input_padding_ptr[],
-                                                const int32_t output_padding_ptr[],
-                                                cudecompDataType_t datatype) {
+                                                 const int32_t input_halo_extents_ptr[],
+                                                 const int32_t output_halo_extents_ptr[],
+                                                 const int32_t input_padding_ptr[],
+                                                 const int32_t output_padding_ptr[],
+                                                 cudecompDataType_t datatype) {
   std::array<int32_t, 3> input_halo_extents{0, 0, 0};
   std::array<int32_t, 3> output_halo_extents{0, 0, 0};
   std::array<int32_t, 3> input_padding{0, 0, 0};
@@ -197,10 +201,10 @@ cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, v
 
 // Helper function to create halo configuration key
 cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
-                                      const int32_t halo_extents_ptr[],
-                                      const bool halo_periods_ptr[],
-                                      const int32_t padding_ptr[],
-                                      cudecompDataType_t datatype) {
+                                       const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[],
+                                       const int32_t padding_ptr[],
+                                       cudecompDataType_t datatype) {
   std::array<int32_t, 3> halo_extents{0, 0, 0};
   std::array<bool, 3> halo_periods{false, false, false};
   std::array<int32_t, 3> padding{0, 0, 0};
@@ -357,11 +361,17 @@ bool compareTransposeConfigData(const TransposeConfigTimingData& a,
   if (a.stats.datatype != b.stats.datatype) {
     return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
   }
-  if (a.stats.halos != b.stats.halos) {
-    return a.stats.halos < b.stats.halos;
+  if (a.stats.input_halos != b.stats.input_halos) {
+    return a.stats.input_halos < b.stats.input_halos;
   }
-  if (a.stats.padding != b.stats.padding) {
-    return a.stats.padding < b.stats.padding;
+  if (a.stats.output_halos != b.stats.output_halos) {
+    return a.stats.output_halos < b.stats.output_halos;
+  }
+  if (a.stats.input_padding != b.stats.input_padding) {
+    return a.stats.input_padding < b.stats.input_padding;
+  }
+  if (a.stats.output_padding != b.stats.output_padding) {
+    return a.stats.output_padding < b.stats.output_padding;
   }
   if (a.stats.inplace != b.stats.inplace) {
     return a.stats.inplace < b.stats.inplace;
@@ -466,16 +476,18 @@ TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKe
   stats.operation = getTransposeOperationName(config);
   stats.datatype = getDatatypeString(std::get<8>(config));
 
-  // Format combined halos and padding
+  // Format separate halos and padding
   auto input_halos = std::get<2>(config);
   auto output_halos = std::get<3>(config);
   auto input_padding = std::get<4>(config);
   auto output_padding = std::get<5>(config);
 
-  stats.halos = formatArray(input_halos) + "/" + formatArray(output_halos);
-  stats.padding = formatArray(input_padding) + "/" + formatArray(output_padding);
-  stats.inplace = std::get<6>(config) ? "Y" : "N";
-  stats.managed = std::get<7>(config) ? "Y" : "N";
+  stats.input_halos = formatArray(input_halos);
+  stats.output_halos = formatArray(output_halos);
+  stats.input_padding = formatArray(input_padding);
+  stats.output_padding = formatArray(output_padding);
+  stats.inplace = std::get<6>(config) ? "T" : "F";
+  stats.managed = std::get<7>(config) ? "T" : "F";
   stats.samples = config_data.total_times.size();
 
   // Compute average statistics and reduce across all ranks
@@ -533,7 +545,7 @@ HaloConfigTimingData processHaloConfig(const cudecompHaloConfigKey& config,
   stats.halos = formatArray(halo_extents);
   stats.periods = formatArray(halo_periods);
   stats.padding = formatArray(padding);
-  stats.managed = std::get<5>(config) ? "Y" : "N";
+  stats.managed = std::get<5>(config) ? "T" : "F";
   stats.samples = config_data.total_times.size();
 
   // Compute average statistics across all ranks
@@ -579,25 +591,27 @@ void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>
   printf("CUDECOMP:\n");
 
   // Print compact table header
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "operation", "dtype", "input halos", "output halos", "input padding", "output padding", "inplace", "managed", "samples",
          "total", "A2A", "local", "A2A BW");
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "", "", "", "", "", "", "",
+  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "", "", "", "", "", "", "", "", "",
          "[ms]", "[ms]", "[ms]", "[GB/s]");
   printf("CUDECOMP: ");
-  for (int i = 0; i < 120; ++i) printf("-");
+  for (int i = 0; i < 152; ++i) printf("-");
   printf("\n");
 
   // Print table rows
   for (const auto& config_data : all_transpose_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
-      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
              stats.operation.c_str(),
              stats.datatype.c_str(),
-             stats.halos.c_str(),
-             stats.padding.c_str(),
+             stats.input_halos.c_str(),
+             stats.output_halos.c_str(),
+             stats.input_padding.c_str(),
+             stats.output_padding.c_str(),
              stats.inplace.c_str(),
              stats.managed.c_str(),
              stats.samples,
@@ -681,11 +695,13 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
     if (handle->rank != 0) return;
   }
 
-  printf("CUDECOMP: %s (dtype=%s, halos=%s, padding=%s, inplace=%s, managed=%s) samples:\n",
+  printf("CUDECOMP: %s (dtype=%s, input_halos=%s, output_halos=%s, input_padding=%s, output_padding=%s, inplace=%s, managed=%s) samples:\n",
          stats.operation.c_str(),
          stats.datatype.c_str(),
-         stats.halos.c_str(),
-         stats.padding.c_str(),
+         stats.input_halos.c_str(),
+         stats.output_halos.c_str(),
+         stats.input_padding.c_str(),
+         stats.output_padding.c_str(),
          stats.inplace.c_str(),
          stats.managed.c_str());
 
@@ -706,8 +722,10 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
       if (csv_file && csv_file->is_open()) {
         *csv_file << stats.operation << ","
                   << stats.datatype << ","
-                  << "\"" << stats.halos << "\","
-                  << "\"" << stats.padding << "\","
+                  << "\"" << stats.input_halos << "\","
+                  << "\"" << stats.output_halos << "\","
+                  << "\"" << stats.input_padding << "\","
+                  << "\"" << stats.output_padding << "\","
                   << stats.inplace << ","
                   << stats.managed << ","
                   << r << ","
@@ -820,7 +838,7 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
     csv_file.open(filename);
     if (csv_file.is_open()) {
       writeCSVHeader(csv_file, grid_desc);
-      csv_file << "operation,dtype,halo_extents,padding,inplace,managed,rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+      csv_file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
     } else {
       printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
     }
@@ -835,14 +853,15 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
   
   if (csv_file.is_open()) {
     csv_file.close();
-    printf("CUDECOMP: Wrote transpose per-sample data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
+    printf("CUDECOMP:\n");
+    printf("CUDECOMP: Wrote per-sample transpose data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
   }
 }
 
 // Print per-sample details for halo configurations
 void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo_config_data,
-                              const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
-                              const std::string& write_dir = "") {
+                               const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                               const std::string& write_dir = "") {
   std::ofstream csv_file;
   bool csv_enabled = !write_dir.empty();
   
@@ -875,7 +894,8 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
   
   if (csv_file.is_open()) {
     csv_file.close();
-    printf("CUDECOMP: Wrote halo per-sample data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
+    printf("CUDECOMP:\n");
+    printf("CUDECOMP: Wrote per-sample halo data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
   }
 }
 
@@ -1007,7 +1027,7 @@ void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t g
 
 // Helper function to advance transpose sample index with warmup handling
 void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                      const cudecompTransposeConfigKey& config) {
+                                       const cudecompTransposeConfigKey& config) {
   if (!handle->performance_report_enable) return;
 
   auto& collection = getOrCreateTransposePerformanceSamples(handle, grid_desc, config);
@@ -1025,7 +1045,7 @@ void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGr
 
 // Helper function to advance halo sample index with warmup handling
 void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                 const cudecompHaloConfigKey& config) {
+                                  const cudecompHaloConfigKey& config) {
   if (!handle->performance_report_enable) return;
 
   auto& collection = getOrCreateHaloPerformanceSamples(handle, grid_desc, config);

From 6fe39b4a68a02cc4134232f51389af3582c9aabb Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 9 Jul 2025 15:02:22 -0700
Subject: [PATCH 12/13] Update transpose table halo/padding columns.

---
 src/performance.cc | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/performance.cc b/src/performance.cc
index a19c0e1..08f3a06 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -476,7 +476,6 @@ TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKe
   stats.operation = getTransposeOperationName(config);
   stats.datatype = getDatatypeString(std::get<8>(config));
 
-  // Format separate halos and padding
   auto input_halos = std::get<2>(config);
   auto output_halos = std::get<3>(config);
   auto input_padding = std::get<4>(config);
@@ -591,21 +590,21 @@ void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>
   printf("CUDECOMP:\n");
 
   // Print compact table header
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "operation", "dtype", "input halos", "output halos", "input padding", "output padding", "inplace", "managed", "samples",
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+    "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
          "total", "A2A", "local", "A2A BW");
-  printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "", "", "", "", "", "", "", "", "",
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
+         "", "", "", "", "", "", "",
          "[ms]", "[ms]", "[ms]", "[GB/s]");
   printf("CUDECOMP: ");
-  for (int i = 0; i < 152; ++i) printf("-");
+  for (int i = 0; i < 120; ++i) printf("-");
   printf("\n");
 
   // Print table rows
   for (const auto& config_data : all_transpose_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
-      printf("CUDECOMP: %-12s %-6s %-16s %-16s %-16s %-16s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
+      printf("CUDECOMP: %-12s %-6s %-7s/%-7s %-7s/%-7s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
              stats.operation.c_str(),
              stats.datatype.c_str(),
              stats.input_halos.c_str(),
@@ -695,7 +694,7 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
     if (handle->rank != 0) return;
   }
 
-  printf("CUDECOMP: %s (dtype=%s, input_halos=%s, output_halos=%s, input_padding=%s, output_padding=%s, inplace=%s, managed=%s) samples:\n",
+  printf("CUDECOMP: %s (dtype=%s, halo extents=%s/%s, padding=%s/%s, inplace=%s, managed=%s) samples:\n",
          stats.operation.c_str(),
          stats.datatype.c_str(),
          stats.input_halos.c_str(),

From c98eecb4e324f7c1d2fecfa0e875216ad215eec5 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 18 Jul 2025 10:19:07 -0700
Subject: [PATCH 13/13] Run clang-format.

---
 include/internal/comm_routines.h |  39 +--
 include/internal/common.h        |  33 +--
 include/internal/halo.h          |  28 ++-
 include/internal/performance.h   |  57 ++---
 include/internal/transpose.h     |  31 ++-
 src/autotune.cc                  |   8 +-
 src/cudecomp.cc                  |  24 +-
 src/performance.cc               | 403 +++++++++++--------------------
 8 files changed, 264 insertions(+), 359 deletions(-)

diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index c0a0879..b14b0fe 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -150,12 +150,13 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
 #endif
 
 template <typename T>
-static void
-cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                 const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
-                 T* recv_buff, const std::vector<comm_count_t>& recv_counts,
-                 const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
-                 cudecompCommAxis comm_axis, cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
+static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                             const std::vector<comm_count_t>& send_counts,
+                             const std::vector<comm_count_t>& send_offsets, T* recv_buff,
+                             const std::vector<comm_count_t>& recv_counts,
+                             const std::vector<comm_count_t>& recv_offsets,
+                             const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                             cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
   if (handle->performance_report_enable) {
@@ -283,19 +284,17 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
 }
 
 template <typename T>
-static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                                      const std::vector<comm_count_t>& send_counts,
-                                      const std::vector<comm_count_t>& send_offsets, T* recv_buff,
-                                      const std::vector<comm_count_t>& recv_counts,
-                                      const std::vector<comm_count_t>& recv_offsets,
-                                      const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
-                                      const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
-                                      cudaStream_t stream, bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
+static void
+cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                          const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
+                          T* recv_buff, const std::vector<comm_count_t>& recv_counts,
+                          const std::vector<comm_count_t>& recv_offsets,
+                          const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                          const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
+                          bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
 
   // If there are no transfers to complete, quick return
-  if (send_counts.size() == 0 && recv_counts.size() == 0) {
-    return;
-  }
+  if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }
 
   std::ostringstream os;
   os << "cudecompAlltoallPipelined_";
@@ -309,7 +308,8 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   if (handle->performance_report_enable && src_ranks[0] != self_rank) {
     // Note: skipping self-copy for timing as it should be overlapped
     CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
-    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], handle->pl_stream));
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count],
+                               handle->pl_stream));
   }
 
 #ifdef ENABLE_NVSHMEM
@@ -484,7 +484,8 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
 
   if (handle->performance_report_enable && src_ranks[0] != self_rank) {
-    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
+    CHECK_CUDA(
+        cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
     current_sample->alltoall_timing_count++;
   }
   nvtx::rangePop();
diff --git a/include/internal/common.h b/include/internal/common.h
index e65b7d6..6738af5 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -116,11 +116,13 @@ struct cudecompHandle {
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
 
   // Performance reporting related entries
-  bool performance_report_enable = false;              // flag to track if performance reporting is enabled
-  int32_t performance_report_detail = 0;               // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
-  int32_t performance_report_samples = 20;             // number of performance samples to keep for final report
-  int32_t performance_report_warmup_samples = 3;       // number of initial warmup samples to ignore for each configuration
-  std::string performance_report_write_dir = "";       // directory to write CSV performance reports, empty means no file writing
+  bool performance_report_enable = false; // flag to track if performance reporting is enabled
+  int32_t performance_report_detail =
+      0; // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
+  int32_t performance_report_samples = 20;       // number of performance samples to keep for final report
+  int32_t performance_report_warmup_samples = 3; // number of initial warmup samples to ignore for each configuration
+  std::string performance_report_write_dir =
+      ""; // directory to write CSV performance reports, empty means no file writing
 };
 
 // Structure with information about row/column communicator
@@ -172,7 +174,6 @@ struct cudecompHaloPerformanceSampleCollection {
   int32_t warmup_count = 0;
 };
 
-
 // cuDecomp grid descriptor containing grid-specific information
 struct cudecompGridDesc {
   cudecompGridDescConfig_t config;      // configuration struct
@@ -194,19 +195,21 @@ struct cudecompGridDesc {
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
   // Performance reporting related entries
-  std::vector<cudaEvent_t> alltoall_start_events;  // events for alltoall timing
-  std::vector<cudaEvent_t> alltoall_end_events;    // events for alltoall timing
-  int32_t alltoall_timing_count = 0;               // count of alltoall timing events pairs (for pipelined alltoall)
-  cudaEvent_t transpose_start_event;               // event for transpose timing
-  cudaEvent_t transpose_end_event;                 // event for transpose timing
+  std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
+  std::vector<cudaEvent_t> alltoall_end_events;   // events for alltoall timing
+  int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
+  cudaEvent_t transpose_start_event;              // event for transpose timing
+  cudaEvent_t transpose_end_event;                // event for transpose timing
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
-                     cudecompTransposePerformanceSampleCollection> transpose_perf_samples_map;
+                     cudecompTransposePerformanceSampleCollection>
+      transpose_perf_samples_map;
 
-  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>,
-                                std::array<int32_t, 3>, bool, cudecompDataType_t>,
-                     cudecompHaloPerformanceSampleCollection> halo_perf_samples_map;
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>, std::array<int32_t, 3>,
+                                bool, cudecompDataType_t>,
+                     cudecompHaloPerformanceSampleCollection>
+      halo_perf_samples_map;
 
   bool initialized = false;
 };
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 9a6c58f..55a4a8f 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -77,7 +77,10 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
 
   cudecompHaloPerformanceSample* current_sample = nullptr;
   if (handle->performance_report_enable) {
-    auto& samples = getOrCreateHaloPerformanceSamples(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+    auto& samples =
+        getOrCreateHaloPerformanceSamples(handle, grid_desc,
+                                          createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                           padding.data(), getCudecompDataType<T>()));
     current_sample = &samples.samples[samples.sample_idx];
     current_sample->sendrecv_bytes = 0;
     current_sample->valid = true;
@@ -136,7 +139,9 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     if (handle->performance_report_enable && current_sample) {
       // Record end event and advance sample even for early return
       CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
-      advanceHaloPerformanceSample(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+      advanceHaloPerformanceSample(handle, grid_desc,
+                                   createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                    padding.data(), getCudecompDataType<T>()));
     }
     return;
   }
@@ -225,12 +230,11 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     if (handle->performance_report_enable && current_sample) {
       current_sample->sendrecv_bytes = 0;
       for (int i = 0; i < 2; ++i) {
-        if (neighbors[i] != -1) {
-          current_sample->sendrecv_bytes += halo_size * sizeof(T);
-        }
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
       }
     }
-    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream, current_sample);
+    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream,
+                         current_sample);
 
     // Unpack
     // Left
@@ -290,20 +294,20 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     if (handle->performance_report_enable && current_sample) {
       current_sample->sendrecv_bytes = 0;
       for (int i = 0; i < 2; ++i) {
-        if (neighbors[i] != -1) {
-          current_sample->sendrecv_bytes += halo_size * sizeof(T);
-        }
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
       }
     }
-    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
-                         stream, current_sample);
+    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, stream,
+                         current_sample);
   } break;
   }
 
   if (handle->performance_report_enable && current_sample) {
     // Record end event
     CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
-    advanceHaloPerformanceSample(handle, grid_desc, createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), padding.data(), getCudecompDataType<T>()));
+    advanceHaloPerformanceSample(handle, grid_desc,
+                                 createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                  padding.data(), getCudecompDataType<T>()));
   }
 }
 
diff --git a/include/internal/performance.h b/include/internal/performance.h
index 5ea6bb5..82f54e2 100644
--- a/include/internal/performance.h
+++ b/include/internal/performance.h
@@ -41,27 +41,25 @@
 
 namespace cudecomp {
 
-using cudecompTransposeConfigKey = std::tuple<
-  int32_t,                         // ax (axis)
-  int32_t,                         // dir (direction)
-  std::array<int32_t, 3>,          // input_halo_extents
-  std::array<int32_t, 3>,          // output_halo_extents
-  std::array<int32_t, 3>,          // input_padding
-  std::array<int32_t, 3>,          // output_padding
-  bool,                            // inplace
-  bool,                            // managed_memory
-  cudecompDataType_t               // datatype
->;
-
-using cudecompHaloConfigKey = std::tuple<
-  int32_t,                         // ax (axis)
-  int32_t,                         // dim (dimension)
-  std::array<int32_t, 3>,          // halo_extents
-  std::array<bool, 3>,             // halo_periods
-  std::array<int32_t, 3>,          // padding
-  bool,                            // managed_memory
-  cudecompDataType_t               // datatype
->;
+using cudecompTransposeConfigKey = std::tuple<int32_t,                // ax (axis)
+                                              int32_t,                // dir (direction)
+                                              std::array<int32_t, 3>, // input_halo_extents
+                                              std::array<int32_t, 3>, // output_halo_extents
+                                              std::array<int32_t, 3>, // input_padding
+                                              std::array<int32_t, 3>, // output_padding
+                                              bool,                   // inplace
+                                              bool,                   // managed_memory
+                                              cudecompDataType_t      // datatype
+                                              >;
+
+using cudecompHaloConfigKey = std::tuple<int32_t,                // ax (axis)
+                                         int32_t,                // dim (dimension)
+                                         std::array<int32_t, 3>, // halo_extents
+                                         std::array<bool, 3>,    // halo_periods
+                                         std::array<int32_t, 3>, // padding
+                                         bool,                   // managed_memory
+                                         cudecompDataType_t      // datatype
+                                         >;
 
 // Helper structure for transpose statistics
 struct TransposePerformanceStats {
@@ -123,12 +121,12 @@ void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t g
 void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
                                        const cudecompTransposeConfigKey& config);
 
-cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
-                                                                                     cudecompGridDesc_t grid_desc,
-                                                                                     const cudecompTransposeConfigKey& config);
+cudecompTransposePerformanceSampleCollection&
+getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config);
 
 void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                 const cudecompHaloConfigKey& config);
+                                  const cudecompHaloConfigKey& config);
 
 cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle,
                                                                            cudecompGridDesc_t grid_desc,
@@ -138,15 +136,12 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
                                                  const int32_t input_halo_extents_ptr[],
                                                  const int32_t output_halo_extents_ptr[],
-                                                 const int32_t input_padding_ptr[],
-                                                 const int32_t output_padding_ptr[],
+                                                 const int32_t input_padding_ptr[], const int32_t output_padding_ptr[],
                                                  cudecompDataType_t datatype);
 
 // Helper function to create halo configuration key
-cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
-                                       const int32_t halo_extents_ptr[],
-                                       const bool halo_periods_ptr[],
-                                       const int32_t padding_ptr[],
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[], const int32_t padding_ptr[],
                                        cudecompDataType_t datatype);
 
 } // namespace cudecomp
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 2f45ec4..afb6480 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -254,7 +254,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
   cudecompTransposePerformanceSample* current_sample = nullptr;
   if (handle->performance_report_enable) {
-    auto& samples = getOrCreateTransposePerformanceSamples(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+    auto& samples =
+        getOrCreateTransposePerformanceSamples(handle, grid_desc,
+                                               createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                     output_halo_extents.data(), input_padding.data(),
+                                                                     output_padding.data(), getCudecompDataType<T>()));
     current_sample = &samples.samples[samples.sample_idx];
     current_sample->alltoall_timing_count = 0;
     current_sample->alltoall_bytes = pinfo_a.size * sizeof(T);
@@ -276,7 +280,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
           if (handle->performance_report_enable) {
             // Record performance data
             CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-            advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+            advanceTransposePerformanceSample(handle, grid_desc,
+                                              createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                    output_halo_extents.data(), input_padding.data(),
+                                                                    output_padding.data(), getCudecompDataType<T>()));
           }
           return;
         }
@@ -545,7 +552,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
       // o1 is output. Return.
       if (handle->performance_report_enable) {
         CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-        advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+        advanceTransposePerformanceSample(handle, grid_desc,
+                                          createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                                output_halo_extents.data(), input_padding.data(),
+                                                                output_padding.data(), getCudecompDataType<T>()));
       }
       return;
     }
@@ -633,7 +643,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                      current_sample);
           }
 
           if (o2 != o3) {
@@ -728,7 +739,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
           if (o2 != o1) {
             cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
+                                      recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                      current_sample);
           }
         }
 
@@ -785,7 +797,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
         if (o2 != o1) {
           cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
-                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, current_sample);
+                                    recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
+                                    current_sample);
         }
       }
 
@@ -822,9 +835,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   if (handle->performance_report_enable) {
     // Record performance data
     CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream));
-    advanceTransposePerformanceSample(handle, grid_desc, createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), output_halo_extents.data(), input_padding.data(), output_padding.data(), getCudecompDataType<T>()));
+    advanceTransposePerformanceSample(handle, grid_desc,
+                                      createTransposeConfig(ax, dir, input, output, input_halo_extents.data(),
+                                                            output_halo_extents.data(), input_padding.data(),
+                                                            output_padding.data(), getCudecompDataType<T>()));
   }
-
 }
 
 template <typename T>
diff --git a/src/autotune.cc b/src/autotune.cc
index d59f33b..4cb0dbe 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -436,9 +436,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       }
 
       // Print performance report for this configuration if enabled
-      if (handle->performance_report_enable && !skip_case) {
-        printPerformanceReport(handle, grid_desc);
-      }
+      if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); }
 
       if (skip_case) continue;
 
@@ -776,9 +774,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       }
 
       // Print performance report for this configuration if enabled
-      if (handle->performance_report_enable && !skip_case) {
-        printPerformanceReport(handle, grid_desc);
-      }
+      if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); }
 
       if (skip_case) continue;
 
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index de653e9..d0852f7 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -332,7 +332,9 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
 
   // Check CUDECOMP_ENABLE_PERFORMANCE_REPORT (Performance reporting)
   const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORT");
-  if (performance_report_str) { handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; }
+  if (performance_report_str) {
+    handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1;
+  }
 
   // Check CUDECOMP_PERFORMANCE_REPORT_DETAIL (Performance report detail level)
   const char* performance_detail_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_DETAIL");
@@ -349,11 +351,11 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
   const char* performance_samples_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_SAMPLES");
   if (performance_samples_str) {
     int32_t samples = std::strtol(performance_samples_str, nullptr, 10);
-    if (samples > 0) {  // Only require positive values
+    if (samples > 0) { // Only require positive values
       handle->performance_report_samples = samples;
     } else if (handle->rank == 0) {
-      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n",
-             samples, handle->performance_report_samples);
+      printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n", samples,
+             handle->performance_report_samples);
     }
   }
 
@@ -361,7 +363,7 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
   const char* performance_warmup_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES");
   if (performance_warmup_str) {
     int32_t warmup_samples = std::strtol(performance_warmup_str, nullptr, 10);
-    if (warmup_samples >= 0) {  // Only require non-negative values
+    if (warmup_samples >= 0) { // Only require non-negative values
       handle->performance_report_warmup_samples = warmup_samples;
     } else if (handle->rank == 0) {
       printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES value (%d). Using default (%d).\n",
@@ -371,9 +373,7 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
 
   // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports)
   const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
-  if (performance_write_dir_str) {
-    handle->performance_report_write_dir = std::string(performance_write_dir_str);
-  }
+  if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); }
 }
 
 #ifdef ENABLE_NVSHMEM
@@ -778,8 +778,12 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
         for (auto& sample : collection.samples) {
           CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
           CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
-          for (auto& event : sample.alltoall_start_events) { CHECK_CUDA(cudaEventDestroy(event)); }
-          for (auto& event : sample.alltoall_end_events) { CHECK_CUDA(cudaEventDestroy(event)); }
+          for (auto& event : sample.alltoall_start_events) {
+            CHECK_CUDA(cudaEventDestroy(event));
+          }
+          for (auto& event : sample.alltoall_end_events) {
+            CHECK_CUDA(cudaEventDestroy(event));
+          }
         }
       }
 
diff --git a/src/performance.cc b/src/performance.cc
index 08f3a06..1910e9d 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -29,15 +29,15 @@
  */
 
 #include <algorithm>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <numeric>
 #include <sstream>
 #include <string>
-#include <fstream>
-#include <cstdlib>
-#include <filesystem>
 
 #include <mpi.h>
 
@@ -48,35 +48,38 @@
 namespace cudecomp {
 
 // Helper function to create file name with grid descriptor information
-std::filesystem::path createPerformanceReportFileName(const std::string& write_dir,
-                                                      const std::string& table_type,
+std::filesystem::path createPerformanceReportFileName(const std::string& write_dir, const std::string& table_type,
                                                       const cudecompGridDesc_t grid_desc) {
   std::ostringstream filename;
   filename << "cudecomp-perf-report-" << table_type << "-";
   filename << "tcomm_" << grid_desc->config.transpose_comm_backend << "-";
   filename << "hcomm_" << grid_desc->config.halo_comm_backend << "-";
   filename << "pdims_" << grid_desc->config.pdims[0] << "x" << grid_desc->config.pdims[1] << "-";
-  filename << "gdims_" << grid_desc->config.gdims[0] << "x" << grid_desc->config.gdims[1] << "x" << grid_desc->config.gdims[2] << "-";
+  filename << "gdims_" << grid_desc->config.gdims[0] << "x" << grid_desc->config.gdims[1] << "x"
+           << grid_desc->config.gdims[2] << "-";
   filename << "memorder_";
   for (int axis = 0; axis < 3; ++axis) {
-    filename << grid_desc->config.transpose_mem_order[axis][0] << grid_desc->config.transpose_mem_order[axis][1] << grid_desc->config.transpose_mem_order[axis][2];
+    filename << grid_desc->config.transpose_mem_order[axis][0] << grid_desc->config.transpose_mem_order[axis][1]
+             << grid_desc->config.transpose_mem_order[axis][2];
   }
   filename << ".csv";
-  
+
   return std::filesystem::path(write_dir) / filename.str();
 }
 
 // Helper function to write CSV header comment with grid configuration information
 void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
-  file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend) << "\n";
+  file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend)
+       << "\n";
   file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n";
   file << "# Process grid: [" << grid_desc->config.pdims[0] << ", " << grid_desc->config.pdims[1] << "]\n";
-  file << "# Global dimensions: [" << grid_desc->config.gdims[0] << ", " << grid_desc->config.gdims[1] << ", " << grid_desc->config.gdims[2] << "]\n";
+  file << "# Global dimensions: [" << grid_desc->config.gdims[0] << ", " << grid_desc->config.gdims[1] << ", "
+       << grid_desc->config.gdims[2] << "]\n";
   file << "# Memory order: ";
   for (int axis = 0; axis < 3; ++axis) {
     file << "[" << grid_desc->config.transpose_mem_order[axis][0] << ","
-         << grid_desc->config.transpose_mem_order[axis][1] << ","
-         << grid_desc->config.transpose_mem_order[axis][2] << "]";
+         << grid_desc->config.transpose_mem_order[axis][1] << "," << grid_desc->config.transpose_mem_order[axis][2]
+         << "]";
     if (axis < 2) file << "; ";
   }
   file << "\n#\n";
@@ -84,8 +87,7 @@ void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) {
 
 // Write transpose performance table to CSV
 void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
-                                       const cudecompGridDesc_t grid_desc,
-                                       const std::string& write_dir) {
+                                       const cudecompGridDesc_t grid_desc, const std::string& write_dir) {
   if (all_transpose_config_data.empty()) return;
 
   std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-aggregated", grid_desc);
@@ -98,25 +100,21 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
   writeCSVHeader(file, grid_desc);
 
   // Write table header
-  file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,samples,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+  file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,samples,"
+          "total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
 
   // Write table data rows
   for (const auto& config_data : all_transpose_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
-      file << stats.operation << ","
-           << stats.datatype << ","
+      file << stats.operation << "," << stats.datatype << ","
            << "\"" << stats.input_halos << "\","
            << "\"" << stats.output_halos << "\","
            << "\"" << stats.input_padding << "\","
-           << "\"" << stats.output_padding << "\","
-           << stats.inplace << ","
-           << stats.managed << ","
-           << stats.samples << ","
-           << std::fixed << std::setprecision(3) << stats.total_time_avg << ","
-           << std::fixed << std::setprecision(3) << stats.alltoall_time_avg << ","
-           << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
-           << std::fixed << std::setprecision(3) << stats.alltoall_bw_avg << "\n";
+           << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << stats.samples
+           << "," << std::fixed << std::setprecision(3) << stats.total_time_avg << "," << std::fixed
+           << std::setprecision(3) << stats.alltoall_time_avg << "," << std::fixed << std::setprecision(3)
+           << stats.local_time_avg << "," << std::fixed << std::setprecision(3) << stats.alltoall_bw_avg << "\n";
     }
   }
 
@@ -127,8 +125,7 @@ void writeTransposePerformanceTableCSV(const std::vector<TransposeConfigTimingDa
 
 // Write halo performance table to CSV
 void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_halo_config_data,
-                                  const cudecompGridDesc_t grid_desc,
-                                  const std::string& write_dir) {
+                                  const cudecompGridDesc_t grid_desc, const std::string& write_dir) {
   if (all_halo_config_data.empty()) return;
 
   std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-aggregated", grid_desc);
@@ -147,17 +144,12 @@ void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_h
   for (const auto& config_data : all_halo_config_data) {
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
-      file << stats.operation << ","
-           << stats.datatype << ","
-           << stats.dim << ","
+      file << stats.operation << "," << stats.datatype << "," << stats.dim << ","
            << "\"" << stats.halos << "\","
            << "\"" << stats.periods << "\","
-           << "\"" << stats.padding << "\","
-           << stats.managed << ","
-           << stats.samples << ","
-           << std::fixed << std::setprecision(3) << stats.total_time_avg << ","
-           << std::fixed << std::setprecision(3) << stats.sendrecv_time_avg << ","
-           << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
+           << "\"" << stats.padding << "\"," << stats.managed << "," << stats.samples << "," << std::fixed
+           << std::setprecision(3) << stats.total_time_avg << "," << std::fixed << std::setprecision(3)
+           << stats.sendrecv_time_avg << "," << std::fixed << std::setprecision(3) << stats.local_time_avg << ","
            << std::fixed << std::setprecision(3) << stats.sendrecv_bw_avg << "\n";
     }
   }
@@ -171,8 +163,7 @@ void writeHaloPerformanceTableCSV(const std::vector<HaloConfigTimingData>& all_h
 cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output,
                                                  const int32_t input_halo_extents_ptr[],
                                                  const int32_t output_halo_extents_ptr[],
-                                                 const int32_t input_padding_ptr[],
-                                                 const int32_t output_padding_ptr[],
+                                                 const int32_t input_padding_ptr[], const int32_t output_padding_ptr[],
                                                  cudecompDataType_t datatype) {
   std::array<int32_t, 3> input_halo_extents{0, 0, 0};
   std::array<int32_t, 3> output_halo_extents{0, 0, 0};
@@ -185,39 +176,27 @@ cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, v
   if (output_halo_extents_ptr) {
     std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin());
   }
-  if (input_padding_ptr) {
-    std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin());
-  }
-  if (output_padding_ptr) {
-    std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin());
-  }
+  if (input_padding_ptr) { std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin()); }
+  if (output_padding_ptr) { std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin()); }
 
   bool inplace = (input == output);
   bool managed_memory = isManagedPointer(input) || isManagedPointer(output);
 
-  return std::make_tuple(ax, dir, input_halo_extents, output_halo_extents,
-                         input_padding, output_padding, inplace, managed_memory, datatype);
+  return std::make_tuple(ax, dir, input_halo_extents, output_halo_extents, input_padding, output_padding, inplace,
+                         managed_memory, datatype);
 }
 
 // Helper function to create halo configuration key
-cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
-                                       const int32_t halo_extents_ptr[],
-                                       const bool halo_periods_ptr[],
-                                       const int32_t padding_ptr[],
+cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[],
+                                       const bool halo_periods_ptr[], const int32_t padding_ptr[],
                                        cudecompDataType_t datatype) {
   std::array<int32_t, 3> halo_extents{0, 0, 0};
   std::array<bool, 3> halo_periods{false, false, false};
   std::array<int32_t, 3> padding{0, 0, 0};
 
-  if (halo_extents_ptr) {
-    std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin());
-  }
-  if (halo_periods_ptr) {
-    std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin());
-  }
-  if (padding_ptr) {
-    std::copy(padding_ptr, padding_ptr + 3, padding.begin());
-  }
+  if (halo_extents_ptr) { std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin()); }
+  if (halo_periods_ptr) { std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin()); }
+  if (padding_ptr) { std::copy(padding_ptr, padding_ptr + 3, padding.begin()); }
 
   bool managed_memory = isManagedPointer(input);
 
@@ -225,9 +204,9 @@ cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input,
 }
 
 // Helper function to get or create transpose performance sample collection for a configuration
-cudecompTransposePerformanceSampleCollection& getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle,
-                                                                                     cudecompGridDesc_t grid_desc,
-                                                                                     const cudecompTransposeConfigKey& config) {
+cudecompTransposePerformanceSampleCollection&
+getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                                       const cudecompTransposeConfigKey& config) {
   auto& samples_map = grid_desc->transpose_perf_samples_map;
 
   if (samples_map.find(config) == samples_map.end()) {
@@ -332,28 +311,23 @@ std::string getHaloOperationName(const cudecompHaloConfigKey& config) {
 // Helper function to convert datatype to string
 std::string getDatatypeString(cudecompDataType_t datatype) {
   switch (datatype) {
-    case CUDECOMP_FLOAT: return "S";
-    case CUDECOMP_DOUBLE: return "D";
-    case CUDECOMP_FLOAT_COMPLEX: return "C";
-    case CUDECOMP_DOUBLE_COMPLEX: return "Z";
-    default: return "unknown";
+  case CUDECOMP_FLOAT: return "S";
+  case CUDECOMP_DOUBLE: return "D";
+  case CUDECOMP_FLOAT_COMPLEX: return "C";
+  case CUDECOMP_DOUBLE_COMPLEX: return "Z";
+  default: return "unknown";
   }
 }
 
 // Comparison function to order transpose configurations.
 // Ordering is (operation, datatype, halos, padding, inplace, managed).
-bool compareTransposeConfigData(const TransposeConfigTimingData& a,
-                                const TransposeConfigTimingData& b) {
-  static const std::map<std::string, int> op_priority = {
-    {"TransposeXY", 0},
-    {"TransposeYZ", 1},
-    {"TransposeZY", 2},
-    {"TransposeYX", 3}
-  };
-
-  static const std::map<std::string, int> dtype_priority = {
-    {"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}
-  };
+bool compareTransposeConfigData(const TransposeConfigTimingData& a, const TransposeConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {{"TransposeXY", 0},
+                                                         {"TransposeYZ", 1},
+                                                         {"TransposeZY", 2},
+                                                         {"TransposeYX", 3}};
+
+  static const std::map<std::string, int> dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}};
 
   if (a.stats.operation != b.stats.operation) {
     return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
@@ -361,56 +335,31 @@ bool compareTransposeConfigData(const TransposeConfigTimingData& a,
   if (a.stats.datatype != b.stats.datatype) {
     return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
   }
-  if (a.stats.input_halos != b.stats.input_halos) {
-    return a.stats.input_halos < b.stats.input_halos;
-  }
-  if (a.stats.output_halos != b.stats.output_halos) {
-    return a.stats.output_halos < b.stats.output_halos;
-  }
-  if (a.stats.input_padding != b.stats.input_padding) {
-    return a.stats.input_padding < b.stats.input_padding;
-  }
-  if (a.stats.output_padding != b.stats.output_padding) {
-    return a.stats.output_padding < b.stats.output_padding;
-  }
-  if (a.stats.inplace != b.stats.inplace) {
-    return a.stats.inplace < b.stats.inplace;
-  }
+  if (a.stats.input_halos != b.stats.input_halos) { return a.stats.input_halos < b.stats.input_halos; }
+  if (a.stats.output_halos != b.stats.output_halos) { return a.stats.output_halos < b.stats.output_halos; }
+  if (a.stats.input_padding != b.stats.input_padding) { return a.stats.input_padding < b.stats.input_padding; }
+  if (a.stats.output_padding != b.stats.output_padding) { return a.stats.output_padding < b.stats.output_padding; }
+  if (a.stats.inplace != b.stats.inplace) { return a.stats.inplace < b.stats.inplace; }
   return a.stats.managed < b.stats.managed;
 }
 
 // Comparison function to order halo configurations.
 // Ordering is (operation, datatype, dim, halos, periods, padding, managed).
-bool compareHaloConfigData(const HaloConfigTimingData& a,
-                           const HaloConfigTimingData& b) {
-  static const std::map<std::string, int> op_priority = {
-    {"HaloX", 0},
-    {"HaloY", 1},
-    {"HaloZ", 2}
-  };
-
-  static const std::map<std::string, int> dtype_priority = {
-    {"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}
-  };
+bool compareHaloConfigData(const HaloConfigTimingData& a, const HaloConfigTimingData& b) {
+  static const std::map<std::string, int> op_priority = {{"HaloX", 0}, {"HaloY", 1}, {"HaloZ", 2}};
+
+  static const std::map<std::string, int> dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}};
 
   if (a.stats.operation != b.stats.operation) {
     return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation);
   }
-  if (a.stats.dim != b.stats.dim) {
-    return a.stats.dim < b.stats.dim;
-  }
+  if (a.stats.dim != b.stats.dim) { return a.stats.dim < b.stats.dim; }
   if (a.stats.datatype != b.stats.datatype) {
     return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype);
   }
-  if (a.stats.halos != b.stats.halos) {
-    return a.stats.halos < b.stats.halos;
-  }
-  if (a.stats.periods != b.stats.periods) {
-    return a.stats.periods < b.stats.periods;
-  }
-  if (a.stats.padding != b.stats.padding) {
-    return a.stats.padding < b.stats.padding;
-  }
+  if (a.stats.halos != b.stats.halos) { return a.stats.halos < b.stats.halos; }
+  if (a.stats.periods != b.stats.periods) { return a.stats.periods < b.stats.periods; }
+  if (a.stats.padding != b.stats.padding) { return a.stats.padding < b.stats.padding; }
   return a.stats.managed < b.stats.managed;
 }
 
@@ -429,12 +378,10 @@ float computeGlobalAverage(const std::vector<float>& values, const cudecompHandl
 void gatherSampleData(const std::vector<float>& local_data, std::vector<float>& all_data,
                       const cudecompHandle_t handle) {
   int num_samples = local_data.size();
-  if (handle->rank == 0) {
-    all_data.resize(num_samples * handle->nranks);
-  }
+  if (handle->rank == 0) { all_data.resize(num_samples * handle->nranks); }
 
-  CHECK_MPI(MPI_Gather(local_data.data(), num_samples, MPI_FLOAT,
-                       all_data.data(), num_samples, MPI_FLOAT, 0, handle->mpi_comm));
+  CHECK_MPI(MPI_Gather(local_data.data(), num_samples, MPI_FLOAT, all_data.data(), num_samples, MPI_FLOAT, 0,
+                       handle->mpi_comm));
 }
 
 // Process transpose timing data from sample collections
@@ -565,19 +512,16 @@ void printGridConfiguration(const cudecompHandle_t handle, const cudecompGridDes
   printf("CUDECOMP: Grid Configuration:\n");
   printf("CUDECOMP:\tTranspose backend: %s\n",
          cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend));
-  printf("CUDECOMP:\tHalo backend: %s\n",
-         cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
-  printf("CUDECOMP:\tProcess grid: [%d, %d]\n",
-         grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
-  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n",
-         grid_desc->config.gdims[0], grid_desc->config.gdims[1], grid_desc->config.gdims[2]);
+  printf("CUDECOMP:\tHalo backend: %s\n", cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend));
+  printf("CUDECOMP:\tProcess grid: [%d, %d]\n", grid_desc->config.pdims[0], grid_desc->config.pdims[1]);
+  printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n", grid_desc->config.gdims[0], grid_desc->config.gdims[1],
+         grid_desc->config.gdims[2]);
 
   // Print memory ordering information
   printf("CUDECOMP:\tMemory order: ");
   for (int axis = 0; axis < 3; ++axis) {
-    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0],
-                        grid_desc->config.transpose_mem_order[axis][1],
-                        grid_desc->config.transpose_mem_order[axis][2]);
+    printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0], grid_desc->config.transpose_mem_order[axis][1],
+           grid_desc->config.transpose_mem_order[axis][2]);
     if (axis < 2) printf("; ");
   }
   printf("\n");
@@ -590,14 +534,13 @@ void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>
   printf("CUDECOMP:\n");
 
   // Print compact table header
-  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-    "operation", "dtype", "halo extents", "padding", "inplace", "managed", "samples",
-         "total", "A2A", "local", "A2A BW");
-  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "", "", "", "", "", "", "",
-         "[ms]", "[ms]", "[ms]", "[GB/s]");
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "halo extents",
+         "padding", "inplace", "managed", "samples", "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "[ms]",
+         "[ms]", "[ms]", "[GB/s]");
   printf("CUDECOMP: ");
-  for (int i = 0; i < 120; ++i) printf("-");
+  for (int i = 0; i < 120; ++i)
+    printf("-");
   printf("\n");
 
   // Print table rows
@@ -605,19 +548,9 @@ void printTransposePerformanceTable(const std::vector<TransposeConfigTimingData>
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
       printf("CUDECOMP: %-12s %-6s %-7s/%-7s %-7s/%-7s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-             stats.operation.c_str(),
-             stats.datatype.c_str(),
-             stats.input_halos.c_str(),
-             stats.output_halos.c_str(),
-             stats.input_padding.c_str(),
-             stats.output_padding.c_str(),
-             stats.inplace.c_str(),
-             stats.managed.c_str(),
-             stats.samples,
-             stats.total_time_avg,
-             stats.alltoall_time_avg,
-             stats.local_time_avg,
-             stats.alltoall_bw_avg);
+             stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(),
+             stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str(),
+             stats.samples, stats.total_time_avg, stats.alltoall_time_avg, stats.local_time_avg, stats.alltoall_bw_avg);
     }
   }
 }
@@ -629,14 +562,13 @@ void printHaloPerformanceTable(const std::vector<HaloConfigTimingData>& all_halo
   printf("CUDECOMP:\n");
 
   // Print compact table header
-  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "operation", "dtype", "dim", "halo extent", "periods", "padding", "managed", "samples",
-         "total", "SR", "local", "SR BW");
-  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n",
-         "", "", "", "", "", "", "", "",
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "dim",
+         "halo extent", "periods", "padding", "managed", "samples", "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "",
          "[ms]", "[ms]", "[ms]", "[GB/s]");
   printf("CUDECOMP: ");
-  for (int i = 0; i < 125; ++i) printf("-");
+  for (int i = 0; i < 125; ++i)
+    printf("-");
   printf("\n");
 
   // Print table rows
@@ -644,26 +576,17 @@ void printHaloPerformanceTable(const std::vector<HaloConfigTimingData>& all_halo
     const auto& stats = config_data.stats;
     if (stats.samples > 0) {
       printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-             stats.operation.c_str(),
-             stats.datatype.c_str(),
-             stats.dim,
-             stats.halos.c_str(),
-             stats.periods.c_str(),
-             stats.padding.c_str(),
-             stats.managed.c_str(),
-             stats.samples,
-             stats.total_time_avg,
-             stats.sendrecv_time_avg,
-             stats.local_time_avg,
-             stats.sendrecv_bw_avg);
+             stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(),
+             stats.padding.c_str(), stats.managed.c_str(), stats.samples, stats.total_time_avg, stats.sendrecv_time_avg,
+             stats.local_time_avg, stats.sendrecv_bw_avg);
     }
   }
 }
 
 // Print per-sample transpose data for a single configuration
 void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& config_data,
-                                            const cudecompHandle_t handle, int detail_level,
-                                            std::ofstream* csv_file = nullptr) {
+                                             const cudecompHandle_t handle, int detail_level,
+                                             std::ofstream* csv_file = nullptr) {
   const auto& stats = config_data.stats;
   const auto& total_times = config_data.total_times;
   const auto& alltoall_times = config_data.alltoall_times;
@@ -695,44 +618,28 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
   }
 
   printf("CUDECOMP: %s (dtype=%s, halo extents=%s/%s, padding=%s/%s, inplace=%s, managed=%s) samples:\n",
-         stats.operation.c_str(),
-         stats.datatype.c_str(),
-         stats.input_halos.c_str(),
-         stats.output_halos.c_str(),
-         stats.input_padding.c_str(),
-         stats.output_padding.c_str(),
-         stats.inplace.c_str(),
-         stats.managed.c_str());
-
-  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-         "rank", "sample", "total", "A2A", "local", "A2A BW");
-  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-         "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+         stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(),
+         stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "A2A", "local", "A2A BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
 
   for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
     for (int s = 0; s < num_samples; ++s) {
       int idx = r * num_samples + s;
-      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-             r, s, all_total_times[idx],
-             all_alltoall_times[idx], all_local_times[idx],
-             all_alltoall_bws[idx]);
-      
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_alltoall_times[idx],
+             all_local_times[idx], all_alltoall_bws[idx]);
+
       // Write to CSV if file is provided
       if (csv_file && csv_file->is_open()) {
-        *csv_file << stats.operation << ","
-                  << stats.datatype << ","
+        *csv_file << stats.operation << "," << stats.datatype << ","
                   << "\"" << stats.input_halos << "\","
                   << "\"" << stats.output_halos << "\","
                   << "\"" << stats.input_padding << "\","
-                  << "\"" << stats.output_padding << "\","
-                  << stats.inplace << ","
-                  << stats.managed << ","
-                  << r << ","
-                  << s << ","
-                  << std::fixed << std::setprecision(3) << all_total_times[idx] << ","
-                  << std::fixed << std::setprecision(3) << all_alltoall_times[idx] << ","
-                  << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
-                  << std::fixed << std::setprecision(3) << all_alltoall_bws[idx] << "\n";
+                  << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << r << ","
+                  << s << "," << std::fixed << std::setprecision(3) << all_total_times[idx] << "," << std::fixed
+                  << std::setprecision(3) << all_alltoall_times[idx] << "," << std::fixed << std::setprecision(3)
+                  << all_local_times[idx] << "," << std::fixed << std::setprecision(3) << all_alltoall_bws[idx] << "\n";
       }
     }
   }
@@ -740,9 +647,8 @@ void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& co
 }
 
 // Print per-sample halo data for a single configuration
-void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
-                                        const cudecompHandle_t handle, int detail_level,
-                                        std::ofstream* csv_file = nullptr) {
+void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data, const cudecompHandle_t handle,
+                                        int detail_level, std::ofstream* csv_file = nullptr) {
   const auto& stats = config_data.stats;
   const auto& total_times = config_data.total_times;
   const auto& sendrecv_times = config_data.sendrecv_times;
@@ -774,41 +680,26 @@ void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
   }
 
   printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n",
-         stats.operation.c_str(),
-         stats.datatype.c_str(),
-         stats.dim,
-         stats.halos.c_str(),
-         stats.periods.c_str(),
-         stats.padding.c_str(),
-         stats.managed.c_str());
-
-  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-         "rank", "sample", "total", "SR", "local", "SR BW");
-  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n",
-         "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
+         stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(),
+         stats.padding.c_str(), stats.managed.c_str());
+
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "SR", "local", "SR BW");
+  printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]");
 
   for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) {
     for (int s = 0; s < num_samples; ++s) {
       int idx = r * num_samples + s;
-      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n",
-             r, s, all_total_times[idx],
-             all_sendrecv_times[idx], all_local_times[idx],
-             all_sendrecv_bws[idx]);
-      
+      printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_sendrecv_times[idx],
+             all_local_times[idx], all_sendrecv_bws[idx]);
+
       // Write to CSV if file is provided
       if (csv_file && csv_file->is_open()) {
-        *csv_file << stats.operation << ","
-                  << stats.datatype << ","
-                  << stats.dim << ","
+        *csv_file << stats.operation << "," << stats.datatype << "," << stats.dim << ","
                   << "\"" << stats.halos << "\","
                   << "\"" << stats.periods << "\","
-                  << "\"" << stats.padding << "\","
-                  << stats.managed << ","
-                  << r << ","
-                  << s << ","
-                  << std::fixed << std::setprecision(3) << all_total_times[idx] << ","
-                  << std::fixed << std::setprecision(3) << all_sendrecv_times[idx] << ","
-                  << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
+                  << "\"" << stats.padding << "\"," << stats.managed << "," << r << "," << s << "," << std::fixed
+                  << std::setprecision(3) << all_total_times[idx] << "," << std::fixed << std::setprecision(3)
+                  << all_sendrecv_times[idx] << "," << std::fixed << std::setprecision(3) << all_local_times[idx] << ","
                   << std::fixed << std::setprecision(3) << all_sendrecv_bws[idx] << "\n";
       }
     }
@@ -818,11 +709,11 @@ void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data,
 
 // Print per-sample details for transpose configurations
 void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>& all_transpose_config_data,
-                                   const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
-                                   const std::string& write_dir = "") {
+                                    const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
+                                    const std::string& write_dir = "") {
   std::ofstream csv_file;
   bool csv_enabled = !write_dir.empty();
-  
+
   // Check if there are any samples before creating CSV file
   bool has_samples = false;
   for (const auto& config_data : all_transpose_config_data) {
@@ -831,13 +722,14 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
       break;
     }
   }
-  
+
   if (csv_enabled && handle->rank == 0 && has_samples) {
     std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc);
     csv_file.open(filename);
     if (csv_file.is_open()) {
       writeCSVHeader(csv_file, grid_desc);
-      csv_file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
+      csv_file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,"
+                  "rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n";
     } else {
       printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
     }
@@ -847,13 +739,14 @@ void printTransposePerSampleDetails(const std::vector<TransposeConfigTimingData>
     if (config_data.stats.samples == 0) continue;
 
     printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
-                                           csv_enabled && has_samples ? &csv_file : nullptr);
+                                            csv_enabled && has_samples ? &csv_file : nullptr);
   }
-  
+
   if (csv_file.is_open()) {
     csv_file.close();
     printf("CUDECOMP:\n");
-    printf("CUDECOMP: Wrote per-sample transpose data to %s\n", createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
+    printf("CUDECOMP: Wrote per-sample transpose data to %s\n",
+           createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str());
   }
 }
 
@@ -863,7 +756,7 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
                                const std::string& write_dir = "") {
   std::ofstream csv_file;
   bool csv_enabled = !write_dir.empty();
-  
+
   // Check if there are any samples before creating CSV file
   bool has_samples = false;
   for (const auto& config_data : all_halo_config_data) {
@@ -872,13 +765,14 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
       break;
     }
   }
-  
+
   if (csv_enabled && handle->rank == 0 && has_samples) {
     std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc);
     csv_file.open(filename);
     if (csv_file.is_open()) {
       writeCSVHeader(csv_file, grid_desc);
-      csv_file << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
+      csv_file
+          << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n";
     } else {
       printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str());
     }
@@ -888,13 +782,14 @@ void printHaloPerSampleDetails(const std::vector<HaloConfigTimingData>& all_halo
     if (config_data.stats.samples == 0) continue;
 
     printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail,
-                                      csv_enabled && has_samples ? &csv_file : nullptr);
+                                       csv_enabled && has_samples ? &csv_file : nullptr);
   }
-  
+
   if (csv_file.is_open()) {
     csv_file.close();
     printf("CUDECOMP:\n");
-    printf("CUDECOMP: Wrote per-sample halo data to %s\n", createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
+    printf("CUDECOMP: Wrote per-sample halo data to %s\n",
+           createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str());
   }
 }
 
@@ -911,9 +806,7 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     const auto& collection = entry.second;
 
     TransposeConfigTimingData config_data = processTransposeConfig(config, collection, handle);
-    if (config_data.stats.samples > 0) {
-      all_transpose_config_data.emplace_back(std::move(config_data));
-    }
+    if (config_data.stats.samples > 0) { all_transpose_config_data.emplace_back(std::move(config_data)); }
   }
 
   // Sort transpose configuration data for consistent ordering
@@ -928,9 +821,7 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     const auto& collection = entry.second;
 
     HaloConfigTimingData config_data = processHaloConfig(config, collection, handle);
-    if (config_data.stats.samples > 0) {
-      all_halo_config_data.emplace_back(std::move(config_data));
-    }
+    if (config_data.stats.samples > 0) { all_halo_config_data.emplace_back(std::move(config_data)); }
   }
 
   // Sort halo configuration data for consistent ordering
@@ -954,21 +845,17 @@ void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDes
     // Print transpose performance data
     if (!all_transpose_config_data.empty()) {
       printTransposePerformanceTable(all_transpose_config_data);
-      
+
       // Write transpose performance data to CSV if enabled
-      if (csv_enabled) {
-        writeTransposePerformanceTableCSV(all_transpose_config_data, grid_desc, write_dir);
-      }
+      if (csv_enabled) { writeTransposePerformanceTableCSV(all_transpose_config_data, grid_desc, write_dir); }
     }
 
     // Print halo performance data
     if (!all_halo_config_data.empty()) {
       printHaloPerformanceTable(all_halo_config_data);
-      
+
       // Write halo performance data to CSV if enabled
-      if (csv_enabled) {
-        writeHaloPerformanceTableCSV(all_halo_config_data, grid_desc, write_dir);
-      }
+      if (csv_enabled) { writeHaloPerformanceTableCSV(all_halo_config_data, grid_desc, write_dir); }
     }
   }