NVIDIA · romerojosh · Jul 22, 2025 · Jul 1, 2025 · Jul 3, 2025 · Jul 5, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,6 +163,7 @@ target_sources(cudecomp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/performance.cc
 )
 
 set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu PROPERTIES COMPILE_FLAGS -rdc=true)

diff --git a/docs/env_vars.rst b/docs/env_vars.rst
@@ -32,3 +32,61 @@ CUDECOMP_ENABLE_CUDA_GRAPHS
 and communication overlap of packing kernels in large scale cases.
 
 Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_ENABLE_PERFORMANCE_REPORT
+------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` controls whether cuDecomp performance reporting is enabled.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
+CUDECOMP_PERFORMANCE_REPORT_DETAIL
+----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.
+
+The following values are supported:
+
+- :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default)
+- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose/halo configuration, but only from rank 0
+- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose/halo configuration from all ranks, gathered and sorted by rank on rank 0
+
+Default setting is :code:`0`.
+
+CUDECOMP_PERFORMANCE_REPORT_SAMPLES
+-----------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose/halo configuration.
+
+Default setting is :code:`20` samples.
+
+CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
+------------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.
+
+Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering.
+
+CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR
+-------------------------------------
+(since v0.5.1)
+
+:code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory.
+
+CSV files are created with descriptive names encoding the grid configuration, for example:
+:code:`cudecomp-perf-report-transpose-aggregated-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`
+
+The following CSV files are generated:
+
+- Aggregated transpose performance data
+- Aggregated halo performance data
+- Per-sample transpose data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+- Per-sample halo data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
+
+Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.
+
+Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
@@ -150,14 +150,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
 #endif
 
 template <typename T>
-static void
-cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                 const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
-                 T* recv_buff, const std::vector<comm_count_t>& recv_counts,
-                 const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
-                 cudecompCommAxis comm_axis, cudaStream_t stream) {
+static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                             const std::vector<comm_count_t>& send_counts,
+                             const std::vector<comm_count_t>& send_offsets, T* recv_buff,
+                             const std::vector<comm_count_t>& recv_counts,
+                             const std::vector<comm_count_t>& recv_offsets,
+                             const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                             cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompAlltoall");
 
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -269,18 +274,28 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     break;
   }
   }
+
+  if (handle->performance_report_enable) {
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream));
+    current_sample->alltoall_timing_count++;
+  }
+
   nvtx::rangePop();
 }
 
 template <typename T>
-static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                                      const std::vector<comm_count_t>& send_counts,
-                                      const std::vector<comm_count_t>& send_offsets, T* recv_buff,
-                                      const std::vector<comm_count_t>& recv_counts,
-                                      const std::vector<comm_count_t>& recv_offsets,
-                                      const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
-                                      const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
-                                      cudaStream_t stream, bool& synced) {
+static void
+cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                          const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
+                          T* recv_buff, const std::vector<comm_count_t>& recv_counts,
+                          const std::vector<comm_count_t>& recv_offsets,
+                          const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
+                          const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
+                          bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
+
+  // If there are no transfers to complete, quick return
+  if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }
+
   std::ostringstream os;
   os << "cudecompAlltoallPipelined_";
   for (int i = 0; i < src_ranks.size(); ++i) {
@@ -289,6 +304,14 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
   }
   nvtx::rangePush(os.str());
 
+  int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    // Note: skipping self-copy for timing as it should be overlapped
+    CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
+    CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count],
+                               handle->pl_stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
@@ -459,6 +482,12 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
     break;
   }
   }
+
+  if (handle->performance_report_enable && src_ranks[0] != self_rank) {
+    CHECK_CUDA(
+        cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
+    current_sample->alltoall_timing_count++;
+  }
   nvtx::rangePop();
 }
 
@@ -468,9 +497,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
                                  const std::array<comm_count_t, 2>& send_counts,
                                  const std::array<size_t, 2>& send_offsets, T* recv_buff,
                                  const std::array<comm_count_t, 2>& recv_counts,
-                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0) {
+                                 const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0,
+                                 cudecompHaloPerformanceSample* current_sample = nullptr) {
   nvtx::rangePush("cudecompSendRecvPair");
 
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_start_event, stream));
+  }
+
 #ifdef ENABLE_NVSHMEM
   if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
       haloBackendRequiresMpi(grid_desc->config.halo_comm_backend) &&
@@ -592,6 +626,11 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
     break;
   }
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_end_event, stream));
+  }
+
   nvtx::rangePop();
 }
 

diff --git a/include/internal/common.h b/include/internal/common.h
@@ -33,8 +33,10 @@
 
 #include <array>
 #include <complex>
+#include <functional>
 #include <map>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -43,6 +45,10 @@
 #include <cuda_runtime.h>
 #include <mpi.h>
 #include <nccl.h>
+#ifdef ENABLE_NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#endif
 
 #include "cudecomp.h"
 #include "internal/checks.h"
@@ -108,6 +114,15 @@ struct cudecompHandle {
 
   // CUDA graphs
   bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used
+
+  // Performance reporting related entries
+  bool performance_report_enable = false; // flag to track if performance reporting is enabled
+  int32_t performance_report_detail =
+      0; // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
+  int32_t performance_report_samples = 20;       // number of performance samples to keep for final report
+  int32_t performance_report_warmup_samples = 3; // number of initial warmup samples to ignore for each configuration
+  std::string performance_report_write_dir =
+      ""; // directory to write CSV performance reports, empty means no file writing
 };
 
 // Structure with information about row/column communicator
@@ -124,6 +139,41 @@ struct cudecompCommInfo {
 #endif
 };
 
+// Structure to contain data for transpose performance sample
+struct cudecompTransposePerformanceSample {
+  cudaEvent_t transpose_start_event;
+  cudaEvent_t transpose_end_event;
+  std::vector<cudaEvent_t> alltoall_start_events;
+  std::vector<cudaEvent_t> alltoall_end_events;
+  int32_t alltoall_timing_count = 0;
+  size_t alltoall_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of transpose performance samples for a specific configuration
+struct cudecompTransposePerformanceSampleCollection {
+  std::vector<cudecompTransposePerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
+// Structure to contain data for halo performance sample
+struct cudecompHaloPerformanceSample {
+  cudaEvent_t halo_start_event;
+  cudaEvent_t halo_end_event;
+  cudaEvent_t sendrecv_start_event;
+  cudaEvent_t sendrecv_end_event;
+  size_t sendrecv_bytes = 0;
+  bool valid = false;
+};
+
+// Collection of halo performance samples for a specific configuration
+struct cudecompHaloPerformanceSampleCollection {
+  std::vector<cudecompHaloPerformanceSample> samples;
+  int32_t sample_idx = 0;
+  int32_t warmup_count = 0;
+};
+
 // cuDecomp grid descriptor containing grid-specific information
 struct cudecompGridDesc {
   cudecompGridDescConfig_t config;      // configuration struct
@@ -144,6 +194,23 @@ struct cudecompGridDesc {
   cudecomp::ncclComm
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
+  // Performance reporting related entries
+  std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
+  std::vector<cudaEvent_t> alltoall_end_events;   // events for alltoall timing
+  int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
+  cudaEvent_t transpose_start_event;              // event for transpose timing
+  cudaEvent_t transpose_end_event;                // event for transpose timing
+
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
+                                std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
+                     cudecompTransposePerformanceSampleCollection>
+      transpose_perf_samples_map;
+
+  std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>, std::array<int32_t, 3>,
+                                bool, cudecompDataType_t>,
+                     cudecompHaloPerformanceSampleCollection>
+      halo_perf_samples_map;
+
   bool initialized = false;
 };
 

diff --git a/include/internal/halo.h b/include/internal/halo.h
@@ -41,6 +41,8 @@
 #include "internal/comm_routines.h"
 #include "internal/cudecomp_kernels.h"
 #include "internal/nvtx.h"
+#include "internal/performance.h"
+#include "internal/utils.h"
 
 namespace cudecomp {
 
@@ -73,6 +75,20 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
   // Quick return if no halos
   if (halo_extents[dim] == 0) { return; }
 
+  cudecompHaloPerformanceSample* current_sample = nullptr;
+  if (handle->performance_report_enable) {
+    auto& samples =
+        getOrCreateHaloPerformanceSamples(handle, grid_desc,
+                                          createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                           padding.data(), getCudecompDataType<T>()));
+    current_sample = &samples.samples[samples.sample_idx];
+    current_sample->sendrecv_bytes = 0;
+    current_sample->valid = true;
+
+    // Record start event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
+  }
+
   // Check if halos include more than one process (unsupported currently).
   int count = 0;
   for (int i = 0; i < 3; ++i) {
@@ -120,6 +136,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     c = 0;
   } else if (neighbors[0] == -1 && neighbors[1] == -1) {
     // Single rank in this dimension and not periodic. Return.
+    if (handle->performance_report_enable && current_sample) {
+      // Record end event and advance sample even for early return
+      CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+      advanceHaloPerformanceSample(handle, grid_desc,
+                                   createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                    padding.data(), getCudecompDataType<T>()));
+    }
     return;
   }
 
@@ -204,7 +227,14 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     std::array<size_t, 2> offsets{};
     offsets[1] = halo_size;
 
-    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream);
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
+      }
+    }
+    cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream,
+                         current_sample);
 
     // Unpack
     // Left
@@ -261,10 +291,24 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     lx[dim] = shape_g_h_p[dim] - halo_extents[dim];
     recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx);
 
-    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
-                         stream);
+    if (handle->performance_report_enable && current_sample) {
+      current_sample->sendrecv_bytes = 0;
+      for (int i = 0; i < 2; ++i) {
+        if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
+      }
+    }
+    cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, stream,
+                         current_sample);
   } break;
   }
+
+  if (handle->performance_report_enable && current_sample) {
+    // Record end event
+    CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
+    advanceHaloPerformanceSample(handle, grid_desc,
+                                 createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
+                                                  padding.data(), getCudecompDataType<T>()));
+  }
 }
 
 template <typename T>