diff --git a/CMakeLists.txt b/CMakeLists.txt index 241ed51..e4b39a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ target_sources(cudecomp ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/performance.cc ) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu PROPERTIES COMPILE_FLAGS -rdc=true) diff --git a/docs/env_vars.rst b/docs/env_vars.rst index a7b607d..2123b57 100644 --- a/docs/env_vars.rst +++ b/docs/env_vars.rst @@ -32,3 +32,61 @@ CUDECOMP_ENABLE_CUDA_GRAPHS and communication overlap of packing kernels in large scale cases. Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. + +CUDECOMP_ENABLE_PERFORMANCE_REPORT +------------------------------------ +(since v0.5.1) + +:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` controls whether cuDecomp performance reporting is enabled. + +Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature. + +CUDECOMP_PERFORMANCE_REPORT_DETAIL +---------------------------------- +(since v0.5.1) + +:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary. + +The following values are supported: + +- :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default) +- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose/halo configuration, but only from rank 0 +- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose/halo configuration from all ranks, gathered and sorted by rank on rank 0 + +Default setting is :code:`0`. + +CUDECOMP_PERFORMANCE_REPORT_SAMPLES +----------------------------------- +(since v0.5.1) + +:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose/halo configuration. + +Default setting is :code:`20` samples. + +CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES +------------------------------------------ +(since v0.5.1) + +:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics. + +Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering. + +CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR +------------------------------------- +(since v0.5.1) + +:code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory. + +CSV files are created with descriptive names encoding the grid configuration, for example: +:code:`cudecomp-perf-report-transpose-aggregated-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv` + +The following CSV files are generated: + +- Aggregated transpose performance data +- Aggregated halo performance data +- Per-sample transpose data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0) +- Per-sample halo data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0) + +Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format. + +Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output. diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 789ba87..b14b0fe 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -150,14 +150,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ #endif template -static void -cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, - const std::vector& send_counts, const std::vector& send_offsets, - T* recv_buff, const std::vector& recv_counts, - const std::vector& recv_offsets, const std::vector& recv_offsets_nvshmem, - cudecompCommAxis comm_axis, cudaStream_t stream) { +static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, + const std::vector& send_counts, + const std::vector& send_offsets, T* recv_buff, + const std::vector& recv_counts, + const std::vector& recv_offsets, + const std::vector& recv_offsets_nvshmem, cudecompCommAxis comm_axis, + cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) { nvtx::rangePush("cudecompAlltoall"); + if (handle->performance_report_enable) { + CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream)); + } + #ifdef ENABLE_NVSHMEM if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued && transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) && @@ -269,18 +274,28 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ break; } } + + if (handle->performance_report_enable) { + CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream)); + current_sample->alltoall_timing_count++; + } + nvtx::rangePop(); } template -static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, - const std::vector& send_counts, - const std::vector& send_offsets, T* recv_buff, - const std::vector& recv_counts, - const std::vector& recv_offsets, - const std::vector& recv_offsets_nvshmem, cudecompCommAxis comm_axis, - const std::vector& src_ranks, const std::vector& dst_ranks, - cudaStream_t stream, bool& synced) { +static void +cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, + const std::vector& send_counts, const std::vector& send_offsets, + T* recv_buff, const std::vector& recv_counts, + const std::vector& recv_offsets, + const std::vector& recv_offsets_nvshmem, cudecompCommAxis comm_axis, + const std::vector& src_ranks, const std::vector& dst_ranks, cudaStream_t stream, + bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) { + + // If there are no transfers to complete, quick return + if (send_counts.size() == 0 && recv_counts.size() == 0) { return; } + std::ostringstream os; os << "cudecompAlltoallPipelined_"; for (int i = 0; i < src_ranks.size(); ++i) { @@ -289,6 +304,14 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude } nvtx::rangePush(os.str()); + int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank; + if (handle->performance_report_enable && src_ranks[0] != self_rank) { + // Note: skipping self-copy for timing as it should be overlapped + CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0)); + CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], + handle->pl_stream)); + } + #ifdef ENABLE_NVSHMEM if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued && transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) && @@ -459,6 +482,12 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude break; } } + + if (handle->performance_report_enable && src_ranks[0] != self_rank) { + CHECK_CUDA( + cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream)); + current_sample->alltoall_timing_count++; + } nvtx::rangePop(); } @@ -468,9 +497,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG const std::array& send_counts, const std::array& send_offsets, T* recv_buff, const std::array& recv_counts, - const std::array& recv_offsets, cudaStream_t stream = 0) { + const std::array& recv_offsets, cudaStream_t stream = 0, + cudecompHaloPerformanceSample* current_sample = nullptr) { nvtx::rangePush("cudecompSendRecvPair"); + if (handle->performance_report_enable && current_sample) { + CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_start_event, stream)); + } + #ifdef ENABLE_NVSHMEM if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued && haloBackendRequiresMpi(grid_desc->config.halo_comm_backend) && @@ -592,6 +626,11 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG break; } } + + if (handle->performance_report_enable && current_sample) { + CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_end_event, stream)); + } + nvtx::rangePop(); } diff --git a/include/internal/common.h b/include/internal/common.h index a299271..6738af5 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -33,8 +33,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -43,6 +45,10 @@ #include #include #include +#ifdef ENABLE_NVSHMEM +#include +#include +#endif #include "cudecomp.h" #include "internal/checks.h" @@ -108,6 +114,15 @@ struct cudecompHandle { // CUDA graphs bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used + + // Performance reporting related entries + bool performance_report_enable = false; // flag to track if performance reporting is enabled + int32_t performance_report_detail = + 0; // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks + int32_t performance_report_samples = 20; // number of performance samples to keep for final report + int32_t performance_report_warmup_samples = 3; // number of initial warmup samples to ignore for each configuration + std::string performance_report_write_dir = + ""; // directory to write CSV performance reports, empty means no file writing }; // Structure with information about row/column communicator @@ -124,6 +139,41 @@ struct cudecompCommInfo { #endif }; +// Structure to contain data for transpose performance sample +struct cudecompTransposePerformanceSample { + cudaEvent_t transpose_start_event; + cudaEvent_t transpose_end_event; + std::vector alltoall_start_events; + std::vector alltoall_end_events; + int32_t alltoall_timing_count = 0; + size_t alltoall_bytes = 0; + bool valid = false; +}; + +// Collection of transpose performance samples for a specific configuration +struct cudecompTransposePerformanceSampleCollection { + std::vector samples; + int32_t sample_idx = 0; + int32_t warmup_count = 0; +}; + +// Structure to contain data for halo performance sample +struct cudecompHaloPerformanceSample { + cudaEvent_t halo_start_event; + cudaEvent_t halo_end_event; + cudaEvent_t sendrecv_start_event; + cudaEvent_t sendrecv_end_event; + size_t sendrecv_bytes = 0; + bool valid = false; +}; + +// Collection of halo performance samples for a specific configuration +struct cudecompHaloPerformanceSampleCollection { + std::vector samples; + int32_t sample_idx = 0; + int32_t warmup_count = 0; +}; + // cuDecomp grid descriptor containing grid-specific information struct cudecompGridDesc { cudecompGridDescConfig_t config; // configuration struct @@ -144,6 +194,23 @@ struct cudecompGridDesc { cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle + // Performance reporting related entries + std::vector alltoall_start_events; // events for alltoall timing + std::vector alltoall_end_events; // events for alltoall timing + int32_t alltoall_timing_count = 0; // count of alltoall timing events pairs (for pipelined alltoall) + cudaEvent_t transpose_start_event; // event for transpose timing + cudaEvent_t transpose_end_event; // event for transpose timing + + std::unordered_map, std::array, + std::array, std::array, bool, bool, cudecompDataType_t>, + cudecompTransposePerformanceSampleCollection> + transpose_perf_samples_map; + + std::unordered_map, std::array, std::array, + bool, cudecompDataType_t>, + cudecompHaloPerformanceSampleCollection> + halo_perf_samples_map; + bool initialized = false; }; diff --git a/include/internal/halo.h b/include/internal/halo.h index 096292e..55a4a8f 100644 --- a/include/internal/halo.h +++ b/include/internal/halo.h @@ -41,6 +41,8 @@ #include "internal/comm_routines.h" #include "internal/cudecomp_kernels.h" #include "internal/nvtx.h" +#include "internal/performance.h" +#include "internal/utils.h" namespace cudecomp { @@ -73,6 +75,20 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG // Quick return if no halos if (halo_extents[dim] == 0) { return; } + cudecompHaloPerformanceSample* current_sample = nullptr; + if (handle->performance_report_enable) { + auto& samples = + getOrCreateHaloPerformanceSamples(handle, grid_desc, + createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), + padding.data(), getCudecompDataType())); + current_sample = &samples.samples[samples.sample_idx]; + current_sample->sendrecv_bytes = 0; + current_sample->valid = true; + + // Record start event + CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream)); + } + // Check if halos include more than one process (unsupported currently). int count = 0; for (int i = 0; i < 3; ++i) { @@ -120,6 +136,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG c = 0; } else if (neighbors[0] == -1 && neighbors[1] == -1) { // Single rank in this dimension and not periodic. Return. + if (handle->performance_report_enable && current_sample) { + // Record end event and advance sample even for early return + CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream)); + advanceHaloPerformanceSample(handle, grid_desc, + createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), + padding.data(), getCudecompDataType())); + } return; } @@ -204,7 +227,14 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG std::array offsets{}; offsets[1] = halo_size; - cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream); + if (handle->performance_report_enable && current_sample) { + current_sample->sendrecv_bytes = 0; + for (int i = 0; i < 2; ++i) { + if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); } + } + } + cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream, + current_sample); // Unpack // Left @@ -261,10 +291,24 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG lx[dim] = shape_g_h_p[dim] - halo_extents[dim]; recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx); - cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, - stream); + if (handle->performance_report_enable && current_sample) { + current_sample->sendrecv_bytes = 0; + for (int i = 0; i < 2; ++i) { + if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); } + } + } + cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, stream, + current_sample); } break; } + + if (handle->performance_report_enable && current_sample) { + // Record end event + CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream)); + advanceHaloPerformanceSample(handle, grid_desc, + createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(), + padding.data(), getCudecompDataType())); + } } template diff --git a/include/internal/performance.h b/include/internal/performance.h new file mode 100644 index 0000000..82f54e2 --- /dev/null +++ b/include/internal/performance.h @@ -0,0 +1,149 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CUDECOMP_PERFORMANCE_H +#define CUDECOMP_PERFORMANCE_H + +#include +#include +#include +#include +#include + +#include "internal/common.h" + +namespace cudecomp { + +using cudecompTransposeConfigKey = std::tuple, // input_halo_extents + std::array, // output_halo_extents + std::array, // input_padding + std::array, // output_padding + bool, // inplace + bool, // managed_memory + cudecompDataType_t // datatype + >; + +using cudecompHaloConfigKey = std::tuple, // halo_extents + std::array, // halo_periods + std::array, // padding + bool, // managed_memory + cudecompDataType_t // datatype + >; + +// Helper structure for transpose statistics +struct TransposePerformanceStats { + std::string operation; + std::string datatype; + std::string input_halos; + std::string output_halos; + std::string input_padding; + std::string output_padding; + std::string inplace; + std::string managed; + int samples; + float total_time_avg; + float alltoall_time_avg; + float local_time_avg; + float alltoall_bw_avg; +}; + +// Helper structure for halo statistics +struct HaloPerformanceStats { + std::string operation; + std::string datatype; + int dim; + std::string halos; + std::string periods; + std::string padding; + std::string managed; + int samples; + float total_time_avg; + float sendrecv_time_avg; + float local_time_avg; + float sendrecv_bw_avg; +}; + +// Helper structure to hold pre-computed transpose timing data +struct TransposeConfigTimingData { + TransposePerformanceStats stats; + std::vector total_times; + std::vector alltoall_times; + std::vector local_times; + std::vector alltoall_bws; + std::vector sample_indices; +}; + +// Helper structure to hold pre-computed halo timing data +struct HaloConfigTimingData { + HaloPerformanceStats stats; + std::vector total_times; + std::vector sendrecv_times; + std::vector local_times; + std::vector sendrecv_bws; + std::vector sample_indices; +}; + +void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc); + +void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc); + +void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompTransposeConfigKey& config); + +cudecompTransposePerformanceSampleCollection& +getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompTransposeConfigKey& config); + +void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompHaloConfigKey& config); + +cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle, + cudecompGridDesc_t grid_desc, + const cudecompHaloConfigKey& config); + +// Helper function to create transpose configuration key +cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output, + const int32_t input_halo_extents_ptr[], + const int32_t output_halo_extents_ptr[], + const int32_t input_padding_ptr[], const int32_t output_padding_ptr[], + cudecompDataType_t datatype); + +// Helper function to create halo configuration key +cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[], + const bool halo_periods_ptr[], const int32_t padding_ptr[], + cudecompDataType_t datatype); + +} // namespace cudecomp + +#endif // CUDECOMP_PERFORMANCE_H diff --git a/include/internal/transpose.h b/include/internal/transpose.h index fb47c76..afb6480 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -44,6 +44,8 @@ #include "internal/comm_routines.h" #include "internal/cudecomp_kernels.h" #include "internal/nvtx.h" +#include "internal/performance.h" +#include "internal/utils.h" namespace cudecomp { @@ -250,6 +252,22 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream)); } + cudecompTransposePerformanceSample* current_sample = nullptr; + if (handle->performance_report_enable) { + auto& samples = + getOrCreateTransposePerformanceSamples(handle, grid_desc, + createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), + output_halo_extents.data(), input_padding.data(), + output_padding.data(), getCudecompDataType())); + current_sample = &samples.samples[samples.sample_idx]; + current_sample->alltoall_timing_count = 0; + current_sample->alltoall_bytes = pinfo_a.size * sizeof(T); + current_sample->valid = true; + + // Record start event + CHECK_CUDA(cudaEventRecord(current_sample->transpose_start_event, stream)); + } + // Adjust pointers to handle special cases bool direct_pack = false; bool direct_transpose = false; @@ -259,6 +277,14 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (inplace) { if (halos_padding_equal) { // Single rank, in place, Pack -> Unpack: No transpose necessary. + if (handle->performance_report_enable) { + // Record performance data + CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream)); + advanceTransposePerformanceSample(handle, grid_desc, + createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), + output_halo_extents.data(), input_padding.data(), + output_padding.data(), getCudecompDataType())); + } return; } } else { @@ -333,6 +359,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } bool data_transposed = false; + if (o1 != i1) { if (pinfo_b.order[2] == ax_a && !orders_equal) { // Transpose/Pack @@ -523,6 +550,13 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (o1 == output) { // o1 is output. Return. + if (handle->performance_report_enable) { + CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream)); + advanceTransposePerformanceSample(handle, grid_desc, + createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), + output_halo_extents.data(), input_padding.data(), + output_padding.data(), getCudecompDataType())); + } return; } } else { @@ -539,7 +573,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (splits_a.size() > 1) { if (!pipelined) { cudecompAlltoall(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets, - recv_offsets_nvshmem, comm_axis, stream); + recv_offsets_nvshmem, comm_axis, stream, current_sample); } } else { o2 = o1; @@ -609,7 +643,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (o2 != o1) { cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets, - recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced); + recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, + current_sample); } if (o2 != o3) { @@ -704,7 +739,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (o2 != o1) { cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets, - recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced); + recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, + current_sample); } } @@ -761,7 +797,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (o2 != o1) { cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets, - recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced); + recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced, + current_sample); } } @@ -794,6 +831,15 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } } } + + if (handle->performance_report_enable) { + // Record performance data + CHECK_CUDA(cudaEventRecord(current_sample->transpose_end_event, stream)); + advanceTransposePerformanceSample(handle, grid_desc, + createTransposeConfig(ax, dir, input, output, input_halo_extents.data(), + output_halo_extents.data(), input_padding.data(), + output_padding.data(), getCudecompDataType())); + } } template diff --git a/src/autotune.cc b/src/autotune.cc index b49cf0b..4cb0dbe 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -45,6 +45,7 @@ #include "internal/checks.h" #include "internal/common.h" #include "internal/halo.h" +#include "internal/performance.h" #include "internal/transpose.h" namespace cudecomp { @@ -288,6 +289,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d if (transposeBackendRequiresNvshmem(comm)) { w = work_nvshmem; } #endif + // Reset performance samples + resetPerformanceSamples(handle, grid_desc); + // Warmup for (int i = 0; i < options->n_warmup_trials; ++i) { if (options->transpose_op_weights[0] != 0.0) { @@ -431,6 +435,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d } } + // Print performance report for this configuration if enabled + if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); } + if (skip_case) continue; if (times_w[2] < t_best) { @@ -515,6 +522,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d CHECK_MPI(MPI_Barrier(handle->mpi_comm)); CHECK_CUDA(cudaFree(tmp1)); CHECK_CUDA(cudaFree(tmp2)); + + // Reset performance samples after autotuning + resetPerformanceSamples(handle, grid_desc); } void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, @@ -682,6 +692,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, if (haloBackendRequiresNvshmem(comm)) { w = work_nvshmem; } #endif + // Reset performance samples + resetPerformanceSamples(handle, grid_desc); + // Warmup for (int i = 0; i < options->n_warmup_trials; ++i) { for (int dim = 0; dim < 3; ++dim) { @@ -742,6 +755,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, } } } + auto times = processTimings(handle, trial_times, 1000.); if (handle->rank == 0) { @@ -759,6 +773,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, } } + // Print performance report for this configuration if enabled + if (handle->performance_report_enable && !skip_case) { printPerformanceReport(handle, grid_desc); } + if (skip_case) continue; if (times[2] < t_best) { @@ -822,6 +839,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, CHECK_MPI(MPI_Barrier(handle->mpi_comm)); double t_end = MPI_Wtime(); if (handle->rank == 0) printf("CUDECOMP: halo autotuning time [s]: %f\n", t_end - t_start); + + // Reset performance samples after autotuning + resetPerformanceSamples(handle, grid_desc); } } // namespace cudecomp diff --git a/src/cudecomp.cc b/src/cudecomp.cc index bd4338e..d0852f7 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -329,6 +329,51 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) { handle->cuda_graphs_enable = false; #endif } + + // Check CUDECOMP_ENABLE_PERFORMANCE_REPORT (Performance reporting) + const char* performance_report_str = std::getenv("CUDECOMP_ENABLE_PERFORMANCE_REPORT"); + if (performance_report_str) { + handle->performance_report_enable = std::strtol(performance_report_str, nullptr, 10) == 1; + } + + // Check CUDECOMP_PERFORMANCE_REPORT_DETAIL (Performance report detail level) + const char* performance_detail_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_DETAIL"); + if (performance_detail_str) { + int32_t detail = std::strtol(performance_detail_str, nullptr, 10); + if (detail >= 0 && detail <= 2) { + handle->performance_report_detail = detail; + } else if (handle->rank == 0) { + printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_DETAIL value (%d). Using default (0).\n", detail); + } + } + + // Check CUDECOMP_PERFORMANCE_REPORT_SAMPLES (Number of performance samples to keep) + const char* performance_samples_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_SAMPLES"); + if (performance_samples_str) { + int32_t samples = std::strtol(performance_samples_str, nullptr, 10); + if (samples > 0) { // Only require positive values + handle->performance_report_samples = samples; + } else if (handle->rank == 0) { + printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_SAMPLES value (%d). Using default (%d).\n", samples, + handle->performance_report_samples); + } + } + + // Check CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES (Number of initial samples to ignore) + const char* performance_warmup_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES"); + if (performance_warmup_str) { + int32_t warmup_samples = std::strtol(performance_warmup_str, nullptr, 10); + if (warmup_samples >= 0) { // Only require non-negative values + handle->performance_report_warmup_samples = warmup_samples; + } else if (handle->rank == 0) { + printf("CUDECOMP:WARN: Invalid CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES value (%d). Using default (%d).\n", + warmup_samples, handle->performance_report_warmup_samples); + } + } + + // Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports) + const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR"); + if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); } } #ifdef ENABLE_NVSHMEM @@ -582,14 +627,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes handle->nvshmem_initialized = true; handle->nvshmem_allocation_size = 0; } - if (!handle->pl_stream) { - int greatest_priority; - CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority)); - CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority)); - } #endif } + if (!handle->pl_stream) { + int greatest_priority; + CHECK_CUDA(cudaDeviceGetStreamPriorityRange(nullptr, &greatest_priority)); + CHECK_CUDA(cudaStreamCreateWithPriority(&handle->pl_stream, cudaStreamNonBlocking, greatest_priority)); + } + // Create CUDA events for scheduling grid_desc->events.resize(handle->nranks); for (auto& event : grid_desc->events) { @@ -722,6 +768,37 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); } #endif + if (handle->performance_report_enable) { + // Print performance report before destroying events + printPerformanceReport(handle, grid_desc); + + // Destroy all transpose performance sample events in the map + for (auto& entry : grid_desc->transpose_perf_samples_map) { + auto& collection = entry.second; + for (auto& sample : collection.samples) { + CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event)); + CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event)); + for (auto& event : sample.alltoall_start_events) { + CHECK_CUDA(cudaEventDestroy(event)); + } + for (auto& event : sample.alltoall_end_events) { + CHECK_CUDA(cudaEventDestroy(event)); + } + } + } + + // Destroy all halo performance sample events in the map + for (auto& entry : grid_desc->halo_perf_samples_map) { + auto& collection = entry.second; + for (auto& sample : collection.samples) { + CHECK_CUDA(cudaEventDestroy(sample.halo_start_event)); + CHECK_CUDA(cudaEventDestroy(sample.halo_end_event)); + CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event)); + CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event)); + } + } + } + if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) || haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { // Release grid descriptor references to NCCL communicators diff --git a/src/performance.cc b/src/performance.cc new file mode 100644 index 0000000..1910e9d --- /dev/null +++ b/src/performance.cc @@ -0,0 +1,950 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cudecomp.h" +#include "internal/checks.h" +#include "internal/performance.h" + +namespace cudecomp { + +// Helper function to create file name with grid descriptor information +std::filesystem::path createPerformanceReportFileName(const std::string& write_dir, const std::string& table_type, + const cudecompGridDesc_t grid_desc) { + std::ostringstream filename; + filename << "cudecomp-perf-report-" << table_type << "-"; + filename << "tcomm_" << grid_desc->config.transpose_comm_backend << "-"; + filename << "hcomm_" << grid_desc->config.halo_comm_backend << "-"; + filename << "pdims_" << grid_desc->config.pdims[0] << "x" << grid_desc->config.pdims[1] << "-"; + filename << "gdims_" << grid_desc->config.gdims[0] << "x" << grid_desc->config.gdims[1] << "x" + << grid_desc->config.gdims[2] << "-"; + filename << "memorder_"; + for (int axis = 0; axis < 3; ++axis) { + filename << grid_desc->config.transpose_mem_order[axis][0] << grid_desc->config.transpose_mem_order[axis][1] + << grid_desc->config.transpose_mem_order[axis][2]; + } + filename << ".csv"; + + return std::filesystem::path(write_dir) / filename.str(); +} + +// Helper function to write CSV header comment with grid configuration information +void writeCSVHeader(std::ofstream& file, const cudecompGridDesc_t grid_desc) { + file << "# Transpose backend: " << cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend) + << "\n"; + file << "# Halo backend: " << cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend) << "\n"; + file << "# Process grid: [" << grid_desc->config.pdims[0] << ", " << grid_desc->config.pdims[1] << "]\n"; + file << "# Global dimensions: [" << grid_desc->config.gdims[0] << ", " << grid_desc->config.gdims[1] << ", " + << grid_desc->config.gdims[2] << "]\n"; + file << "# Memory order: "; + for (int axis = 0; axis < 3; ++axis) { + file << "[" << grid_desc->config.transpose_mem_order[axis][0] << "," + << grid_desc->config.transpose_mem_order[axis][1] << "," << grid_desc->config.transpose_mem_order[axis][2] + << "]"; + if (axis < 2) file << "; "; + } + file << "\n#\n"; +} + +// Write transpose performance table to CSV +void writeTransposePerformanceTableCSV(const std::vector& all_transpose_config_data, + const cudecompGridDesc_t grid_desc, const std::string& write_dir) { + if (all_transpose_config_data.empty()) return; + + std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-aggregated", grid_desc); + std::ofstream file(filename); + if (!file.is_open()) { + printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str()); + return; + } + + writeCSVHeader(file, grid_desc); + + // Write table header + file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed,samples," + "total_ms,A2A_ms,local_ms,A2A_BW_GBps\n"; + + // Write table data rows + for (const auto& config_data : all_transpose_config_data) { + const auto& stats = config_data.stats; + if (stats.samples > 0) { + file << stats.operation << "," << stats.datatype << "," + << "\"" << stats.input_halos << "\"," + << "\"" << stats.output_halos << "\"," + << "\"" << stats.input_padding << "\"," + << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << stats.samples + << "," << std::fixed << std::setprecision(3) << stats.total_time_avg << "," << std::fixed + << std::setprecision(3) << stats.alltoall_time_avg << "," << std::fixed << std::setprecision(3) + << stats.local_time_avg << "," << std::fixed << std::setprecision(3) << stats.alltoall_bw_avg << "\n"; + } + } + + file.close(); + printf("CUDECOMP:\n"); + printf("CUDECOMP: Wrote transpose performance data to %s\n", filename.string().c_str()); +} + +// Write halo performance table to CSV +void writeHaloPerformanceTableCSV(const std::vector& all_halo_config_data, + const cudecompGridDesc_t grid_desc, const std::string& write_dir) { + if (all_halo_config_data.empty()) return; + + std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-aggregated", grid_desc); + std::ofstream file(filename); + if (!file.is_open()) { + printf("CUDECOMP:WARN: Could not open file %s for writing\n", filename.string().c_str()); + return; + } + + writeCSVHeader(file, grid_desc); + + // Write table header + file << "operation,dtype,dim,halo_extent,periods,padding,managed,samples,total_ms,SR_ms,local_ms,SR_BW_GBps\n"; + + // Write table data rows + for (const auto& config_data : all_halo_config_data) { + const auto& stats = config_data.stats; + if (stats.samples > 0) { + file << stats.operation << "," << stats.datatype << "," << stats.dim << "," + << "\"" << stats.halos << "\"," + << "\"" << stats.periods << "\"," + << "\"" << stats.padding << "\"," << stats.managed << "," << stats.samples << "," << std::fixed + << std::setprecision(3) << stats.total_time_avg << "," << std::fixed << std::setprecision(3) + << stats.sendrecv_time_avg << "," << std::fixed << std::setprecision(3) << stats.local_time_avg << "," + << std::fixed << std::setprecision(3) << stats.sendrecv_bw_avg << "\n"; + } + } + + file.close(); + printf("CUDECOMP:\n"); + printf("CUDECOMP: Wrote halo performance data to %s\n", filename.string().c_str()); +} + +// Helper function to create transpose configuration key +cudecompTransposeConfigKey createTransposeConfig(int ax, int dir, void* input, void* output, + const int32_t input_halo_extents_ptr[], + const int32_t output_halo_extents_ptr[], + const int32_t input_padding_ptr[], const int32_t output_padding_ptr[], + cudecompDataType_t datatype) { + std::array input_halo_extents{0, 0, 0}; + std::array output_halo_extents{0, 0, 0}; + std::array input_padding{0, 0, 0}; + std::array output_padding{0, 0, 0}; + + if (input_halo_extents_ptr) { + std::copy(input_halo_extents_ptr, input_halo_extents_ptr + 3, input_halo_extents.begin()); + } + if (output_halo_extents_ptr) { + std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin()); + } + if (input_padding_ptr) { std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin()); } + if (output_padding_ptr) { std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin()); } + + bool inplace = (input == output); + bool managed_memory = isManagedPointer(input) || isManagedPointer(output); + + return std::make_tuple(ax, dir, input_halo_extents, output_halo_extents, input_padding, output_padding, inplace, + managed_memory, datatype); +} + +// Helper function to create halo configuration key +cudecompHaloConfigKey createHaloConfig(int ax, int dim, void* input, const int32_t halo_extents_ptr[], + const bool halo_periods_ptr[], const int32_t padding_ptr[], + cudecompDataType_t datatype) { + std::array halo_extents{0, 0, 0}; + std::array halo_periods{false, false, false}; + std::array padding{0, 0, 0}; + + if (halo_extents_ptr) { std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin()); } + if (halo_periods_ptr) { std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin()); } + if (padding_ptr) { std::copy(padding_ptr, padding_ptr + 3, padding.begin()); } + + bool managed_memory = isManagedPointer(input); + + return std::make_tuple(ax, dim, halo_extents, halo_periods, padding, managed_memory, datatype); +} + +// Helper function to get or create transpose performance sample collection for a configuration +cudecompTransposePerformanceSampleCollection& +getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompTransposeConfigKey& config) { + auto& samples_map = grid_desc->transpose_perf_samples_map; + + if (samples_map.find(config) == samples_map.end()) { + // Create new sample collection for this configuration + cudecompTransposePerformanceSampleCollection collection; + collection.samples.resize(handle->performance_report_samples); + collection.sample_idx = 0; + + // Create events for each sample + for (auto& sample : collection.samples) { + CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event)); + CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event)); + sample.alltoall_start_events.resize(handle->nranks); + sample.alltoall_end_events.resize(handle->nranks); + for (auto& event : sample.alltoall_start_events) { + CHECK_CUDA(cudaEventCreate(&event)); + } + for (auto& event : sample.alltoall_end_events) { + CHECK_CUDA(cudaEventCreate(&event)); + } + sample.valid = false; + } + + samples_map[config] = std::move(collection); + } + + return samples_map[config]; +} + +// Helper function to get or create halo performance sample collection for a configuration +cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const cudecompHandle_t handle, + cudecompGridDesc_t grid_desc, + const cudecompHaloConfigKey& config) { + auto& samples_map = grid_desc->halo_perf_samples_map; + + if (samples_map.find(config) == samples_map.end()) { + // Create new sample collection for this configuration + cudecompHaloPerformanceSampleCollection collection; + collection.samples.resize(handle->performance_report_samples); + collection.sample_idx = 0; + + // Create events for each sample + for (auto& sample : collection.samples) { + CHECK_CUDA(cudaEventCreate(&sample.halo_start_event)); + CHECK_CUDA(cudaEventCreate(&sample.halo_end_event)); + CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event)); + CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event)); + sample.valid = false; + } + + samples_map[config] = std::move(collection); + } + + return samples_map[config]; +} + +// Helper function to format integer array as compact string +std::string formatArray(const std::array& arr) { + std::ostringstream oss; + oss << "[" << arr[0] << "," << arr[1] << "," << arr[2] << "]"; + return oss.str(); +} + +// Helper function to format boolean array as compact string +std::string formatArray(const std::array& arr) { + std::ostringstream oss; + oss << "[" << (arr[0] ? "1" : "0") << "," << (arr[1] ? "1" : "0") << "," << (arr[2] ? "1" : "0") << "]"; + return oss.str(); +} + +// Helper function to get operation name from transpose config +std::string getTransposeOperationName(const cudecompTransposeConfigKey& config) { + int ax = std::get<0>(config); + int dir = std::get<1>(config); + + if (ax == 0) { + return "TransposeXY"; + } else if (ax == 1 && dir == 1) { + return "TransposeYZ"; + } else if (ax == 2) { + return "TransposeZY"; + } else if (ax == 1 && dir == -1) { + return "TransposeYX"; + } + return "Unknown"; +} + +// Helper function to get operation name from halo config +std::string getHaloOperationName(const cudecompHaloConfigKey& config) { + int ax = std::get<0>(config); + + if (ax == 0) { + return "HaloX"; + } else if (ax == 1) { + return "HaloY"; + } else if (ax == 2) { + return "HaloZ"; + } + return "Unknown"; +} + +// Helper function to convert datatype to string +std::string getDatatypeString(cudecompDataType_t datatype) { + switch (datatype) { + case CUDECOMP_FLOAT: return "S"; + case CUDECOMP_DOUBLE: return "D"; + case CUDECOMP_FLOAT_COMPLEX: return "C"; + case CUDECOMP_DOUBLE_COMPLEX: return "Z"; + default: return "unknown"; + } +} + +// Comparison function to order transpose configurations. +// Ordering is (operation, datatype, halos, padding, inplace, managed). +bool compareTransposeConfigData(const TransposeConfigTimingData& a, const TransposeConfigTimingData& b) { + static const std::map op_priority = {{"TransposeXY", 0}, + {"TransposeYZ", 1}, + {"TransposeZY", 2}, + {"TransposeYX", 3}}; + + static const std::map dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}}; + + if (a.stats.operation != b.stats.operation) { + return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation); + } + if (a.stats.datatype != b.stats.datatype) { + return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype); + } + if (a.stats.input_halos != b.stats.input_halos) { return a.stats.input_halos < b.stats.input_halos; } + if (a.stats.output_halos != b.stats.output_halos) { return a.stats.output_halos < b.stats.output_halos; } + if (a.stats.input_padding != b.stats.input_padding) { return a.stats.input_padding < b.stats.input_padding; } + if (a.stats.output_padding != b.stats.output_padding) { return a.stats.output_padding < b.stats.output_padding; } + if (a.stats.inplace != b.stats.inplace) { return a.stats.inplace < b.stats.inplace; } + return a.stats.managed < b.stats.managed; +} + +// Comparison function to order halo configurations. +// Ordering is (operation, datatype, dim, halos, periods, padding, managed). +bool compareHaloConfigData(const HaloConfigTimingData& a, const HaloConfigTimingData& b) { + static const std::map op_priority = {{"HaloX", 0}, {"HaloY", 1}, {"HaloZ", 2}}; + + static const std::map dtype_priority = {{"S", 0}, {"D", 1}, {"C", 2}, {"Z", 3}}; + + if (a.stats.operation != b.stats.operation) { + return op_priority.at(a.stats.operation) < op_priority.at(b.stats.operation); + } + if (a.stats.dim != b.stats.dim) { return a.stats.dim < b.stats.dim; } + if (a.stats.datatype != b.stats.datatype) { + return dtype_priority.at(a.stats.datatype) < dtype_priority.at(b.stats.datatype); + } + if (a.stats.halos != b.stats.halos) { return a.stats.halos < b.stats.halos; } + if (a.stats.periods != b.stats.periods) { return a.stats.periods < b.stats.periods; } + if (a.stats.padding != b.stats.padding) { return a.stats.padding < b.stats.padding; } + return a.stats.managed < b.stats.managed; +} + +// Function to compute average across ranks +float computeGlobalAverage(const std::vector& values, const cudecompHandle_t handle) { + if (values.size() == 0) { return 0.0f; } + + float value = std::accumulate(values.begin(), values.end(), 0.0f); + value /= values.size(); + CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &value, 1, MPI_FLOAT, MPI_SUM, handle->mpi_comm)); + value /= handle->nranks; + return value; +} + +// Function to gather data from all ranks +void gatherSampleData(const std::vector& local_data, std::vector& all_data, + const cudecompHandle_t handle) { + int num_samples = local_data.size(); + if (handle->rank == 0) { all_data.resize(num_samples * handle->nranks); } + + CHECK_MPI(MPI_Gather(local_data.data(), num_samples, MPI_FLOAT, all_data.data(), num_samples, MPI_FLOAT, 0, + handle->mpi_comm)); +} + +// Process transpose timing data from sample collections +TransposeConfigTimingData processTransposeConfig(const cudecompTransposeConfigKey& config, + const cudecompTransposePerformanceSampleCollection& collection, + const cudecompHandle_t handle) { + TransposeConfigTimingData config_data; + + config_data.total_times.reserve(collection.samples.size()); + config_data.alltoall_times.reserve(collection.samples.size()); + config_data.local_times.reserve(collection.samples.size()); + config_data.alltoall_bws.reserve(collection.samples.size()); + + // Collect valid samples and compute elapsed times + for (int i = 0; i < collection.samples.size(); ++i) { + const auto& sample = collection.samples[i]; + if (!sample.valid) continue; + + float alltoall_timing_ms = 0.0f; + for (int j = 0; j < sample.alltoall_timing_count; ++j) { + float elapsed_time; + CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, sample.alltoall_start_events[j], sample.alltoall_end_events[j])); + alltoall_timing_ms += elapsed_time; + } + + float transpose_timing_ms; + CHECK_CUDA(cudaEventElapsedTime(&transpose_timing_ms, sample.transpose_start_event, sample.transpose_end_event)); + + config_data.total_times.push_back(transpose_timing_ms); + config_data.alltoall_times.push_back(alltoall_timing_ms); + config_data.local_times.push_back(transpose_timing_ms - alltoall_timing_ms); + + float alltoall_bw = (alltoall_timing_ms > 0) ? sample.alltoall_bytes * 1e-6 / alltoall_timing_ms : 0; + config_data.alltoall_bws.push_back(alltoall_bw); + } + + // Prepare aggregated statistics + TransposePerformanceStats& stats = config_data.stats; + stats.operation = getTransposeOperationName(config); + stats.datatype = getDatatypeString(std::get<8>(config)); + + auto input_halos = std::get<2>(config); + auto output_halos = std::get<3>(config); + auto input_padding = std::get<4>(config); + auto output_padding = std::get<5>(config); + + stats.input_halos = formatArray(input_halos); + stats.output_halos = formatArray(output_halos); + stats.input_padding = formatArray(input_padding); + stats.output_padding = formatArray(output_padding); + stats.inplace = std::get<6>(config) ? "T" : "F"; + stats.managed = std::get<7>(config) ? "T" : "F"; + stats.samples = config_data.total_times.size(); + + // Compute average statistics and reduce across all ranks + stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle); + stats.alltoall_time_avg = computeGlobalAverage(config_data.alltoall_times, handle); + stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle); + stats.alltoall_bw_avg = computeGlobalAverage(config_data.alltoall_bws, handle); + + return config_data; +} + +// Process halo timing data from sample collections +HaloConfigTimingData processHaloConfig(const cudecompHaloConfigKey& config, + const cudecompHaloPerformanceSampleCollection& collection, + const cudecompHandle_t handle) { + HaloConfigTimingData config_data; + + config_data.total_times.reserve(collection.samples.size()); + config_data.sendrecv_times.reserve(collection.samples.size()); + config_data.local_times.reserve(collection.samples.size()); + config_data.sendrecv_bws.reserve(collection.samples.size()); + + // Collect valid samples and compute elapsed times + for (int i = 0; i < collection.samples.size(); ++i) { + const auto& sample = collection.samples[i]; + if (!sample.valid) continue; + + float sendrecv_timing_ms = 0.0f; + if (sample.sendrecv_bytes > 0) { + CHECK_CUDA(cudaEventElapsedTime(&sendrecv_timing_ms, sample.sendrecv_start_event, sample.sendrecv_end_event)); + } + + float halo_timing_ms; + CHECK_CUDA(cudaEventElapsedTime(&halo_timing_ms, sample.halo_start_event, sample.halo_end_event)); + + config_data.total_times.push_back(halo_timing_ms); + config_data.sendrecv_times.push_back(sendrecv_timing_ms); + config_data.local_times.push_back(halo_timing_ms - sendrecv_timing_ms); + + float sendrecv_bw = (sendrecv_timing_ms > 0) ? sample.sendrecv_bytes * 1e-6 / sendrecv_timing_ms : 0; + config_data.sendrecv_bws.push_back(sendrecv_bw); + } + + // Prepare aggregated statistics + HaloPerformanceStats& stats = config_data.stats; + stats.operation = getHaloOperationName(config); + stats.datatype = getDatatypeString(std::get<6>(config)); + stats.dim = std::get<1>(config); + + // Format halo extents, periods, and padding + auto halo_extents = std::get<2>(config); + auto halo_periods = std::get<3>(config); + auto padding = std::get<4>(config); + + stats.halos = formatArray(halo_extents); + stats.periods = formatArray(halo_periods); + stats.padding = formatArray(padding); + stats.managed = std::get<5>(config) ? "T" : "F"; + stats.samples = config_data.total_times.size(); + + // Compute average statistics across all ranks + stats.total_time_avg = computeGlobalAverage(config_data.total_times, handle); + stats.sendrecv_time_avg = computeGlobalAverage(config_data.sendrecv_times, handle); + stats.local_time_avg = computeGlobalAverage(config_data.local_times, handle); + stats.sendrecv_bw_avg = computeGlobalAverage(config_data.sendrecv_bws, handle); + + return config_data; +} + +// Print grid configuration information +void printGridConfiguration(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) { + if (handle->rank != 0) return; + + printf("CUDECOMP:\n"); + printf("CUDECOMP: ===== Performance Summary =====\n"); + printf("CUDECOMP: Grid Configuration:\n"); + printf("CUDECOMP:\tTranspose backend: %s\n", + cudecompTransposeCommBackendToString(grid_desc->config.transpose_comm_backend)); + printf("CUDECOMP:\tHalo backend: %s\n", cudecompHaloCommBackendToString(grid_desc->config.halo_comm_backend)); + printf("CUDECOMP:\tProcess grid: [%d, %d]\n", grid_desc->config.pdims[0], grid_desc->config.pdims[1]); + printf("CUDECOMP:\tGlobal dimensions: [%d, %d, %d]\n", grid_desc->config.gdims[0], grid_desc->config.gdims[1], + grid_desc->config.gdims[2]); + + // Print memory ordering information + printf("CUDECOMP:\tMemory order: "); + for (int axis = 0; axis < 3; ++axis) { + printf("[%d,%d,%d]", grid_desc->config.transpose_mem_order[axis][0], grid_desc->config.transpose_mem_order[axis][1], + grid_desc->config.transpose_mem_order[axis][2]); + if (axis < 2) printf("; "); + } + printf("\n"); + printf("CUDECOMP:\n"); +} + +// Print transpose performance table +void printTransposePerformanceTable(const std::vector& all_transpose_config_data) { + printf("CUDECOMP: Transpose Performance Data:\n"); + printf("CUDECOMP:\n"); + + // Print compact table header + printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "halo extents", + "padding", "inplace", "managed", "samples", "total", "A2A", "local", "A2A BW"); + printf("CUDECOMP: %-12s %-6s %-15s %-15s %-8s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "[ms]", + "[ms]", "[ms]", "[GB/s]"); + printf("CUDECOMP: "); + for (int i = 0; i < 120; ++i) + printf("-"); + printf("\n"); + + // Print table rows + for (const auto& config_data : all_transpose_config_data) { + const auto& stats = config_data.stats; + if (stats.samples > 0) { + printf("CUDECOMP: %-12s %-6s %-7s/%-7s %-7s/%-7s %-8s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n", + stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(), + stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str(), + stats.samples, stats.total_time_avg, stats.alltoall_time_avg, stats.local_time_avg, stats.alltoall_bw_avg); + } + } +} + +// Print halo performance table +void printHaloPerformanceTable(const std::vector& all_halo_config_data) { + printf("CUDECOMP:\n"); + printf("CUDECOMP: Halo Performance Data:\n"); + printf("CUDECOMP:\n"); + + // Print compact table header + printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "operation", "dtype", "dim", + "halo extent", "periods", "padding", "managed", "samples", "total", "SR", "local", "SR BW"); + printf("CUDECOMP: %-12s %-6s %-5s %-12s %-12s %-12s %-8s %-8s %-9s %-9s %-9s %-9s\n", "", "", "", "", "", "", "", "", + "[ms]", "[ms]", "[ms]", "[GB/s]"); + printf("CUDECOMP: "); + for (int i = 0; i < 125; ++i) + printf("-"); + printf("\n"); + + // Print table rows + for (const auto& config_data : all_halo_config_data) { + const auto& stats = config_data.stats; + if (stats.samples > 0) { + printf("CUDECOMP: %-12s %-6s %-5d %-12s %-12s %-12s %-8s %-8d %-9.3f %-9.3f %-9.3f %-9.3f\n", + stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(), + stats.padding.c_str(), stats.managed.c_str(), stats.samples, stats.total_time_avg, stats.sendrecv_time_avg, + stats.local_time_avg, stats.sendrecv_bw_avg); + } + } +} + +// Print per-sample transpose data for a single configuration +void printTransposePerSampleDetailsForConfig(const TransposeConfigTimingData& config_data, + const cudecompHandle_t handle, int detail_level, + std::ofstream* csv_file = nullptr) { + const auto& stats = config_data.stats; + const auto& total_times = config_data.total_times; + const auto& alltoall_times = config_data.alltoall_times; + const auto& local_times = config_data.local_times; + const auto& alltoall_bws = config_data.alltoall_bws; + + if (total_times.empty()) return; + + int num_samples = total_times.size(); + std::vector all_total_times, all_alltoall_times, all_local_times, all_alltoall_bws; + + if (detail_level == 1) { + // Detail level 1: Only rank 0 prints its own data + if (handle->rank != 0) return; + + all_total_times = total_times; + all_alltoall_times = alltoall_times; + all_local_times = local_times; + all_alltoall_bws = alltoall_bws; + + } else if (detail_level == 2) { + // Detail level 2: Gather data from all ranks + gatherSampleData(total_times, all_total_times, handle); + gatherSampleData(alltoall_times, all_alltoall_times, handle); + gatherSampleData(local_times, all_local_times, handle); + gatherSampleData(alltoall_bws, all_alltoall_bws, handle); + + if (handle->rank != 0) return; + } + + printf("CUDECOMP: %s (dtype=%s, halo extents=%s/%s, padding=%s/%s, inplace=%s, managed=%s) samples:\n", + stats.operation.c_str(), stats.datatype.c_str(), stats.input_halos.c_str(), stats.output_halos.c_str(), + stats.input_padding.c_str(), stats.output_padding.c_str(), stats.inplace.c_str(), stats.managed.c_str()); + + printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "A2A", "local", "A2A BW"); + printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]"); + + for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) { + for (int s = 0; s < num_samples; ++s) { + int idx = r * num_samples + s; + printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_alltoall_times[idx], + all_local_times[idx], all_alltoall_bws[idx]); + + // Write to CSV if file is provided + if (csv_file && csv_file->is_open()) { + *csv_file << stats.operation << "," << stats.datatype << "," + << "\"" << stats.input_halos << "\"," + << "\"" << stats.output_halos << "\"," + << "\"" << stats.input_padding << "\"," + << "\"" << stats.output_padding << "\"," << stats.inplace << "," << stats.managed << "," << r << "," + << s << "," << std::fixed << std::setprecision(3) << all_total_times[idx] << "," << std::fixed + << std::setprecision(3) << all_alltoall_times[idx] << "," << std::fixed << std::setprecision(3) + << all_local_times[idx] << "," << std::fixed << std::setprecision(3) << all_alltoall_bws[idx] << "\n"; + } + } + } + printf("CUDECOMP:\n"); +} + +// Print per-sample halo data for a single configuration +void printHaloPerSampleDetailsForConfig(const HaloConfigTimingData& config_data, const cudecompHandle_t handle, + int detail_level, std::ofstream* csv_file = nullptr) { + const auto& stats = config_data.stats; + const auto& total_times = config_data.total_times; + const auto& sendrecv_times = config_data.sendrecv_times; + const auto& local_times = config_data.local_times; + const auto& sendrecv_bws = config_data.sendrecv_bws; + + if (total_times.empty()) return; + + int num_samples = total_times.size(); + std::vector all_total_times, all_sendrecv_times, all_local_times, all_sendrecv_bws; + + if (detail_level == 1) { + // Detail level 1: Only rank 0 prints its own data + if (handle->rank != 0) return; + + all_total_times = total_times; + all_sendrecv_times = sendrecv_times; + all_local_times = local_times; + all_sendrecv_bws = sendrecv_bws; + + } else if (detail_level == 2) { + // Detail level 2: Gather data from all ranks + gatherSampleData(total_times, all_total_times, handle); + gatherSampleData(sendrecv_times, all_sendrecv_times, handle); + gatherSampleData(local_times, all_local_times, handle); + gatherSampleData(sendrecv_bws, all_sendrecv_bws, handle); + + if (handle->rank != 0) return; + } + + printf("CUDECOMP: %s (dtype=%s, dim=%d, halos=%s, periods=%s, padding=%s, managed=%s) samples:\n", + stats.operation.c_str(), stats.datatype.c_str(), stats.dim, stats.halos.c_str(), stats.periods.c_str(), + stats.padding.c_str(), stats.managed.c_str()); + + printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "rank", "sample", "total", "SR", "local", "SR BW"); + printf("CUDECOMP: %-6s %-12s %-9s %-9s %-9s %-9s\n", "", "", "[ms]", "[ms]", "[ms]", "[GB/s]"); + + for (int r = 0; r < (detail_level == 1 ? 1 : handle->nranks); ++r) { + for (int s = 0; s < num_samples; ++s) { + int idx = r * num_samples + s; + printf("CUDECOMP: %-6d %-12d %-9.3f %-9.3f %-9.3f %-9.3f\n", r, s, all_total_times[idx], all_sendrecv_times[idx], + all_local_times[idx], all_sendrecv_bws[idx]); + + // Write to CSV if file is provided + if (csv_file && csv_file->is_open()) { + *csv_file << stats.operation << "," << stats.datatype << "," << stats.dim << "," + << "\"" << stats.halos << "\"," + << "\"" << stats.periods << "\"," + << "\"" << stats.padding << "\"," << stats.managed << "," << r << "," << s << "," << std::fixed + << std::setprecision(3) << all_total_times[idx] << "," << std::fixed << std::setprecision(3) + << all_sendrecv_times[idx] << "," << std::fixed << std::setprecision(3) << all_local_times[idx] << "," + << std::fixed << std::setprecision(3) << all_sendrecv_bws[idx] << "\n"; + } + } + } + printf("CUDECOMP:\n"); +} + +// Print per-sample details for transpose configurations +void printTransposePerSampleDetails(const std::vector& all_transpose_config_data, + const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, + const std::string& write_dir = "") { + std::ofstream csv_file; + bool csv_enabled = !write_dir.empty(); + + // Check if there are any samples before creating CSV file + bool has_samples = false; + for (const auto& config_data : all_transpose_config_data) { + if (config_data.stats.samples > 0) { + has_samples = true; + break; + } + } + + if (csv_enabled && handle->rank == 0 && has_samples) { + std::filesystem::path filename = createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc); + csv_file.open(filename); + if (csv_file.is_open()) { + writeCSVHeader(csv_file, grid_desc); + csv_file << "operation,dtype,input_halo_extents,output_halo_extents,input_padding,output_padding,inplace,managed," + "rank,sample,total_ms,A2A_ms,local_ms,A2A_BW_GBps\n"; + } else { + printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str()); + } + } + + for (const auto& config_data : all_transpose_config_data) { + if (config_data.stats.samples == 0) continue; + + printTransposePerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail, + csv_enabled && has_samples ? &csv_file : nullptr); + } + + if (csv_file.is_open()) { + csv_file.close(); + printf("CUDECOMP:\n"); + printf("CUDECOMP: Wrote per-sample transpose data to %s\n", + createPerformanceReportFileName(write_dir, "transpose-samples", grid_desc).string().c_str()); + } +} + +// Print per-sample details for halo configurations +void printHaloPerSampleDetails(const std::vector& all_halo_config_data, + const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, + const std::string& write_dir = "") { + std::ofstream csv_file; + bool csv_enabled = !write_dir.empty(); + + // Check if there are any samples before creating CSV file + bool has_samples = false; + for (const auto& config_data : all_halo_config_data) { + if (config_data.stats.samples > 0) { + has_samples = true; + break; + } + } + + if (csv_enabled && handle->rank == 0 && has_samples) { + std::filesystem::path filename = createPerformanceReportFileName(write_dir, "halo-samples", grid_desc); + csv_file.open(filename); + if (csv_file.is_open()) { + writeCSVHeader(csv_file, grid_desc); + csv_file + << "operation,dtype,dim,halo_extent,periods,padding,managed,rank,sample,total_ms,SR_ms,local_ms,SR_BW_GBps\n"; + } else { + printf("CUDECOMP: Warning: Could not open file %s for writing\n", filename.string().c_str()); + } + } + + for (const auto& config_data : all_halo_config_data) { + if (config_data.stats.samples == 0) continue; + + printHaloPerSampleDetailsForConfig(config_data, handle, handle->performance_report_detail, + csv_enabled && has_samples ? &csv_file : nullptr); + } + + if (csv_file.is_open()) { + csv_file.close(); + printf("CUDECOMP:\n"); + printf("CUDECOMP: Wrote per-sample halo data to %s\n", + createPerformanceReportFileName(write_dir, "halo-samples", grid_desc).string().c_str()); + } +} + +void printPerformanceReport(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc) { + // Synchronize to ensure all events are recorded + CHECK_CUDA(cudaDeviceSynchronize()); + + // Collect all transpose statistics and timing data + std::vector all_transpose_config_data; + all_transpose_config_data.reserve(grid_desc->transpose_perf_samples_map.size()); + + for (const auto& entry : grid_desc->transpose_perf_samples_map) { + const auto& config = entry.first; + const auto& collection = entry.second; + + TransposeConfigTimingData config_data = processTransposeConfig(config, collection, handle); + if (config_data.stats.samples > 0) { all_transpose_config_data.emplace_back(std::move(config_data)); } + } + + // Sort transpose configuration data for consistent ordering + std::sort(all_transpose_config_data.begin(), all_transpose_config_data.end(), compareTransposeConfigData); + + // Collect all halo statistics and timing data with performance optimizations + std::vector all_halo_config_data; + all_halo_config_data.reserve(grid_desc->halo_perf_samples_map.size()); + + for (const auto& entry : grid_desc->halo_perf_samples_map) { + const auto& config = entry.first; + const auto& collection = entry.second; + + HaloConfigTimingData config_data = processHaloConfig(config, collection, handle); + if (config_data.stats.samples > 0) { all_halo_config_data.emplace_back(std::move(config_data)); } + } + + // Sort halo configuration data for consistent ordering + std::sort(all_halo_config_data.begin(), all_halo_config_data.end(), compareHaloConfigData); + + // Check if CSV writing is enabled + const std::string& write_dir = handle->performance_report_write_dir; + bool csv_enabled = !write_dir.empty(); + + // Print grid configuration information + printGridConfiguration(handle, grid_desc); + + if (handle->rank == 0) { + if (all_transpose_config_data.empty() && all_halo_config_data.empty()) { + printf("CUDECOMP: No performance data collected\n"); + printf("CUDECOMP: ================================\n"); + printf("CUDECOMP:\n"); + return; + } + + // Print transpose performance data + if (!all_transpose_config_data.empty()) { + printTransposePerformanceTable(all_transpose_config_data); + + // Write transpose performance data to CSV if enabled + if (csv_enabled) { writeTransposePerformanceTableCSV(all_transpose_config_data, grid_desc, write_dir); } + } + + // Print halo performance data + if (!all_halo_config_data.empty()) { + printHaloPerformanceTable(all_halo_config_data); + + // Write halo performance data to CSV if enabled + if (csv_enabled) { writeHaloPerformanceTableCSV(all_halo_config_data, grid_desc, write_dir); } + } + } + + // Print per-sample data if detail level > 0 + if (handle->performance_report_detail > 0) { + if (handle->rank == 0) { + printf("CUDECOMP:\n"); + printf("CUDECOMP: Per-Sample Details:\n"); + printf("CUDECOMP:\n"); + } + + // Print transpose per-sample details (and write CSV if enabled) + printTransposePerSampleDetails(all_transpose_config_data, handle, grid_desc, csv_enabled ? write_dir : ""); + + // Print halo per-sample details (and write CSV if enabled) + printHaloPerSampleDetails(all_halo_config_data, handle, grid_desc, csv_enabled ? write_dir : ""); + } + + if (handle->rank == 0) { + printf("CUDECOMP: ================================\n"); + printf("CUDECOMP:\n"); + } +} + +void resetPerformanceSamples(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) { + if (!handle->performance_report_enable) return; + + // Reset all transpose sample collections in the map + for (auto& entry : grid_desc->transpose_perf_samples_map) { + auto& collection = entry.second; + collection.sample_idx = 0; + collection.warmup_count = 0; + + // Mark all samples as invalid and reset counters + for (auto& sample : collection.samples) { + sample.valid = false; + sample.alltoall_timing_count = 0; + sample.alltoall_bytes = 0; + } + } + + // Reset all halo sample collections in the map + for (auto& entry : grid_desc->halo_perf_samples_map) { + auto& collection = entry.second; + collection.sample_idx = 0; + collection.warmup_count = 0; + + // Mark all samples as invalid and reset counters + for (auto& sample : collection.samples) { + sample.valid = false; + sample.sendrecv_bytes = 0; + } + } +} + +// Helper function to advance transpose sample index with warmup handling +void advanceTransposePerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompTransposeConfigKey& config) { + if (!handle->performance_report_enable) return; + + auto& collection = getOrCreateTransposePerformanceSamples(handle, grid_desc, config); + + // Check if we're still in warmup phase + if (collection.warmup_count < handle->performance_report_warmup_samples) { + collection.warmup_count++; + // During warmup, don't advance the circular buffer, just mark current sample as invalid + collection.samples[collection.sample_idx].valid = false; + } else { + // Past warmup, advance the circular buffer normally + collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples; + } +} + +// Helper function to advance halo sample index with warmup handling +void advanceHaloPerformanceSample(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + const cudecompHaloConfigKey& config) { + if (!handle->performance_report_enable) return; + + auto& collection = getOrCreateHaloPerformanceSamples(handle, grid_desc, config); + + // Check if we're still in warmup phase + if (collection.warmup_count < handle->performance_report_warmup_samples) { + collection.warmup_count++; + // During warmup, don't advance the circular buffer, just mark current sample as invalid + collection.samples[collection.sample_idx].valid = false; + } else { + // Past warmup, advance the circular buffer normally + collection.sample_idx = (collection.sample_idx + 1) % handle->performance_report_samples; + } +} + +} // namespace cudecomp diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc index 69e9dbc..895fe86 100644 --- a/tests/cc/halo_test.cc +++ b/tests/cc/halo_test.cc @@ -587,6 +587,21 @@ int main(int argc, char** argv) { } // Finalize + // Free grid descriptors + for (auto& entry : grid_desc_cache) { + auto& backend = entry.first; + auto& grid_desc = entry.second; + // If backend matches workspace, free workspace + if (std::get<0>(workspace) == static_cast(backend)) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace))); + std::get<0>(workspace) = -1; + std::get<1>(workspace) = nullptr; + std::get<2>(workspace) = 0; + } + CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc)); + } + grid_desc_cache.clear(); + CHECK_CUDECOMP_EXIT(cudecompFinalize(handle)); CHECK_MPI_EXIT(MPI_Finalize()); diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc index f9ad62c..c217a3c 100644 --- a/tests/cc/transpose_test.cc +++ b/tests/cc/transpose_test.cc @@ -651,6 +651,21 @@ int main(int argc, char** argv) { } // Finalize + // Free grid descriptors + for (auto& entry : grid_desc_cache) { + auto& backend = entry.first; + auto& grid_desc = entry.second; + // Free workspace using correct grid descriptor + if (std::get<0>(workspace) == static_cast(backend)) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, std::get<1>(workspace))); + std::get<0>(workspace) = -1; + std::get<1>(workspace) = nullptr; + std::get<2>(workspace) = 0; + } + CHECK_CUDECOMP(cudecompGridDescDestroy(handle, grid_desc)); + } + grid_desc_cache.clear(); + CHECK_CUDECOMP_EXIT(cudecompFinalize(handle)); CHECK_MPI_EXIT(MPI_Finalize()); diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index 9e67e1c..ac816c3 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -630,6 +630,17 @@ program main if (nfailed /= 0) retcode = 1; endif + ! Free grid descriptors + do i = 1, 5 + if (grid_desc_cache_set(i)) then + ! Free workspace with correct grid descriptor + if (work_backend == i) then + CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d)) + endif + CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i))) + endif + end do + CHECK_CUDECOMP_EXIT(cudecompFinalize(handle)) call MPI_Finalize(ierr) diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index 02810df..57caf7c 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -654,6 +654,17 @@ program main if (nfailed /= 0) retcode = 1; endif + ! Free grid descriptors + do i = 1, 7 + if (grid_desc_cache_set(i)) then + ! Free workspace with correct grid descriptor + if (work_backend == i) then + CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc_cache(i), work_d)) + endif + CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc_cache(i))) + endif + end do + CHECK_CUDECOMP_EXIT(cudecompFinalize(handle)) call MPI_Finalize(ierr)