Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ target_sources(cudecomp
${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc
${CMAKE_CURRENT_SOURCE_DIR}/src/graph.cc
${CMAKE_CURRENT_SOURCE_DIR}/src/nvml_wrap.cc
${CMAKE_CURRENT_SOURCE_DIR}/src/performance.cc
)

set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels_rdc.cu PROPERTIES COMPILE_FLAGS -rdc=true)
Expand Down
58 changes: 58 additions & 0 deletions docs/env_vars.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,61 @@ CUDECOMP_ENABLE_CUDA_GRAPHS
and communication overlap of packing kernels in large scale cases.

Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.

CUDECOMP_ENABLE_PERFORMANCE_REPORT
------------------------------------
(since v0.5.1)

:code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` controls whether cuDecomp performance reporting is enabled.

Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.

CUDECOMP_PERFORMANCE_REPORT_DETAIL
----------------------------------
(since v0.5.1)

:code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` controls the verbosity of performance reporting when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. This setting determines whether individual sample data is printed in addition to the aggregated performance summary.

The following values are supported:

- :code:`0`: Aggregated report only - prints only the summary table with averaged performance statistics (default)
- :code:`1`: Per-sample reporting on rank 0 - prints individual sample data for each transpose/halo configuration, but only from rank 0
- :code:`2`: Per-sample reporting on all ranks - prints individual sample data for each transpose/halo configuration from all ranks, gathered and sorted by rank on rank 0

Default setting is :code:`0`.

CUDECOMP_PERFORMANCE_REPORT_SAMPLES
-----------------------------------
(since v0.5.1)

:code:`CUDECOMP_PERFORMANCE_REPORT_SAMPLES` controls the number of performance samples to keep for the final performance report. This setting determines the size of the circular buffer used to store timing measurements for each transpose/halo configuration.

Default setting is :code:`20` samples.

CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES
------------------------------------------
(since v0.5.1)

:code:`CUDECOMP_PERFORMANCE_REPORT_WARMUP_SAMPLES` controls the number of initial samples to ignore for each transpose/halo configuration. This helps exclude outliers from GPU warmup, memory allocation, and other initialization effects from the final performance statistics.

Default setting is :code:`3` warmup samples. Setting this to 0 disables warmup sample filtering.

CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR
-------------------------------------
(since v0.5.1)

:code:`CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR` controls the directory where CSV performance reports are written when :code:`CUDECOMP_ENABLE_PERFORMANCE_REPORT` is enabled. When this variable is set, cuDecomp will write performance data to CSV files in the specified directory.

CSV files are created with descriptive names encoding the grid configuration, for example:
:code:`cudecomp-perf-report-transpose-aggregated-tcomm_1-hcomm_1-pdims_2x2-gdims_256x256x256-memorder_012012012.csv`

The following CSV files are generated:

- Aggregated transpose performance data
- Aggregated halo performance data
- Per-sample transpose data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)
- Per-sample halo data (when :code:`CUDECOMP_PERFORMANCE_REPORT_DETAIL` > 0)

Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.

Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.
69 changes: 54 additions & 15 deletions include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
#endif

template <typename T>
static void
cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
T* recv_buff, const std::vector<comm_count_t>& recv_counts,
const std::vector<comm_count_t>& recv_offsets, const std::vector<comm_count_t>& recv_offsets_nvshmem,
cudecompCommAxis comm_axis, cudaStream_t stream) {
static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
const std::vector<comm_count_t>& send_counts,
const std::vector<comm_count_t>& send_offsets, T* recv_buff,
const std::vector<comm_count_t>& recv_counts,
const std::vector<comm_count_t>& recv_offsets,
const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
cudaStream_t stream, cudecompTransposePerformanceSample* current_sample = nullptr) {
nvtx::rangePush("cudecompAlltoall");

if (handle->performance_report_enable) {
CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count], stream));
}

#ifdef ENABLE_NVSHMEM
if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
Expand Down Expand Up @@ -269,18 +274,28 @@ cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
break;
}
}

if (handle->performance_report_enable) {
CHECK_CUDA(cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], stream));
current_sample->alltoall_timing_count++;
}

nvtx::rangePop();
}

template <typename T>
static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
const std::vector<comm_count_t>& send_counts,
const std::vector<comm_count_t>& send_offsets, T* recv_buff,
const std::vector<comm_count_t>& recv_counts,
const std::vector<comm_count_t>& recv_offsets,
const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks,
cudaStream_t stream, bool& synced) {
static void
cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
T* recv_buff, const std::vector<comm_count_t>& recv_counts,
const std::vector<comm_count_t>& recv_offsets,
const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {

// If there are no transfers to complete, quick return
if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }

std::ostringstream os;
os << "cudecompAlltoallPipelined_";
for (int i = 0; i < src_ranks.size(); ++i) {
Expand All @@ -289,6 +304,14 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
}
nvtx::rangePush(os.str());

int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
if (handle->performance_report_enable && src_ranks[0] != self_rank) {
// Note: skipping self-copy for timing as it should be overlapped
CHECK_CUDA(cudaStreamWaitEvent(handle->pl_stream, grid_desc->events[dst_ranks[0]], 0));
CHECK_CUDA(cudaEventRecord(current_sample->alltoall_start_events[current_sample->alltoall_timing_count],
handle->pl_stream));
}

#ifdef ENABLE_NVSHMEM
if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
transposeBackendRequiresMpi(grid_desc->config.transpose_comm_backend) &&
Expand Down Expand Up @@ -459,6 +482,12 @@ static void cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cude
break;
}
}

if (handle->performance_report_enable && src_ranks[0] != self_rank) {
CHECK_CUDA(
cudaEventRecord(current_sample->alltoall_end_events[current_sample->alltoall_timing_count], handle->pl_stream));
current_sample->alltoall_timing_count++;
}
nvtx::rangePop();
}

Expand All @@ -468,9 +497,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
const std::array<comm_count_t, 2>& send_counts,
const std::array<size_t, 2>& send_offsets, T* recv_buff,
const std::array<comm_count_t, 2>& recv_counts,
const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0) {
const std::array<size_t, 2>& recv_offsets, cudaStream_t stream = 0,
cudecompHaloPerformanceSample* current_sample = nullptr) {
nvtx::rangePush("cudecompSendRecvPair");

if (handle->performance_report_enable && current_sample) {
CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_start_event, stream));
}

#ifdef ENABLE_NVSHMEM
if (handle->rank == 0 && handle->nvshmem_initialized && !handle->nvshmem_mixed_buffer_warning_issued &&
haloBackendRequiresMpi(grid_desc->config.halo_comm_backend) &&
Expand Down Expand Up @@ -592,6 +626,11 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
break;
}
}

if (handle->performance_report_enable && current_sample) {
CHECK_CUDA(cudaEventRecord(current_sample->sendrecv_end_event, stream));
}

nvtx::rangePop();
}

Expand Down
67 changes: 67 additions & 0 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@

#include <array>
#include <complex>
#include <functional>
#include <map>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
Expand All @@ -43,6 +45,10 @@
#include <cuda_runtime.h>
#include <mpi.h>
#include <nccl.h>
#ifdef ENABLE_NVSHMEM
#include <nvshmem.h>
#include <nvshmemx.h>
#endif

#include "cudecomp.h"
#include "internal/checks.h"
Expand Down Expand Up @@ -108,6 +114,15 @@ struct cudecompHandle {

// CUDA graphs
bool cuda_graphs_enable = false; // Flag to control whether CUDA graphs are used

// Performance reporting related entries
bool performance_report_enable = false; // flag to track if performance reporting is enabled
int32_t performance_report_detail =
0; // performance report detail level: 0=aggregated, 1=per-sample rank 0, 2=per-sample all ranks
int32_t performance_report_samples = 20; // number of performance samples to keep for final report
int32_t performance_report_warmup_samples = 3; // number of initial warmup samples to ignore for each configuration
std::string performance_report_write_dir =
""; // directory to write CSV performance reports, empty means no file writing
};

// Structure with information about row/column communicator
Expand All @@ -124,6 +139,41 @@ struct cudecompCommInfo {
#endif
};

// Structure to contain data for transpose performance sample
struct cudecompTransposePerformanceSample {
cudaEvent_t transpose_start_event;
cudaEvent_t transpose_end_event;
std::vector<cudaEvent_t> alltoall_start_events;
std::vector<cudaEvent_t> alltoall_end_events;
int32_t alltoall_timing_count = 0;
size_t alltoall_bytes = 0;
bool valid = false;
};

// Collection of transpose performance samples for a specific configuration
struct cudecompTransposePerformanceSampleCollection {
std::vector<cudecompTransposePerformanceSample> samples;
int32_t sample_idx = 0;
int32_t warmup_count = 0;
};

// Structure to contain data for halo performance sample
struct cudecompHaloPerformanceSample {
cudaEvent_t halo_start_event;
cudaEvent_t halo_end_event;
cudaEvent_t sendrecv_start_event;
cudaEvent_t sendrecv_end_event;
size_t sendrecv_bytes = 0;
bool valid = false;
};

// Collection of halo performance samples for a specific configuration
struct cudecompHaloPerformanceSampleCollection {
std::vector<cudecompHaloPerformanceSample> samples;
int32_t sample_idx = 0;
int32_t warmup_count = 0;
};

// cuDecomp grid descriptor containing grid-specific information
struct cudecompGridDesc {
cudecompGridDescConfig_t config; // configuration struct
Expand All @@ -144,6 +194,23 @@ struct cudecompGridDesc {
cudecomp::ncclComm
nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle

// Performance reporting related entries
std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
std::vector<cudaEvent_t> alltoall_end_events; // events for alltoall timing
int32_t alltoall_timing_count = 0; // count of alltoall timing events pairs (for pipelined alltoall)
cudaEvent_t transpose_start_event; // event for transpose timing
cudaEvent_t transpose_end_event; // event for transpose timing

std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
cudecompTransposePerformanceSampleCollection>
transpose_perf_samples_map;

std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<bool, 3>, std::array<int32_t, 3>,
bool, cudecompDataType_t>,
cudecompHaloPerformanceSampleCollection>
halo_perf_samples_map;

bool initialized = false;
};

Expand Down
50 changes: 47 additions & 3 deletions include/internal/halo.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#include "internal/comm_routines.h"
#include "internal/cudecomp_kernels.h"
#include "internal/nvtx.h"
#include "internal/performance.h"
#include "internal/utils.h"

namespace cudecomp {

Expand Down Expand Up @@ -73,6 +75,20 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
// Quick return if no halos
if (halo_extents[dim] == 0) { return; }

cudecompHaloPerformanceSample* current_sample = nullptr;
if (handle->performance_report_enable) {
auto& samples =
getOrCreateHaloPerformanceSamples(handle, grid_desc,
createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
padding.data(), getCudecompDataType<T>()));
current_sample = &samples.samples[samples.sample_idx];
current_sample->sendrecv_bytes = 0;
current_sample->valid = true;

// Record start event
CHECK_CUDA(cudaEventRecord(current_sample->halo_start_event, stream));
}

// Check if halos include more than one process (unsupported currently).
int count = 0;
for (int i = 0; i < 3; ++i) {
Expand Down Expand Up @@ -120,6 +136,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
c = 0;
} else if (neighbors[0] == -1 && neighbors[1] == -1) {
// Single rank in this dimension and not periodic. Return.
if (handle->performance_report_enable && current_sample) {
// Record end event and advance sample even for early return
CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
advanceHaloPerformanceSample(handle, grid_desc,
createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
padding.data(), getCudecompDataType<T>()));
}
return;
}

Expand Down Expand Up @@ -204,7 +227,14 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
std::array<size_t, 2> offsets{};
offsets[1] = halo_size;

cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream);
if (handle->performance_report_enable && current_sample) {
current_sample->sendrecv_bytes = 0;
for (int i = 0; i < 2; ++i) {
if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
}
}
cudecompSendRecvPair(handle, grid_desc, neighbors, send_buff, counts, offsets, recv_buff, counts, offsets, stream,
current_sample);

// Unpack
// Left
Expand Down Expand Up @@ -261,10 +291,24 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
lx[dim] = shape_g_h_p[dim] - halo_extents[dim];
recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx);

cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
stream);
if (handle->performance_report_enable && current_sample) {
current_sample->sendrecv_bytes = 0;
for (int i = 0; i < 2; ++i) {
if (neighbors[i] != -1) { current_sample->sendrecv_bytes += halo_size * sizeof(T); }
}
}
cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, stream,
current_sample);
} break;
}

if (handle->performance_report_enable && current_sample) {
// Record end event
CHECK_CUDA(cudaEventRecord(current_sample->halo_end_event, stream));
advanceHaloPerformanceSample(handle, grid_desc,
createHaloConfig(ax, dim, input, halo_extents.data(), halo_periods.data(),
padding.data(), getCudecompDataType<T>()));
}
}

template <typename T>
Expand Down
Loading