From d6558e614d1826419087cfa50f19d0ab922591bd Mon Sep 17 00:00:00 2001 From: hayk Date: Mon, 28 Apr 2025 06:58:56 -0400 Subject: [PATCH] gpu-aware-mpi flag added cmake + comms --- CMakeLists.txt | 21 ++ cmake/defaults.cmake | 12 + cmake/report.cmake | 37 ++- src/engines/engine_printer.cpp | 73 +++++- src/framework/domain/comm_mpi.hpp | 393 ++++++++++++++++++------------ 5 files changed, 377 insertions(+), 159 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c328e8b8..8738afbe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,10 @@ set(mpi ${default_mpi} CACHE BOOL "Use MPI") +set(gpu_aware_mpi + ${default_gpu_aware_mpi} + CACHE BOOL "Enable GPU-aware MPI") + # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -90,10 +94,18 @@ include_directories(${plog_SRC}/include) set_precision(${precision}) if("${Kokkos_DEVICES}" MATCHES "CUDA") add_compile_options("-D CUDA_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") elseif("${Kokkos_DEVICES}" MATCHES "HIP") add_compile_options("-D HIP_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") +else() + set(DEVICE_ENABLED OFF) endif() # MPI @@ -102,6 +114,15 @@ if(${mpi}) include_directories(${MPI_CXX_INCLUDE_PATH}) add_compile_options("-D MPI_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} MPI::MPI_CXX) + if(${DEVICE_ENABLED}) + if(${gpu_aware_mpi}) + add_compile_options("-D GPU_AWARE_MPI") + endif() + else() + set(gpu_aware_mpi + OFF + CACHE BOOL "Use explicit copy when using MPI + GPU") + endif() endif() # Output diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 30e605a5..7070644e 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -64,3 +64,15 @@ else() endif() set_property(CACHE default_mpi PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_GPU_AWARE_MPI}) + set(default_gpu_aware_mpi + $ENV{Entity_ENABLE_GPU_AWARE_MPI} + CACHE INTERNAL "Default flag for GPU-aware MPI") +else() + set(default_gpu_aware_mpi + ON + CACHE INTERNAL "Default flag for GPU-aware MPI") +endif() + +set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) diff --git a/cmake/report.cmake b/cmake/report.cmake index 626de15a..5cf6961a 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -36,7 +36,7 @@ printchoices( ${default_precision} "${Blue}" PRECISION_REPORT - 36) + 46) printchoices( "Output" "output" @@ -45,7 +45,7 @@ printchoices( ${default_output} "${Green}" OUTPUT_REPORT - 36) + 46) printchoices( "MPI" "mpi" @@ -54,7 +54,20 @@ printchoices( OFF "${Green}" MPI_REPORT - 36) + 46) + +if(${mpi} AND ${DEVICE_ENABLED}) + printchoices( + "GPU-aware MPI" + "gpu_aware_mpi" + "${ON_OFF_VALUES}" + ${gpu_aware_mpi} + OFF + "${Green}" + GPU_AWARE_MPI_REPORT + 46) +endif() + printchoices( "Debug mode" "DEBUG" @@ -63,7 +76,7 @@ printchoices( OFF "${Green}" DEBUG_REPORT - 36) + 46) if(NOT ${PROJECT_VERSION_TWEAK} EQUAL 0) set(VERSION_SYMBOL "v${PROJECT_VERSION_MAJOR}." "${PROJECT_VERSION_MINOR}.") @@ -111,13 +124,23 @@ string(REPLACE ";" "+" Kokkos_DEVICES "${Kokkos_DEVICES}") string( APPEND REPORT_TEXT - " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: ${Kokkos_ARCH}" + " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: " + "${Kokkos_ARCH}" "\n" - " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: ${Kokkos_DEVICES}" + " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: " + "${Kokkos_DEVICES}" "\n" " " ${MPI_REPORT} - "\n" + "\n") + +if(${mpi} AND ${DEVICE_ENABLED}) + string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") +endif() + +string( + APPEND + REPORT_TEXT " " ${DEBUG_REPORT} "\n" diff --git a/src/engines/engine_printer.cpp b/src/engines/engine_printer.cpp index 66725fd5..9d1f74a7 100644 --- a/src/engines/engine_printer.cpp +++ b/src/engines/engine_printer.cpp @@ -106,8 +106,8 @@ namespace ntt { color::RESET); } - auto bytes_to_human_readable( - std::size_t bytes) -> std::pair { + auto bytes_to_human_readable(std::size_t bytes) + -> std::pair { const std::vector units { "B", "KB", "MB", "GB", "TB" }; idx_t unit_idx = 0; auto size = static_cast(bytes); @@ -214,6 +214,10 @@ namespace ntt { report += "\n\n"; add_header(report, { entity_version }, { color::BRIGHT_GREEN }); report += "\n"; + + /* + * Backend + */ add_category(report, 4, "Backend"); add_param(report, 4, "Build hash", "%s", hash.c_str()); add_param(report, 4, "CXX", "%s [%s]", ccx.c_str(), cpp_standard.c_str()); @@ -223,11 +227,76 @@ namespace ntt { add_param(report, 4, "HIP", "%s", hip_version.c_str()); #endif add_param(report, 4, "MPI", "%s", mpi_version.c_str()); +#if defined(MPI_ENABLED) && defined(DEVICE_ENABLED) + #if defined(GPU_AWARE_MPI) + const std::string gpu_aware_mpi = "ON"; + #else + const std::string gpu_aware_mpi = "OFF"; + #endif + add_param(report, 4, "GPU-aware MPI", "%s", gpu_aware_mpi.c_str()); +#endif add_param(report, 4, "Kokkos", "%s", kokkos_version.c_str()); add_param(report, 4, "ADIOS2", "%s", adios2_version.c_str()); add_param(report, 4, "Precision", "%s", precision); add_param(report, 4, "Debug", "%s", dbg.c_str()); report += "\n"; + + /* + * Compilation flags + */ + add_category(report, 4, "Compilation flags"); +#if defined(SINGLE_PRECISION) + add_param(report, 4, "SINGLE_PRECISION", "%s", "ON"); +#else + add_param(report, 4, "SINGLE_PRECISION", "%s", "OFF"); +#endif + +#if defined(OUTPUT_ENABLED) + add_param(report, 4, "OUTPUT_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "OUTPUT_ENABLED", "%s", "OFF"); +#endif + +#if defined(DEBUG) + add_param(report, 4, "DEBUG", "%s", "ON"); +#else + add_param(report, 4, "DEBUG", "%s", "OFF"); +#endif + +#if defined(CUDA_ENABLED) + add_param(report, 4, "CUDA_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "CUDA_ENABLED", "%s", "OFF"); +#endif + +#if defined(HIP_ENABLED) + add_param(report, 4, "HIP_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "HIP_ENABLED", "%s", "OFF"); +#endif + +#if defined(DEVICE_ENABLED) + add_param(report, 4, "DEVICE_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "DEVICE_ENABLED", "%s", "OFF"); +#endif + +#if defined(MPI_ENABLED) + add_param(report, 4, "MPI_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "MPI_ENABLED", "%s", "OFF"); +#endif + +#if defined(GPU_AWARE_MPI) + add_param(report, 4, "GPU_AWARE_MPI", "%s", "ON"); +#else + add_param(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); +#endif + report += "\n"; + + /* + * Simulation configs + */ add_category(report, 4, "Configuration"); add_param(report, 4, diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index fbeb7ecd..e0d0cb4b 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,6 +33,214 @@ namespace comm { using namespace ntt; + namespace flds { + template + void send_recv(ndarray_t& send_arr, + ndarray_t& recv_arr, + int send_rank, + int recv_rank, + ncells_t nsend, + ncells_t nrecv) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Sendrecv(send_arr.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Sendrecv(send_arr_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_arr, recv_arr_h); +#endif + } + + template + void send(ndarray_t& send_arr, int send_rank, ncells_t nsend) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#else + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Send(send_arr_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#endif + } + + template + void recv(ndarray_t& recv_arr, int recv_rank, ncells_t nrecv) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Recv(recv_arr.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); + MPI_Recv(recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_arr, recv_arr_h); +#endif + } + + template + void communicate(ndarray_t& send_arr, + ndarray_t& recv_arr, + int send_rank, + int recv_rank, + ncells_t nsend, + ncells_t nrecv) { + if (send_rank >= 0 and recv_rank >= 0 and nsend > 0 and nrecv > 0) { + send_recv(send_arr, recv_arr, send_rank, recv_rank, nsend, nrecv); + } else if (send_rank >= 0 and nsend > 0) { + send(send_arr, send_rank, nsend); + } else if (recv_rank >= 0 and nrecv > 0) { + recv(recv_arr, recv_rank, nrecv); + } + } + + } // namespace flds + + namespace prtls { + template + void send_recv(array_t& send_arr, + array_t& recv_arr, + int send_rank, + int recv_rank, + npart_t nsend, + npart_t nrecv, + npart_t offset) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Sendrecv(send_arr.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr.data() + offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + const auto slice = std::make_pair(offset, offset + nrecv); + + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + auto recv_arr_h = Kokkos::create_mirror_view( + Kokkos::subview(recv_arr, slice)); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Sendrecv(send_arr_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); +#endif + } + + template + void send(array_t& send_arr, int send_rank, npart_t nsend) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#else + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Send(send_arr_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#endif + } + + template + void recv(array_t& recv_arr, int recv_rank, npart_t nrecv, npart_t offset) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Recv(recv_arr.data() + offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + const auto slice = std::make_pair(offset, offset + nrecv); + + auto recv_arr_h = Kokkos::create_mirror_view( + Kokkos::subview(recv_arr, slice)); + MPI_Recv(recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); +#endif + } + + template + void communicate(array_t& send_arr, + array_t& recv_arr, + int send_rank, + int recv_rank, + npart_t nsend, + npart_t nrecv, + npart_t offset) { + if (send_rank >= 0 && recv_rank >= 0) { + raise::ErrorIf( + nrecv + offset > recv_arr.extent(0), + "recv_arr is not large enough to hold the received particles", + HERE); + send_recv(send_arr, recv_arr, send_rank, recv_rank, nsend, nrecv, offset); + } else if (send_rank >= 0) { + send(send_arr, send_rank, nsend); + } else if (recv_rank >= 0) { + raise::ErrorIf( + nrecv + offset > recv_arr.extent(0), + "recv_arr is not large enough to hold the received particles", + HERE); + recv(recv_arr, recv_rank, nrecv, offset); + } else { + raise::Error("CommunicateParticles called with negative ranks", HERE); + } + } + } // namespace prtls + template inline void CommunicateField(unsigned int idx, ndfield_t& fld, @@ -185,38 +393,12 @@ namespace comm { } } - if (send_rank >= 0 && recv_rank >= 0) { - MPI_Sendrecv(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } else if (send_rank >= 0) { - MPI_Send(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - - } else if (recv_rank >= 0) { - MPI_Recv(recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } else { - raise::Error("CommunicateField called with negative ranks", HERE); - } + flds::communicate(D) + 1>(send_fld, + recv_fld, + send_rank, + recv_rank, + nsend, + nrecv); if (recv_rank >= 0) { @@ -398,124 +580,35 @@ namespace comm { const auto recv_offset_prtldx = current_received * NPRTLDX; const auto recv_offset_pld = current_received * NPLDS; - if ((send_rank >= 0) and (recv_rank >= 0) and (npart_send_in > 0) and - (npart_recv_in > 0)) { - raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > - recv_buff_int.extent(0), - "incorrect # of recv particles", - HERE); - MPI_Sendrecv(send_buff_int.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - recv_buff_int.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_real.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - recv_buff_real.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_prtldx.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - recv_buff_prtldx.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - if (NPLDS > 0) { - MPI_Sendrecv(send_buff_pld.data(), - npart_send_in * NPLDS, - mpi::get_type(), - send_rank, - 0, - recv_buff_pld.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } - } else if ((send_rank >= 0) and (npart_send_in > 0)) { - MPI_Send(send_buff_int.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - MPI_Send(send_buff_real.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - MPI_Send(send_buff_prtldx.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - if (NPLDS > 0) { - MPI_Send(send_buff_pld.data(), - npart_send_in * NPLDS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - } - } else if ((recv_rank >= 0) and (npart_recv_in > 0)) { - raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > - recv_buff_int.extent(0), - "incorrect # of recv particles", - HERE); - MPI_Recv(recv_buff_int.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_real.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_prtldx.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - if (NPLDS > 0) { - MPI_Recv(recv_buff_pld.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } + prtls::communicate(send_buff_int, + recv_buff_int, + send_rank, + recv_rank, + npart_send_in * NINTS, + npart_recv_in * NINTS, + recv_offset_int); + prtls::communicate(send_buff_real, + recv_buff_real, + send_rank, + recv_rank, + npart_send_in * NREALS, + npart_recv_in * NREALS, + recv_offset_real); + prtls::communicate(send_buff_prtldx, + recv_buff_prtldx, + send_rank, + recv_rank, + npart_send_in * NPRTLDX, + npart_recv_in * NPRTLDX, + recv_offset_prtldx); + if (NPLDS > 0) { + prtls::communicate(send_buff_pld, + recv_buff_pld, + send_rank, + recv_rank, + npart_send_in * NPLDS, + npart_recv_in * NPLDS, + recv_offset_pld); } current_received += npart_recv_in; iteration++;