diff --git a/CMakeLists.txt b/CMakeLists.txt index cbea5ab5..06a7690d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,10 @@ set(mpi ${default_mpi} CACHE BOOL "Use MPI") +set(gpu_aware_mpi + ${default_gpu_aware_mpi} + CACHE BOOL "Enable GPU-aware MPI") + # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -90,10 +94,18 @@ include_directories(${plog_SRC}/include) set_precision(${precision}) if("${Kokkos_DEVICES}" MATCHES "CUDA") add_compile_options("-D CUDA_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") elseif("${Kokkos_DEVICES}" MATCHES "HIP") add_compile_options("-D HIP_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") + set(DEVICE_ENABLED ON) + add_compile_options("-D DEVICE_ENABLED") +else() + set(DEVICE_ENABLED OFF) endif() if(("${Kokkos_DEVICES}" MATCHES "CUDA") @@ -110,14 +122,12 @@ if(${mpi}) include_directories(${MPI_CXX_INCLUDE_PATH}) add_compile_options("-D MPI_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} MPI::MPI_CXX) - if(${DEVICE_ENABLED}) - set(mpi_device_copy - ${default_mpi_device_copy} - CACHE BOOL "Use explicit copy when using MPI + GPU") - add_compile_options("-D MPI_DEVICE_COPY") + if(${gpu_aware_mpi}) + add_compile_options("-D GPU_AWARE_MPI") + endif() else() - set(mpi_device_copy + set(gpu_aware_mpi OFF CACHE BOOL "Use explicit copy when using MPI + GPU") endif() diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index d106f420..2bfa9a61 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -74,3 +74,15 @@ else() endif() set_property(CACHE default_mpi PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_GPU_AWARE_MPI}) + set(default_gpu_aware_mpi + $ENV{Entity_ENABLE_GPU_AWARE_MPI} + CACHE INTERNAL "Default flag for GPU-aware MPI") +else() + set(default_gpu_aware_mpi + ON + CACHE INTERNAL "Default flag for GPU-aware MPI") +endif() + +set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) diff --git a/cmake/report.cmake b/cmake/report.cmake index 6578cfd5..b0e299d8 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -55,6 +55,17 @@ printchoices( "${Green}" MPI_REPORT 46) +if(${mpi} AND ${DEVICE_ENABLED}) + printchoices( + "GPU-aware MPI" + "gpu_aware_mpi" + "${ON_OFF_VALUES}" + ${gpu_aware_mpi} + OFF + "${Green}" + GPU_AWARE_MPI_REPORT + 46) +endif() printchoices( "Debug mode" "DEBUG" @@ -65,18 +76,6 @@ printchoices( DEBUG_REPORT 46) -if(${mpi} AND ${DEVICE_ENABLED}) - printchoices( - "MPI explicit copy" - "mpi_device_copy" - "${ON_OFF_VALUES}" - ${mpi_device_copy} - OFF - "${Green}" - MPI_DEVICE_COPY_REPORT - 46) -endif() - if(NOT ${PROJECT_VERSION_TWEAK} EQUAL 0) set(VERSION_SYMBOL "v${PROJECT_VERSION_MAJOR}." "${PROJECT_VERSION_MINOR}.") string(APPEND VERSION_SYMBOL @@ -134,7 +133,7 @@ string( "\n") if(${mpi} AND ${DEVICE_ENABLED}) - string(APPEND REPORT_TEXT " " ${MPI_DEVICE_COPY_REPORT} "\n") + string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") endif() string( diff --git a/src/engines/engine_printer.cpp b/src/engines/engine_printer.cpp index 66725fd5..9d1f74a7 100644 --- a/src/engines/engine_printer.cpp +++ b/src/engines/engine_printer.cpp @@ -106,8 +106,8 @@ namespace ntt { color::RESET); } - auto bytes_to_human_readable( - std::size_t bytes) -> std::pair { + auto bytes_to_human_readable(std::size_t bytes) + -> std::pair { const std::vector units { "B", "KB", "MB", "GB", "TB" }; idx_t unit_idx = 0; auto size = static_cast(bytes); @@ -214,6 +214,10 @@ namespace ntt { report += "\n\n"; add_header(report, { entity_version }, { color::BRIGHT_GREEN }); report += "\n"; + + /* + * Backend + */ add_category(report, 4, "Backend"); add_param(report, 4, "Build hash", "%s", hash.c_str()); add_param(report, 4, "CXX", "%s [%s]", ccx.c_str(), cpp_standard.c_str()); @@ -223,11 +227,76 @@ namespace ntt { add_param(report, 4, "HIP", "%s", hip_version.c_str()); #endif add_param(report, 4, "MPI", "%s", mpi_version.c_str()); +#if defined(MPI_ENABLED) && defined(DEVICE_ENABLED) + #if defined(GPU_AWARE_MPI) + const std::string gpu_aware_mpi = "ON"; + #else + const std::string gpu_aware_mpi = "OFF"; + #endif + add_param(report, 4, "GPU-aware MPI", "%s", gpu_aware_mpi.c_str()); +#endif add_param(report, 4, "Kokkos", "%s", kokkos_version.c_str()); add_param(report, 4, "ADIOS2", "%s", adios2_version.c_str()); add_param(report, 4, "Precision", "%s", precision); add_param(report, 4, "Debug", "%s", dbg.c_str()); report += "\n"; + + /* + * Compilation flags + */ + add_category(report, 4, "Compilation flags"); +#if defined(SINGLE_PRECISION) + add_param(report, 4, "SINGLE_PRECISION", "%s", "ON"); +#else + add_param(report, 4, "SINGLE_PRECISION", "%s", "OFF"); +#endif + +#if defined(OUTPUT_ENABLED) + add_param(report, 4, "OUTPUT_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "OUTPUT_ENABLED", "%s", "OFF"); +#endif + +#if defined(DEBUG) + add_param(report, 4, "DEBUG", "%s", "ON"); +#else + add_param(report, 4, "DEBUG", "%s", "OFF"); +#endif + +#if defined(CUDA_ENABLED) + add_param(report, 4, "CUDA_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "CUDA_ENABLED", "%s", "OFF"); +#endif + +#if defined(HIP_ENABLED) + add_param(report, 4, "HIP_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "HIP_ENABLED", "%s", "OFF"); +#endif + +#if defined(DEVICE_ENABLED) + add_param(report, 4, "DEVICE_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "DEVICE_ENABLED", "%s", "OFF"); +#endif + +#if defined(MPI_ENABLED) + add_param(report, 4, "MPI_ENABLED", "%s", "ON"); +#else + add_param(report, 4, "MPI_ENABLED", "%s", "OFF"); +#endif + +#if defined(GPU_AWARE_MPI) + add_param(report, 4, "GPU_AWARE_MPI", "%s", "ON"); +#else + add_param(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); +#endif + report += "\n"; + + /* + * Simulation configs + */ add_category(report, 4, "Configuration"); add_param(report, 4, diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index eb778129..e0d0cb4b 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,85 +33,213 @@ namespace comm { using namespace ntt; - namespace { + namespace flds { + template + void send_recv(ndarray_t& send_arr, + ndarray_t& recv_arr, + int send_rank, + int recv_rank, + ncells_t nsend, + ncells_t nrecv) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Sendrecv(send_arr.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Sendrecv(send_arr_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_arr, recv_arr_h); +#endif + } + + template + void send(ndarray_t& send_arr, int send_rank, ncells_t nsend) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#else + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Send(send_arr_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#endif + } + + template + void recv(ndarray_t& recv_arr, int recv_rank, ncells_t nrecv) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Recv(recv_arr.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#else + auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); + MPI_Recv(recv_arr_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_arr, recv_arr_h); +#endif + } + + template + void communicate(ndarray_t& send_arr, + ndarray_t& recv_arr, + int send_rank, + int recv_rank, + ncells_t nsend, + ncells_t nrecv) { + if (send_rank >= 0 and recv_rank >= 0 and nsend > 0 and nrecv > 0) { + send_recv(send_arr, recv_arr, send_rank, recv_rank, nsend, nrecv); + } else if (send_rank >= 0 and nsend > 0) { + send(send_arr, send_rank, nsend); + } else if (recv_rank >= 0 and nrecv > 0) { + recv(recv_arr, recv_rank, nrecv); + } + } + + } // namespace flds + + namespace prtls { template - void send_recv(const array_t& send_buff, - array_t& recv_buff, - npart_t nsend, - npart_t nrecv, - int send_rank, - int recv_rank, - npart_t recv_offset = 0u) { -#if defined(MPI_DEVICE_COPY) - auto send_buff_h = Kokkos::create_mirror_view(send_buff); - auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); - Kokkos::deep_copy(send_buff_h, send_buff); - MPI_Sendrecv(send_buff_h.data(), + void send_recv(array_t& send_arr, + array_t& recv_arr, + int send_rank, + int recv_rank, + npart_t nsend, + npart_t nrecv, + npart_t offset) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Sendrecv(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, - recv_buff_h.data() + recv_offset, + recv_arr.data() + offset, nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff, recv_buff_h); #else - MPI_Sendrecv(send_buff.data(), + const auto slice = std::make_pair(offset, offset + nrecv); + + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + auto recv_arr_h = Kokkos::create_mirror_view( + Kokkos::subview(recv_arr, slice)); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Sendrecv(send_arr_h.data(), nsend, mpi::get_type(), send_rank, 0, - recv_buff.data() + recv_offset, + recv_arr_h.data(), nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); #endif } template - void send(const array_t& send_buff, npart_t nsend, int send_rank) { -#if defined(MPI_DEVICE_COPY) - auto send_buff_h = Kokkos::create_mirror_view(send_buff); - Kokkos::deep_copy(send_buff_h, send_buff); - MPI_Send(send_buff_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); + void send(array_t& send_arr, int send_rank, npart_t nsend) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); #else - MPI_Send(send_buff.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); + auto send_arr_h = Kokkos::create_mirror_view(send_arr); + Kokkos::deep_copy(send_arr_h, send_arr); + MPI_Send(send_arr_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); #endif } template - void recv(const array_t& recv_buff, - npart_t nrecv, - int recv_rank, - npart_t recv_offset = 0u) { -#if defined(MPI_DEVICE_COPY) - auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); - MPI_Recv(recv_buff_h.data() + recv_offset, + void recv(array_t& recv_arr, int recv_rank, npart_t nrecv, npart_t offset) { +#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) + MPI_Recv(recv_arr.data() + offset, nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff, recv_buff_h); #else - MPI_Recv(recv_buff.data() + recv_offset, + const auto slice = std::make_pair(offset, offset + nrecv); + + auto recv_arr_h = Kokkos::create_mirror_view( + Kokkos::subview(recv_arr, slice)); + MPI_Recv(recv_arr_h.data(), nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); #endif } - } // namespace + + template + void communicate(array_t& send_arr, + array_t& recv_arr, + int send_rank, + int recv_rank, + npart_t nsend, + npart_t nrecv, + npart_t offset) { + if (send_rank >= 0 && recv_rank >= 0) { + raise::ErrorIf( + nrecv + offset > recv_arr.extent(0), + "recv_arr is not large enough to hold the received particles", + HERE); + send_recv(send_arr, recv_arr, send_rank, recv_rank, nsend, nrecv, offset); + } else if (send_rank >= 0) { + send(send_arr, send_rank, nsend); + } else if (recv_rank >= 0) { + raise::ErrorIf( + nrecv + offset > recv_arr.extent(0), + "recv_arr is not large enough to hold the received particles", + HERE); + recv(recv_arr, recv_rank, nrecv, offset); + } else { + raise::Error("CommunicateParticles called with negative ranks", HERE); + } + } + } // namespace prtls template inline void CommunicateField(unsigned int idx, @@ -265,79 +393,12 @@ namespace comm { } } - if (send_rank >= 0 && recv_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto send_fld_h = Kokkos::create_mirror_view(send_fld); - auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); - Kokkos::deep_copy(send_fld_h, send_fld); - MPI_Sendrecv(send_fld_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_fld_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_fld, recv_fld_h); -#else - MPI_Sendrecv(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif - } else if (send_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto send_fld_h = Kokkos::create_mirror_view(send_fld); - Kokkos::deep_copy(send_fld_h, send_fld); - MPI_Send(send_fld_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#else - MPI_Send(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#endif - } else if (recv_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); - MPI_Recv(recv_fld_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_fld, recv_fld_h); -#else - MPI_Recv(recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif - } else { - raise::Error("CommunicateField called with negative ranks", HERE); - } + flds::communicate(D) + 1>(send_fld, + recv_fld, + send_rank, + recv_rank, + nsend, + nrecv); if (recv_rank >= 0) { @@ -519,67 +580,35 @@ namespace comm { const auto recv_offset_prtldx = current_received * NPRTLDX; const auto recv_offset_pld = current_received * NPLDS; - if ((send_rank >= 0) and (recv_rank >= 0) and (npart_send_in > 0) and - (npart_recv_in > 0)) { - raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > - recv_buff_int.extent(0), - "incorrect # of recv particles", - HERE); - send_recv(send_buff_int, - recv_buff_int, - NINTS * npart_send_in, - NINTS * npart_recv_in, - send_rank, - recv_rank, - recv_offset_int); - - send_recv(send_buff_real, - recv_buff_real, - NREALS * npart_send_in, - NREALS * npart_recv_in, - send_rank, - recv_rank, - recv_offset_real); - - send_recv(send_buff_prtldx, - recv_buff_prtldx, - NPRTLDX * npart_send_in, - NPRTLDX * npart_recv_in, - send_rank, - recv_rank, - recv_offset_prtldx); - - if (NPLDS > 0) { - send_recv(send_buff_pld, - recv_buff_pld, - NPLDS * npart_send_in, - NPLDS * npart_recv_in, - send_rank, - recv_rank, - recv_offset_pld); - } - } else if ((send_rank >= 0) and (npart_send_in > 0)) { - send(send_buff_int, NINTS * npart_send_in, send_rank); - send(send_buff_real, NREALS * npart_send_in, send_rank); - send(send_buff_prtldx, NPRTLDX * npart_send_in, send_rank); - if (NPLDS > 0) { - send(send_buff_pld, NPLDS * npart_send_in, send_rank); - } - } else if ((recv_rank >= 0) and (npart_recv_in > 0)) { - raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > - recv_buff_int.extent(0), - "incorrect # of recv particles", - HERE); - - recv(recv_buff_int, NINTS * npart_recv_in, recv_rank, recv_offset_int); - recv(recv_buff_real, NREALS * npart_recv_in, recv_rank, recv_offset_real); - recv(recv_buff_prtldx, - NPRTLDX * npart_recv_in, - recv_rank, - recv_offset_prtldx); - if (NPLDS > 0) { - recv(recv_buff_pld, NPLDS * npart_recv_in, recv_rank, recv_offset_pld); - } + prtls::communicate(send_buff_int, + recv_buff_int, + send_rank, + recv_rank, + npart_send_in * NINTS, + npart_recv_in * NINTS, + recv_offset_int); + prtls::communicate(send_buff_real, + recv_buff_real, + send_rank, + recv_rank, + npart_send_in * NREALS, + npart_recv_in * NREALS, + recv_offset_real); + prtls::communicate(send_buff_prtldx, + recv_buff_prtldx, + send_rank, + recv_rank, + npart_send_in * NPRTLDX, + npart_recv_in * NPRTLDX, + recv_offset_prtldx); + if (NPLDS > 0) { + prtls::communicate(send_buff_pld, + recv_buff_pld, + send_rank, + recv_rank, + npart_send_in * NPLDS, + npart_recv_in * NPLDS, + recv_offset_pld); } current_received += npart_recv_in; iteration++;