From f609fb236cacef34a6ae6451eeecf7dac6f2527a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Mon, 21 Apr 2025 14:22:53 -0500 Subject: [PATCH 1/9] explicitly copying mpi buffers from GPU to CPU to solve GPU pinning issue on DeltaAI --- src/framework/domain/comm_mpi.hpp | 89 ++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index fbeb7ecd..5e91fff9 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -186,34 +186,42 @@ namespace comm { } if (send_rank >= 0 && recv_rank >= 0) { - MPI_Sendrecv(send_fld.data(), + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Sendrecv(send_fld_h.data(), nsend, mpi::get_type(), send_rank, 0, - recv_fld.data(), + recv_fld_h.data(), nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); } else if (send_rank >= 0) { - MPI_Send(send_fld.data(), + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Send(send_fld_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); - } else if (recv_rank >= 0) { - MPI_Recv(recv_fld.data(), + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + Kokkos::deep_copy(recv_fld_h, recv_fld); + MPI_Recv(recv_fld_h.data(), nrecv, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); } else { raise::Error("CommunicateField called with negative ranks", HERE); } @@ -404,77 +412,106 @@ namespace comm { recv_buff_int.extent(0), "incorrect # of recv particles", HERE); - MPI_Sendrecv(send_buff_int.data(), + auto send_buff_int_h = Kokkos::create_mirror_view(send_buff_int); + auto recv_buff_int_h = Kokkos::create_mirror_view(recv_buff_int); + Kokkos::deep_copy(send_buff_int_h, send_buff_int); + MPI_Sendrecv(send_buff_int_h.data(), npart_send_in * NINTS, mpi::get_type(), send_rank, 0, - recv_buff_int.data() + recv_offset_int, + recv_buff_int_h.data() + recv_offset_int, npart_recv_in * NINTS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_real.data(), + Kokkos::deep_copy(recv_buff_int, recv_buff_int_h); + + auto send_buff_real_h = Kokkos::create_mirror_view(send_buff_real); + auto recv_buff_real_h = Kokkos::create_mirror_view(recv_buff_real); + Kokkos::deep_copy(send_buff_real_h, send_buff_real); + MPI_Sendrecv(send_buff_real_h.data(), npart_send_in * NREALS, mpi::get_type(), send_rank, 0, - recv_buff_real.data() + recv_offset_real, + recv_buff_real_h.data() + recv_offset_real, npart_recv_in * NREALS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_prtldx.data(), + Kokkos::deep_copy(recv_buff_real, recv_buff_real_h); + + auto send_buff_prtldx_h = Kokkos::create_mirror_view(send_buff_prtldx); + auto recv_buff_prtldx_h = Kokkos::create_mirror_view(recv_buff_prtldx); + Kokkos::deep_copy(send_buff_prtldx_h, send_buff_prtldx); + MPI_Sendrecv(send_buff_prtldx_h.data(), npart_send_in * NPRTLDX, mpi::get_type(), send_rank, 0, - recv_buff_prtldx.data() + recv_offset_prtldx, + recv_buff_prtldx_h.data() + recv_offset_prtldx, npart_recv_in * NPRTLDX, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff_prtldx, recv_buff_prtldx_h); + if (NPLDS > 0) { - MPI_Sendrecv(send_buff_pld.data(), + auto send_buff_pld_h = Kokkos::create_mirror_view(send_buff_pld); + auto recv_buff_pld_h = Kokkos::create_mirror_view(recv_buff_pld); + Kokkos::deep_copy(send_buff_pld_h, send_buff_pld); + MPI_Sendrecv(send_buff_pld_h.data(), npart_send_in * NPLDS, mpi::get_type(), send_rank, 0, - recv_buff_pld.data() + recv_offset_pld, + recv_buff_pld_h.data() + recv_offset_pld, npart_recv_in * NPLDS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff_pld, recv_buff_pld_h); } } else if ((send_rank >= 0) and (npart_send_in > 0)) { - MPI_Send(send_buff_int.data(), + auto send_buff_int_h = Kokkos::create_mirror_view(send_buff_int); + Kokkos::deep_copy(send_buff_int_h, send_buff_int); + MPI_Send(send_buff_int_h.data(), npart_send_in * NINTS, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); + + auto send_buff_real_h = Kokkos::create_mirror_view(send_buff_real); + Kokkos::deep_copy(send_buff_real_h, send_buff_real); MPI_Send(send_buff_real.data(), npart_send_in * NREALS, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); - MPI_Send(send_buff_prtldx.data(), + + auto send_buff_prtldx_h = Kokkos::create_mirror_view(send_buff_prtldx); + Kokkos::deep_copy(send_buff_prtldx_h, send_buff_prtldx); + MPI_Send(send_buff_prtldx_h.data(), npart_send_in * NPRTLDX, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); if (NPLDS > 0) { - MPI_Send(send_buff_pld.data(), + auto send_buff_pld_h = Kokkos::create_mirror_view(send_buff_pld); + Kokkos::deep_copy(send_buff_pld_h, send_buff_pld); + MPI_Send(send_buff_pld_h.data(), npart_send_in * NPLDS, mpi::get_type(), send_rank, @@ -486,35 +523,47 @@ namespace comm { recv_buff_int.extent(0), "incorrect # of recv particles", HERE); - MPI_Recv(recv_buff_int.data() + recv_offset_int, + + auto recv_buff_int_h = Kokkos::create_mirror_view(recv_buff_int); + MPI_Recv(recv_buff_int_h.data() + recv_offset_int, npart_recv_in * NINTS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_real.data() + recv_offset_real, + Kokkos::deep_copy(recv_buff_int, recv_buff_int_h); + + auto recv_buff_real_h = Kokkos::create_mirror_view(recv_buff_real); + MPI_Recv(recv_buff_real_h.data() + recv_offset_real, npart_recv_in * NREALS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_prtldx.data() + recv_offset_prtldx, + Kokkos::deep_copy(recv_buff_real, recv_buff_real_h); + + auto recv_buff_prtldx_h = Kokkos::create_mirror_view(recv_buff_prtldx); + MPI_Recv(recv_buff_prtldx_h.data() + recv_offset_prtldx, npart_recv_in * NPRTLDX, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff_prtldx, recv_buff_prtldx_h); + if (NPLDS > 0) { - MPI_Recv(recv_buff_pld.data() + recv_offset_pld, + auto rrecv_buff_pld_h = Kokkos::create_mirror_view(recv_buff_pld); + MPI_Recv(rrecv_buff_pld_h.data() + recv_offset_pld, npart_recv_in * NPLDS, mpi::get_type(), recv_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff_pld, rrecv_buff_pld_h); } } current_received += npart_recv_in; From d64dceebeb5069d9754c921c368695348ad502c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Mon, 21 Apr 2025 18:41:09 -0500 Subject: [PATCH 2/9] bugfix --- src/framework/domain/comm_mpi.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index 5e91fff9..a2b248dc 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -213,7 +213,6 @@ namespace comm { MPI_COMM_WORLD); } else if (recv_rank >= 0) { auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); - Kokkos::deep_copy(recv_fld_h, recv_fld); MPI_Recv(recv_fld_h.data(), nrecv, mpi::get_type(), From e2c2b51356941c1f81787b6e600d1eee768ff492 Mon Sep 17 00:00:00 2001 From: hayk Date: Wed, 23 Apr 2025 15:38:12 -0400 Subject: [PATCH 3/9] compile-time flag: mpi_device_copy --- CMakeLists.txt | 18 ++ cmake/defaults.cmake | 10 + cmake/report.cmake | 34 +++- src/framework/domain/comm_mpi.hpp | 314 +++++++++++++++++------------- 4 files changed, 230 insertions(+), 146 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c328e8b8..20d73978 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,13 @@ elseif("${Kokkos_DEVICES}" MATCHES "HIP") elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") endif() +if((${Kokkos_DEVICES} MATCHES "CUDA") + OR (${Kokkos_DEVICES} MATCHES "HIP") + OR (${Kokkos_DEVICES} MATCHES "SYCL")) + set(DEVICE_ENABLED ON) +else() + set(DEVICE_ENABLED OFF) +endif() # MPI if(${mpi}) @@ -102,6 +109,17 @@ if(${mpi}) include_directories(${MPI_CXX_INCLUDE_PATH}) add_compile_options("-D MPI_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} MPI::MPI_CXX) + + if(${DEVICE_ENABLED}) + set(mpi_device_copy + ${default_mpi_device_copy} + CACHE BOOL "Use explicit copy when using MPI + GPU") + add_compile_options("-D MPI_DEVICE_COPY") + else() + set(mpi_device_copy + OFF + CACHE BOOL "Use explicit copy when using MPI + GPU") + endif() endif() # Output diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 30e605a5..9829ea14 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -63,4 +63,14 @@ else() CACHE INTERNAL "Default flag for MPI") endif() +if(DEFINED ENV{Entity_MPI_DEVICE_COPY}) + set(default_mpi_device_copy + $ENV{Entity_MPI_DEVICE_COPY} + CACHE INTERNAL "Default flag for copying from device to host for MPI") +else() + set(default_mpi_device_copy + ON + CACHE INTERNAL "Default flag for copying from device to host for MPI") +endif() + set_property(CACHE default_mpi PROPERTY TYPE BOOL) diff --git a/cmake/report.cmake b/cmake/report.cmake index 626de15a..c4958fa3 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -36,7 +36,7 @@ printchoices( ${default_precision} "${Blue}" PRECISION_REPORT - 36) + 46) printchoices( "Output" "output" @@ -45,7 +45,7 @@ printchoices( ${default_output} "${Green}" OUTPUT_REPORT - 36) + 46) printchoices( "MPI" "mpi" @@ -54,7 +54,7 @@ printchoices( OFF "${Green}" MPI_REPORT - 36) + 46) printchoices( "Debug mode" "DEBUG" @@ -63,7 +63,17 @@ printchoices( OFF "${Green}" DEBUG_REPORT - 36) + 46) + +printchoices( + "MPI explicit copy" + "mpi_device_copy" + "${ON_OFF_VALUES}" + ${mpi_device_copy} + OFF + "${Green}" + MPI_DEVICE_COPY_REPORT + 46) if(NOT ${PROJECT_VERSION_TWEAK} EQUAL 0) set(VERSION_SYMBOL "v${PROJECT_VERSION_MAJOR}." "${PROJECT_VERSION_MINOR}.") @@ -111,13 +121,23 @@ string(REPLACE ";" "+" Kokkos_DEVICES "${Kokkos_DEVICES}") string( APPEND REPORT_TEXT - " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: ${Kokkos_ARCH}" + " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: " + "${Kokkos_ARCH}" "\n" - " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: ${Kokkos_DEVICES}" + " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: " + "${Kokkos_DEVICES}" "\n" " " ${MPI_REPORT} - "\n" + "\n") + +if(${DEVICE_ENABLED}) + string(APPEND REPORT_TEXT " " ${MPI_DEVICE_COPY_REPORT} "\n") +endif() + +string( + APPEND + REPORT_TEXT " " ${DEBUG_REPORT} "\n" diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index a2b248dc..f63761af 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -186,6 +186,7 @@ namespace comm { } if (send_rank >= 0 && recv_rank >= 0) { +#if defined(MPI_DEVICE_COPY) auto send_fld_h = Kokkos::create_mirror_view(send_fld); auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); Kokkos::deep_copy(send_fld_h, send_fld); @@ -202,7 +203,22 @@ namespace comm { MPI_COMM_WORLD, MPI_STATUS_IGNORE); Kokkos::deep_copy(recv_fld, recv_fld_h); +#else + MPI_Sendrecv(send_fld.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_fld.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif } else if (send_rank >= 0) { +#if defined(MPI_DEVICE_COPY) auto send_fld_h = Kokkos::create_mirror_view(send_fld); Kokkos::deep_copy(send_fld_h, send_fld); MPI_Send(send_fld_h.data(), @@ -211,7 +227,16 @@ namespace comm { send_rank, 0, MPI_COMM_WORLD); +#else + MPI_Send(send_fld.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#endif } else if (recv_rank >= 0) { +#if defined(MPI_DEVICE_COPY) auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); MPI_Recv(recv_fld_h.data(), nrecv, @@ -221,6 +246,15 @@ namespace comm { MPI_COMM_WORLD, MPI_STATUS_IGNORE); Kokkos::deep_copy(recv_fld, recv_fld_h); +#else + MPI_Recv(recv_fld.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif } else { raise::Error("CommunicateField called with negative ranks", HERE); } @@ -325,6 +359,101 @@ namespace comm { } } + namespace { + template + void send_recv(const array_t& send_buff, + array_t& recv_buff, + unsigned short Narrs, + npart_t nsend, + npart_t nrecv, + int send_rank, + int recv_rank, + npart_t recv_offset) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Sendrecv(send_buff_h.data(), + nsend * Narrs, + mpi::get_type(), + send_rank, + 0, + recv_buff_h.data() + recv_offset, + nrecv * Narrs, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Sendrecv(send_buff.data(), + nsend * Narrs, + mpi::get_type(), + send_rank, + 0, + recv_buff.data() + recv_offset, + nrecv * Narrs, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + + template + void send(const array_t& send_buff, + unsigned short Narrs, + npart_t nsend, + int send_rank) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Send(send_buff_h.data(), + nsend * Narrs, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#else + MPI_Send(send_buff.data(), + nsend * Narrs, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#endif + } + + template + void recv(const array_t& recv_buff, + unsigned short Narrs, + npart_t nrecv, + int recv_rank, + npart_t recv_offset) { +#if defined(MPI_DEVICE_COPY) + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + MPI_Recv(recv_buff_h.data() + recv_offset, + nrecv * Narrs, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Recv(recv_buff.data() + recv_offset, + nrecv * Narrs, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + } // namespace + template void CommunicateParticles(Particles& species, const array_t& outgoing_indices, @@ -411,111 +540,49 @@ namespace comm { recv_buff_int.extent(0), "incorrect # of recv particles", HERE); - auto send_buff_int_h = Kokkos::create_mirror_view(send_buff_int); - auto recv_buff_int_h = Kokkos::create_mirror_view(recv_buff_int); - Kokkos::deep_copy(send_buff_int_h, send_buff_int); - MPI_Sendrecv(send_buff_int_h.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - recv_buff_int_h.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_int, recv_buff_int_h); - - auto send_buff_real_h = Kokkos::create_mirror_view(send_buff_real); - auto recv_buff_real_h = Kokkos::create_mirror_view(recv_buff_real); - Kokkos::deep_copy(send_buff_real_h, send_buff_real); - MPI_Sendrecv(send_buff_real_h.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - recv_buff_real_h.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_real, recv_buff_real_h); - - auto send_buff_prtldx_h = Kokkos::create_mirror_view(send_buff_prtldx); - auto recv_buff_prtldx_h = Kokkos::create_mirror_view(recv_buff_prtldx); - Kokkos::deep_copy(send_buff_prtldx_h, send_buff_prtldx); - MPI_Sendrecv(send_buff_prtldx_h.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - recv_buff_prtldx_h.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_prtldx, recv_buff_prtldx_h); - - if (NPLDS > 0) { - auto send_buff_pld_h = Kokkos::create_mirror_view(send_buff_pld); - auto recv_buff_pld_h = Kokkos::create_mirror_view(recv_buff_pld); - Kokkos::deep_copy(send_buff_pld_h, send_buff_pld); - MPI_Sendrecv(send_buff_pld_h.data(), - npart_send_in * NPLDS, - mpi::get_type(), + send_recv(send_buff_int, + recv_buff_int, + NINTS, + npart_send_in, + npart_recv_in, send_rank, - 0, - recv_buff_pld_h.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_pld, recv_buff_pld_h); + recv_offset_int); + + send_recv(send_buff_real, + recv_buff_real, + NREALS, + npart_send_in, + npart_recv_in, + send_rank, + recv_rank, + recv_offset_real); + + send_recv(send_buff_prtldx, + recv_buff_prtldx, + NPRTLDX, + npart_send_in, + npart_recv_in, + send_rank, + recv_rank, + recv_offset_prtldx); + + if (NPLDS > 0) { + send_recv(send_buff_pld, + recv_buff_pld, + NPLDS, + npart_send_in, + npart_recv_in, + send_rank, + recv_rank, + recv_offset_pld); } } else if ((send_rank >= 0) and (npart_send_in > 0)) { - auto send_buff_int_h = Kokkos::create_mirror_view(send_buff_int); - Kokkos::deep_copy(send_buff_int_h, send_buff_int); - MPI_Send(send_buff_int_h.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - - auto send_buff_real_h = Kokkos::create_mirror_view(send_buff_real); - Kokkos::deep_copy(send_buff_real_h, send_buff_real); - MPI_Send(send_buff_real.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - - auto send_buff_prtldx_h = Kokkos::create_mirror_view(send_buff_prtldx); - Kokkos::deep_copy(send_buff_prtldx_h, send_buff_prtldx); - MPI_Send(send_buff_prtldx_h.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); + send(send_buff_int, NINTS, npart_send_in, send_rank); + send(send_buff_real, NREALS, npart_send_in, send_rank); + send(send_buff_prtldx, NPRTLDX, npart_send_in, send_rank); if (NPLDS > 0) { - auto send_buff_pld_h = Kokkos::create_mirror_view(send_buff_pld); - Kokkos::deep_copy(send_buff_pld_h, send_buff_pld); - MPI_Send(send_buff_pld_h.data(), - npart_send_in * NPLDS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); + send(send_buff_pld, NPLDS, npart_send_in, send_rank); } } else if ((recv_rank >= 0) and (npart_recv_in > 0)) { raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > @@ -523,46 +590,15 @@ namespace comm { "incorrect # of recv particles", HERE); - auto recv_buff_int_h = Kokkos::create_mirror_view(recv_buff_int); - MPI_Recv(recv_buff_int_h.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_int, recv_buff_int_h); - - auto recv_buff_real_h = Kokkos::create_mirror_view(recv_buff_real); - MPI_Recv(recv_buff_real_h.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_real, recv_buff_real_h); - - auto recv_buff_prtldx_h = Kokkos::create_mirror_view(recv_buff_prtldx); - MPI_Recv(recv_buff_prtldx_h.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_prtldx, recv_buff_prtldx_h); - + recv(recv_buff_int, NINTS, npart_recv_in, recv_rank, recv_offset_int); + recv(recv_buff_real, NREALS, npart_recv_in, recv_rank, recv_offset_real); + recv(recv_buff_prtldx, + NPRTLDX, + npart_recv_in, + recv_rank, + recv_offset_prtldx); if (NPLDS > 0) { - auto rrecv_buff_pld_h = Kokkos::create_mirror_view(recv_buff_pld); - MPI_Recv(rrecv_buff_pld_h.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff_pld, rrecv_buff_pld_h); + recv(recv_buff_pld, NPLDS, npart_recv_in, recv_rank, recv_offset_pld); } } current_received += npart_recv_in; From 4b670b475572f38283ad7e0b6fea031647a200b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Wed, 23 Apr 2025 16:39:29 -0500 Subject: [PATCH 4/9] bugfix --- CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20d73978..8c9a8c18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,9 +95,12 @@ elseif("${Kokkos_DEVICES}" MATCHES "HIP") elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") endif() -if((${Kokkos_DEVICES} MATCHES "CUDA") - OR (${Kokkos_DEVICES} MATCHES "HIP") - OR (${Kokkos_DEVICES} MATCHES "SYCL")) + +if("${Kokkos_DEVICES}" MATCHES "CUDA") + set(DEVICE_ENABLED ON) +elseif("${Kokkos_DEVICES}" MATCHES "HIP") + set(DEVICE_ENABLED ON) +elseif("${Kokkos_DEVICES}" MATCHES "SYCL") set(DEVICE_ENABLED ON) else() set(DEVICE_ENABLED OFF) From 19527083002468889fa967c2eedc041f33e79ace Mon Sep 17 00:00:00 2001 From: hayk Date: Thu, 24 Apr 2025 01:12:35 -0400 Subject: [PATCH 5/9] added quotes --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c9a8c18..cbea5ab5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,11 +96,9 @@ elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") endif() -if("${Kokkos_DEVICES}" MATCHES "CUDA") - set(DEVICE_ENABLED ON) -elseif("${Kokkos_DEVICES}" MATCHES "HIP") - set(DEVICE_ENABLED ON) -elseif("${Kokkos_DEVICES}" MATCHES "SYCL") +if(("${Kokkos_DEVICES}" MATCHES "CUDA") + OR ("${Kokkos_DEVICES}" MATCHES "HIP") + OR ("${Kokkos_DEVICES}" MATCHES "SYCL")) set(DEVICE_ENABLED ON) else() set(DEVICE_ENABLED OFF) From 30d0afde4f778286bfed12e47af1c63ac880fb77 Mon Sep 17 00:00:00 2001 From: hayk Date: Thu, 24 Apr 2025 01:20:26 -0400 Subject: [PATCH 6/9] mpi_device_copy default to OFF --- cmake/defaults.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 9829ea14..d106f420 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -69,7 +69,7 @@ if(DEFINED ENV{Entity_MPI_DEVICE_COPY}) CACHE INTERNAL "Default flag for copying from device to host for MPI") else() set(default_mpi_device_copy - ON + OFF CACHE INTERNAL "Default flag for copying from device to host for MPI") endif() From 5d7c25492358341ab5a604c15b301456f0b37873 Mon Sep 17 00:00:00 2001 From: haykh Date: Thu, 24 Apr 2025 10:47:24 -0400 Subject: [PATCH 7/9] reformat comm --- src/framework/domain/comm_mpi.hpp | 282 +++++++++++------------------- 1 file changed, 99 insertions(+), 183 deletions(-) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index f63761af..d6b876a4 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,6 +33,86 @@ namespace comm { using namespace ntt; + namespace { + template + void send_recv(const array_t& send_buff, + array_t& recv_buff, + npart_t nsend, + npart_t nrecv, + int send_rank, + int recv_rank, + npart_t recv_offset = 0u) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Sendrecv(send_buff_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_buff_h.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Sendrecv(send_buff.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_buff.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + + template + void send(const array_t& send_buff, npart_t nsend, int send_rank) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Send(send_buff_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#else + MPI_Send(send_buff.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#endif + } + + template + void recv(const array_t& recv_buff, + npart_t nrecv, + int recv_rank, + npart_t recv_offset = 0u) { +#if defined(MPI_DEVICE_COPY) + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + MPI_Recv(recv_buff_h.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Recv(recv_buff.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + } // namespace + template inline void CommunicateField(unsigned int idx, ndfield_t& fld, @@ -186,75 +266,11 @@ namespace comm { } if (send_rank >= 0 && recv_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto send_fld_h = Kokkos::create_mirror_view(send_fld); - auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); - Kokkos::deep_copy(send_fld_h, send_fld); - MPI_Sendrecv(send_fld_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_fld_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_fld, recv_fld_h); -#else - MPI_Sendrecv(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif + sendrecv(send_fld, recv_fld, nsend, nrecv, send_rank, recv_rank); } else if (send_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto send_fld_h = Kokkos::create_mirror_view(send_fld); - Kokkos::deep_copy(send_fld_h, send_fld); - MPI_Send(send_fld_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#else - MPI_Send(send_fld.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#endif + send(send_fld, nsend, send_rank); } else if (recv_rank >= 0) { -#if defined(MPI_DEVICE_COPY) - auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); - MPI_Recv(recv_fld_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_fld, recv_fld_h); -#else - MPI_Recv(recv_fld.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif + recv(recv_fld, nrecv, recv_rank); } else { raise::Error("CommunicateField called with negative ranks", HERE); } @@ -359,101 +375,6 @@ namespace comm { } } - namespace { - template - void send_recv(const array_t& send_buff, - array_t& recv_buff, - unsigned short Narrs, - npart_t nsend, - npart_t nrecv, - int send_rank, - int recv_rank, - npart_t recv_offset) { -#if defined(MPI_DEVICE_COPY) - auto send_buff_h = Kokkos::create_mirror_view(send_buff); - auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); - Kokkos::deep_copy(send_buff_h, send_buff); - MPI_Sendrecv(send_buff_h.data(), - nsend * Narrs, - mpi::get_type(), - send_rank, - 0, - recv_buff_h.data() + recv_offset, - nrecv * Narrs, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff, recv_buff_h); -#else - MPI_Sendrecv(send_buff.data(), - nsend * Narrs, - mpi::get_type(), - send_rank, - 0, - recv_buff.data() + recv_offset, - nrecv * Narrs, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif - } - - template - void send(const array_t& send_buff, - unsigned short Narrs, - npart_t nsend, - int send_rank) { -#if defined(MPI_DEVICE_COPY) - auto send_buff_h = Kokkos::create_mirror_view(send_buff); - Kokkos::deep_copy(send_buff_h, send_buff); - MPI_Send(send_buff_h.data(), - nsend * Narrs, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#else - MPI_Send(send_buff.data(), - nsend * Narrs, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); -#endif - } - - template - void recv(const array_t& recv_buff, - unsigned short Narrs, - npart_t nrecv, - int recv_rank, - npart_t recv_offset) { -#if defined(MPI_DEVICE_COPY) - auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); - MPI_Recv(recv_buff_h.data() + recv_offset, - nrecv * Narrs, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_buff, recv_buff_h); -#else - MPI_Recv(recv_buff.data() + recv_offset, - nrecv * Narrs, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); -#endif - } - } // namespace - template void CommunicateParticles(Particles& species, const array_t& outgoing_indices, @@ -542,27 +463,24 @@ namespace comm { HERE); send_recv(send_buff_int, recv_buff_int, - NINTS, - npart_send_in, - npart_recv_in, + NINTS * npart_send_in, + NINTS * npart_recv_in, send_rank, recv_rank, recv_offset_int); send_recv(send_buff_real, recv_buff_real, - NREALS, - npart_send_in, - npart_recv_in, + NREALS * npart_send_in, + NREALS * npart_recv_in, send_rank, recv_rank, recv_offset_real); send_recv(send_buff_prtldx, recv_buff_prtldx, - NPRTLDX, - npart_send_in, - npart_recv_in, + NPRTLDX * npart_send_in, + NPRTLDX * npart_recv_in, send_rank, recv_rank, recv_offset_prtldx); @@ -570,19 +488,18 @@ namespace comm { if (NPLDS > 0) { send_recv(send_buff_pld, recv_buff_pld, - NPLDS, - npart_send_in, - npart_recv_in, + NPLDS * npart_send_in, + NPLDS * npart_recv_in, send_rank, recv_rank, recv_offset_pld); } } else if ((send_rank >= 0) and (npart_send_in > 0)) { - send(send_buff_int, NINTS, npart_send_in, send_rank); - send(send_buff_real, NREALS, npart_send_in, send_rank); - send(send_buff_prtldx, NPRTLDX, npart_send_in, send_rank); + send(send_buff_int, NINTS * npart_send_in, send_rank); + send(send_buff_real, NREALS * npart_send_in, send_rank); + send(send_buff_prtldx, NPRTLDX * npart_send_in, send_rank); if (NPLDS > 0) { - send(send_buff_pld, NPLDS, npart_send_in, send_rank); + send(send_buff_pld, NPLDS * npart_send_in, send_rank); } } else if ((recv_rank >= 0) and (npart_recv_in > 0)) { raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > @@ -590,15 +507,14 @@ namespace comm { "incorrect # of recv particles", HERE); - recv(recv_buff_int, NINTS, npart_recv_in, recv_rank, recv_offset_int); - recv(recv_buff_real, NREALS, npart_recv_in, recv_rank, recv_offset_real); + recv(recv_buff_int, NINTS * npart_recv_in, recv_rank, recv_offset_int); + recv(recv_buff_real, NREALS * npart_recv_in, recv_rank, recv_offset_real); recv(recv_buff_prtldx, - NPRTLDX, - npart_recv_in, + NPRTLDX * npart_recv_in, recv_rank, recv_offset_prtldx); if (NPLDS > 0) { - recv(recv_buff_pld, NPLDS, npart_recv_in, recv_rank, recv_offset_pld); + recv(recv_buff_pld, NPLDS * npart_recv_in, recv_rank, recv_offset_pld); } } current_received += npart_recv_in; From d9f96f0f0adaa71d91b7a4c5745dc2ea23c31b35 Mon Sep 17 00:00:00 2001 From: hayk Date: Thu, 24 Apr 2025 17:01:45 -0400 Subject: [PATCH 8/9] return older explicit copy for flds --- src/framework/domain/comm_mpi.hpp | 70 +++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index d6b876a4..eb778129 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -266,11 +266,75 @@ namespace comm { } if (send_rank >= 0 && recv_rank >= 0) { - sendrecv(send_fld, recv_fld, nsend, nrecv, send_rank, recv_rank); +#if defined(MPI_DEVICE_COPY) + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Sendrecv(send_fld_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_fld_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); +#else + MPI_Sendrecv(send_fld.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_fld.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif } else if (send_rank >= 0) { - send(send_fld, nsend, send_rank); +#if defined(MPI_DEVICE_COPY) + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Send(send_fld_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#else + MPI_Send(send_fld.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#endif } else if (recv_rank >= 0) { - recv(recv_fld, nrecv, recv_rank); +#if defined(MPI_DEVICE_COPY) + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + MPI_Recv(recv_fld_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); +#else + MPI_Recv(recv_fld.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif } else { raise::Error("CommunicateField called with negative ranks", HERE); } From 802916bd76d241c1b9e42ee36b76e575df0afc99 Mon Sep 17 00:00:00 2001 From: hayk Date: Thu, 24 Apr 2025 18:00:02 -0400 Subject: [PATCH 9/9] mpi flag bug in cmake --- cmake/report.cmake | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cmake/report.cmake b/cmake/report.cmake index c4958fa3..6578cfd5 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -65,15 +65,17 @@ printchoices( DEBUG_REPORT 46) -printchoices( - "MPI explicit copy" - "mpi_device_copy" - "${ON_OFF_VALUES}" - ${mpi_device_copy} - OFF - "${Green}" - MPI_DEVICE_COPY_REPORT - 46) +if(${mpi} AND ${DEVICE_ENABLED}) + printchoices( + "MPI explicit copy" + "mpi_device_copy" + "${ON_OFF_VALUES}" + ${mpi_device_copy} + OFF + "${Green}" + MPI_DEVICE_COPY_REPORT + 46) +endif() if(NOT ${PROJECT_VERSION_TWEAK} EQUAL 0) set(VERSION_SYMBOL "v${PROJECT_VERSION_MAJOR}." "${PROJECT_VERSION_MINOR}.") @@ -131,7 +133,7 @@ string( ${MPI_REPORT} "\n") -if(${DEVICE_ENABLED}) +if(${mpi} AND ${DEVICE_ENABLED}) string(APPEND REPORT_TEXT " " ${MPI_DEVICE_COPY_REPORT} "\n") endif()