diff --git a/CMakeLists.txt b/CMakeLists.txt index c328e8b8..cbea5ab5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,12 +96,31 @@ elseif("${Kokkos_DEVICES}" MATCHES "SYCL") add_compile_options("-D SYCL_ENABLED") endif() +if(("${Kokkos_DEVICES}" MATCHES "CUDA") + OR ("${Kokkos_DEVICES}" MATCHES "HIP") + OR ("${Kokkos_DEVICES}" MATCHES "SYCL")) + set(DEVICE_ENABLED ON) +else() + set(DEVICE_ENABLED OFF) +endif() + # MPI if(${mpi}) find_or_fetch_dependency(MPI FALSE REQUIRED) include_directories(${MPI_CXX_INCLUDE_PATH}) add_compile_options("-D MPI_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} MPI::MPI_CXX) + + if(${DEVICE_ENABLED}) + set(mpi_device_copy + ${default_mpi_device_copy} + CACHE BOOL "Use explicit copy when using MPI + GPU") + add_compile_options("-D MPI_DEVICE_COPY") + else() + set(mpi_device_copy + OFF + CACHE BOOL "Use explicit copy when using MPI + GPU") + endif() endif() # Output diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 30e605a5..d106f420 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -63,4 +63,14 @@ else() CACHE INTERNAL "Default flag for MPI") endif() +if(DEFINED ENV{Entity_MPI_DEVICE_COPY}) + set(default_mpi_device_copy + $ENV{Entity_MPI_DEVICE_COPY} + CACHE INTERNAL "Default flag for copying from device to host for MPI") +else() + set(default_mpi_device_copy + OFF + CACHE INTERNAL "Default flag for copying from device to host for MPI") +endif() + set_property(CACHE default_mpi PROPERTY TYPE BOOL) diff --git a/cmake/report.cmake b/cmake/report.cmake index 626de15a..6578cfd5 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -36,7 +36,7 @@ printchoices( ${default_precision} "${Blue}" PRECISION_REPORT - 36) + 46) printchoices( "Output" "output" @@ -45,7 +45,7 @@ printchoices( ${default_output} "${Green}" OUTPUT_REPORT - 36) + 46) printchoices( "MPI" "mpi" @@ -54,7 +54,7 @@ printchoices( OFF "${Green}" MPI_REPORT - 36) + 46) printchoices( "Debug mode" "DEBUG" @@ -63,7 +63,19 @@ printchoices( OFF "${Green}" DEBUG_REPORT - 36) + 46) + +if(${mpi} AND ${DEVICE_ENABLED}) + printchoices( + "MPI explicit copy" + "mpi_device_copy" + "${ON_OFF_VALUES}" + ${mpi_device_copy} + OFF + "${Green}" + MPI_DEVICE_COPY_REPORT + 46) +endif() if(NOT ${PROJECT_VERSION_TWEAK} EQUAL 0) set(VERSION_SYMBOL "v${PROJECT_VERSION_MAJOR}." "${PROJECT_VERSION_MINOR}.") @@ -111,13 +123,23 @@ string(REPLACE ";" "+" Kokkos_DEVICES "${Kokkos_DEVICES}") string( APPEND REPORT_TEXT - " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: ${Kokkos_ARCH}" + " - ARCH [${Magenta}Kokkos_ARCH_***${ColorReset}]: " + "${Kokkos_ARCH}" "\n" - " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: ${Kokkos_DEVICES}" + " - DEVICES [${Magenta}Kokkos_ENABLE_***${ColorReset}]: " + "${Kokkos_DEVICES}" "\n" " " ${MPI_REPORT} - "\n" + "\n") + +if(${mpi} AND ${DEVICE_ENABLED}) + string(APPEND REPORT_TEXT " " ${MPI_DEVICE_COPY_REPORT} "\n") +endif() + +string( + APPEND + REPORT_TEXT " " ${DEBUG_REPORT} "\n" diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index fbeb7ecd..eb778129 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,6 +33,86 @@ namespace comm { using namespace ntt; + namespace { + template + void send_recv(const array_t& send_buff, + array_t& recv_buff, + npart_t nsend, + npart_t nrecv, + int send_rank, + int recv_rank, + npart_t recv_offset = 0u) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Sendrecv(send_buff_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_buff_h.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Sendrecv(send_buff.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_buff.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + + template + void send(const array_t& send_buff, npart_t nsend, int send_rank) { +#if defined(MPI_DEVICE_COPY) + auto send_buff_h = Kokkos::create_mirror_view(send_buff); + Kokkos::deep_copy(send_buff_h, send_buff); + MPI_Send(send_buff_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#else + MPI_Send(send_buff.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#endif + } + + template + void recv(const array_t& recv_buff, + npart_t nrecv, + int recv_rank, + npart_t recv_offset = 0u) { +#if defined(MPI_DEVICE_COPY) + auto recv_buff_h = Kokkos::create_mirror_view(recv_buff); + MPI_Recv(recv_buff_h.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_buff, recv_buff_h); +#else + MPI_Recv(recv_buff.data() + recv_offset, + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); +#endif + } + } // namespace + template inline void CommunicateField(unsigned int idx, ndfield_t& fld, @@ -186,6 +266,24 @@ namespace comm { } if (send_rank >= 0 && recv_rank >= 0) { +#if defined(MPI_DEVICE_COPY) + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Sendrecv(send_fld_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + recv_fld_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); +#else MPI_Sendrecv(send_fld.data(), nsend, mpi::get_type(), @@ -198,15 +296,37 @@ namespace comm { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); +#endif } else if (send_rank >= 0) { +#if defined(MPI_DEVICE_COPY) + auto send_fld_h = Kokkos::create_mirror_view(send_fld); + Kokkos::deep_copy(send_fld_h, send_fld); + MPI_Send(send_fld_h.data(), + nsend, + mpi::get_type(), + send_rank, + 0, + MPI_COMM_WORLD); +#else MPI_Send(send_fld.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); - +#endif } else if (recv_rank >= 0) { +#if defined(MPI_DEVICE_COPY) + auto recv_fld_h = Kokkos::create_mirror_view(recv_fld); + MPI_Recv(recv_fld_h.data(), + nrecv, + mpi::get_type(), + recv_rank, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::deep_copy(recv_fld, recv_fld_h); +#else MPI_Recv(recv_fld.data(), nrecv, mpi::get_type(), @@ -214,6 +334,7 @@ namespace comm { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); +#endif } else { raise::Error("CommunicateField called with negative ranks", HERE); } @@ -404,117 +525,60 @@ namespace comm { recv_buff_int.extent(0), "incorrect # of recv particles", HERE); - MPI_Sendrecv(send_buff_int.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - recv_buff_int.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_real.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - recv_buff_real.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Sendrecv(send_buff_prtldx.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - recv_buff_prtldx.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - if (NPLDS > 0) { - MPI_Sendrecv(send_buff_pld.data(), - npart_send_in * NPLDS, - mpi::get_type(), + send_recv(send_buff_int, + recv_buff_int, + NINTS * npart_send_in, + NINTS * npart_recv_in, send_rank, - 0, - recv_buff_pld.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); + recv_offset_int); + + send_recv(send_buff_real, + recv_buff_real, + NREALS * npart_send_in, + NREALS * npart_recv_in, + send_rank, + recv_rank, + recv_offset_real); + + send_recv(send_buff_prtldx, + recv_buff_prtldx, + NPRTLDX * npart_send_in, + NPRTLDX * npart_recv_in, + send_rank, + recv_rank, + recv_offset_prtldx); + + if (NPLDS > 0) { + send_recv(send_buff_pld, + recv_buff_pld, + NPLDS * npart_send_in, + NPLDS * npart_recv_in, + send_rank, + recv_rank, + recv_offset_pld); } } else if ((send_rank >= 0) and (npart_send_in > 0)) { - MPI_Send(send_buff_int.data(), - npart_send_in * NINTS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - MPI_Send(send_buff_real.data(), - npart_send_in * NREALS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); - MPI_Send(send_buff_prtldx.data(), - npart_send_in * NPRTLDX, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); + send(send_buff_int, NINTS * npart_send_in, send_rank); + send(send_buff_real, NREALS * npart_send_in, send_rank); + send(send_buff_prtldx, NPRTLDX * npart_send_in, send_rank); if (NPLDS > 0) { - MPI_Send(send_buff_pld.data(), - npart_send_in * NPLDS, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); + send(send_buff_pld, NPLDS * npart_send_in, send_rank); } } else if ((recv_rank >= 0) and (npart_recv_in > 0)) { raise::ErrorIf(recv_offset_int + npart_recv_in * NINTS > recv_buff_int.extent(0), "incorrect # of recv particles", HERE); - MPI_Recv(recv_buff_int.data() + recv_offset_int, - npart_recv_in * NINTS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_real.data() + recv_offset_real, - npart_recv_in * NREALS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(recv_buff_prtldx.data() + recv_offset_prtldx, - npart_recv_in * NPRTLDX, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); + + recv(recv_buff_int, NINTS * npart_recv_in, recv_rank, recv_offset_int); + recv(recv_buff_real, NREALS * npart_recv_in, recv_rank, recv_offset_real); + recv(recv_buff_prtldx, + NPRTLDX * npart_recv_in, + recv_rank, + recv_offset_prtldx); if (NPLDS > 0) { - MPI_Recv(recv_buff_pld.data() + recv_offset_pld, - npart_recv_in * NPLDS, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); + recv(recv_buff_pld, NPLDS * npart_recv_in, recv_rank, recv_offset_pld); } } current_received += npart_recv_in;