From 6e6c67559a0aca9ff1e40c7a79b917cc1ce939f9 Mon Sep 17 00:00:00 2001 From: haykh Date: Mon, 7 Jul 2025 17:36:26 -0400 Subject: [PATCH 1/9] fast Aphi calc (WIP MPI) --- src/framework/domain/output.cpp | 81 +++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index e458de0e..6ce0c393 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -24,6 +24,7 @@ #include "kernels/prtls_to_phys.hpp" #include +#include #include #include @@ -32,7 +33,6 @@ #endif // MPI_ENABLED #include -#include #include #include @@ -193,26 +193,66 @@ namespace ntt { unsigned short buff_idx, const Mesh mesh) { if constexpr (M::Dim == Dim::_2D) { - const auto i2_min = mesh.i_min(in::x2); - // !TODO: this is quite slow + using TeamPolicy = Kokkos::TeamPolicy; + const auto nx1 = mesh.n_active(in::x1); + const auto nx2 = mesh.n_active(in::x2); + + TeamPolicy policy(nx1, Kokkos::AUTO); + Kokkos::parallel_for( "ComputeVectorPotential", - mesh.rangeActiveCells(), - Lambda(index_t i, index_t j) { - const real_t i_ { static_cast(static_cast(i) - (N_GHOSTS)) }; - const auto k_min = (i2_min - (N_GHOSTS)) + 1; - const auto k_max = (j - (N_GHOSTS)); - real_t A3 = ZERO; - for (auto k { k_min }; k <= k_max; ++k) { - real_t k_ = static_cast(k); - real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i_, k_ - HALF }) }; - real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i_, k_ + HALF }) }; - auto k1 { k + N_GHOSTS }; - A3 += HALF * (sqrt_detH_ij1 * EM(i, k1 - 1, em::bx1) + - sqrt_detH_ij2 * EM(i, k1, em::bx1)); - } - buffer(i, j, buff_idx) = A3; + policy, + Lambda(const TeamPolicy::member_type& team_member) { + index_t i1 = team_member.league_rank(); + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team_member, nx2), + [=](index_t i2, real_t& update, const bool final_pass) { + const auto i1_ { static_cast(i1) }; + const auto i2_ { static_cast(i2) }; + real_t sqrt_detH_ijM { mesh.metric.sqrt_det_h({ i1_, i2_ - HALF }) }; + real_t sqrt_detH_ijP { mesh.metric.sqrt_det_h({ i1_, i2_ + HALF }) }; + const auto input_val = + HALF * + (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + + sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); + if (final_pass) { + buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; + } + update += input_val; + }); }); + +#if defined(MPI_ENABLED) + array_t aphi_r { "Aphi_r", nx1 }; + Kokkos::deep_copy(aphi_r, + Kokkos::subview(buffer, + std::make_pair(N_GHOSTS, N_GHOSTS + nx1), + N_GHOSTS + nx2 - 1, + buff_idx)); +#endif + + // !TODO: this is quite slow + // Kokkos::parallel_for( + // "ComputeVectorPotential", + // mesh.rangeActiveCells(), + // Lambda(index_t i1, index_t i2) { + // const real_t i1_ { COORD(i1) }; + // // const auto k_min = (i2_min - (N_GHOSTS)) + 1; + // // const auto k_max = (j - (N_GHOSTS)); + // // real_t A3 = ZERO; + // // for (auto k { k_min }; k <= k_max; ++k) { + // // real_t k_ = static_cast(k); + // // real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i_, k_ - HALF }) }; + // // real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i_, k_ + HALF }) }; + // // auto k1 { k + N_GHOSTS }; + // // A3 += HALF * (sqrt_detH_ij1 * EM(i, k1 - 1, em::bx1) + + // // sqrt_detH_ij2 * EM(i, k1, em::bx1)); + // // } + // buffer(i1, i2, buff_idx) = HALF * + // (sqrt_detH_ij1 * EM(i, k1 - 1, em::bx1) + + // sqrt_detH_ij2 * EM(i, k1, em::bx1)); + // }); + // buffer(:, i2_max - 1) } else { raise::KernelError( HERE, @@ -434,8 +474,9 @@ namespace ntt { c, local_domain->mesh); } else { - raise::Error("Vector potential can only be computed for GRPIC in 2D", - HERE); + raise::Error( + "Vector potential can only be computed for GRPIC in 2D", + HERE); } } else { raise::Error("Wrong # of components requested for " From bb5b03d1f2bbc636c150fe7122a3cd66dd2a0fe3 Mon Sep 17 00:00:00 2001 From: haykh Date: Mon, 7 Jul 2025 18:35:53 -0400 Subject: [PATCH 2/9] mpicomm for vector pot --- src/framework/domain/metadomain.h | 3 + src/framework/domain/output.cpp | 162 +++++++++++++++++++----------- 2 files changed, 108 insertions(+), 57 deletions(-) diff --git a/src/framework/domain/metadomain.h b/src/framework/domain/metadomain.h index efc639a8..7ddacffb 100644 --- a/src/framework/domain/metadomain.h +++ b/src/framework/domain/metadomain.h @@ -261,6 +261,9 @@ namespace ntt { #if defined(OUTPUT_ENABLED) out::Writer g_writer; checkpoint::Writer g_checkpoint_writer; + #if defined(MPI_ENABLED) + void CommunicateVectorPotential(unsigned short); + #endif #endif #if defined(MPI_ENABLED) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index af97febb..347517be 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -192,66 +192,52 @@ namespace ntt { unsigned short buff_idx, const Mesh mesh) { if constexpr (M::Dim == Dim::_2D) { - using TeamPolicy = Kokkos::TeamPolicy; - const auto nx1 = mesh.n_active(in::x1); - const auto nx2 = mesh.n_active(in::x2); - - TeamPolicy policy(nx1, Kokkos::AUTO); - - Kokkos::parallel_for( - "ComputeVectorPotential", - policy, - Lambda(const TeamPolicy::member_type& team_member) { - index_t i1 = team_member.league_rank(); - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(team_member, nx2), - [=](index_t i2, real_t& update, const bool final_pass) { - const auto i1_ { static_cast(i1) }; - const auto i2_ { static_cast(i2) }; - real_t sqrt_detH_ijM { mesh.metric.sqrt_det_h({ i1_, i2_ - HALF }) }; - real_t sqrt_detH_ijP { mesh.metric.sqrt_det_h({ i1_, i2_ + HALF }) }; - const auto input_val = - HALF * - (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + - sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); - if (final_pass) { - buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; - } - update += input_val; - }); - }); - -#if defined(MPI_ENABLED) - array_t aphi_r { "Aphi_r", nx1 }; - Kokkos::deep_copy(aphi_r, - Kokkos::subview(buffer, - std::make_pair(N_GHOSTS, N_GHOSTS + nx1), - N_GHOSTS + nx2 - 1, - buff_idx)); -#endif - - // !TODO: this is quite slow + // using TeamPolicy = Kokkos::TeamPolicy; + // const auto nx1 = mesh.n_active(in::x1); + // const auto nx2 = mesh.n_active(in::x2); + // + // TeamPolicy policy(nx1, Kokkos::AUTO); + // // Kokkos::parallel_for( // "ComputeVectorPotential", - // mesh.rangeActiveCells(), - // Lambda(index_t i1, index_t i2) { - // const real_t i1_ { COORD(i1) }; - // // const auto k_min = (i2_min - (N_GHOSTS)) + 1; - // // const auto k_max = (j - (N_GHOSTS)); - // // real_t A3 = ZERO; - // // for (auto k { k_min }; k <= k_max; ++k) { - // // real_t k_ = static_cast(k); - // // real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i_, k_ - HALF }) }; - // // real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i_, k_ + HALF }) }; - // // auto k1 { k + N_GHOSTS }; - // // A3 += HALF * (sqrt_detH_ij1 * EM(i, k1 - 1, em::bx1) + - // // sqrt_detH_ij2 * EM(i, k1, em::bx1)); - // // } - // buffer(i1, i2, buff_idx) = HALF * - // (sqrt_detH_ij1 * EM(i, k1 - 1, em::bx1) + - // sqrt_detH_ij2 * EM(i, k1, em::bx1)); + // policy, + // Lambda(const TeamPolicy::member_type& team_member) { + // index_t i1 = team_member.league_rank(); + // Kokkos::parallel_scan( + // Kokkos::TeamThreadRange(team_member, nx2), + // [=](index_t i2, real_t& update, const bool final_pass) { + // const auto i1_ { static_cast(i1) }; + // const auto i2_ { static_cast(i2) }; + // real_t sqrt_detH_ijM { mesh.metric.sqrt_det_h({ i1_, i2_ - HALF }) }; + // real_t sqrt_detH_ijP { mesh.metric.sqrt_det_h({ i1_, i2_ + HALF }) }; + // const auto input_val = + // HALF * + // (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + + // sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); + // if (final_pass) { + // buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; + // } + // update += input_val; + // }); // }); - // buffer(:, i2_max - 1) + Kokkos::parallel_for( + "ComputeVectorPotential", + mesh.rangeActiveCells(), + Lambda(index_t i1, index_t i2) { + const real_t i1_ { COORD(i1) }; + const ncells_t k_min = 1; + const ncells_t k_max = (i2 - (N_GHOSTS)); + real_t A3 = ZERO; + for (auto k { k_min }; k <= k_max; ++k) { + real_t k_ = static_cast(k); + real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i1_, k_ - HALF }) }; + real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i1_, k_ + HALF }) }; + auto k1 { k + N_GHOSTS }; + A3 += HALF * (sqrt_detH_ij1 * EM(i1, k - 1, em::bx1) + + sqrt_detH_ij2 * EM(i1, k, em::bx1)); + } + buffer(i1, i2, buff_idx) = A3; + }); } else { raise::KernelError( HERE, @@ -259,6 +245,65 @@ namespace ntt { } } +#if defined(MPI_ENABLED) && defined(OUTPUT_ENABLED) + template + void Metadomain::CommunicateVectorPotential(unsigned short buff_idx) { + if constexpr (M::Dim == Dim::_2D) { + auto local_domain = subdomain_ptr(l_subdomain_indices()[0]); + const auto nx1 = local_domain->mesh.n_active(in::x1); + const auto nx2 = local_domain->mesh.n_active(in::x2); + + auto& buffer = local_domain->fields.bckp; + + const auto nranks_x1 = ndomains_per_dim()[0]; + const auto nranks_x2 = ndomains_per_dim()[1]; + + for (auto nr2 { 1u }; nr2 < nranks_x2; ++nr2) { + const auto rank_send_pre = (nr2 - 1u) * nranks_x1; + const auto rank_recv_pre = nr2 * nranks_x1; + for (auto nr1 { 0u }; nr1 < nranks_x1; ++nr1) { + const auto rank_send = rank_send_pre + nr1; + const auto rank_recv = rank_recv_pre + nr1; + if (local_domain->mpi_rank() == rank_send) { + array_t aphi_r { "Aphi_r", nx1 }; + Kokkos::deep_copy( + aphi_r, + Kokkos::subview(buffer, + std::make_pair(N_GHOSTS, N_GHOSTS + nx1), + N_GHOSTS + nx2 - 1, + buff_idx)); + MPI_Send(aphi_r.data(), + nx1, + mpi::get_type(), + rank_recv, + 0, + MPI_COMM_WORLD); + } else if (local_domain->mpi_rank() == rank_recv) { + array_t aphi_r { "Aphi_r", nx1 }; + MPI_Recv(aphi_r.data(), + nx1, + mpi::get_type(), + rank_send, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + Kokkos::parallel_for( + "AddVectorPotential", + local_domain->mesh.rangeActiveCells(), + Lambda(index_t i1, index_t i2) { + buffer(i1, i2, buff_idx) += aphi_r(i1 - N_GHOSTS); + }); + } + } + } + } else { + raise::Error("CommunicateVectorPotential: comm vector potential only " + "possible for 2D", + HERE); + } + } +#endif + template auto Metadomain::Write( const SimulationParams& params, @@ -472,6 +517,9 @@ namespace ntt { local_domain->fields.em, c, local_domain->mesh); +#if defined(MPI_ENABLED) + CommunicateVectorPotential(c); +#endif } else { raise::Error( "Vector potential can only be computed for GRPIC in 2D", From 5bf3be304ee7e52c66394f08ad5cbe252f166858 Mon Sep 17 00:00:00 2001 From: haykh Date: Mon, 7 Jul 2025 18:45:42 -0400 Subject: [PATCH 3/9] template spec --- src/framework/domain/output.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index 347517be..fd4787eb 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -860,7 +860,8 @@ namespace ntt { index_t, \ timestep_t, \ simtime_t, \ - const Domain&)>) -> bool; + const Domain&)>) -> bool; \ + template void Metadomain::CommunicateVectorPotential(unsigned short); METADOMAIN_OUTPUT(SimEngine::SRPIC, metric::Minkowski) METADOMAIN_OUTPUT(SimEngine::SRPIC, metric::Minkowski) From d9a7a4bbc86bcc1f4c0613c582290d7e4539faa9 Mon Sep 17 00:00:00 2001 From: haykh Date: Mon, 7 Jul 2025 18:59:07 -0400 Subject: [PATCH 4/9] extract vector potential to outside func --- src/framework/domain/output.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index fd4787eb..b73b87ca 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -246,6 +246,19 @@ namespace ntt { } #if defined(MPI_ENABLED) && defined(OUTPUT_ENABLED) + template + void ExtractVectorPotential(ndfield_t& buffer, + array_t& aphi_r, + unsigned short buff_idx, + const Mesh mesh) { + Kokkos::parallel_for( + "AddVectorPotential", + mesh.rangeActiveCells(), + Lambda(index_t i1, index_t i2) { + buffer(i1, i2, buff_idx) += aphi_r(i1 - N_GHOSTS); + }); + } + template void Metadomain::CommunicateVectorPotential(unsigned short buff_idx) { if constexpr (M::Dim == Dim::_2D) { @@ -287,12 +300,7 @@ namespace ntt { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - Kokkos::parallel_for( - "AddVectorPotential", - local_domain->mesh.rangeActiveCells(), - Lambda(index_t i1, index_t i2) { - buffer(i1, i2, buff_idx) += aphi_r(i1 - N_GHOSTS); - }); + ExtractVectorPotential(buffer, aphi_r, buff_idx, local_domain->mesh); } } } From a35451b21bde10949bf7abbd2946d6ed9fb1e55a Mon Sep 17 00:00:00 2001 From: Alisa Galishnikova <55898700+alisagk@users.noreply.github.com> Date: Mon, 7 Jul 2025 19:37:50 -0400 Subject: [PATCH 5/9] minor adjustments of indexes --- src/framework/domain/output.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index b73b87ca..7109bc07 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -220,12 +220,13 @@ namespace ntt { // update += input_val; // }); // }); + // TODO: this is slow Kokkos::parallel_for( "ComputeVectorPotential", mesh.rangeActiveCells(), Lambda(index_t i1, index_t i2) { const real_t i1_ { COORD(i1) }; - const ncells_t k_min = 1; + const ncells_t k_min = 0; const ncells_t k_max = (i2 - (N_GHOSTS)); real_t A3 = ZERO; for (auto k { k_min }; k <= k_max; ++k) { @@ -233,8 +234,8 @@ namespace ntt { real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i1_, k_ - HALF }) }; real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i1_, k_ + HALF }) }; auto k1 { k + N_GHOSTS }; - A3 += HALF * (sqrt_detH_ij1 * EM(i1, k - 1, em::bx1) + - sqrt_detH_ij2 * EM(i1, k, em::bx1)); + A3 += HALF * (sqrt_detH_ij1 * EM(i1, k + N_GHOSTS - 1, em::bx1) + + sqrt_detH_ij2 * EM(i1, k + N_GHOSTS, em::bx1)); } buffer(i1, i2, buff_idx) = A3; }); From f5ca864c3e2419ca0a1c516f7e839e830d897e3c Mon Sep 17 00:00:00 2001 From: haykh Date: Tue, 15 Jul 2025 17:11:10 -0400 Subject: [PATCH 6/9] aphi with teampolicy --- dev/nix/kokkos.nix | 1 + src/framework/domain/output.cpp | 87 +++++++++++++++------------------ 2 files changed, 41 insertions(+), 47 deletions(-) diff --git a/dev/nix/kokkos.nix b/dev/nix/kokkos.nix index e12b9d57..b9373015 100644 --- a/dev/nix/kokkos.nix +++ b/dev/nix/kokkos.nix @@ -13,6 +13,7 @@ let rocm-core clr rocthrust + llvm.rocm-merged-llvm rocprim rocminfo rocm-smi diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index 7109bc07..9d59367d 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -192,52 +192,35 @@ namespace ntt { unsigned short buff_idx, const Mesh mesh) { if constexpr (M::Dim == Dim::_2D) { - // using TeamPolicy = Kokkos::TeamPolicy; - // const auto nx1 = mesh.n_active(in::x1); - // const auto nx2 = mesh.n_active(in::x2); - // - // TeamPolicy policy(nx1, Kokkos::AUTO); - // - // Kokkos::parallel_for( - // "ComputeVectorPotential", - // policy, - // Lambda(const TeamPolicy::member_type& team_member) { - // index_t i1 = team_member.league_rank(); - // Kokkos::parallel_scan( - // Kokkos::TeamThreadRange(team_member, nx2), - // [=](index_t i2, real_t& update, const bool final_pass) { - // const auto i1_ { static_cast(i1) }; - // const auto i2_ { static_cast(i2) }; - // real_t sqrt_detH_ijM { mesh.metric.sqrt_det_h({ i1_, i2_ - HALF }) }; - // real_t sqrt_detH_ijP { mesh.metric.sqrt_det_h({ i1_, i2_ + HALF }) }; - // const auto input_val = - // HALF * - // (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + - // sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); - // if (final_pass) { - // buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; - // } - // update += input_val; - // }); - // }); - // TODO: this is slow + using TeamPolicy = Kokkos::TeamPolicy; + const auto nx1 = mesh.n_active(in::x1); + const auto nx2 = mesh.n_active(in::x2); + + TeamPolicy policy(nx1, Kokkos::AUTO); + + const auto metric = mesh.metric; + Kokkos::parallel_for( "ComputeVectorPotential", - mesh.rangeActiveCells(), - Lambda(index_t i1, index_t i2) { - const real_t i1_ { COORD(i1) }; - const ncells_t k_min = 0; - const ncells_t k_max = (i2 - (N_GHOSTS)); - real_t A3 = ZERO; - for (auto k { k_min }; k <= k_max; ++k) { - real_t k_ = static_cast(k); - real_t sqrt_detH_ij1 { mesh.metric.sqrt_det_h({ i1_, k_ - HALF }) }; - real_t sqrt_detH_ij2 { mesh.metric.sqrt_det_h({ i1_, k_ + HALF }) }; - auto k1 { k + N_GHOSTS }; - A3 += HALF * (sqrt_detH_ij1 * EM(i1, k + N_GHOSTS - 1, em::bx1) + - sqrt_detH_ij2 * EM(i1, k + N_GHOSTS, em::bx1)); - } - buffer(i1, i2, buff_idx) = A3; + policy, + Lambda(const TeamPolicy::member_type& team_member) { + index_t i1 = team_member.league_rank(); + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team_member, nx2), + [=](index_t i2, real_t& update, const bool final_pass) { + const auto i1_ { static_cast(i1) }; + const auto i2_ { static_cast(i2) }; + real_t sqrt_detH_ijM { metric.sqrt_det_h({ i1_, i2_ - HALF }) }; + real_t sqrt_detH_ijP { metric.sqrt_det_h({ i1_, i2_ + HALF }) }; + const auto input_val = + HALF * + (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + + sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); + update += input_val; + if (final_pass) { + buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; + } + }); }); } else { raise::KernelError( @@ -520,7 +503,7 @@ namespace ntt { HERE); } } else if (fld.is_vpotential()) { - if (S == SimEngine::GRPIC && M::Dim == Dim::_2D) { + if constexpr (S == SimEngine::GRPIC && M::Dim == Dim::_2D) { const auto c = static_cast(addresses.back()); ComputeVectorPotential(local_domain->fields.bckp, local_domain->fields.em, @@ -869,8 +852,7 @@ namespace ntt { index_t, \ timestep_t, \ simtime_t, \ - const Domain&)>) -> bool; \ - template void Metadomain::CommunicateVectorPotential(unsigned short); + const Domain&)>) -> bool; METADOMAIN_OUTPUT(SimEngine::SRPIC, metric::Minkowski) METADOMAIN_OUTPUT(SimEngine::SRPIC, metric::Minkowski) @@ -883,4 +865,15 @@ namespace ntt { #undef METADOMAIN_OUTPUT +#if defined(MPI_ENABLED) + #define COMMVECTORPOTENTIAL(S, M) \ + template void Metadomain::CommunicateVectorPotential(unsigned short); + + COMMVECTORPOTENTIAL(SimEngine::GRPIC, metric::KerrSchild) + COMMVECTORPOTENTIAL(SimEngine::GRPIC, metric::QKerrSchild) + COMMVECTORPOTENTIAL(SimEngine::GRPIC, metric::KerrSchild0) + + #undef COMMVECTORPOTENTIAL +#endif + } // namespace ntt From 4612481440c409c20da3b47a36ef03a27ff58ee7 Mon Sep 17 00:00:00 2001 From: haykh Date: Tue, 15 Jul 2025 21:30:00 -0400 Subject: [PATCH 7/9] wald toml fixed (slightly) --- dev/nix/kokkos.nix | 6 +++--- dev/nix/shell.nix | 3 --- pgens/wald/wald.toml | 7 +++---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/dev/nix/kokkos.nix b/dev/nix/kokkos.nix index b9373015..2f6ee6b9 100644 --- a/dev/nix/kokkos.nix +++ b/dev/nix/kokkos.nix @@ -10,20 +10,22 @@ let pversion = "4.6.01"; compilerPkgs = { "HIP" = with pkgs.rocmPackages; [ + llvm.rocm-merged-llvm rocm-core clr rocthrust - llvm.rocm-merged-llvm rocprim rocminfo rocm-smi ]; "CUDA" = with pkgs.cudaPackages; [ + llvmPackages_18.clang-tools cudatoolkit cuda_cudart pkgs.gcc13 ]; "NONE" = [ + pkgs.llvmPackages_18.clang-tools pkgs.gcc13 ]; }; @@ -37,9 +39,7 @@ let "HIP" = [ "-D Kokkos_ENABLE_HIP=ON" "-D Kokkos_ARCH_${getArch { }}=ON" - # remove leading AMD_ "-D AMDGPU_TARGETS=${builtins.replaceStrings [ "amd_" ] [ "" ] (pkgs.lib.toLower (getArch { }))}" - "-D CMAKE_C_COMPILER=hipcc" "-D CMAKE_CXX_COMPILER=hipcc" ]; "CUDA" = [ diff --git a/dev/nix/shell.nix b/dev/nix/shell.nix index ae5451ec..33ae5709 100644 --- a/dev/nix/shell.nix +++ b/dev/nix/shell.nix @@ -42,9 +42,6 @@ pkgs.mkShell { zlib cmake - llvmPackages_18.clang-tools - libgcc - adios2Pkg kokkosPkg diff --git a/pgens/wald/wald.toml b/pgens/wald/wald.toml index 1104f7a0..75a3ef2e 100644 --- a/pgens/wald/wald.toml +++ b/pgens/wald/wald.toml @@ -4,11 +4,11 @@ runtime = 100.0 [grid] - resolution = [128, 128] + resolution = [512, 512] extent = [[1.0, 10.0]] [grid.metric] - metric = "kerr_schild" + metric = "qkerr_schild" qsph_r0 = 0.0 qsph_h = 0.0 ks_a = 0.95 @@ -40,8 +40,7 @@ [setup] [output] - format = "hdf5" - separate_files = false + format = "hdf5" [output.fields] interval_time = 1.0 From 5c481aa39c889869dc3f5b596297779b74f46344 Mon Sep 17 00:00:00 2001 From: haykh Date: Tue, 15 Jul 2025 21:50:39 -0400 Subject: [PATCH 8/9] added vertical field support in wald --- pgens/wald/pgen.hpp | 217 ++++++++++++++++++++++++++----------------- pgens/wald/wald.toml | 1 + 2 files changed, 134 insertions(+), 84 deletions(-) diff --git a/pgens/wald/pgen.hpp b/pgens/wald/pgen.hpp index b2c2aeb0..71ee905e 100644 --- a/pgens/wald/pgen.hpp +++ b/pgens/wald/pgen.hpp @@ -7,6 +7,7 @@ #include "arch/kokkos_aliases.h" #include "arch/traits.h" #include "utils/comparators.h" +#include "utils/error.h" #include "utils/formatting.h" #include "utils/log.h" #include "utils/numeric.h" @@ -17,14 +18,31 @@ #include "framework/domain/domain.h" #include "framework/domain/metadomain.h" +#include #include +enum InitFieldGeometry { + Wald, + Vertical, +}; + namespace user { using namespace ntt; template struct InitFields { - InitFields(M metric_) : metric { metric_ } {} + InitFields(M metric_, const std::string& init_field_geometry) + : metric { metric_ } { + if (init_field_geometry == "wald") { + field_geometry = InitFieldGeometry::Wald; + } else if (init_field_geometry == "vertical") { + field_geometry = InitFieldGeometry::Vertical; + } else { + raise::Error(fmt::format("Unrecognized field geometry: %s", + init_field_geometry.c_str()), + HERE); + } + } Inline auto A_3(const coord_t& x_Cd) const -> real_t { return HALF * (metric.template h_<3, 3>(x_Cd) + @@ -83,102 +101,132 @@ namespace user { Inline auto bx3( const coord_t& x_Ph) const -> real_t { // at ( i + HALF , j + HALF ) - coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; - metric.template convert(x_Ph, xi); - - x0m[0] = xi[0]; - x0m[1] = xi[1] - HALF; - x0p[0] = xi[0]; - x0p[1] = xi[1] + HALF; - - real_t inv_sqrt_detH_iPjP { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; - return -(A_1(x0p) - A_1(x0m)) * inv_sqrt_detH_iPjP; + if (field_geometry == InitFieldGeometry::Wald) { + coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; + metric.template convert(x_Ph, xi); + + x0m[0] = xi[0]; + x0m[1] = xi[1] - HALF; + x0p[0] = xi[0]; + x0p[1] = xi[1] + HALF; + + real_t inv_sqrt_detH_iPjP { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; + return -(A_1(x0p) - A_1(x0m)) * inv_sqrt_detH_iPjP; + } else if (field_geometry == InitFieldGeometry::Vertical) { + return ZERO; + } else { + raise::KernelError(HERE, "Unrecognized field geometry"); + return ZERO; + } } Inline auto dx1(const coord_t& x_Ph) const -> real_t { // at ( i + HALF , j ) - coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; - metric.template convert(x_Ph, xi); - - real_t alpha_iPj { metric.alpha({ xi[0], xi[1] }) }; - real_t inv_sqrt_detH_ij { ONE / metric.sqrt_det_h({ xi[0] - HALF, xi[1] }) }; - real_t sqrt_detH_ij { metric.sqrt_det_h({ xi[0] - HALF, xi[1] }) }; - real_t beta_ij { metric.beta1({ xi[0] - HALF, xi[1] }) }; - real_t alpha_ij { metric.alpha({ xi[0] - HALF, xi[1] }) }; - - // D1 at ( i + HALF , j ) - x0m[0] = xi[0] - HALF; - x0m[1] = xi[1]; - x0p[0] = xi[0] + HALF; - x0p[1] = xi[1]; - real_t E1d { (A_0(x0p) - A_0(x0m)) }; - real_t D1d { E1d / alpha_iPj }; - - // D3 at ( i , j ) - x0m[0] = xi[0] - HALF - HALF; - x0m[1] = xi[1]; - x0p[0] = xi[0] - HALF + HALF; - x0p[1] = xi[1]; - real_t D3d { (A_3(x0p) - A_3(x0m)) * beta_ij / alpha_ij }; - - real_t D1u { metric.template h<1, 1>({ xi[0], xi[1] }) * D1d + - metric.template h<1, 3>({ xi[0], xi[1] }) * D3d }; - - return D1u; + if (field_geometry == InitFieldGeometry::Wald) { + coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; + metric.template convert(x_Ph, xi); + + real_t alpha_iPj { metric.alpha({ xi[0], xi[1] }) }; + real_t inv_sqrt_detH_ij { ONE / metric.sqrt_det_h({ xi[0] - HALF, xi[1] }) }; + real_t sqrt_detH_ij { metric.sqrt_det_h({ xi[0] - HALF, xi[1] }) }; + real_t beta_ij { metric.beta1({ xi[0] - HALF, xi[1] }) }; + real_t alpha_ij { metric.alpha({ xi[0] - HALF, xi[1] }) }; + + // D1 at ( i + HALF , j ) + x0m[0] = xi[0] - HALF; + x0m[1] = xi[1]; + x0p[0] = xi[0] + HALF; + x0p[1] = xi[1]; + real_t E1d { (A_0(x0p) - A_0(x0m)) }; + real_t D1d { E1d / alpha_iPj }; + + // D3 at ( i , j ) + x0m[0] = xi[0] - HALF - HALF; + x0m[1] = xi[1]; + x0p[0] = xi[0] - HALF + HALF; + x0p[1] = xi[1]; + real_t D3d { (A_3(x0p) - A_3(x0m)) * beta_ij / alpha_ij }; + + real_t D1u { metric.template h<1, 1>({ xi[0], xi[1] }) * D1d + + metric.template h<1, 3>({ xi[0], xi[1] }) * D3d }; + + return D1u; + } else if (field_geometry == InitFieldGeometry::Vertical) { + return ZERO; + } else { + raise::KernelError(HERE, "Unrecognized field geometry"); + return ZERO; + } } Inline auto dx2(const coord_t& x_Ph) const -> real_t { // at ( i , j + HALF ) - coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; - metric.template convert(x_Ph, xi); - x0m[0] = xi[0]; - x0m[1] = xi[1] - HALF; - x0p[0] = xi[0]; - x0p[1] = xi[1] + HALF; - real_t inv_sqrt_detH_ijP { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; - real_t sqrt_detH_ijP { metric.sqrt_det_h({ xi[0], xi[1] }) }; - real_t alpha_ijP { metric.alpha({ xi[0], xi[1] }) }; - real_t beta_ijP { metric.beta1({ xi[0], xi[1] }) }; - - real_t E2d { (A_0(x0p) - A_0(x0m)) }; - real_t D2d { E2d / alpha_ijP - (A_1(x0p) - A_1(x0m)) * beta_ijP / alpha_ijP }; - real_t D2u { metric.template h<2, 2>({ xi[0], xi[1] }) * D2d }; - - return D2u; + if (field_geometry == InitFieldGeometry::Wald) { + coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; + metric.template convert(x_Ph, xi); + x0m[0] = xi[0]; + x0m[1] = xi[1] - HALF; + x0p[0] = xi[0]; + x0p[1] = xi[1] + HALF; + real_t inv_sqrt_detH_ijP { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; + real_t sqrt_detH_ijP { metric.sqrt_det_h({ xi[0], xi[1] }) }; + real_t alpha_ijP { metric.alpha({ xi[0], xi[1] }) }; + real_t beta_ijP { metric.beta1({ xi[0], xi[1] }) }; + + real_t E2d { (A_0(x0p) - A_0(x0m)) }; + real_t D2d { E2d / alpha_ijP - + (A_1(x0p) - A_1(x0m)) * beta_ijP / alpha_ijP }; + real_t D2u { metric.template h<2, 2>({ xi[0], xi[1] }) * D2d }; + + return D2u; + } else if (field_geometry == InitFieldGeometry::Vertical) { + return ZERO; + } else { + raise::KernelError(HERE, "Unrecognized field geometry"); + return ZERO; + } } Inline auto dx3(const coord_t& x_Ph) const -> real_t { // at ( i , j ) - coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; - metric.template convert(x_Ph, xi); - real_t inv_sqrt_detH_ij { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; - real_t sqrt_detH_ij { metric.sqrt_det_h({ xi[0], xi[1] }) }; - real_t beta_ij { metric.beta1({ xi[0], xi[1] }) }; - real_t alpha_ij { metric.alpha({ xi[0], xi[1] }) }; - real_t alpha_iPj { metric.alpha({ xi[0] + HALF, xi[1] }) }; - - // D3 at ( i , j ) - x0m[0] = xi[0] - HALF; - x0m[1] = xi[1]; - x0p[0] = xi[0] + HALF; - x0p[1] = xi[1]; - real_t D3d { (A_3(x0p) - A_3(x0m)) * beta_ij / alpha_ij }; - - // D1 at ( i + HALF , j ) - x0m[0] = xi[0] + HALF - HALF; - x0m[1] = xi[1]; - x0p[0] = xi[0] + HALF + HALF; - x0p[1] = xi[1]; - real_t E1d { (A_0(x0p) - A_0(x0m)) }; - real_t D1d { E1d / alpha_iPj }; - - if (cmp::AlmostZero(x_Ph[1])) { - return metric.template h<1, 3>({ xi[0], xi[1] }) * D1d; + if (field_geometry == InitFieldGeometry::Wald) { + coord_t xi { ZERO }, x0m { ZERO }, x0p { ZERO }; + metric.template convert(x_Ph, xi); + real_t inv_sqrt_detH_ij { ONE / metric.sqrt_det_h({ xi[0], xi[1] }) }; + real_t sqrt_detH_ij { metric.sqrt_det_h({ xi[0], xi[1] }) }; + real_t beta_ij { metric.beta1({ xi[0], xi[1] }) }; + real_t alpha_ij { metric.alpha({ xi[0], xi[1] }) }; + real_t alpha_iPj { metric.alpha({ xi[0] + HALF, xi[1] }) }; + + // D3 at ( i , j ) + x0m[0] = xi[0] - HALF; + x0m[1] = xi[1]; + x0p[0] = xi[0] + HALF; + x0p[1] = xi[1]; + real_t D3d { (A_3(x0p) - A_3(x0m)) * beta_ij / alpha_ij }; + + // D1 at ( i + HALF , j ) + x0m[0] = xi[0] + HALF - HALF; + x0m[1] = xi[1]; + x0p[0] = xi[0] + HALF + HALF; + x0p[1] = xi[1]; + real_t E1d { (A_0(x0p) - A_0(x0m)) }; + real_t D1d { E1d / alpha_iPj }; + + if (cmp::AlmostZero(x_Ph[1])) { + return metric.template h<1, 3>({ xi[0], xi[1] }) * D1d; + } else { + return metric.template h<3, 3>({ xi[0], xi[1] }) * D3d + + metric.template h<1, 3>({ xi[0], xi[1] }) * D1d; + } + } else if (field_geometry == InitFieldGeometry::Vertical) { + return ZERO; } else { - return metric.template h<3, 3>({ xi[0], xi[1] }) * D3d + - metric.template h<1, 3>({ xi[0], xi[1] }) * D1d; + raise::KernelError(HERE, "Unrecognized field geometry"); + return ZERO; } } private: - const M metric; + const M metric; + InitFieldGeometry field_geometry; }; template @@ -201,7 +249,8 @@ namespace user { inline PGen(const SimulationParams& p, const Metadomain& m) : arch::ProblemGenerator { p } , global_domain { m } - , init_flds { m.mesh().metric } {} + , init_flds { m.mesh().metric, + p.template get("setup.init_field", "wald") } {} }; } // namespace user diff --git a/pgens/wald/wald.toml b/pgens/wald/wald.toml index 75a3ef2e..2b05fbac 100644 --- a/pgens/wald/wald.toml +++ b/pgens/wald/wald.toml @@ -38,6 +38,7 @@ ppc0 = 2.0 [setup] + init_field = "wald" # or "vertical" [output] format = "hdf5" From ec47ff89ec667199eeb0ee933d9351f0ccdffc8d Mon Sep 17 00:00:00 2001 From: haykh Date: Wed, 16 Jul 2025 15:01:55 -0400 Subject: [PATCH 9/9] old implementation returned --- src/framework/domain/output.cpp | 72 +++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/src/framework/domain/output.cpp b/src/framework/domain/output.cpp index 9d59367d..960b2c71 100644 --- a/src/framework/domain/output.cpp +++ b/src/framework/domain/output.cpp @@ -192,36 +192,56 @@ namespace ntt { unsigned short buff_idx, const Mesh mesh) { if constexpr (M::Dim == Dim::_2D) { - using TeamPolicy = Kokkos::TeamPolicy; - const auto nx1 = mesh.n_active(in::x1); - const auto nx2 = mesh.n_active(in::x2); - - TeamPolicy policy(nx1, Kokkos::AUTO); - const auto metric = mesh.metric; - Kokkos::parallel_for( "ComputeVectorPotential", - policy, - Lambda(const TeamPolicy::member_type& team_member) { - index_t i1 = team_member.league_rank(); - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(team_member, nx2), - [=](index_t i2, real_t& update, const bool final_pass) { - const auto i1_ { static_cast(i1) }; - const auto i2_ { static_cast(i2) }; - real_t sqrt_detH_ijM { metric.sqrt_det_h({ i1_, i2_ - HALF }) }; - real_t sqrt_detH_ijP { metric.sqrt_det_h({ i1_, i2_ + HALF }) }; - const auto input_val = - HALF * - (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + - sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); - update += input_val; - if (final_pass) { - buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; - } - }); + mesh.rangeActiveCells(), + Lambda(index_t i1, index_t i2) { + const real_t i1_ { COORD(i1) }; + const ncells_t k_min = 0; + const ncells_t k_max = (i2 - (N_GHOSTS)); + real_t A3 = ZERO; + for (auto k { k_min }; k <= k_max; ++k) { + real_t k_ = static_cast(k); + real_t sqrt_detH_ij1 { metric.sqrt_det_h({ i1_, k_ - HALF }) }; + real_t sqrt_detH_ij2 { metric.sqrt_det_h({ i1_, k_ + HALF }) }; + auto k1 { k + N_GHOSTS }; + A3 += HALF * (sqrt_detH_ij1 * EM(i1, k + N_GHOSTS - 1, em::bx1) + + sqrt_detH_ij2 * EM(i1, k + N_GHOSTS, em::bx1)); + } + buffer(i1, i2, buff_idx) = A3; }); + + // @TODO: Implementation with team policies works on AMD, but not on NVIDIA GPUs + // + // using TeamPolicy = Kokkos::TeamPolicy; + // const auto nx1 = mesh.n_active(in::x1); + // const auto nx2 = mesh.n_active(in::x2); + // + // TeamPolicy policy(nx1, Kokkos::AUTO); + // + // Kokkos::parallel_for( + // "ComputeVectorPotential", + // policy, + // Lambda(const TeamPolicy::member_type& team_member) { + // index_t i1 = team_member.league_rank(); + // Kokkos::parallel_scan( + // Kokkos::TeamThreadRange(team_member, nx2), + // [=](index_t i2, real_t& update, const bool final_pass) { + // const auto i1_ { static_cast(i1) }; + // const auto i2_ { static_cast(i2) }; + // const real_t sqrt_detH_ijM { metric.sqrt_det_h({ i1_, i2_ - HALF }) }; + // const real_t sqrt_detH_ijP { metric.sqrt_det_h({ i1_, i2_ + HALF }) }; + // const auto input_val = + // HALF * + // (sqrt_detH_ijM * EM(i1 + N_GHOSTS, i2 + N_GHOSTS - 1, em::bx1) + + // sqrt_detH_ijP * EM(i1 + N_GHOSTS, i2 + N_GHOSTS, em::bx1)); + // if (final_pass) { + // buffer(i1 + N_GHOSTS, i2 + N_GHOSTS, buff_idx) = update; + // } + // update += input_val; + // }); + // }); } else { raise::KernelError( HERE,