From 56033ff61a40fa9e8f3cd9e7b1424bad1b442a4a Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Fri, 24 Apr 2026 16:41:36 +0200 Subject: [PATCH 01/20] support mixed precision --- src/core/common/include/context.h | 2 +- src/core/common/include/md_types.h | 28 +++++----- src/core/common/include/precision.h | 7 +++ src/core/common/include/vdw_rules.h | 26 ++++----- src/core/common/src/init.cpp | 12 ++-- src/core/cpu/src/cpu_nonbonded_pp_force.cpp | 21 ++++--- src/core/cpu/src/cpu_nonbonded_pw_force.cpp | 28 +++++----- src/core/cpu/src/cpu_nonbonded_qp_force.cpp | 24 ++++---- src/core/cpu/src/cpu_nonbonded_qq_force.cpp | 23 ++++---- src/core/cpu/src/cpu_nonbonded_qw_force.cpp | 47 ++++++++-------- src/core/cpu/src/cpu_nonbonded_ww_force.cpp | 49 +++++++++-------- src/core/cuda/src/cuda_angle_force.cu | 12 ++-- src/core/cuda/src/cuda_nonbonded_14_force.cu | 42 +++++++------- src/core/cuda/src/cuda_nonbonded_force.cu | 58 ++++++++++++-------- 14 files changed, 202 insertions(+), 177 deletions(-) create mode 100644 src/core/common/include/precision.h diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h index c77a2c91..83817bb8 100644 --- a/src/core/common/include/context.h +++ b/src/core/common/include/context.h @@ -187,7 +187,7 @@ class Context { std::unique_ptr> p_atoms_list; std::unique_ptr> w_atoms_list; std::unique_ptr> q_atoms_list; - std::unique_ptr> charge_pair_products; + std::unique_ptr> charge_pair_products; std::unique_ptr> p_charge_types; std::unique_ptr> w_charge_types; std::unique_ptr> q_charge_types; diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index 60f1f56a..6a4d2865 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -2,6 +2,8 @@ #include #include + +#include "common/include/precision.h" /* ============================================= * == FROM MD FILE * ============================================= @@ -47,9 +49,9 @@ struct md_t { }; struct coord_t { - double x; - double y; - double z; + real_t x; + real_t y; + real_t z; }; struct bond_t { @@ -114,7 +116,7 @@ struct charge_t { struct ccharge_t { int code; - double charge; + real_t charge; }; struct atype_t { @@ -125,17 +127,17 @@ struct atype_t { struct catype_t { int code; double m; - double aii_normal; - double bii_normal; + real_t aii_normal; + real_t bii_normal; // double aii_polar; // double bii_polar; - double aii_1_4; - double bii_1_4; + real_t aii_1_4; + real_t bii_1_4; }; struct vdw_pair_param_t { - double a; - double b; + real_t a; + real_t b; }; struct topo_t { @@ -302,9 +304,9 @@ struct shake_bond_t { */ struct vel_t { - double x; - double y; - double z; + real_t x; + real_t y; + real_t z; }; struct dvel_t { diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h new file mode 100644 index 00000000..f15fc6ca --- /dev/null +++ b/src/core/common/include/precision.h @@ -0,0 +1,7 @@ +#pragma once + +#ifdef QDYN_SPFP +using real_t = float; +#else +using real_t = double; +#endif diff --git a/src/core/common/include/vdw_rules.h b/src/core/common/include/vdw_rules.h index ca7bd762..5b8e8604 100644 --- a/src/core/common/include/vdw_rules.h +++ b/src/core/common/include/vdw_rules.h @@ -4,15 +4,10 @@ #include - -// Geometric rule: A_ij = sqrt(A_i) * sqrt(A_j), B_ij = sqrt(B_i) * sqrt(B_j) -// Energy: V = A_ij * r^-12 - B_ij * r^-6 -// Parameters: ai_aii, aj_aii are sqrt(A_i), sqrt(A_j) -// ai_bii, aj_bii are sqrt(B_i), sqrt(B_j) -// r6 is 1/r^6 +template __device__ __host__ inline void calc_vdw_geometric( - double ai_aii, double aj_aii, double ai_bii, double aj_bii, - double r6, double* V_a, double* V_b) { + Real ai_aii, Real aj_aii, Real ai_bii, Real aj_bii, + Real r6, Real* V_a, Real* V_b) { *V_a = r6 * r6 * ai_aii * aj_aii; *V_b = r6 * ai_bii * aj_bii; } @@ -24,16 +19,17 @@ __device__ __host__ inline void calc_vdw_geometric( // ai_aii, aj_aii store R*_i, R*_j (vdW radius) // ai_bii, aj_bii store sqrt(eps_i), sqrt(eps_j) (after preprocessing) // r6 is 1/r^6 +template __device__ __host__ inline void calc_vdw_arithmetic( - double Rstar_i, double Rstar_j, double sqrt_eps_i, double sqrt_eps_j, - double r6, double* V_a, double* V_b) { - double Rstar_ij = Rstar_i + Rstar_j; // Arithmetic combination - double sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j; // Geometric combination (already sqrt) + Real Rstar_i, Real Rstar_j, Real sqrt_eps_i, Real sqrt_eps_j, + Real r6, Real* V_a, Real* V_b) { + Real Rstar_ij = Rstar_i + Rstar_j; // Arithmetic combination + Real sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j; // Geometric combination (already sqrt) // Compute R6 = (R*_ij)^6 - double R2 = Rstar_ij * Rstar_ij; - double R6 = R2 * R2 * R2; + Real R2 = Rstar_ij * Rstar_ij; + Real R6 = R2 * R2 * R2; *V_a = sqrt_eps_ij * R6 * R6 * r6 * r6; // sqrt(eps_i * eps_j) * R^12 * r^-12 - *V_b = 2.0 * sqrt_eps_ij * R6 * r6; // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6 + *V_b = static_cast(2.0) * sqrt_eps_ij * R6 * r6; // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6 } diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp index dc519a9f..499c01cb 100644 --- a/src/core/common/src/init.cpp +++ b/src/core/common/src/init.cpp @@ -77,9 +77,11 @@ void initialize_catype_tables() { const catype_t& cj = h_catype_table_all[j]; vdw_pair_param_t pair_param = {}; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { - calc_vdw_geometric(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b); + calc_vdw_geometric( + ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast(1.0), &pair_param.a, &pair_param.b); } else { - calc_vdw_arithmetic(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b); + calc_vdw_arithmetic( + ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast(1.0), &pair_param.a, &pair_param.b); } h_catype_pair_params[i * ctx.n_catype_types + j] = pair_param; } @@ -168,10 +170,11 @@ void initialize_charge_tables() { ctx.zero_charge_type = add_charge(0.0); ctx.n_charge_types = static_cast(h_charge_table_all.size()); - std::vector h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types); + std::vector h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types); for (int i = 0; i < ctx.n_charge_types; i++) { for (int j = 0; j < ctx.n_charge_types; j++) { - h_charge_pair_products[i * ctx.n_charge_types + j] = h_charge_table_all[i].charge * h_charge_table_all[j].charge; + h_charge_pair_products[i * ctx.n_charge_types + j] = + static_cast(h_charge_table_all[i].charge * h_charge_table_all[j].charge); } } @@ -913,4 +916,3 @@ void write_headers() { write_header("velocities.csv"); write_energy_header(); } - diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp index ce744ad0..390c67eb 100644 --- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp @@ -15,11 +15,10 @@ void calc_nonbonded_pp_forces() { bool bond14, bond23; double scaling; coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double crg_i, crg_j; - double ai_aii, aj_aii, ai_bii, aj_bii; + real_t r2a, ra, r6a; + real_t V_a, V_b; + real_t crg_i, crg_j; + real_t ai_aii, aj_aii, ai_bii, aj_bii; int i, j; for (int pi = 0; pi < ctx.n_patoms; pi++) { for (int pj = pi + 1; pj < ctx.n_patoms; pj++) { @@ -42,11 +41,11 @@ void calc_nonbonded_pp_forces() { da.x = coords[j].x - coords[i].x; da.y = coords[j].y - coords[i].y; da.z = coords[j].z - coords[i].z; - r2a = 1 / (std::pow(da.x, 2) + std::pow(da.y, 2) + std::pow(da.z, 2)); - ra = sqrt(r2a); + r2a = static_cast(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z); + ra = static_cast(std::sqrt(r2a)); r6a = r2a * r2a * r2a; - Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra; + const real_t Vela = static_cast(scaling * ctx.topo.coulomb_constant) * crg_i * crg_j * ra; ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal; aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; @@ -58,7 +57,7 @@ void calc_nonbonded_pp_forces() { } else { calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b); } - dva = r2a * (-Vela - 12 * V_a + 6 * V_b); + const real_t dva = r2a * (-Vela - static_cast(12.0) * V_a + static_cast(6.0) * V_b); dvelocities[i].x -= dva * da.x; dvelocities[i].y -= dva * da.y; @@ -68,8 +67,8 @@ void calc_nonbonded_pp_forces() { dvelocities[j].y += dva * da.y; dvelocities[j].z += dva * da.z; - ctx.E_nonbond_pp.Ucoul += Vela; - ctx.E_nonbond_pp.Uvdw += (V_a - V_b); + ctx.E_nonbond_pp.Ucoul += static_cast(Vela); + ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp index 6bf2c27e..030c1290 100644 --- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp @@ -22,21 +22,21 @@ void calc_nonbonded_pw_forces() { continue; } - const double qi = ctx.unified_ccharge(atom_i, 0).charge; - const double qj = ctx.unified_ccharge(atom_j, 0).charge; + const real_t qi = ctx.unified_ccharge(atom_i, 0).charge; + const real_t qj = ctx.unified_ccharge(atom_j, 0).charge; const catype_t& atom_i_type = ctx.unified_catype(atom_i, 0); const catype_t& atom_j_type = ctx.unified_catype(atom_j, 0); - double v_a = 0.0; - double v_b = 0.0; - const double dx = coords[atom_j].x - coords[atom_i].x; - const double dy = coords[atom_j].y - coords[atom_i].y; - const double dz = coords[atom_j].z - coords[atom_i].z; - const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz); - const double rinv = std::sqrt(r2inv); - const double r6inv = r2inv * r2inv * r2inv; - const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv; + real_t v_a = 0.0; + real_t v_b = 0.0; + const real_t dx = coords[atom_j].x - coords[atom_i].x; + const real_t dy = coords[atom_j].y - coords[atom_i].y; + const real_t dz = coords[atom_j].z - coords[atom_i].z; + const real_t r2inv = static_cast(1.0) / (dx * dx + dy * dy + dz * dz); + const real_t rinv = static_cast(std::sqrt(r2inv)); + const real_t r6inv = r2inv * r2inv * r2inv; + const real_t ecoul = static_cast(ctx.topo.coulomb_constant) * qi * qj * rinv; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { calc_vdw_geometric(atom_i_type.aii_normal, @@ -56,7 +56,7 @@ void calc_nonbonded_pw_forces() { &v_b); } - const double scale = r2inv * (-ecoul - 12.0 * v_a + 6.0 * v_b); + const real_t scale = r2inv * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); dvelocities[atom_i].x -= scale * dx; dvelocities[atom_i].y -= scale * dy; @@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() { dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - ctx.E_nonbond_pw.Ucoul += ecoul; - ctx.E_nonbond_pw.Uvdw += (v_a - v_b); + ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); + ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp index 65a74a6c..7a81a516 100644 --- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp @@ -15,10 +15,11 @@ void calc_nonbonded_qp_forces() { auto *excluded = ctx.excluded->cpu_data_p; int i, j; coord_t da; - double r2, r6, r; - double ai_aii, aj_aii, ai_bii, aj_bii; + real_t r2, r; + real_t ai_aii, aj_aii, ai_bii, aj_bii; bool bond23, bond14; - double scaling, Vel, V_a, V_b, dv; + double scaling; + real_t Vel, V_a, V_b, dv; for (int qi = 0; qi < ctx.n_qatoms; qi++) { for (int pj = 0; pj < ctx.n_patoms; pj++) { @@ -37,12 +38,10 @@ void calc_nonbonded_qp_forces() { da.y = coords[j].y - coords[i].y; da.z = coords[j].z - coords[i].z; - r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2); - - r6 = r2 * r2 * r2; - r2 = 1 / r2; - r = sqrt(r2); - double r6inv = r2 * r2 * r2; // 1/r^6 for vdW calculation + r2 = da.x * da.x + da.y * da.y + da.z * da.z; + r2 = static_cast(1.0) / r2; + r = static_cast(std::sqrt(r2)); + const real_t r6inv = r2 * r2 * r2; // 1/r^6 for vdW calculation for (int state = 0; state < ctx.n_lambdas; state++) { const catype_t& qi_type = ctx.unified_catype(i, state); @@ -53,7 +52,8 @@ void calc_nonbonded_qp_forces() { ai_bii = bond14 ? qi_type.bii_1_4 : qi_type.bii_normal; aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - Vel = ctx.topo.coulomb_constant * scaling * ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r; + Vel = static_cast(ctx.topo.coulomb_constant * scaling) * + ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { calc_vdw_geometric(ai_aii, aj_aii, ai_bii, aj_bii, r6inv, &V_a, &V_b); } else { @@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() { dvelocities[j].z += dv * da.z; // Update Q totals - ctx.EQ_nonbond_qp[state].Ucoul += Vel; - ctx.EQ_nonbond_qp[state].Uvdw += (V_a - V_b); + ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); + ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp index 2b062d48..006a3c0e 100644 --- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp @@ -15,14 +15,14 @@ void calc_nonbonded_qq_forces() { auto *excluded = ctx.excluded->cpu_data_p; auto *q_elscales = ctx.q_elscales->cpu_data_p; int ai, aj; - double crg_i, crg_j; + real_t crg_i, crg_j; double elscale, scaling; bool bond23, bond14; coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double ai_aii, aj_aii, ai_bii, aj_bii; + real_t r2a, ra, r6a; + real_t Vela, V_a, V_b; + real_t dva; + real_t ai_aii, aj_aii, ai_bii, aj_bii; for (int state = 0; state < ctx.n_lambdas; state++) { for (int qi = 0; qi < ctx.n_qatoms; qi++) { @@ -54,11 +54,11 @@ void calc_nonbonded_qq_forces() { da.x = coords[aj].x - coords[ai].x; da.y = coords[aj].y - coords[ai].y; da.z = coords[aj].z - coords[ai].z; - r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); - ra = sqrt(r2a); + r2a = static_cast(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z); + ra = static_cast(std::sqrt(r2a)); r6a = r2a * r2a * r2a; - Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra * elscale; + Vela = static_cast(scaling * ctx.topo.coulomb_constant * elscale) * crg_i * crg_j * ra; ai_aii = bond14 ? qi_type.aii_1_4 : qi_type.aii_normal; aj_aii = bond14 ? qj_type.aii_1_4 : qj_type.aii_normal; @@ -70,7 +70,8 @@ void calc_nonbonded_qq_forces() { } else { calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b); } - dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state]; + dva = r2a * (-Vela - static_cast(12.0) * V_a + static_cast(6.0) * V_b) * + static_cast(lambdas[state]); dvelocities[ai].x -= dva * da.x; dvelocities[ai].y -= dva * da.y; @@ -80,8 +81,8 @@ void calc_nonbonded_qq_forces() { dvelocities[aj].y += dva * da.y; dvelocities[aj].z += dva * da.z; - ctx.EQ_nonbond_qq[state].Ucoul += Vela; - ctx.EQ_nonbond_qq[state].Uvdw += (V_a - V_b); + ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); + ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp index 17530a16..8d18bc55 100644 --- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp @@ -13,17 +13,17 @@ void calc_nonbonded_qw_forces() { auto *excluded = ctx.excluded->cpu_data_p; int i; coord_t dO, dH1, dH2; - double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; - double dvO, dvH1, dvH2; - double V_a, V_b, VelO, VelH1, VelH2; - double ai_aii, ai_bii; + real_t r2O, rH1, rH2, rO, r2H1, r2H2; + real_t dvO, dvH1, dvH2; + real_t V_a, V_b, VelO, VelH1, VelH2; + real_t ai_aii, ai_bii; // Loop over O-atoms, q-atoms for (int j = ctx.n_atoms_solute; j < ctx.n_atoms; j += 3) { const catype_t& ow_type = ctx.unified_catype(j, 0); - const double ow_charge = ctx.unified_ccharge(j, 0).charge; - const double hw1_charge = ctx.unified_ccharge(j + 1, 0).charge; - const double hw2_charge = ctx.unified_ccharge(j + 2, 0).charge; + const real_t ow_charge = ctx.unified_ccharge(j, 0).charge; + const real_t hw1_charge = ctx.unified_ccharge(j + 1, 0).charge; + const real_t hw2_charge = ctx.unified_ccharge(j + 2, 0).charge; for (int qi = 0; qi < ctx.n_qatoms; qi++) { i = ctx.q_atoms[qi]; if (excluded[i] || excluded[j]) continue; @@ -36,13 +36,12 @@ void calc_nonbonded_qw_forces() { dH2.x = coords[j + 2].x - coords[i].x; dH2.y = coords[j + 2].y - coords[i].y; dH2.z = coords[j + 2].z - coords[i].z; - r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2); - rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2))); - rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2))); - r6O = r2O * r2O * r2O; - r2O = 1.0 / r2O; - rO = sqrt(r2O); - double r6Oinv = r2O * r2O * r2O; // 1/r^6 for vdW calculation + r2O = dO.x * dO.x + dO.y * dO.y + dO.z * dO.z; + rH1 = static_cast(std::sqrt(static_cast(1.0) / (dH1.x * dH1.x + dH1.y * dH1.y + dH1.z * dH1.z))); + rH2 = static_cast(std::sqrt(static_cast(1.0) / (dH2.x * dH2.x + dH2.y * dH2.y + dH2.z * dH2.z))); + r2O = static_cast(1.0) / r2O; + rO = static_cast(std::sqrt(r2O)); + const real_t r6Oinv = r2O * r2O * r2O; // 1/r^6 for vdW calculation r2H1 = rH1 * rH1; r2H2 = rH2 * rH2; @@ -63,19 +62,21 @@ void calc_nonbonded_qw_forces() { calc_vdw_arithmetic(ai_aii, ow_type.aii_normal, ai_bii, ow_type.bii_normal, r6Oinv, &V_a, &V_b); } - const double q_charge = ctx.unified_ccharge(i, state).charge; - VelO = ctx.topo.coulomb_constant * ow_charge * q_charge * rO; - VelH1 = ctx.topo.coulomb_constant * hw1_charge * q_charge * rH1; - VelH2 = ctx.topo.coulomb_constant * hw2_charge * q_charge * rH2; + const real_t q_charge = ctx.unified_ccharge(i, state).charge; + const real_t coulomb_constant = static_cast(ctx.topo.coulomb_constant); + VelO = coulomb_constant * ow_charge * q_charge * rO; + VelH1 = coulomb_constant * hw1_charge * q_charge * rH1; + VelH2 = coulomb_constant * hw2_charge * q_charge * rH2; // if (state == 0 && qi == 1) printf("j = %d ai__aii = %f A_O = %f B_O = %f V_a = %f V_b = %f r6O = %f\n", j, ai_aii, A_O, B_O, V_a, V_b, r6O); - dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * lambdas[state]; - dvH1 -= r2H1 * VelH1 * lambdas[state]; - dvH2 -= r2H2 * VelH2 * lambdas[state]; + const real_t lambda = static_cast(lambdas[state]); + dvO += r2O * (-VelO - (static_cast(12.0) * V_a - static_cast(6.0) * V_b)) * lambda; + dvH1 -= r2H1 * VelH1 * lambda; + dvH2 -= r2H2 * VelH2 * lambda; - ctx.EQ_nonbond_qw[state].Ucoul += (VelO + VelH1 + VelH2); - ctx.EQ_nonbond_qw[state].Uvdw += (V_a - V_b); + ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); + ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); } // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp index 505dd45a..3be5e6f0 100644 --- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp @@ -1,18 +1,21 @@ #include "cpu_nonbonded_ww_force.h" +#include + #include "constants.h" #include "context.h" #include "vdw_rules.h" namespace { -void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, double* vdw_a, double* vdw_b) { +void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, real_t* vdw_a, real_t* vdw_b) { const catype_t& oi_type = ctx.unified_catype(oxygen_i, 0); const catype_t& oj_type = ctx.unified_catype(oxygen_j, 0); if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { *vdw_a = oi_type.aii_normal * oj_type.aii_normal; *vdw_b = oi_type.bii_normal * oj_type.bii_normal; } else { - calc_vdw_arithmetic(oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, 1.0, vdw_a, vdw_b); + calc_vdw_arithmetic( + oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, static_cast(1.0), vdw_a, vdw_b); } } } // namespace @@ -20,33 +23,33 @@ void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, void accumulate_pair_force(Context& ctx, int atom_i, int atom_j, - double qi, - double qj, + real_t qi, + real_t qj, bool include_vdw, - double vdw_a, - double vdw_b, + real_t vdw_a, + real_t vdw_b, E_nonbonded_t& energy) { auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; - const double dx = coords[atom_j].x - coords[atom_i].x; - const double dy = coords[atom_j].y - coords[atom_i].y; - const double dz = coords[atom_j].z - coords[atom_i].z; + const real_t dx = coords[atom_j].x - coords[atom_i].x; + const real_t dy = coords[atom_j].y - coords[atom_i].y; + const real_t dz = coords[atom_j].z - coords[atom_i].z; - const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz); - const double rinv = std::sqrt(r2inv); - const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv; + const real_t r2inv = static_cast(1.0) / (dx * dx + dy * dy + dz * dz); + const real_t rinv = static_cast(std::sqrt(r2inv)); + const real_t ecoul = static_cast(ctx.topo.coulomb_constant) * qi * qj * rinv; - double evdw = 0.0; - double dva = -ecoul; + real_t evdw = 0.0; + real_t dva = -ecoul; if (include_vdw) { - const double r6inv = r2inv * r2inv * r2inv; - const double v_a = vdw_a * r6inv * r6inv; - const double v_b = vdw_b * r6inv; + const real_t r6inv = r2inv * r2inv * r2inv; + const real_t v_a = vdw_a * r6inv * r6inv; + const real_t v_b = vdw_b * r6inv; evdw = v_a - v_b; - dva -= 12.0 * v_a - 6.0 * v_b; + dva -= static_cast(12.0) * v_a - static_cast(6.0) * v_b; } - const double scale = r2inv * dva; + const real_t scale = r2inv * dva; dvelocities[atom_i].x -= scale * dx; dvelocities[atom_i].y -= scale * dy; @@ -56,8 +59,8 @@ void accumulate_pair_force(Context& ctx, dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - energy.Ucoul += ecoul; - energy.Uvdw += evdw; + energy.Ucoul += static_cast(ecoul); + energy.Uvdw += static_cast(evdw); } void calc_nonbonded_ww_forces() { @@ -70,8 +73,8 @@ void calc_nonbonded_ww_forces() { const int base_i = ctx.n_atoms_solute + 3 * water_i; for (int water_j = water_i + 1; water_j < ctx.n_waters; ++water_j) { const int base_j = ctx.n_atoms_solute + 3 * water_j; - double oxygen_vdw_a = 0.0; - double oxygen_vdw_b = 0.0; + real_t oxygen_vdw_a = 0.0; + real_t oxygen_vdw_b = 0.0; calc_oxygen_vdw_parameters(ctx, base_i, base_j, &oxygen_vdw_a, &oxygen_vdw_b); for (int atom_i = 0; atom_i < 3; ++atom_i) { for (int atom_j = 0; atom_j < 3; ++atom_j) { diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index 7c49cffb..dcd044ce 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -48,14 +48,14 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co atomicAdd(energy_sum, energy); coord_t di = { - f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length)), - f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length)), - f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length))}; + static_cast(f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length))), + static_cast(f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length))), + static_cast(f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length)))}; coord_t dk = { - f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length)), - f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length)), - f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length))}; + static_cast(f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length))), + static_cast(f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length))), + static_cast(f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length)))}; atomicAdd(&dvelocities[i].x, dv * di.x); atomicAdd(&dvelocities[i].y, dv * di.y); diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu index fa404ee7..a33bb695 100644 --- a/src/core/cuda/src/cuda_nonbonded_14_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu @@ -29,12 +29,12 @@ __device__ __forceinline__ int unified_parameter_index( __device__ void calculate_nonbonded_14_pair( const coord_t& x, const coord_t& y, - double x_charge, - double y_charge, - double x_aii, - double y_aii, - double x_bii, - double y_bii, + real_t x_charge, + real_t y_charge, + real_t x_aii, + real_t y_aii, + real_t x_bii, + real_t y_bii, double coulomb_constant, double scaling, int vdw_rule, @@ -42,15 +42,17 @@ __device__ void calculate_nonbonded_14_pair( double& evdw, double& ecoul, double& dv) { - const double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; - const double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z); - const double r2 = r * r; - const double r6 = r2 * r2 * r2; + const real_t dx = x.x - y.x; + const real_t dy = x.y - y.y; + const real_t dz = x.z - y.z; + const real_t r = rsqrt(dx * dx + dy * dy + dz * dz); + const real_t r2 = r * r; + const real_t r6 = r2 * r2 * r2; ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda; - double v_a = 0.0; - double v_b = 0.0; + real_t v_a = 0.0; + real_t v_b = 0.0; if (vdw_rule == VDW_GEOMETRIC) { calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b); } else { @@ -124,13 +126,15 @@ __global__ void calc_nonbonded_14_force_kernel( ecoul, dv); - const double3 d = {rj.x - ri.x, rj.y - ri.y, rj.z - ri.z}; - atomicAdd(&d_dvelocities[ai].x, -dv * d.x); - atomicAdd(&d_dvelocities[ai].y, -dv * d.y); - atomicAdd(&d_dvelocities[ai].z, -dv * d.z); - atomicAdd(&d_dvelocities[aj].x, dv * d.x); - atomicAdd(&d_dvelocities[aj].y, dv * d.y); - atomicAdd(&d_dvelocities[aj].z, dv * d.z); + const real_t dx = rj.x - ri.x; + const real_t dy = rj.y - ri.y; + const real_t dz = rj.z - ri.z; + atomicAdd(&d_dvelocities[ai].x, -dv * dx); + atomicAdd(&d_dvelocities[ai].y, -dv * dy); + atomicAdd(&d_dvelocities[ai].z, -dv * dz); + atomicAdd(&d_dvelocities[aj].x, dv * dx); + atomicAdd(&d_dvelocities[aj].y, dv * dy); + atomicAdd(&d_dvelocities[aj].z, dv * dz); atomicAdd(&evdw_totals[mode], evdw); atomicAdd(&ecoul_totals[mode], ecoul); diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index 432a7137..097a3550 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -19,7 +19,13 @@ __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { y += x; } -__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { +template +__device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffffffffu) { + return __shfl_sync(mask, v, srcLane); +} + +template <> +__device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) { int2 a = *reinterpret_cast(&v); a.x = __shfl_sync(mask, a.x, srcLane); a.y = __shfl_sync(mask, a.y, srcLane); @@ -27,9 +33,9 @@ __device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0x } __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { - v.x = shfl(v.x, srcLane, mask); - v.y = shfl(v.y, srcLane, mask); - v.z = shfl(v.z, srcLane, mask); + v.x = shfl_value(v.x, srcLane, mask); + v.y = shfl_value(v.y, srcLane, mask); + v.z = shfl_value(v.z, srcLane, mask); return v; } @@ -37,7 +43,7 @@ __device__ void calculate_unforce_bound( const coord_t& x, const coord_t& y, - const double charge_product, + const real_t charge_product, const vdw_pair_param_t& pair_param, const double coulomb_constant, @@ -48,10 +54,12 @@ __device__ void calculate_unforce_bound( double& evdw, double& ecoul, double& dv) { - double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; - double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z); - double r2 = r * r; - double r6 = r2 * r2 * r2; + const real_t dx = x.x - y.x; + const real_t dy = x.y - y.y; + const real_t dz = x.z - y.z; + const real_t r = rsqrt(dx * dx + dy * dy + dz * dz); + const real_t r2 = r * r; + const real_t r6 = r2 * r2 * r2; // double v_a = r6 * r6; // double v_b = r6; // ecoul = r; @@ -60,8 +68,8 @@ __device__ void calculate_unforce_bound( ecoul = scaling * coulomb_constant * charge_product * r * lambda; - double v_a = pair_param.a * r6 * r6 * lambda; - double v_b = pair_param.b * r6 * lambda; + const real_t v_a = pair_param.a * r6 * r6 * static_cast(lambda); + const real_t v_b = pair_param.b * r6 * static_cast(lambda); evdw = v_a - v_b; dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); } @@ -72,7 +80,7 @@ __global__ void calc_nonbonded_force_kernel( const int* x_charges_types, const int* y_charges_types, - const double* charge_pair_products, + const real_t* charge_pair_products, const int* x_atypes_types, const int* y_atypes_types, @@ -139,7 +147,7 @@ __global__ void calc_nonbonded_force_kernel( int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1; int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1; - coord_t invalid = {-1e9, -1e9, -1e9}; + coord_t invalid = {static_cast(-1e9), static_cast(-1e9), static_cast(-1e9)}; coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid; coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid; @@ -194,9 +202,9 @@ __global__ void calc_nonbonded_force_kernel( y_charge_type_idx = __shfl_sync(mask, y_charge_type_idx, src); y_catype_type_idx = __shfl_sync(mask, y_catype_type_idx, src); - y_force.x = shfl(y_force.x, src, mask); - y_force.y = shfl(y_force.y, src, mask); - y_force.z = shfl(y_force.z, src, mask); + y_force.x = shfl_value(y_force.x, src, mask); + y_force.y = shfl_value(y_force.y, src, mask); + y_force.z = shfl_value(y_force.z, src, mask); }; if (disable_water_h_lj) { @@ -214,7 +222,7 @@ __global__ void calc_nonbonded_force_kernel( for (int i = 0; i < 32; i++) { if (is_valid()) { double scaling = 1.0; - double charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; + real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx]; // todo: Now the idx is wrong, should optimize it later @@ -242,14 +250,16 @@ __global__ void calc_nonbonded_force_kernel( evdw_sum += evdw; ecoul_sum += ecoul; - double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z}; - y_force.x -= dv * d.x; - y_force.y -= dv * d.y; - y_force.z -= dv * d.z; + const real_t dx = x_coord.x - y_coord.x; + const real_t dy = x_coord.y - y_coord.y; + const real_t dz = x_coord.z - y_coord.z; + y_force.x -= dv * dx; + y_force.y -= dv * dy; + y_force.z -= dv * dz; - x_force.x += dv * d.x; - x_force.y += dv * d.y; - x_force.z += dv * d.z; + x_force.x += dv * dx; + x_force.y += dv * dy; + x_force.z += dv * dz; } do_shuffle(); } From e9406befed0cfa6b6779cd9a5e7dde24a8fb0a0c Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 09:28:42 +0200 Subject: [PATCH 02/20] add benchmark script --- benchmark-qgpu/benchmark_correctness.py | 417 ++++++++++++++++ benchmark-qgpu/benchmark_nsday.py | 427 +++++++++++++++++ benchmark-qgpu/benchmark_system_scaling.py | 413 ++++++++++++++++ benchmark-qgpu/benchmark_test.py | 527 +++++++++++++++++++++ test/runTEST.py | 261 ++++++---- 5 files changed, 1950 insertions(+), 95 deletions(-) create mode 100644 benchmark-qgpu/benchmark_correctness.py create mode 100644 benchmark-qgpu/benchmark_nsday.py create mode 100644 benchmark-qgpu/benchmark_system_scaling.py create mode 100644 benchmark-qgpu/benchmark_test.py diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py new file mode 100644 index 00000000..07f25046 --- /dev/null +++ b/benchmark-qgpu/benchmark_correctness.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import io +import json +import math +import os +import shutil +import sys +from contextlib import redirect_stdout +from datetime import datetime +from pathlib import Path +from statistics import mean + +os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib") + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + +from benchmark_test import ( + ROOT, + command_text, + prepare_qgpu_input, + prepare_restart_with_qdyn_test, + resolve_fortran_bin, + resolve_qgpu_bin, + resolve_test_data, + run_timed, + write_md_input, +) + +sys.path.insert(0, str(ROOT / "src" / "Qgpu")) + +import compare # noqa: E402 +import energy as ENERGY # noqa: E402 + + +def default_collect_out(test_name): + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{test_name}_correctness" + + +def run_qgpu_once(qgpu_bin, prepared_data_dir, run_dir): + if run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True) + data_dir = run_dir / prepared_data_dir.name + shutil.copytree(prepared_data_dir, data_dir) + + stdout_path = run_dir / "qgpu.log" + stderr_path = run_dir / "qgpu.err" + args = [str(qgpu_bin), "--gpu", str(data_dir)] + return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path) + if return_code != 0: + raise RuntimeError( + "QGPU correctness run failed. " + f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}" + ) + return data_dir, { + "command": command_text(args), + "return_code": return_code, + "wall_seconds": wall_seconds, + "stdout": str(stdout_path), + "stderr": str(stderr_path), + } + + +def load_qgpu_energy(qgpu_data_dir): + energy_path = Path(qgpu_data_dir) / "output" / "energies.csv" + if not energy_path.exists(): + raise FileNotFoundError(f"QGPU energy file not found: {energy_path}") + return ENERGY.Read_Energy(str(energy_path), 0).QDYN(), energy_path + + +def load_fortran_energy(fortran_dir): + q_data_path = Path(fortran_dir) / "Q_data.json" + if not q_data_path.exists(): + raise FileNotFoundError(f"Fortran energy JSON not found: {q_data_path}") + with open(q_data_path, encoding="utf-8") as json_f: + return json.load(json_f), q_data_path + + +def build_correctness_rows(fortran_data, qgpu_data, tolerance): + compare.ENERGY_TOLERANCE = tolerance + rows = [] + frames = sorted(int(key) for key in fortran_data.keys() if key.isdigit()) + for frame in frames: + if frame >= len(qgpu_data): + continue + with redirect_stdout(io.StringIO()): + passed, fortran_values, qgpu_values = compare.compare_energies( + fortran_data[str(frame)], + qgpu_data[frame], + ) + for term, fortran_value, qgpu_value in zip(compare.header, fortran_values, qgpu_values): + if math.isnan(fortran_value) or math.isnan(qgpu_value): + continue + abs_error = abs(fortran_value - qgpu_value) + rel_error = abs_error / abs(fortran_value) if fortran_value != 0 else "" + rows.append( + { + "frame": frame, + "term": term, + "fortran": fortran_value, + "qgpu": qgpu_value, + "abs_error": abs_error, + "rel_error": rel_error, + "passed_tolerance": abs_error <= tolerance, + "frame_passed": passed, + } + ) + if not rows: + raise RuntimeError("No comparable energy rows were produced.") + return rows + + +def summarize_rows(rows, tolerance): + abs_errors = [float(row["abs_error"]) for row in rows] + by_term = {} + for row in rows: + by_term.setdefault(row["term"], []).append(float(row["abs_error"])) + + term_summary = [] + for term, values in sorted(by_term.items()): + term_summary.append( + { + "term": term, + "max_abs_error": max(values), + "mean_abs_error": mean(values), + "rmse": math.sqrt(mean([value * value for value in values])), + } + ) + + return { + "tolerance": tolerance, + "frames": sorted({int(row["frame"]) for row in rows}), + "terms": len(by_term), + "rows": len(rows), + "max_abs_error": max(abs_errors), + "mean_abs_error": mean(abs_errors), + "rmse": math.sqrt(mean([value * value for value in abs_errors])), + "passed": all(float(row["abs_error"]) <= tolerance for row in rows), + "term_summary": term_summary, + } + + +def write_outputs(rows, summary, out_dir, metadata): + terms_csv = out_dir / "correctness_terms.csv" + summary_json = out_dir / "correctness_summary.json" + + with open(terms_csv, "w", newline="", encoding="utf-8") as csv_f: + fieldnames = [ + "frame", + "term", + "fortran", + "qgpu", + "abs_error", + "rel_error", + "passed_tolerance", + "frame_passed", + ] + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + payload = { + "created_at": datetime.now().isoformat(timespec="seconds"), + "metadata": metadata, + "summary": summary, + } + with open(summary_json, "w", encoding="utf-8") as json_f: + json.dump(payload, json_f, indent=2) + + return terms_csv, summary_json + + +def collect(args): + out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(args.test) + out_dir.mkdir(parents=True, exist_ok=True) + + qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) + prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) + data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) + + fortran_dir = out_dir / "fortran_reference" + prep_dir = out_dir / "qgpu_prepare" + qgpu_run_dir = out_dir / "qgpu_run" + fortran_dir.mkdir(parents=True, exist_ok=True) + + print(f"Preparing Fortran reference for {args.test}") + write_md_input(data, fortran_dir) + prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + + print("Preparing QGPU input") + prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) + + print("Running QGPU correctness simulation") + qgpu_data_dir, qgpu_run = run_qgpu_once(qgpu_bin, prepared_data_dir, qgpu_run_dir) + + fortran_data, fortran_energy_path = load_fortran_energy(fortran_dir) + qgpu_data, qgpu_energy_path = load_qgpu_energy(qgpu_data_dir) + rows = build_correctness_rows(fortran_data, qgpu_data, args.tolerance) + summary = summarize_rows(rows, args.tolerance) + + terms_csv, summary_json = write_outputs( + rows, + summary, + out_dir, + { + "test": args.test, + "steps": args.steps, + "lambda": args.lambda_name, + "shake": args.shake, + "qgpu_bin": str(qgpu_bin), + "prep_fortran_bin": str(prep_fortran_bin), + "fortran_energy": str(fortran_energy_path), + "qgpu_energy": str(qgpu_energy_path), + "qgpu_run": qgpu_run, + }, + ) + + print(f"Terms CSV: {terms_csv}") + print(f"Summary JSON: {summary_json}") + print( + f"max |delta E| = {summary['max_abs_error']:.6g} kcal/mol; " + f"RMSE = {summary['rmse']:.6g}; passed = {summary['passed']}" + ) + return 0 + + +def load_rows(csv_path): + rows = [] + with open(csv_path, newline="", encoding="utf-8") as csv_f: + reader = csv.DictReader(csv_f) + for row in reader: + row["frame"] = int(row["frame"]) + row["fortran"] = float(row["fortran"]) + row["qgpu"] = float(row["qgpu"]) + row["abs_error"] = float(row["abs_error"]) + rows.append(row) + if not rows: + raise RuntimeError(f"No rows found in {csv_path}") + return rows + + +def select_term_rows(rows, term): + selected = [row for row in rows if row["term"] == term] + if not selected: + terms = ", ".join(sorted({row["term"] for row in rows})) + raise ValueError(f"Term '{term}' not found. Available terms: {terms}") + return sorted(selected, key=lambda row: row["frame"]) + + +def plot(args): + rows = load_rows(Path(args.csv).expanduser().resolve()) + selected = select_term_rows(rows, args.term) + + frames = [row["frame"] for row in selected] + fortran_values = [row["fortran"] for row in selected] + qgpu_values = [row["qgpu"] for row in selected] + abs_errors = [row["abs_error"] for row in selected] + rel_errors_pct = [ + (row["abs_error"] / abs(row["fortran"]) * 100.0) if row["fortran"] != 0 else 0.0 + for row in selected + ] + max_abs_error = max(abs_errors) + mean_abs_error = mean(abs_errors) + rmse = math.sqrt(mean([value * value for value in abs_errors])) + max_rel_error = max(rel_errors_pct) + mean_rel_error = mean(rel_errors_pct) + + if args.error_mode == "relative": + plotted_errors = rel_errors_pct + error_ylabel = "Relative error (%)" + tolerance = args.tolerance + tolerance_label = "rel. tolerance" + else: + plotted_errors = abs_errors + error_ylabel = "|delta E|" + tolerance = args.tolerance + tolerance_label = "tolerance" + + out_path = Path(args.out).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + fig = plt.figure(figsize=(9.8, 4.2)) + grid = fig.add_gridspec(2, 2, width_ratios=[4.2, 1.45], height_ratios=[2.3, 1.3]) + ax_energy = fig.add_subplot(grid[0, 0]) + ax_error = fig.add_subplot(grid[1, 0], sharex=ax_energy) + ax_panel = fig.add_subplot(grid[:, 1]) + + ax_energy.plot(frames, fortran_values, color="#4a4a4a", linewidth=1.8, label="Fortran") + ax_energy.plot(frames, qgpu_values, color="#0b71c8", linestyle="--", linewidth=1.6, label="QGPU") + ax_energy.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#113b5f") + ax_energy.set_ylabel(f"{args.term} (kcal/mol)") + ax_energy.grid(axis="y", color="#e5e8ee", linewidth=0.8) + ax_energy.legend(frameon=False, loc="best", fontsize=8) + ax_energy.spines["top"].set_visible(False) + ax_energy.spines["right"].set_visible(False) + + ax_error.plot(frames, plotted_errors, color="#d62728", linewidth=1.6) + ax_error.fill_between(frames, plotted_errors, color="#d62728", alpha=0.13) + if tolerance is not None: + ax_error.axhline(tolerance, color="#777777", linestyle=":", linewidth=1.0, label=tolerance_label) + ax_error.legend(frameon=False, loc="best", fontsize=8) + ax_error.set_xlabel("MD step") + ax_error.set_ylabel(error_ylabel) + ax_error.grid(axis="y", color="#e5e8ee", linewidth=0.8) + ax_error.spines["top"].set_visible(False) + ax_error.spines["right"].set_visible(False) + + ax_panel.set_facecolor("#eef5fd") + for spine in ax_panel.spines.values(): + spine.set_color("#8ab9ef") + ax_panel.set_xticks([]) + ax_panel.set_yticks([]) + if args.error_mode == "relative": + ax_panel.text(0.5, 0.84, "Consistency", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970") + ax_panel.text(0.5, 0.64, f"{max_rel_error:.3f}%", ha="center", va="center", fontsize=24, weight="bold", color="#003c7f") + ax_panel.text(0.5, 0.50, "max rel. error", ha="center", va="center", fontsize=10, color="#0b3970") + ax_panel.axhline(0.36, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8) + ax_panel.text(0.5, 0.25, f"mean {mean_rel_error:.3f}%", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970") + ax_panel.text(0.5, 0.13, f"abs RMSE {rmse:.2e}", ha="center", va="center", fontsize=9, color="#0b3970") + else: + ax_panel.text(0.5, 0.82, "Agreement", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970") + ax_panel.text(0.5, 0.62, f"{max_abs_error:.2e}", ha="center", va="center", fontsize=22, weight="bold", color="#003c7f") + ax_panel.text(0.5, 0.48, "max |delta E|", ha="center", va="center", fontsize=10, color="#0b3970") + ax_panel.axhline(0.34, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8) + ax_panel.text(0.5, 0.23, f"RMSE {rmse:.2e}", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970") + ax_panel.text(0.5, 0.12, f"mean {mean_abs_error:.2e}", ha="center", va="center", fontsize=10, color="#0b3970") + + fig.tight_layout() + fig.savefig(out_path, dpi=220) + plt.close(fig) + print(f"Plot written to: {out_path}") + return 0 + + +def positive_int(value): + parsed = int(value) + if parsed < 1: + raise argparse.ArgumentTypeError("must be >= 1") + return parsed + + +def nonnegative_float(value): + parsed = float(value) + if parsed < 0: + raise argparse.ArgumentTypeError("must be >= 0") + return parsed + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect and plot Fortran vs QGPU energy correctness.") + subparsers = parser.add_subparsers(dest="command", required=True) + + collect_parser = subparsers.add_parser("collect", help="Run a correctness benchmark and write CSV data.") + collect_parser.add_argument("--test", required=True, help="runTEST.py test name.") + collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.") + collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") + collect_parser.add_argument("--shake", action="store_true", help="Enable shake.") + collect_parser.add_argument("--out", help="Output directory.") + collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.") + collect_parser.add_argument( + "--prep-fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), + help="Path to qdyn_test used to generate Fortran reference data.", + ) + collect_parser.add_argument( + "--tolerance", + type=nonnegative_float, + default=1e-3, + help="Absolute energy tolerance in kcal/mol for pass/fail summary.", + ) + + plot_parser = subparsers.add_parser("plot", help="Plot correctness from correctness_terms.csv.") + plot_parser.add_argument("csv", help="correctness_terms.csv from collect.") + plot_parser.add_argument("--out", required=True, help="Output PNG path.") + plot_parser.add_argument("--term", default="total-Utot", help="Energy term to plot.") + plot_parser.add_argument( + "--title", + default="Long-Run Energy Consistency", + help="Plot title.", + ) + plot_parser.add_argument( + "--error-mode", + choices=["absolute", "relative"], + default="absolute", + help="Plot absolute kcal/mol error or relative percent error.", + ) + plot_parser.add_argument( + "--tolerance", + type=nonnegative_float, + default=None, + help="Optional horizontal tolerance line on the error panel. Units follow --error-mode.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + if args.command == "collect": + return collect(args) + if args.command == "plot": + return plot(args) + raise SystemExit(f"Unknown command: {args.command}") + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (FileNotFoundError, RuntimeError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(1) diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py new file mode 100644 index 00000000..d62ee459 --- /dev/null +++ b/benchmark-qgpu/benchmark_nsday.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json +import os +import shutil +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path +from statistics import median + +os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib") + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + +from benchmark_test import ( + ROOT, + TIME_STEP_NS, + command_text, + prepare_qgpu_input, + prepare_restart_with_qdyn_test, + resolve_fortran_bin, + resolve_qgpu_bin, + resolve_test_data, + write_md_input, +) + + +def read_steps_from_md_csv(data_dir): + md_path = Path(data_dir) / "md.csv" + if not md_path.exists(): + raise FileNotFoundError(f"md.csv not found: {md_path}") + with open(md_path, encoding="utf-8") as md_f: + for line in md_f: + if line.startswith("steps;"): + return int(line.strip().split(";", 1)[1]) + raise RuntimeError(f"Could not find steps in {md_path}") + + +def default_collect_out(label): + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_label = "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in label) + return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{safe_label}_nsday" + + +def prepare_from_test(args, out_dir): + data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) + fortran_dir = out_dir / "prepare" / args.test / "fortran" + prep_dir = out_dir / "prepare" / args.test / "qgpu_prepare" + fortran_dir.mkdir(parents=True, exist_ok=True) + + print(f"Preparing QGPU input for {args.test} in {out_dir}") + write_md_input(data, fortran_dir) + prepare_restart_with_qdyn_test(data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir) + return prepare_qgpu_input(data, fortran_dir, prep_dir) + + +def resolve_collect_data_dir(args, out_dir): + if args.data_dir: + data_dir = Path(args.data_dir).expanduser().resolve() + if not data_dir.is_dir(): + raise FileNotFoundError(f"data dir not found: {data_dir}") + steps = args.steps if args.steps is not None else read_steps_from_md_csv(data_dir) + return data_dir, steps + + if not args.test: + raise SystemExit("collect requires --test or --data-dir.") + if args.steps is None: + raise SystemExit("collect with --test requires --steps.") + data_dir = prepare_from_test(args, out_dir) + return data_dir, args.steps + + +def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, steps, label, repeat): + if run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True) + + processes = [] + process_rows = [] + command_template = None + batch_start = time.perf_counter() + for index in range(1, concurrency + 1): + proc_dir = run_dir / f"proc_{index:03d}" + data_dir = proc_dir / prepared_data_dir.name + proc_dir.mkdir(parents=True) + shutil.copytree(prepared_data_dir, data_dir) + + stdout_path = proc_dir / "qgpu.log" + stderr_path = proc_dir / "qgpu.err" + args = [str(qgpu_bin), "--gpu", str(data_dir)] + command_template = command_text([str(qgpu_bin), "--gpu", ""]) + stdout_f = open(stdout_path, "w", encoding="utf-8") + stderr_f = open(stderr_path, "w", encoding="utf-8") + proc_start = time.perf_counter() + process = subprocess.Popen(args, cwd=ROOT, stdout=stdout_f, stderr=stderr_f) + processes.append( + { + "index": index, + "process": process, + "stdout_file": stdout_f, + "stderr_file": stderr_f, + "stdout": stdout_path, + "stderr": stderr_path, + "start": proc_start, + "command": command_text(args), + } + ) + + remaining = set(range(len(processes))) + while remaining: + for item_index in list(remaining): + item = processes[item_index] + return_code = item["process"].poll() + if return_code is None: + continue + item["return_code"] = return_code + item["end"] = time.perf_counter() + item["stdout_file"].close() + item["stderr_file"].close() + remaining.remove(item_index) + if remaining: + time.sleep(0.01) + + for item in processes: + wall_seconds = item["end"] - item["start"] + process_rows.append( + { + "label": label, + "concurrency": concurrency, + "repeat": repeat, + "process_index": item["index"], + "return_code": item["return_code"], + "process_wall_seconds": wall_seconds, + "process_ns_per_day": steps * TIME_STEP_NS * 86400 / wall_seconds if wall_seconds > 0 else "", + "stdout": str(item["stdout"]), + "stderr": str(item["stderr"]), + "command": item["command"], + } + ) + + batch_wall_seconds = time.perf_counter() - batch_start + failed = sum(1 for row in process_rows if row["return_code"] != 0) + total_ns_per_day = concurrency * steps * TIME_STEP_NS * 86400 / batch_wall_seconds + mean_process_ns_per_day = ( + sum(float(row["process_ns_per_day"]) for row in process_rows if row["process_ns_per_day"] != "") + / len(process_rows) + ) + return { + "label": label, + "concurrency": concurrency, + "repeat": repeat, + "steps": steps, + "batch_wall_seconds": batch_wall_seconds, + "total_ns_per_day": total_ns_per_day, + "mean_process_ns_per_day": mean_process_ns_per_day, + "failed_processes": failed, + "command": command_template, + }, process_rows + + +def write_collect_outputs(batch_rows, process_rows, out_dir, meta): + summary_csv = out_dir / "nsday_summary.csv" + process_csv = out_dir / "nsday_processes.csv" + meta_json = out_dir / "nsday_meta.json" + + with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f: + fieldnames = [ + "label", + "concurrency", + "repeat", + "steps", + "batch_wall_seconds", + "total_ns_per_day", + "mean_process_ns_per_day", + "failed_processes", + "command", + ] + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(batch_rows) + + with open(process_csv, "w", newline="", encoding="utf-8") as csv_f: + fieldnames = [ + "label", + "concurrency", + "repeat", + "process_index", + "return_code", + "process_wall_seconds", + "process_ns_per_day", + "stdout", + "stderr", + "command", + ] + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(process_rows) + + with open(meta_json, "w", encoding="utf-8") as json_f: + json.dump(meta, json_f, indent=2) + + return summary_csv, process_csv, meta_json + + +def collect(args): + label = args.label or args.test or Path(args.data_dir).name + out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(label) + out_dir.mkdir(parents=True, exist_ok=True) + qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) + prepared_data_dir, steps = resolve_collect_data_dir(args, out_dir) + + batch_rows = [] + process_rows = [] + for concurrency in args.concurrency: + for repeat in range(1, args.repeat + 1): + run_dir = out_dir / "runs" / f"c{concurrency:03d}" / f"repeat_{repeat:03d}" + print(f"Running {label}: concurrency={concurrency}, repeat={repeat}") + batch_row, rows = run_concurrency_batch( + qgpu_bin=qgpu_bin, + prepared_data_dir=prepared_data_dir, + run_dir=run_dir, + concurrency=concurrency, + steps=steps, + label=label, + repeat=repeat, + ) + batch_rows.append(batch_row) + process_rows.extend(rows) + if batch_row["failed_processes"]: + summary_csv, process_csv, meta_json = write_collect_outputs( + batch_rows, + process_rows, + out_dir, + { + "created_at": datetime.now().isoformat(timespec="seconds"), + "label": label, + "qgpu_bin": str(qgpu_bin), + "prepared_data_dir": str(prepared_data_dir), + "steps": steps, + }, + ) + raise RuntimeError( + f"{batch_row['failed_processes']} process(es) failed at concurrency " + f"{concurrency}, repeat {repeat}. Summary: {summary_csv}; processes: {process_csv}; meta: {meta_json}" + ) + if args.pause_seconds > 0: + time.sleep(args.pause_seconds) + + summary_csv, process_csv, meta_json = write_collect_outputs( + batch_rows, + process_rows, + out_dir, + { + "created_at": datetime.now().isoformat(timespec="seconds"), + "label": label, + "test": args.test, + "data_dir": str(prepared_data_dir), + "qgpu_bin": str(qgpu_bin), + "steps": steps, + "concurrency": args.concurrency, + "repeat": args.repeat, + }, + ) + print(f"Summary CSV: {summary_csv}") + print(f"Process CSV: {process_csv}") + print(f"Metadata JSON: {meta_json}") + return 0 + + +def load_plot_series(csv_paths, metric): + series = {} + for csv_path in csv_paths: + with open(csv_path, newline="", encoding="utf-8") as csv_f: + reader = csv.DictReader(csv_f) + for row in reader: + if int(row.get("failed_processes") or 0) != 0: + continue + label = row["label"] + concurrency = int(row["concurrency"]) + value = float(row[metric]) + series.setdefault(label, {}).setdefault(concurrency, []).append(value) + + plotted = [] + for label, by_concurrency in sorted(series.items()): + xs = sorted(by_concurrency) + ys = [median(by_concurrency[x]) for x in xs] + plotted.append({"label": label, "xs": xs, "ys": ys}) + if not plotted: + raise RuntimeError("No successful rows found in input CSV file(s).") + return plotted + + +def plot(args): + metric = args.metric + series = load_plot_series([Path(path).expanduser().resolve() for path in args.csv], metric) + out_path = Path(args.out).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + fig, (ax, panel) = plt.subplots( + 1, + 2, + figsize=(9.5, 3.2), + gridspec_kw={"width_ratios": [4.4, 1.55]}, + ) + palette = ["#1f77b4", "#43a047", "#f57c00", "#7b1fa2", "#00838f"] + all_points = [] + for index, item in enumerate(series): + color = palette[index % len(palette)] + ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"]) + for x, y in zip(item["xs"], item["ys"]): + all_points.append((y, item["label"], x)) + ax.text(x, y, f"{y:.1f}", ha="center", va="bottom", fontsize=8, weight="bold", color="#253142") + + ax.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#0f5f18") + ax.text(0.0, 1.02, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142") + ax.set_xlabel("Concurrent Simulations") + ax.set_ylabel("Throughput (ns/day)") + ax.grid(axis="y", color="#e3e7ed", linewidth=0.8) + ax.set_axisbelow(True) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.legend(frameon=False, loc="upper left", fontsize=8) + + best_points = sorted(all_points, reverse=True) + best = best_points[0] + second = None + seen_labels = {best[1]} + for point in best_points[1:]: + if point[1] not in seen_labels: + second = point + break + + panel.set_facecolor("#edf7eb") + for spine in panel.spines.values(): + spine.set_color("#a3d39b") + panel.set_xticks([]) + panel.set_yticks([]) + panel.text(0.5, 0.80, "Up to", ha="center", va="center", fontsize=11, weight="bold", color="#14751c") + panel.text(0.5, 0.55, f"{best[0]:.1f}", ha="center", va="center", fontsize=30, weight="bold", color="#14751c") + panel.text(0.5, 0.35, "ns/day", ha="center", va="center", fontsize=13, weight="bold", color="#14751c") + panel.text(0.5, 0.20, f"{best[1]}", ha="center", va="center", fontsize=9, color="#253142") + if second is not None: + panel.axhline(0.12, xmin=0.12, xmax=0.88, color="#7fbf79", linewidth=0.8) + panel.text(0.5, 0.05, f"{second[0]:.1f} ns/day", ha="center", va="bottom", fontsize=10, weight="bold", color="#14751c") + + fig.tight_layout() + fig.savefig(out_path, dpi=220) + plt.close(fig) + print(f"Plot written to: {out_path}") + return 0 + + +def positive_int(value): + parsed = int(value) + if parsed < 1: + raise argparse.ArgumentTypeError("must be >= 1") + return parsed + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect and plot QGPU concurrency throughput in ns/day.") + subparsers = parser.add_subparsers(dest="command", required=True) + + collect_parser = subparsers.add_parser("collect", help="Run QGPU concurrency benchmark and write CSV data.") + collect_parser.add_argument("--test", help="runTEST.py test name to prepare and benchmark.") + collect_parser.add_argument("--data-dir", help="Existing prepared QGPU input directory containing md.csv.") + collect_parser.add_argument("--steps", type=positive_int, help="MD steps. Required with --test; optional with --data-dir.") + collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") + collect_parser.add_argument("--shake", action="store_true", help="Enable shake when preparing from --test.") + collect_parser.add_argument( + "--concurrency", + type=positive_int, + nargs="+", + default=[1, 2, 4, 8], + help="Concurrent QGPU simulations to run.", + ) + collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per concurrency level.") + collect_parser.add_argument("--label", help="Series label written into the CSV, e.g. 'A100 (thrombin)'.") + collect_parser.add_argument("--out", help="Output directory.") + collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.") + collect_parser.add_argument( + "--prep-fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), + help="Path to qdyn_test used only when preparing from --test.", + ) + collect_parser.add_argument("--pause-seconds", type=float, default=0.0, help="Pause between batches.") + + plot_parser = subparsers.add_parser("plot", help="Plot ns/day vs concurrency from one or more CSV files.") + plot_parser.add_argument("csv", nargs="+", help="One or more nsday_summary.csv files from collect.") + plot_parser.add_argument("--out", required=True, help="Output PNG path.") + plot_parser.add_argument( + "--metric", + choices=["total_ns_per_day", "mean_process_ns_per_day"], + default="total_ns_per_day", + help="Y-axis metric.", + ) + plot_parser.add_argument("--title", default="Multi-System Concurrency (MPS)", help="Plot title.") + plot_parser.add_argument( + "--subtitle", + default="Total simulation throughput at different concurrency levels", + help="Plot subtitle.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + if args.command == "collect": + return collect(args) + if args.command == "plot": + return plot(args) + raise SystemExit(f"Unknown command: {args.command}") + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (FileNotFoundError, RuntimeError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(1) diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py new file mode 100644 index 00000000..d481244b --- /dev/null +++ b/benchmark-qgpu/benchmark_system_scaling.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json +import math +import os +import sys +from datetime import datetime +from pathlib import Path +from statistics import median + +os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib") + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + +from benchmark_test import ( + ROOT, + ns_per_day, + prepare_qgpu_input, + prepare_restart_with_qdyn_test, + resolve_fortran_bin, + resolve_qgpu_bin, + resolve_test_data, + run_fortran_repeats, + run_qgpu_repeats, + write_md_input, +) + + +def default_collect_out(): + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_system_scaling" + + +def count_atoms(prepared_data_dir): + coords_path = Path(prepared_data_dir) / "coords.csv" + if not coords_path.exists(): + raise FileNotFoundError(f"coords.csv not found: {coords_path}") + with open(coords_path, encoding="utf-8") as coords_f: + return int(coords_f.readline().strip()) + + +def successful_times(records): + return [float(record["wall_seconds"]) for record in records if int(record["return_code"]) == 0] + + +def write_raw_records(records, out_dir): + path = out_dir / "system_scaling_raw.csv" + fieldnames = [ + "test", + "runner", + "repeat", + "command", + "return_code", + "wall_seconds", + "steps", + "ns_per_day", + "stdout", + "stderr", + ] + with open(path, "w", newline="", encoding="utf-8") as csv_f: + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(records) + return path + + +def write_summary(rows, out_dir, metadata): + summary_csv = out_dir / "system_scaling.csv" + meta_json = out_dir / "system_scaling_meta.json" + + fieldnames = [ + "test", + "atoms", + "steps", + "fortran_wall_median_s", + "qgpu_wall_median_s", + "fortran_ns_per_day", + "qgpu_ns_per_day", + "speedup_x", + "fortran_repeats", + "qgpu_repeats", + ] + with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f: + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + with open(meta_json, "w", encoding="utf-8") as json_f: + json.dump(metadata, json_f, indent=2) + + return summary_csv, meta_json + + +def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin): + test_dir = out_dir / test_name + fortran_dir = test_dir / "fortran" + prep_dir = test_dir / "qgpu_prepare" + qgpu_runs_dir = test_dir / "qgpu_runs" + fortran_dir.mkdir(parents=True, exist_ok=True) + + data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake) + print(f"Preparing {test_name}") + write_md_input(data, fortran_dir) + + print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))") + fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps) + if not fortran_ok: + return None, fortran_records + + print(f"Preparing QGPU input for {test_name}") + prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) + atoms = count_atoms(prepared_data_dir) + + print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))") + qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps) + + fortran_times = successful_times(fortran_records) + qgpu_times = successful_times(qgpu_records) + if not fortran_times or not qgpu_times: + return None, [*fortran_records, *qgpu_records] + + fortran_wall = median(fortran_times) + qgpu_wall = median(qgpu_times) + row = { + "test": test_name, + "atoms": atoms, + "steps": args.steps, + "fortran_wall_median_s": fortran_wall, + "qgpu_wall_median_s": qgpu_wall, + "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall), + "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall), + "speedup_x": fortran_wall / qgpu_wall if qgpu_wall > 0 else "", + "fortran_repeats": len(fortran_records), + "qgpu_repeats": len(qgpu_records), + } + return row, [*fortran_records, *qgpu_records] + + +def collect(args): + out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out() + out_dir.mkdir(parents=True, exist_ok=True) + fortran_bin = resolve_fortran_bin(args.fortran_bin) + prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) + qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) + + rows = [] + raw_records = [] + try: + for test_name in args.test: + row, records = collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin) + raw_records.extend(records) + write_raw_records(raw_records, out_dir) + if row is not None: + rows.append(row) + write_summary( + rows, + out_dir, + { + "created_at": datetime.now().isoformat(timespec="seconds"), + "tests": args.test, + "steps": args.steps, + "repeat": args.repeat, + "fortran_bin": str(fortran_bin), + "prep_fortran_bin": str(prep_fortran_bin), + "qgpu_bin": str(qgpu_bin), + }, + ) + finally: + raw_path = write_raw_records(raw_records, out_dir) + + failures = [record for record in raw_records if int(record["return_code"]) != 0] + if failures: + first = failures[0] + raise RuntimeError( + f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. " + f"Logs: stdout={first['stdout']} stderr={first['stderr']}; raw CSV: {raw_path}" + ) + + summary_csv, meta_json = write_summary( + rows, + out_dir, + { + "created_at": datetime.now().isoformat(timespec="seconds"), + "tests": args.test, + "steps": args.steps, + "repeat": args.repeat, + "fortran_bin": str(fortran_bin), + "prep_fortran_bin": str(prep_fortran_bin), + "qgpu_bin": str(qgpu_bin), + }, + ) + print(f"Summary CSV: {summary_csv}") + print(f"Raw CSV: {raw_path}") + print(f"Metadata JSON: {meta_json}") + return 0 + + +def load_rows(csv_path): + rows = [] + with open(csv_path, newline="", encoding="utf-8") as csv_f: + reader = csv.DictReader(csv_f) + for row in reader: + parsed = dict(row) + for key in [ + "atoms", + "steps", + "fortran_wall_median_s", + "qgpu_wall_median_s", + "fortran_ns_per_day", + "qgpu_ns_per_day", + "speedup_x", + ]: + parsed[key] = float(parsed[key]) + rows.append(parsed) + if not rows: + raise RuntimeError(f"No rows found in {csv_path}") + return rows + + +def fmt_atoms(atoms): + atoms = int(atoms) + if atoms >= 1000: + return f"{atoms / 1000:.1f}k atoms" + return f"{atoms} atoms" + + +def annotate_bars(ax, bars, formatter): + for bar in bars: + height = bar.get_height() + ax.text( + bar.get_x() + bar.get_width() / 2, + height, + formatter(height), + ha="center", + va="bottom", + fontsize=8, + weight="bold", + ) + + +def plot_speedup(rows, out_path, title): + labels = [row["test"] for row in rows] + speedups = [row["speedup_x"] for row in rows] + atoms = [row["atoms"] for row in rows] + + fig, (ax, panel) = plt.subplots( + 1, + 2, + figsize=(9.2, 3.3), + gridspec_kw={"width_ratios": [4.3, 1.55]}, + ) + x = range(len(rows)) + bars = ax.bar(x, speedups, color="#0b71c8", width=0.62) + annotate_bars(ax, bars, lambda value: f"{value:.1f}x") + ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") + ax.set_ylabel("Speedup vs Fortran (x)") + ax.set_xticks(list(x)) + ax.set_xticklabels(labels) + ax.grid(axis="y", color="#e5e8ee", linewidth=0.8) + ax.set_axisbelow(True) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + for xpos, atom_count in zip(x, atoms): + ax.text(xpos, -0.08, fmt_atoms(atom_count), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8) + + best = max(rows, key=lambda row: row["speedup_x"]) + panel.set_facecolor("#eef5fd") + for spine in panel.spines.values(): + spine.set_color("#8ab9ef") + panel.set_xticks([]) + panel.set_yticks([]) + panel.text(0.5, 0.80, "Best", ha="center", va="center", fontsize=12, weight="bold", color="#0b3970") + panel.text(0.5, 0.55, f"{best['speedup_x']:.1f}x", ha="center", va="center", fontsize=30, weight="bold", color="#003c7f") + panel.text(0.5, 0.35, "speedup", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970") + panel.text(0.5, 0.18, best["test"], ha="center", va="center", fontsize=10, color="#0b3970") + panel.text(0.5, 0.08, fmt_atoms(best["atoms"]), ha="center", va="center", fontsize=9, color="#0b3970") + + fig.tight_layout() + fig.savefig(out_path, dpi=220) + plt.close(fig) + + +def plot_nsday(rows, out_path, title): + labels = [row["test"] for row in rows] + x = list(range(len(rows))) + width = 0.34 + + fig, ax = plt.subplots(figsize=(8.6, 3.5)) + fortran = [row["fortran_ns_per_day"] for row in rows] + qgpu = [row["qgpu_ns_per_day"] for row in rows] + bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b") + bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8") + annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}") + annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}") + ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") + ax.set_ylabel("ns/day") + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.grid(axis="y", color="#e5e8ee", linewidth=0.8) + ax.set_axisbelow(True) + ax.legend(frameon=False, loc="best") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + for xpos, row in zip(x, rows): + ax.text(xpos, -0.08, fmt_atoms(row["atoms"]), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8) + + fig.tight_layout() + fig.savefig(out_path, dpi=220) + plt.close(fig) + + +def plot_atoms(rows, out_path, title): + fig, ax = plt.subplots(figsize=(6.5, 3.8)) + xs = [row["atoms"] for row in rows] + ys = [row["speedup_x"] for row in rows] + ax.plot(xs, ys, color="#0b71c8", marker="o", linewidth=1.8) + for row in rows: + ax.text(row["atoms"], row["speedup_x"], f" {row['test']} ({row['speedup_x']:.1f}x)", va="center", fontsize=8) + ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") + ax.set_xlabel("Atoms") + ax.set_ylabel("Speedup vs Fortran (x)") + ax.grid(True, color="#e5e8ee", linewidth=0.8) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig.tight_layout() + fig.savefig(out_path, dpi=220) + plt.close(fig) + + +def plot(args): + rows = load_rows(Path(args.csv).expanduser().resolve()) + rows.sort(key=lambda row: row["atoms"] if args.sort == "atoms" else row["test"]) + out_path = Path(args.out).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + if args.metric == "speedup": + plot_speedup(rows, out_path, args.title) + elif args.metric == "nsday": + plot_nsday(rows, out_path, args.title) + elif args.metric == "atoms": + plot_atoms(rows, out_path, args.title) + else: + raise SystemExit(f"Unknown metric: {args.metric}") + + print(f"Plot written to: {out_path}") + return 0 + + +def positive_int(value): + parsed = int(value) + if parsed < 1: + raise argparse.ArgumentTypeError("must be >= 1") + return parsed + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect and plot QGPU scaling across molecular systems.") + subparsers = parser.add_subparsers(dest="command", required=True) + + collect_parser = subparsers.add_parser("collect", help="Run Fortran/QGPU benchmark for multiple tests.") + collect_parser.add_argument("--test", nargs="+", required=True, help="runTEST.py test names.") + collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.") + collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") + collect_parser.add_argument("--shake", action="store_true", help="Enable shake.") + collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.") + collect_parser.add_argument("--out", help="Output directory.") + collect_parser.add_argument( + "--fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"), + help="Path to production Fortran qdyn binary.", + ) + collect_parser.add_argument( + "--prep-fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), + help="Path to qdyn_test used only to prepare QGPU restart CSVs.", + ) + collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.") + + plot_parser = subparsers.add_parser("plot", help="Plot system scaling from system_scaling.csv.") + plot_parser.add_argument("csv", help="system_scaling.csv from collect.") + plot_parser.add_argument("--out", required=True, help="Output PNG path.") + plot_parser.add_argument( + "--metric", + choices=["speedup", "nsday", "atoms"], + default="speedup", + help="Plot style.", + ) + plot_parser.add_argument("--sort", choices=["atoms", "test"], default="atoms", help="System order.") + plot_parser.add_argument("--title", default="Performance Across Molecular Systems", help="Plot title.") + return parser.parse_args() + + +def main(): + args = parse_args() + if args.command == "collect": + return collect(args) + if args.command == "plot": + return plot(args) + raise SystemExit(f"Unknown command: {args.command}") + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (FileNotFoundError, RuntimeError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(1) diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py new file mode 100644 index 00000000..a30695f0 --- /dev/null +++ b/benchmark-qgpu/benchmark_test.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json +import os +import shlex +import shutil +import subprocess +import sys +import time +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +from statistics import median + +os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib") + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + + +ROOT = Path(__file__).resolve().parents[1] +TIME_STEP_NS = 2e-6 + +sys.path.insert(0, str(ROOT / "test")) +sys.path.insert(0, str(ROOT / "src" / "qligfep-newbin-unfinished")) + +import runTEST # noqa: E402 +import qdyn as qdyn_prepare # noqa: E402 + + +@contextmanager +def pushd(path): + previous = Path.cwd() + os.chdir(path) + try: + yield + finally: + os.chdir(previous) + + +def abs_path(path): + if path is None: + return None + return str(Path(path).expanduser().resolve()) + + +def command_text(args): + return " ".join(shlex.quote(str(arg)) for arg in args) + + +def resolve_qgpu_bin(path): + if path: + candidate = Path(path).expanduser() + if not candidate.is_absolute(): + candidate = (Path.cwd() / candidate).resolve() + if not candidate.exists(): + raise FileNotFoundError(f"QGPU binary not found: {candidate}") + return candidate + + for candidate in (ROOT / "bin" / "qdyn", ROOT / "src" / "core" / "qdyn"): + if candidate.exists(): + return candidate + raise FileNotFoundError( + "QGPU binary not found. Expected bin/qdyn or src/core/qdyn, " + "or pass --qgpu-bin." + ) + + +def resolve_fortran_bin(path): + candidate = Path(path).expanduser() + if not candidate.is_absolute(): + candidate = (Path.cwd() / candidate).resolve() + if not candidate.exists(): + raise FileNotFoundError(f"Fortran binary not found: {candidate}") + return candidate + + +def resolve_test_data(test_name, steps, lambda_name, shake): + testinfo = runTEST.get_default_testinfo() + if test_name not in testinfo: + available = ", ".join(sorted(testinfo)) + raise ValueError(f"Unknown test '{test_name}'. Available tests: {available}") + + topdir = ROOT / "test" / "data" / "topology" + inputdir = ROOT / "test" / "data" / "inputs" + info = testinfo[test_name] + topfile = info[0] + if len(info) >= 3 and lambda_name is not None: + stem, suffix = topfile.rsplit(".", 1) + topfile = f"{stem}_{lambda_name}.{suffix}" + + data = { + "avg": False, + "curtest": None, + "fep_path": None, + "inputdir": str(inputdir), + "lambda": lambda_name, + "plot": False, + "restraints_path": None, + "shake": shake, + "test": test_name, + "testinfo": testinfo, + "timestep": str(steps), + "topdir": str(topdir), + "topfile": topfile, + "topology_path": str(topdir / topfile), + "verbose": False, + } + if len(info) >= 3: + data["fep_path"] = str(inputdir / info[2]) + if len(info) >= 4: + data["restraints_path"] = str(inputdir / info[3]) + + required = [Path(data["topology_path"])] + if data["fep_path"] is not None: + required.append(Path(data["fep_path"])) + if data["restraints_path"] is not None: + required.append(Path(data["restraints_path"])) + missing = [str(path) for path in required if not path.exists()] + if missing: + raise FileNotFoundError("Required input file(s) not found: " + ", ".join(missing)) + + return data + + +def run_timed(args, cwd, stdout_path, stderr_path): + start = time.perf_counter() + with open(stdout_path, "w", encoding="utf-8") as stdout_f, open( + stderr_path, "w", encoding="utf-8" + ) as stderr_f: + completed = subprocess.run(args, cwd=cwd, stdout=stdout_f, stderr=stderr_f) + wall_seconds = time.perf_counter() - start + return completed.returncode, wall_seconds + + +def ns_per_day(steps, wall_seconds): + if wall_seconds <= 0: + return None + return steps * TIME_STEP_NS * 86400 / wall_seconds + + +def write_md_input(data, fortran_dir): + data["curtest"] = str(fortran_dir) + with pushd(fortran_dir): + runTEST.create_MD_input(data) + + +def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps): + records = [] + saw_success = False + + for index in range(1, repeat + 1): + stdout_name = "fortran.log" if repeat == 1 else f"fortran_{index}.log" + stderr_name = "fortran.err" if repeat == 1 else f"fortran_{index}.err" + stdout_path = fortran_dir / stdout_name + stderr_path = fortran_dir / stderr_name + args = [str(fortran_bin), "eq1.inp"] + return_code, wall_seconds = run_timed(args, fortran_dir, stdout_path, stderr_path) + if return_code == 0: + saw_success = True + records.append( + { + "test": data["test"], + "runner": "fortran", + "repeat": index, + "command": command_text(args), + "return_code": return_code, + "wall_seconds": wall_seconds, + "steps": steps, + "ns_per_day": ns_per_day(steps, wall_seconds), + "stdout": str(stdout_path), + "stderr": str(stderr_path), + } + ) + + return records, saw_success + + +def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir): + stdout_path = fortran_dir / "restart_prep_qdyn_test.log" + stderr_path = fortran_dir / "restart_prep_qdyn_test.err" + args = [str(prep_fortran_bin), "eq1.inp"] + return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path) + if return_code != 0: + raise RuntimeError( + "QGPU restart preparation failed. " + f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}" + ) + + shutil.copyfile(stdout_path, fortran_dir / "eq1.log") + with pushd(fortran_dir): + runTEST.Parse_Q6_data(data) + + +def prepare_qgpu_input(data, fortran_dir, prep_dir): + prep_dir.mkdir(parents=True, exist_ok=True) + restart_dir = prep_dir / "restart" + restart_dir.mkdir(exist_ok=True) + shutil.copyfile(fortran_dir / "coords.csv", restart_dir / "coords.csv") + shutil.copyfile(fortran_dir / "velocities.csv", restart_dir / "velocities.csv") + + top_stem = Path(data["topfile"]).stem + wd_rel = f"TEST/{top_stem}" + with pushd(prep_dir): + qdyn_prepare.Create_Environment(top=data["topology_path"], wd=wd_rel) + qdyn_prepare.Prepare_Topology(top=data["topology_path"], wd=wd_rel) + qdyn_prepare.Prepare_MD(top=data["topology_path"], md=str(fortran_dir / "eq1.inp"), wd=wd_rel) + qdyn_prepare.Prepare_FEP( + fepfile=data["fep_path"], + wd=wd_rel, + top=data["topology_path"], + ) + qdyn_prepare.Read_Restart(restart=str(restart_dir), wd=wd_rel, top=data["topology_path"]) + + prepared_data_dir = prep_dir / wd_rel + if not (prepared_data_dir / "md.csv").exists(): + raise RuntimeError(f"Prepared QGPU data is missing md.csv: {prepared_data_dir}") + return prepared_data_dir + + +def run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, repeat, steps): + qgpu_runs_dir.mkdir(parents=True, exist_ok=True) + records = [] + + for index in range(1, repeat + 1): + run_dir = qgpu_runs_dir / f"repeat_{index:03d}" + data_dir = run_dir / prepared_data_dir.name + if run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True) + shutil.copytree(prepared_data_dir, data_dir) + + stdout_path = run_dir / "qgpu.log" + stderr_path = run_dir / "qgpu.err" + args = [str(qgpu_bin), "--gpu", str(data_dir)] + return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path) + records.append( + { + "test": data["test"], + "runner": "qgpu", + "repeat": index, + "command": command_text(args), + "return_code": return_code, + "wall_seconds": wall_seconds, + "steps": steps, + "ns_per_day": ns_per_day(steps, wall_seconds), + "stdout": str(stdout_path), + "stderr": str(stderr_path), + } + ) + + return records + + +def write_summary_csv(records, out_dir): + csv_path = out_dir / "summary.csv" + fieldnames = [ + "test", + "runner", + "repeat", + "command", + "return_code", + "wall_seconds", + "steps", + "ns_per_day", + "stdout", + "stderr", + ] + with open(csv_path, "w", newline="", encoding="utf-8") as csv_f: + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(records) + return csv_path + + +def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin): + by_test = {} + for record in records: + by_test.setdefault(record["test"], {}).setdefault(record["runner"], []).append(record) + + tests = [] + for test_name in sorted(by_test): + fortran_records = by_test[test_name].get("fortran", []) + qgpu_records = by_test[test_name].get("qgpu", []) + fortran_ok = [r["wall_seconds"] for r in fortran_records if r["return_code"] == 0] + qgpu_ok = [r["wall_seconds"] for r in qgpu_records if r["return_code"] == 0] + if not fortran_ok or not qgpu_ok: + continue + fortran_median = median(fortran_ok) + qgpu_median = median(qgpu_ok) + speedup = fortran_median / qgpu_median if qgpu_median > 0 else None + tests.append( + { + "test": test_name, + "fortran_median_seconds": fortran_median, + "qgpu_median_seconds": qgpu_median, + "speedup_x": speedup, + "improvement_pct": (speedup - 1) * 100 if speedup is not None else None, + "fortran_repeats": len(fortran_records), + "qgpu_repeats": len(qgpu_records), + } + ) + + return { + "created_at": datetime.now().isoformat(timespec="seconds"), + "args": { + "test": args.test, + "steps": args.steps, + "lambda": args.lambda_name, + "shake": args.shake, + "repeat": args.repeat, + }, + "binaries": { + "fortran": str(fortran_bin), + "restart_prep_fortran": str(prep_fortran_bin), + "qgpu": str(qgpu_bin), + }, + "tests": tests, + } + + +def write_summary_json(summary, out_dir): + json_path = out_dir / "summary.json" + with open(json_path, "w", encoding="utf-8") as json_f: + json.dump(summary, json_f, indent=2) + return json_path + + +def plot_speedup(summary, out_dir): + tests = summary["tests"] + if not tests: + return None + + fig_width = max(8.0, 2.0 + len(tests) * 1.2) + fig, (ax, panel) = plt.subplots( + 1, + 2, + figsize=(fig_width, 3.0), + gridspec_kw={"width_ratios": [3.6, 1.8]}, + ) + + x_positions = list(range(len(tests))) + width = 0.34 + fortran_times = [item["fortran_median_seconds"] for item in tests] + qgpu_times = [item["qgpu_median_seconds"] for item in tests] + labels = [item["test"] for item in tests] + + ax.bar([x - width / 2 for x in x_positions], fortran_times, width, label="Fortran", color="#9b9b9b") + ax.bar([x + width / 2 for x in x_positions], qgpu_times, width, label="QGPU", color="#0b71c8") + ax.set_title("Execution Time (s)", fontsize=11, weight="bold") + ax.set_ylabel("Time (s)") + ax.set_xticks(x_positions) + ax.set_xticklabels(labels, rotation=0 if len(labels) <= 3 else 30, ha="center") + ax.legend(frameon=False, loc="upper right") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.grid(axis="y", color="#e7e7e7", linewidth=0.8) + ax.set_axisbelow(True) + + for x, value in zip([x - width / 2 for x in x_positions], fortran_times): + ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold") + for x, value in zip([x + width / 2 for x in x_positions], qgpu_times): + ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold") + + if len(tests) == 1: + ymax = max(fortran_times[0], qgpu_times[0]) + ax.annotate( + "", + xy=(x_positions[0] + width / 2, qgpu_times[0] + ymax * 0.15), + xytext=(x_positions[0] - width / 2, fortran_times[0] * 0.85), + arrowprops={"arrowstyle": "->", "linestyle": "--", "color": "#0b71c8", "lw": 1.2}, + ) + + best = max(tests, key=lambda item: item["speedup_x"] or 0) + panel.set_facecolor("#eef5fd") + for spine in panel.spines.values(): + spine.set_color("#8ab9ef") + panel.set_xticks([]) + panel.set_yticks([]) + panel.text(0.5, 0.82, "Up to", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970") + panel.text( + 0.5, + 0.52, + f"{best['speedup_x']:.1f}x", + ha="center", + va="center", + fontsize=32, + weight="bold", + color="#003c7f", + ) + panel.text(0.5, 0.28, "speedup", ha="center", va="center", fontsize=14, weight="bold", color="#0b3970") + panel.text(0.5, 0.12, "(vs. Fortran)", ha="center", va="center", fontsize=10, color="#0b3970") + + fig.tight_layout() + png_path = out_dir / "speedup.png" + fig.savefig(png_path, dpi=200) + plt.close(fig) + return png_path + + +def default_out_dir(test_names): + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + label = test_names[0] if len(test_names) == 1 else "multi" + return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{label}" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark Fortran vs QGPU for runTEST.py test cases.") + parser.add_argument("--test", nargs="+", help="Test name(s) from test/runTEST.py.") + parser.add_argument("--list-tests", action="store_true", help="List available tests and exit.") + parser.add_argument("--steps", type=int, help="MD steps to write into eq1.inp.") + parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") + parser.add_argument("--shake", action="store_true", help="Enable shake in generated MD input.") + parser.add_argument("--repeat", type=int, default=1, help="Number of repeats for each runner.") + parser.add_argument("--out", default=None, help="Output directory.") + parser.add_argument( + "--fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"), + help="Path to production Fortran qdyn binary used for timed Fortran runs.", + ) + parser.add_argument( + "--prep-fortran-bin", + default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), + help="Path to qdyn_test binary used only to prepare QGPU restart CSVs.", + ) + parser.add_argument("--qgpu-bin", default=None, help="Path to QGPU qdyn binary.") + return parser.parse_args() + + +def validate_args(args): + if args.list_tests: + return + if not args.test: + raise SystemExit("--test is required unless --list-tests is used.") + if args.steps is None: + raise SystemExit("--steps is required unless --list-tests is used.") + if args.steps < 1: + raise SystemExit("--steps must be >= 1.") + if args.repeat < 1: + raise SystemExit("--repeat must be >= 1.") + + +def main(): + args = parse_args() + validate_args(args) + + testinfo = runTEST.get_default_testinfo() + if args.list_tests: + for test_name in sorted(testinfo): + print(test_name) + return 0 + + qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) + fortran_bin = resolve_fortran_bin(args.fortran_bin) + prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) + out_dir = Path(args.out).expanduser().resolve() if args.out else default_out_dir(args.test) + out_dir.mkdir(parents=True, exist_ok=True) + + all_records = [] + try: + for test_name in args.test: + test_dir = out_dir / test_name + fortran_dir = test_dir / "fortran" + prep_dir = test_dir / "qgpu_prepare" + qgpu_runs_dir = test_dir / "qgpu_runs" + fortran_dir.mkdir(parents=True, exist_ok=True) + + data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake) + print(f"Preparing Fortran input for {test_name} in {fortran_dir}") + write_md_input(data, fortran_dir) + + print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))") + fortran_records, fortran_ok = run_fortran_repeats( + data, fortran_bin, fortran_dir, args.repeat, args.steps + ) + all_records.extend(fortran_records) + if not fortran_ok: + continue + + print(f"Preparing QGPU restart with qdyn_test for {test_name}") + prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + + print(f"Preparing QGPU CSV input for {test_name}") + prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) + + print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))") + all_records.extend( + run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps) + ) + finally: + write_summary_csv(all_records, out_dir) + + failures = [record for record in all_records if record["return_code"] != 0] + if failures: + first = failures[0] + raise RuntimeError( + f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. " + f"Logs: stdout={first['stdout']} stderr={first['stderr']}" + ) + + summary = summarize(all_records, args, qgpu_bin, fortran_bin, prep_fortran_bin) + csv_path = write_summary_csv(all_records, out_dir) + json_path = write_summary_json(summary, out_dir) + png_path = plot_speedup(summary, out_dir) + + print(f"Summary CSV: {csv_path}") + print(f"Summary JSON: {json_path}") + if png_path is not None: + print(f"Speedup plot: {png_path}") + for item in summary["tests"]: + print( + f"{item['test']}: Fortran {item['fortran_median_seconds']:.3f}s, " + f"QGPU {item['qgpu_median_seconds']:.3f}s, speedup {item['speedup_x']:.2f}x" + ) + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (FileNotFoundError, RuntimeError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(1) diff --git a/test/runTEST.py b/test/runTEST.py index ea8d5256..0b694155 100644 --- a/test/runTEST.py +++ b/test/runTEST.py @@ -17,6 +17,92 @@ lambdas = ['eq5', '0744_0256', '0998_0002'] + +def get_default_testinfo(): + return { + 'p-p' : [ + 'benzene-vacuum.top', + '20' + ], + 'q-p_benzene' : [ + 'Na-benzene-vacuum.top', + '20', + 'FEP_benzene.fep' + ], + 'q-p_Na' : [ + 'Na-benzene-vacuum.top', + '20', + 'FEP_Na.fep' + ], + 'q-p-w_benzene' : [ + 'Na-benzene-water.top', + '20', + 'FEP_benzene.fep' + ], + 'q-p-w_Na' : [ + 'Na-benzene-water.top', + '20', + 'FEP_Na.fep' + ], + 'q-q' : [ + 'benzene-vacuum.top', + '20', + 'FEP_benzene.fep' + ], + 'w-p' : [ + 'benzene-water.top', + '20' + ], + 'w-q' : [ + 'benzene-water.top', + '20', + 'FEP_benzene.fep' + ], + 'w-w' : [ + 'water.top', + '20' + ], + 'boundary' : [ + 'ala_wat.top', + '14' + ], + 'polypeptide' : [ + 'ala_wat.top', + '15' + ], + 'polypeptide25' : [ + 'ala_wat25.top', + '25' + ], + 'q-q-large_vac' : [ + 'dualtop_vacuum.top', + '22', + 'dualtop.fep' + ], + 'cdk2' : [ + 'cdk2.top', + '22', + 'FEPm_cdk2.fep', + 'restraints_cdk2.inp' + ], + 'thrombin' : [ + 'thrombin.top', + '25', + 'FEPm_thrombin.fep', + 'restraints_thrombin.inp' + ], + } + + +def resolve_path(path, base_dir=None): + if path is None: + return None + if os.path.isabs(path): + return path + if base_dir is not None: + return os.path.abspath(os.path.join(base_dir, path)) + return os.path.abspath(path) + class Create_Environment(object): """ Creates the workdirectory environment. @@ -44,7 +130,7 @@ def __init__(self,data): _inv_lambda = None # Check if a lambda has been specified - if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None: + if data.get('fep_path') is not None and data['lambda'] is not None: if not data['lambda'].startswith('eq'): str_lambda = data['lambda'].split("_")[0] str_inv_lambda = data['lambda'].split("_")[1] @@ -86,31 +172,32 @@ def __init__(self,data): non_bond 1 [files] -topology {}{} +topology {} final eq1.re """.format(data['timestep'], shake, shake, shake, data['testinfo'][data['test']][1], - data['topdir'], - data['topfile']) - if len(data['testinfo'][test]) >= 3: - filename = data['testinfo'][data['test']][2] + data['topology_path']) + if data.get('fep_path') is not None: + filename = data['fep_path'] + fep_name = os.path.basename(filename) fep_part = """fep {}{} [lambdas] -""".format(data['inputdir'], filename) +""".format("" if os.path.isabs(filename) else "", + filename) if _lambda is not None: fep_part += _lambda + " " + _inv_lambda + "\n" else: - if filename.startswith("FEPm"): + if fep_name.startswith("FEPm"): fep_part += "0.500 0.500\n" else: fep_part += "1.000 0.000\n" md_content = md_content + fep_part # Check if there are boundary conditions - if len(data['testinfo'][test]) >= 4: - filename = data['inputdir'] + '/' + data['testinfo'][data['test']][3] + if data.get('restraints_path') is not None: + filename = data['restraints_path'] with open(filename, 'r') as f: restraint_part = f.read() md_content = md_content + restraint_part @@ -192,9 +279,7 @@ def __init__(self,data): outfile.write('{}\n'.format(v)) # Parse the topology - Qtopology = '{}{}'.format(data['topdir'], - data['topfile']) - read_top = TOPOLOGY.Read_Topology(Qtopology) + read_top = TOPOLOGY.Read_Topology(data['topology_path']) top_data = read_top.Q() with open('coords.csv','w') as outfile: outfile.write('{}\n'.format(len(top_data['coords']))) @@ -210,17 +295,16 @@ def __init__(self,data): shutil.copy('coords.csv', 'tmp/coords.csv') args = [ ' {}src/bin/qdyn.py'.format(settings.ROOT), - '-t', '{}{}'.format(data['topdir'], - data['topfile']), + '-t', data['topology_path'], '-m', 'eq1.inp', '-d', 'TEST', '-r', 'tmp' ] # FEP file? - if len(data['testinfo'][data['test']]) >= 3: + if data.get('fep_path') is not None: args.append('-f') - args.append('{}{}'.format(data['inputdir'],data['testinfo'][data['test']][2])) + args.append(data['fep_path']) if data['verbose']: args.append('--verbose') @@ -330,8 +414,8 @@ def __init__(self, data): self.data = data self.data['curdir'] = os.getcwd() self.data['executable'] = sys.executable - self.data['topdir'] = '{}test/data/topology/'.format(settings.ROOT) - self.data['inputdir'] = '{}test/data/inputs/'.format(settings.ROOT) + self.data['topdir'] = os.path.join(settings.ROOT, 'test/data/topology') + self.data['inputdir'] = os.path.join(settings.ROOT, 'test/data/inputs') # Step = step + 1 self.data['timestep'] = '{}'.format(int(self.data['timestep'])+1) @@ -340,79 +424,20 @@ def __init__(self, data): if self.data['wd'][-1] != '/': self.data['wd'] = self.data['wd'] + '/' - self.data['testinfo'] = { - 'p-p' : [ - 'benzene-vacuum.top', - '20' - ], - 'q-p_benzene' : [ - 'Na-benzene-vacuum.top', - '20', - 'FEP_benzene.fep' - ], - 'q-p_Na' : [ - 'Na-benzene-vacuum.top', - '20', - 'FEP_Na.fep' - ], - 'q-p-w_benzene' : [ - 'Na-benzene-water.top', - '20', - 'FEP_benzene.fep' - ], - 'q-p-w_Na' : [ - 'Na-benzene-water.top', - '20', - 'FEP_Na.fep' - ], - 'q-q' : [ - 'benzene-vacuum.top', - '20', - 'FEP_benzene.fep' - ], - 'w-p' : [ - 'benzene-water.top', - '20' - ], - 'w-q' : [ - 'benzene-water.top', - '20', - 'FEP_benzene.fep' - ], - 'w-w' : [ - 'water.top', - '20' - ], - 'boundary' : [ - 'ala_wat.top', - '14' - ], - 'polypeptide' : [ - 'ala_wat.top', - '15' - ], - 'polypeptide25' : [ - 'ala_wat25.top', - '25' - ], - 'q-q-large_vac' : [ - 'dualtop_vacuum.top', - '22', - 'dualtop.fep' - ], - 'cdk2' : [ - 'cdk2.top', - '22', - 'FEPm_cdk2.fep', - 'restraints_cdk2.inp' - ], - 'thrombin' : [ - 'thrombin.top', - '25', - 'FEPm_thrombin.fep', - 'restraints_thrombin.inp' - ], - } + self.data['testinfo'] = get_default_testinfo() + + if self.data['custom_top'] is not None: + custom_info = [ + os.path.basename(self.data['custom_top']), + self.data['custom_shell_radius'] + ] + if self.data['custom_fep'] is not None: + custom_info.append(os.path.basename(self.data['custom_fep'])) + if self.data['custom_restraints'] is not None: + while len(custom_info) < 3: + custom_info.append(None) + custom_info.append(os.path.basename(self.data['custom_restraints'])) + self.data['testinfo'][self.data['custom_name']] = custom_info tests = data['testinfo'].keys() if self.data['run'] is not None: @@ -422,9 +447,21 @@ def __init__(self, data): self.data['test'] = test self.data['curtest'] = self.data['wd'] + test _topfile = data['testinfo'][data['test']][0] - if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None: + if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None and test != self.data['custom_name']: _topfile = _topfile.split(".")[0] + "_" + data['lambda'] + "." + _topfile.split(".")[1] self.data['topfile'] = _topfile + if test == self.data['custom_name'] and self.data['custom_top'] is not None: + self.data['topology_path'] = self.data['custom_top'] + self.data['fep_path'] = self.data['custom_fep'] + self.data['restraints_path'] = self.data['custom_restraints'] + else: + self.data['topology_path'] = os.path.join(self.data['topdir'], self.data['topfile']) + self.data['fep_path'] = None + self.data['restraints_path'] = None + if len(data['testinfo'][test]) >= 3: + self.data['fep_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][2]) + if len(data['testinfo'][test]) >= 4: + self.data['restraints_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][3]) # INIT Create_Environment(self.data) @@ -515,6 +552,36 @@ def __init__(self, data): required = False, help = "Specify a particular phase of the perturbation") + parser.add_argument('--custom-top', + dest = "custom_top", + default = None, + required = False, + help = "Path to a custom topology file to add as a test") + + parser.add_argument('--custom-shell-radius', + dest = "custom_shell_radius", + default = '25', + required = False, + help = "Shell radius to use with --custom-top") + + parser.add_argument('--custom-fep', + dest = "custom_fep", + default = None, + required = False, + help = "Optional FEP file for --custom-top") + + parser.add_argument('--custom-restraints', + dest = "custom_restraints", + default = None, + required = False, + help = "Optional restraints file for --custom-top") + + parser.add_argument('--custom-name', + dest = "custom_name", + default = 'custom', + required = False, + help = "Test name to use with --custom-top") + parser.add_argument('--tolerance', dest = "tolerance", type = float, @@ -523,5 +590,9 @@ def __init__(self, data): help = "Energy comparison tolerance (default: 0.0 = exact match)") args = parser.parse_args() - - START = Init(vars(args)) + data = vars(args) + data['custom_top'] = resolve_path(data['custom_top']) + data['custom_fep'] = resolve_path(data['custom_fep']) + data['custom_restraints'] = resolve_path(data['custom_restraints']) + + START = Init(data) From c4d416c4c5dc66c72ef9d9c13c58e260ac7d887a Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 10:11:49 +0200 Subject: [PATCH 03/20] fix prepare data --- benchmark-qgpu/benchmark_nsday.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py index d62ee459..253b6b5e 100644 --- a/benchmark-qgpu/benchmark_nsday.py +++ b/benchmark-qgpu/benchmark_nsday.py @@ -32,6 +32,9 @@ ) +RESTART_INIT_STEPS = 1 + + def read_steps_from_md_csv(data_dir): md_path = Path(data_dir) / "md.csv" if not md_path.exists(): @@ -50,15 +53,25 @@ def default_collect_out(label): def prepare_from_test(args, out_dir): - data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) + init_data = resolve_test_data(args.test, RESTART_INIT_STEPS, args.lambda_name, args.shake) + benchmark_data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) fortran_dir = out_dir / "prepare" / args.test / "fortran" prep_dir = out_dir / "prepare" / args.test / "qgpu_prepare" fortran_dir.mkdir(parents=True, exist_ok=True) - print(f"Preparing QGPU input for {args.test} in {out_dir}") - write_md_input(data, fortran_dir) - prepare_restart_with_qdyn_test(data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir) - return prepare_qgpu_input(data, fortran_dir, prep_dir) + print(f"Preparing QGPU restart for {args.test} with {RESTART_INIT_STEPS} MD step(s) in {out_dir}") + write_md_input(init_data, fortran_dir) + prepare_restart_with_qdyn_test(init_data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir) + + print(f"Writing QGPU benchmark input for {args.test} with {args.steps} MD step(s)") + write_md_input(benchmark_data, fortran_dir) + prepared_data_dir = prepare_qgpu_input(benchmark_data, fortran_dir, prep_dir) + prepared_steps = read_steps_from_md_csv(prepared_data_dir) + if prepared_steps != args.steps: + raise RuntimeError( + f"Prepared QGPU input has {prepared_steps} steps, expected {args.steps}: {prepared_data_dir}" + ) + return prepared_data_dir def resolve_collect_data_dir(args, out_dir): From 6eaf637ccd3dc51bbaf74b4d44a9e6a2ccf28871 Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 10:32:57 +0200 Subject: [PATCH 04/20] fix plot png --- benchmark-qgpu/benchmark_nsday.py | 65 ++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py index 253b6b5e..f86708cb 100644 --- a/benchmark-qgpu/benchmark_nsday.py +++ b/benchmark-qgpu/benchmark_nsday.py @@ -95,10 +95,8 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste shutil.rmtree(run_dir) run_dir.mkdir(parents=True) - processes = [] - process_rows = [] command_template = None - batch_start = time.perf_counter() + launch_specs = [] for index in range(1, concurrency + 1): proc_dir = run_dir / f"proc_{index:03d}" data_dir = proc_dir / prepared_data_dir.name @@ -109,20 +107,34 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste stderr_path = proc_dir / "qgpu.err" args = [str(qgpu_bin), "--gpu", str(data_dir)] command_template = command_text([str(qgpu_bin), "--gpu", ""]) - stdout_f = open(stdout_path, "w", encoding="utf-8") - stderr_f = open(stderr_path, "w", encoding="utf-8") + launch_specs.append( + { + "index": index, + "args": args, + "stdout": stdout_path, + "stderr": stderr_path, + "command": command_text(args), + } + ) + + processes = [] + process_rows = [] + batch_start = time.perf_counter() + for spec in launch_specs: + stdout_f = open(spec["stdout"], "w", encoding="utf-8") + stderr_f = open(spec["stderr"], "w", encoding="utf-8") proc_start = time.perf_counter() - process = subprocess.Popen(args, cwd=ROOT, stdout=stdout_f, stderr=stderr_f) + process = subprocess.Popen(spec["args"], cwd=ROOT, stdout=stdout_f, stderr=stderr_f) processes.append( { - "index": index, + "index": spec["index"], "process": process, "stdout_file": stdout_f, "stderr_file": stderr_f, - "stdout": stdout_path, - "stderr": stderr_path, + "stdout": spec["stdout"], + "stderr": spec["stderr"], "start": proc_start, - "command": command_text(args), + "command": spec["command"], } ) @@ -324,22 +336,45 @@ def plot(args): ) palette = ["#1f77b4", "#43a047", "#f57c00", "#7b1fa2", "#00838f"] all_points = [] + all_xs = sorted({x for item in series for x in item["xs"]}) for index, item in enumerate(series): color = palette[index % len(palette)] ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"]) for x, y in zip(item["xs"], item["ys"]): all_points.append((y, item["label"], x)) - ax.text(x, y, f"{y:.1f}", ha="center", va="bottom", fontsize=8, weight="bold", color="#253142") + ax.annotate( + f"{y:.1f}", + xy=(x, y), + xytext=(0, 6), + textcoords="offset points", + ha="center", + va="bottom", + fontsize=8, + weight="bold", + color="#253142", + ) - ax.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#0f5f18") - ax.text(0.0, 1.02, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142") + y_values = [point[0] for point in all_points] + y_min = min(y_values) + y_max = max(y_values) + y_span = y_max - y_min + y_pad = max(y_span * 0.22, y_max * 0.035, 0.5) + ax.set_ylim(max(0, y_min - y_pad * 0.35), y_max + y_pad) + ax.set_xticks(all_xs) + if len(all_xs) == 1: + ax.set_xlim(all_xs[0] - 0.5, all_xs[0] + 0.5) + else: + ax.set_xlim(all_xs[0] - 0.1, all_xs[-1] + 0.1) + + ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18") + ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142") ax.set_xlabel("Concurrent Simulations") ax.set_ylabel("Throughput (ns/day)") ax.grid(axis="y", color="#e3e7ed", linewidth=0.8) ax.set_axisbelow(True) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) - ax.legend(frameon=False, loc="upper left", fontsize=8) + ax.legend(frameon=False, loc="upper right", fontsize=8) best_points = sorted(all_points, reverse=True) best = best_points[0] @@ -363,7 +398,7 @@ def plot(args): panel.axhline(0.12, xmin=0.12, xmax=0.88, color="#7fbf79", linewidth=0.8) panel.text(0.5, 0.05, f"{second[0]:.1f} ns/day", ha="center", va="bottom", fontsize=10, weight="bold", color="#14751c") - fig.tight_layout() + fig.tight_layout(rect=(0, 0, 1, 0.9)) fig.savefig(out_path, dpi=220) plt.close(fig) print(f"Plot written to: {out_path}") From aac3e713ac98b74089150ce4799cef1abf633a56 Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 10:41:40 +0200 Subject: [PATCH 05/20] fix calculation --- benchmark-qgpu/benchmark_nsday.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py index f86708cb..6a7c37b1 100644 --- a/benchmark-qgpu/benchmark_nsday.py +++ b/benchmark-qgpu/benchmark_nsday.py @@ -324,6 +324,10 @@ def load_plot_series(csv_paths, metric): def plot(args): metric = args.metric + metric_labels = { + "total_ns_per_day": "Total Throughput (ns/day)", + "mean_process_ns_per_day": "Mean Per-Process Throughput (ns/day)", + } series = load_plot_series([Path(path).expanduser().resolve() for path in args.csv], metric) out_path = Path(args.out).expanduser().resolve() out_path.parent.mkdir(parents=True, exist_ok=True) @@ -369,7 +373,7 @@ def plot(args): ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18") ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142") ax.set_xlabel("Concurrent Simulations") - ax.set_ylabel("Throughput (ns/day)") + ax.set_ylabel(metric_labels[metric]) ax.grid(axis="y", color="#e3e7ed", linewidth=0.8) ax.set_axisbelow(True) ax.spines["top"].set_visible(False) From cc5700e9a9ebce76f9a45d90e758a5e3e92d8217 Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 11:24:14 +0200 Subject: [PATCH 06/20] support only gpu --- benchmark-qgpu/benchmark_system_scaling.py | 86 ++++++++++++++++------ 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py index d481244b..7b0ad6bf 100644 --- a/benchmark-qgpu/benchmark_system_scaling.py +++ b/benchmark-qgpu/benchmark_system_scaling.py @@ -31,6 +31,9 @@ ) +RESTART_INIT_STEPS = 1 + + def default_collect_out(): stamp = datetime.now().strftime("%Y%m%d_%H%M%S") return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_system_scaling" @@ -48,6 +51,12 @@ def successful_times(records): return [float(record["wall_seconds"]) for record in records if int(record["return_code"]) == 0] +def parse_optional_float(value): + if value in (None, ""): + return float("nan") + return float(value) + + def write_raw_records(records, out_dir): path = out_dir / "system_scaling_raw.csv" fieldnames = [ @@ -104,38 +113,51 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg fortran_dir.mkdir(parents=True, exist_ok=True) data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake) - print(f"Preparing {test_name}") - write_md_input(data, fortran_dir) + fortran_records = [] + fortran_times = [] + + if args.gpu_only: + init_data = resolve_test_data(test_name, RESTART_INIT_STEPS, args.lambda_name, args.shake) + print(f"Preparing QGPU restart for {test_name} with {RESTART_INIT_STEPS} MD step(s)") + write_md_input(init_data, fortran_dir) + prepare_restart_with_qdyn_test(init_data, prep_fortran_bin, fortran_dir) + + print(f"Writing QGPU benchmark input for {test_name} with {args.steps} MD step(s)") + write_md_input(data, fortran_dir) + else: + print(f"Preparing {test_name}") + write_md_input(data, fortran_dir) + + print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))") + fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps) + if not fortran_ok: + return None, fortran_records + fortran_times = successful_times(fortran_records) - print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))") - fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps) - if not fortran_ok: - return None, fortran_records + print(f"Preparing QGPU input for {test_name}") + prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) - print(f"Preparing QGPU input for {test_name}") - prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) atoms = count_atoms(prepared_data_dir) print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))") qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps) - fortran_times = successful_times(fortran_records) qgpu_times = successful_times(qgpu_records) - if not fortran_times or not qgpu_times: + if not qgpu_times or (not args.gpu_only and not fortran_times): return None, [*fortran_records, *qgpu_records] - fortran_wall = median(fortran_times) + fortran_wall = median(fortran_times) if fortran_times else None qgpu_wall = median(qgpu_times) row = { "test": test_name, "atoms": atoms, "steps": args.steps, - "fortran_wall_median_s": fortran_wall, + "fortran_wall_median_s": fortran_wall if fortran_wall is not None else "", "qgpu_wall_median_s": qgpu_wall, - "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall), + "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else "", "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall), - "speedup_x": fortran_wall / qgpu_wall if qgpu_wall > 0 else "", + "speedup_x": fortran_wall / qgpu_wall if fortran_wall is not None and qgpu_wall > 0 else "", "fortran_repeats": len(fortran_records), "qgpu_repeats": len(qgpu_records), } @@ -145,7 +167,7 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg def collect(args): out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out() out_dir.mkdir(parents=True, exist_ok=True) - fortran_bin = resolve_fortran_bin(args.fortran_bin) + fortran_bin = None if args.gpu_only else resolve_fortran_bin(args.fortran_bin) prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) @@ -166,7 +188,8 @@ def collect(args): "tests": args.test, "steps": args.steps, "repeat": args.repeat, - "fortran_bin": str(fortran_bin), + "gpu_only": args.gpu_only, + "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), }, @@ -190,7 +213,8 @@ def collect(args): "tests": args.test, "steps": args.steps, "repeat": args.repeat, - "fortran_bin": str(fortran_bin), + "gpu_only": args.gpu_only, + "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), }, @@ -216,7 +240,7 @@ def load_rows(csv_path): "qgpu_ns_per_day", "speedup_x", ]: - parsed[key] = float(parsed[key]) + parsed[key] = parse_optional_float(parsed[key]) rows.append(parsed) if not rows: raise RuntimeError(f"No rows found in {csv_path}") @@ -245,6 +269,8 @@ def annotate_bars(ax, bars, formatter): def plot_speedup(rows, out_path, title): + if not any(math.isfinite(row["speedup_x"]) for row in rows): + raise RuntimeError("speedup plot requires Fortran data. Use --metric nsday for --gpu-only results.") labels = [row["test"] for row in rows] speedups = [row["speedup_x"] for row in rows] atoms = [row["atoms"] for row in rows] @@ -294,9 +320,13 @@ def plot_nsday(rows, out_path, title): fig, ax = plt.subplots(figsize=(8.6, 3.5)) fortran = [row["fortran_ns_per_day"] for row in rows] qgpu = [row["qgpu_ns_per_day"] for row in rows] - bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b") - bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8") - annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}") + has_fortran = any(math.isfinite(value) for value in fortran) + if has_fortran: + bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b") + bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8") + annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}") + else: + bars_gpu = ax.bar(x, qgpu, width * 1.55, label="QGPU", color="#0b71c8") annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}") ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") ax.set_ylabel("ns/day") @@ -318,13 +348,16 @@ def plot_nsday(rows, out_path, title): def plot_atoms(rows, out_path, title): fig, ax = plt.subplots(figsize=(6.5, 3.8)) xs = [row["atoms"] for row in rows] - ys = [row["speedup_x"] for row in rows] + has_speedup = any(math.isfinite(row["speedup_x"]) for row in rows) + value_key = "speedup_x" if has_speedup else "qgpu_ns_per_day" + ys = [row[value_key] for row in rows] ax.plot(xs, ys, color="#0b71c8", marker="o", linewidth=1.8) for row in rows: - ax.text(row["atoms"], row["speedup_x"], f" {row['test']} ({row['speedup_x']:.1f}x)", va="center", fontsize=8) + suffix = f"{row['speedup_x']:.1f}x" if has_speedup else f"{row['qgpu_ns_per_day']:.1f} ns/day" + ax.text(row["atoms"], row[value_key], f" {row['test']} ({suffix})", va="center", fontsize=8) ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") ax.set_xlabel("Atoms") - ax.set_ylabel("Speedup vs Fortran (x)") + ax.set_ylabel("Speedup vs Fortran (x)" if has_speedup else "QGPU ns/day") ax.grid(True, color="#e5e8ee", linewidth=0.8) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) @@ -369,6 +402,11 @@ def parse_args(): collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") collect_parser.add_argument("--shake", action="store_true", help="Enable shake.") collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.") + collect_parser.add_argument( + "--gpu-only", + action="store_true", + help="Skip timed Fortran qdyn runs and collect only QGPU performance.", + ) collect_parser.add_argument("--out", help="Output directory.") collect_parser.add_argument( "--fortran-bin", From a09f086a82af94effc7b948a2fba13c9d3b79d1e Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 11:36:29 +0200 Subject: [PATCH 07/20] support multi instance --- benchmark-qgpu/benchmark_system_scaling.py | 129 ++++++++++++++++++--- 1 file changed, 113 insertions(+), 16 deletions(-) diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py index 7b0ad6bf..b76aa130 100644 --- a/benchmark-qgpu/benchmark_system_scaling.py +++ b/benchmark-qgpu/benchmark_system_scaling.py @@ -29,6 +29,7 @@ run_qgpu_repeats, write_md_input, ) +from benchmark_nsday import run_concurrency_batch RESTART_INIT_STEPS = 1 @@ -78,6 +79,27 @@ def write_raw_records(records, out_dir): return path +def write_qgpu_concurrency_records(records, out_dir): + path = out_dir / "system_scaling_qgpu_concurrency.csv" + fieldnames = [ + "test", + "label", + "concurrency", + "repeat", + "steps", + "batch_wall_seconds", + "total_ns_per_day", + "mean_process_ns_per_day", + "failed_processes", + "command", + ] + with open(path, "w", newline="", encoding="utf-8") as csv_f: + writer = csv.DictWriter(csv_f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(records) + return path + + def write_summary(rows, out_dir, metadata): summary_csv = out_dir / "system_scaling.csv" meta_json = out_dir / "system_scaling_meta.json" @@ -90,6 +112,7 @@ def write_summary(rows, out_dir, metadata): "qgpu_wall_median_s", "fortran_ns_per_day", "qgpu_ns_per_day", + "qgpu_best_concurrency", "speedup_x", "fortran_repeats", "qgpu_repeats", @@ -105,6 +128,33 @@ def write_summary(rows, out_dir, metadata): return summary_csv, meta_json +def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir): + batch_rows = [] + process_rows = [] + for concurrency in args.concurrency: + for repeat in range(1, args.repeat + 1): + run_dir = qgpu_runs_dir / f"c{concurrency:03d}" / f"repeat_{repeat:03d}" + print(f"Running QGPU for {test_name}: concurrency={concurrency}, repeat={repeat}") + batch_row, rows = run_concurrency_batch( + qgpu_bin=qgpu_bin, + prepared_data_dir=prepared_data_dir, + run_dir=run_dir, + concurrency=concurrency, + steps=args.steps, + label=test_name, + repeat=repeat, + ) + batch_row["test"] = test_name + batch_rows.append(batch_row) + process_rows.extend(rows) + if batch_row["failed_processes"]: + raise RuntimeError( + f"{batch_row['failed_processes']} QGPU process(es) failed for {test_name} " + f"at concurrency {concurrency}, repeat {repeat}. Logs are under {run_dir}" + ) + return batch_rows, process_rows + + def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin): test_dir = out_dir / test_name fortran_dir = test_dir / "fortran" @@ -131,7 +181,7 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))") fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps) if not fortran_ok: - return None, fortran_records + return None, fortran_records, [] fortran_times = successful_times(fortran_records) print(f"Preparing QGPU input for {test_name}") @@ -140,28 +190,53 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) atoms = count_atoms(prepared_data_dir) - print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))") - qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps) + qgpu_concurrency_rows = [] + if args.concurrency: + qgpu_records = [] + qgpu_concurrency_rows, _ = run_qgpu_concurrency_sweep( + args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir + ) + else: + print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))") + qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps) qgpu_times = successful_times(qgpu_records) - if not qgpu_times or (not args.gpu_only and not fortran_times): - return None, [*fortran_records, *qgpu_records] + if args.concurrency: + successful_batches = [row for row in qgpu_concurrency_rows if int(row["failed_processes"]) == 0] + if not successful_batches: + return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows + best_qgpu = max(successful_batches, key=lambda row: float(row["total_ns_per_day"])) + qgpu_wall = float(best_qgpu["batch_wall_seconds"]) + qgpu_ns_day = float(best_qgpu["total_ns_per_day"]) + qgpu_best_concurrency = int(best_qgpu["concurrency"]) + qgpu_repeat_count = len(qgpu_concurrency_rows) + else: + if not qgpu_times: + return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows + qgpu_wall = median(qgpu_times) + qgpu_ns_day = ns_per_day(args.steps, qgpu_wall) + qgpu_best_concurrency = 1 + qgpu_repeat_count = len(qgpu_records) + + if not args.gpu_only and not fortran_times: + return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows fortran_wall = median(fortran_times) if fortran_times else None - qgpu_wall = median(qgpu_times) + fortran_ns_day = ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else None row = { "test": test_name, "atoms": atoms, "steps": args.steps, "fortran_wall_median_s": fortran_wall if fortran_wall is not None else "", "qgpu_wall_median_s": qgpu_wall, - "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else "", - "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall), - "speedup_x": fortran_wall / qgpu_wall if fortran_wall is not None and qgpu_wall > 0 else "", + "fortran_ns_per_day": fortran_ns_day if fortran_ns_day is not None else "", + "qgpu_ns_per_day": qgpu_ns_day, + "qgpu_best_concurrency": qgpu_best_concurrency, + "speedup_x": qgpu_ns_day / fortran_ns_day if fortran_ns_day is not None and fortran_ns_day > 0 else "", "fortran_repeats": len(fortran_records), - "qgpu_repeats": len(qgpu_records), + "qgpu_repeats": qgpu_repeat_count, } - return row, [*fortran_records, *qgpu_records] + return row, [*fortran_records, *qgpu_records], qgpu_concurrency_rows def collect(args): @@ -173,11 +248,17 @@ def collect(args): rows = [] raw_records = [] + qgpu_concurrency_records = [] try: for test_name in args.test: - row, records = collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin) + row, records, concurrency_records = collect_one_test( + args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin + ) raw_records.extend(records) + qgpu_concurrency_records.extend(concurrency_records) write_raw_records(raw_records, out_dir) + if args.concurrency: + write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir) if row is not None: rows.append(row) write_summary( @@ -189,6 +270,7 @@ def collect(args): "steps": args.steps, "repeat": args.repeat, "gpu_only": args.gpu_only, + "concurrency": args.concurrency, "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), @@ -196,6 +278,9 @@ def collect(args): ) finally: raw_path = write_raw_records(raw_records, out_dir) + concurrency_path = ( + write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir) if args.concurrency else None + ) failures = [record for record in raw_records if int(record["return_code"]) != 0] if failures: @@ -214,6 +299,7 @@ def collect(args): "steps": args.steps, "repeat": args.repeat, "gpu_only": args.gpu_only, + "concurrency": args.concurrency, "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), @@ -221,6 +307,8 @@ def collect(args): ) print(f"Summary CSV: {summary_csv}") print(f"Raw CSV: {raw_path}") + if concurrency_path is not None: + print(f"QGPU concurrency CSV: {concurrency_path}") print(f"Metadata JSON: {meta_json}") return 0 @@ -238,9 +326,10 @@ def load_rows(csv_path): "qgpu_wall_median_s", "fortran_ns_per_day", "qgpu_ns_per_day", + "qgpu_best_concurrency", "speedup_x", ]: - parsed[key] = parse_optional_float(parsed[key]) + parsed[key] = parse_optional_float(parsed.get(key)) rows.append(parsed) if not rows: raise RuntimeError(f"No rows found in {csv_path}") @@ -321,15 +410,17 @@ def plot_nsday(rows, out_path, title): fortran = [row["fortran_ns_per_day"] for row in rows] qgpu = [row["qgpu_ns_per_day"] for row in rows] has_fortran = any(math.isfinite(value) for value in fortran) + has_concurrency = any(math.isfinite(row["qgpu_best_concurrency"]) and row["qgpu_best_concurrency"] > 1 for row in rows) + qgpu_label = "QGPU best total" if has_concurrency else "QGPU" if has_fortran: bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b") - bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8") + bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label=qgpu_label, color="#0b71c8") annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}") else: - bars_gpu = ax.bar(x, qgpu, width * 1.55, label="QGPU", color="#0b71c8") + bars_gpu = ax.bar(x, qgpu, width * 1.55, label=qgpu_label, color="#0b71c8") annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}") ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f") - ax.set_ylabel("ns/day") + ax.set_ylabel("Best total ns/day" if has_concurrency else "ns/day") ax.set_xticks(x) ax.set_xticklabels(labels) ax.grid(axis="y", color="#e5e8ee", linewidth=0.8) @@ -402,6 +493,12 @@ def parse_args(): collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.") collect_parser.add_argument("--shake", action="store_true", help="Enable shake.") collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.") + collect_parser.add_argument( + "--concurrency", + type=positive_int, + nargs="+", + help="Concurrent QGPU instance counts to sweep; summary uses the maximum total ns/day.", + ) collect_parser.add_argument( "--gpu-only", action="store_true", From 575db10d762d73d6d1eb77795b996f10bb6c4a7c Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 12:52:34 +0200 Subject: [PATCH 08/20] remove files --- benchmark-qgpu/benchmark_system_scaling.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py index b76aa130..650c3d4c 100644 --- a/benchmark-qgpu/benchmark_system_scaling.py +++ b/benchmark-qgpu/benchmark_system_scaling.py @@ -5,6 +5,7 @@ import json import math import os +import shutil import sys from datetime import datetime from pathlib import Path @@ -128,6 +129,13 @@ def write_summary(rows, out_dir, metadata): return summary_csv, meta_json +def cleanup_test_artifacts(out_dir, test_name): + test_dir = Path(out_dir) / test_name + if test_dir.exists(): + shutil.rmtree(test_dir) + print(f"Removed intermediate run data: {test_dir}") + + def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir): batch_rows = [] process_rows = [] @@ -152,6 +160,8 @@ def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgp f"{batch_row['failed_processes']} QGPU process(es) failed for {test_name} " f"at concurrency {concurrency}, repeat {repeat}. Logs are under {run_dir}" ) + if not args.keep_run_data: + shutil.rmtree(run_dir) return batch_rows, process_rows @@ -271,11 +281,14 @@ def collect(args): "repeat": args.repeat, "gpu_only": args.gpu_only, "concurrency": args.concurrency, + "keep_run_data": args.keep_run_data, "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), }, ) + if not args.keep_run_data: + cleanup_test_artifacts(out_dir, test_name) finally: raw_path = write_raw_records(raw_records, out_dir) concurrency_path = ( @@ -300,6 +313,7 @@ def collect(args): "repeat": args.repeat, "gpu_only": args.gpu_only, "concurrency": args.concurrency, + "keep_run_data": args.keep_run_data, "fortran_bin": str(fortran_bin) if fortran_bin is not None else None, "prep_fortran_bin": str(prep_fortran_bin), "qgpu_bin": str(qgpu_bin), @@ -504,6 +518,11 @@ def parse_args(): action="store_true", help="Skip timed Fortran qdyn runs and collect only QGPU performance.", ) + collect_parser.add_argument( + "--keep-run-data", + action="store_true", + help="Keep per-test run directories and logs. By default successful intermediate data is deleted.", + ) collect_parser.add_argument("--out", help="Output directory.") collect_parser.add_argument( "--fortran-bin", From 4cf9e0e47bc35f4d81ba7cdeea642bcbce3f5f2e Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 15:25:54 +0200 Subject: [PATCH 09/20] support specify concurrency --- benchmark-qgpu/benchmark_nsday.py | 22 ++++++++++++++++++---- benchmark-qgpu/benchmark_report.html.j2 | 10 +++++----- benchmark-qgpu/benchmark_report.py | 6 +++--- benchmark-qgpu/benchmark_run.py | 10 +++++++--- benchmark-qgpu/main.py | 20 +++++++++++++++++--- 5 files changed, 50 insertions(+), 18 deletions(-) diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py index 6a7c37b1..71ddbe9b 100644 --- a/benchmark-qgpu/benchmark_nsday.py +++ b/benchmark-qgpu/benchmark_nsday.py @@ -346,12 +346,24 @@ def plot(args): ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"]) for x, y in zip(item["xs"], item["ys"]): all_points.append((y, item["label"], x)) + if len(item["xs"]) == 1: + x_offset = 0 + ha = "center" + elif x == item["xs"][0]: + x_offset = 6 + ha = "left" + elif x == item["xs"][-1]: + x_offset = -6 + ha = "right" + else: + x_offset = 0 + ha = "center" ax.annotate( f"{y:.1f}", xy=(x, y), - xytext=(0, 6), + xytext=(x_offset, 6), textcoords="offset points", - ha="center", + ha=ha, va="bottom", fontsize=8, weight="bold", @@ -368,7 +380,8 @@ def plot(args): if len(all_xs) == 1: ax.set_xlim(all_xs[0] - 0.5, all_xs[0] + 0.5) else: - ax.set_xlim(all_xs[0] - 0.1, all_xs[-1] + 0.1) + x_pad = max((all_xs[-1] - all_xs[0]) * 0.06, 0.25) + ax.set_xlim(all_xs[0] - x_pad, all_xs[-1] + x_pad) ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18") ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142") @@ -378,7 +391,8 @@ def plot(args): ax.set_axisbelow(True) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) - ax.legend(frameon=False, loc="upper right", fontsize=8) + if len(series) > 1: + ax.legend(frameon=False, loc="upper left", bbox_to_anchor=(0.0, 1.01), ncols=2, fontsize=8) best_points = sorted(all_points, reverse=True) best = best_points[0] diff --git a/benchmark-qgpu/benchmark_report.html.j2 b/benchmark-qgpu/benchmark_report.html.j2 index 61c3a634..6cae93c7 100644 --- a/benchmark-qgpu/benchmark_report.html.j2 +++ b/benchmark-qgpu/benchmark_report.html.j2 @@ -23,9 +23,9 @@ Logs root: {{ logs_root }}
-

Simulation performance (ns/day)

+

Simulation throughput (total ns/day)

-
Simulated nanoseconds per wall-clock day.
+
Total simulated nanoseconds per wall-clock day across all concurrent processes.
@@ -209,8 +209,8 @@ function lineChart(containerId, cfg) { }); lineChart("nsday-chart", { - xLabel: "Processes", yLabel: "ns/day", xs: payload.procs, - series: [{ label: "Performance (ns/day)", ys: payload.ns_per_day }] + xLabel: "Processes", yLabel: "total ns/day", xs: payload.procs, + series: [{ label: "Total throughput (ns/day)", ys: payload.ns_per_day }] }); @@ -233,7 +233,7 @@ function lineChart(containerId, cfg) { GPU util mean (%)GPU util peak (%) VRAM util mean (%)VRAM util peak (%) Speedup (×) - ns/day + Total ns/day `; table.appendChild(thead); diff --git a/benchmark-qgpu/benchmark_report.py b/benchmark-qgpu/benchmark_report.py index 2d393147..c5a85fb4 100644 --- a/benchmark-qgpu/benchmark_report.py +++ b/benchmark-qgpu/benchmark_report.py @@ -85,6 +85,7 @@ def fget(k): mem_means = [v["mem_util_mean"] for v in vals if math.isfinite(v["mem_util_mean"])] mem_peaks = [v["mem_util_peak"] for v in vals if math.isfinite(v["mem_util_peak"])] tmp_ns_per_day = [v["ns_per_day"] for v in vals if math.isfinite(v["ns_per_day"])] + total_ns_per_day = statistics.mean(tmp_ns_per_day) * p if tmp_ns_per_day else float("nan") rc_bad = sum(1 for v in vals if v["rc"] != 0) @@ -99,7 +100,7 @@ def fget(k): gpu_util_peak.append(statistics.mean(util_peak) if util_peak else float("nan")) util_mem_mean.append(statistics.mean(mem_means) if mem_means else float("nan")) util_mem_peak.append(statistics.mean(mem_peaks) if mem_peaks else float("nan")) - ns_per_day.append(statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan")) + ns_per_day.append(total_ns_per_day) Tn = max(walls) if walls else float("nan") @@ -126,7 +127,7 @@ def fget(k): "vram_util_peak": statistics.mean(mem_peaks) if mem_peaks else float("nan"), "Tn": Tn, "speedup": speedup, - "ns_per_day": statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan"), + "ns_per_day": total_ns_per_day, }) @@ -156,4 +157,3 @@ def fget(k): with open(out_html, "w", encoding="utf-8") as f: f.write(html_out) print(f"Report written to: {out_html}") - diff --git a/benchmark-qgpu/benchmark_run.py b/benchmark-qgpu/benchmark_run.py index b119423b..7e0c396d 100644 --- a/benchmark-qgpu/benchmark_run.py +++ b/benchmark-qgpu/benchmark_run.py @@ -350,7 +350,12 @@ def _get(d, dotted, default=None): def run(args): data_dir = os.path.expanduser(args.data_dir) # e.g., TEST/water bin_path = os.path.expanduser(args.bin) # e.g., /path/to/qdyn - max_procs = int(args.max_processes) + if getattr(args, "concurrency", None): + concurrency = sorted(dict.fromkeys(int(value) for value in args.concurrency)) + elif args.max_processes is not None: + concurrency = list(range(1, int(args.max_processes) + 1)) + else: + raise ValueError("Pass --concurrency or --max_processes.") if not os.path.isdir(data_dir): raise FileNotFoundError(f"data_dir not found: {data_dir}") @@ -386,7 +391,7 @@ def run(args): os.makedirs(logs_dir, exist_ok=True) work(1, logs_dir, f'"{bin_path}" "{data_dir}"', steps) - for process_num in range(1, max_procs + 1): + for process_num in concurrency: print(f"Will run {process_num} processes in parallel:") logs_dir = os.path.join(current_dir, f"benchmark_logs/{process_num:02d}_procs") os.makedirs(logs_dir, exist_ok=True) @@ -405,4 +410,3 @@ def run(args): # generate report out_html = os.path.join(current_dir, "benchmark_report.html") make_html_report(logs_root, out_html) - diff --git a/benchmark-qgpu/main.py b/benchmark-qgpu/main.py index f379f021..4d4b6f96 100644 --- a/benchmark-qgpu/main.py +++ b/benchmark-qgpu/main.py @@ -1,5 +1,11 @@ import argparse -from benchmark_run import run + + +def positive_int(value): + parsed = int(value) + if parsed < 1: + raise argparse.ArgumentTypeError("must be >= 1") + return parsed if __name__ == "__main__": @@ -7,8 +13,16 @@ parser.add_argument('--data_dir', type=str, help='Directory containing a single test case.') parser.add_argument('--bin', type=str, help='Path to the Qdyn GPU executable.') - parser.add_argument('--max_processes', type=int, help='Max number of parallel processes to run.') + parser.add_argument('--max_processes', type=positive_int, help='Max number of parallel processes to run.') + parser.add_argument( + '--concurrency', + type=positive_int, + nargs='+', + help='Specific parallel process counts to run, e.g. --concurrency 1 2 3 4 5 10 15 20.', + ) args = parser.parse_args() - run(args) \ No newline at end of file + from benchmark_run import run + + run(args) From 1db80d76eab89e6e0229911ef4a302b66e47faaa Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 15:53:48 +0200 Subject: [PATCH 10/20] support specify fortran files --- benchmark-qgpu/benchmark_correctness.py | 69 ++++++++++++++++++++----- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py index 07f25046..d30f321d 100644 --- a/benchmark-qgpu/benchmark_correctness.py +++ b/benchmark-qgpu/benchmark_correctness.py @@ -83,6 +83,38 @@ def load_fortran_energy(fortran_dir): return json.load(json_f), q_data_path +def find_prepared_qgpu_dir(reference_dir): + prepare_root = Path(reference_dir) / "qgpu_prepare" / "TEST" + if not prepare_root.is_dir(): + raise FileNotFoundError(f"Prepared QGPU TEST directory not found: {prepare_root}") + candidates = sorted(path for path in prepare_root.iterdir() if path.is_dir() and (path / "md.csv").exists()) + if len(candidates) != 1: + shown = ", ".join(str(path) for path in candidates) + raise RuntimeError(f"Expected exactly one prepared QGPU directory under {prepare_root}; found: {shown}") + return candidates[0] + + +def copy_reference_inputs(reference_dir, out_dir): + reference_dir = Path(reference_dir).expanduser().resolve() + source_fortran_dir = reference_dir / "fortran_reference" + if not (source_fortran_dir / "Q_data.json").exists(): + raise FileNotFoundError(f"Fortran reference Q_data.json not found: {source_fortran_dir / 'Q_data.json'}") + + source_prepared_dir = find_prepared_qgpu_dir(reference_dir) + fortran_dir = out_dir / "fortran_reference" + prep_dir = out_dir / "qgpu_prepare" + prepared_data_dir = prep_dir / "TEST" / source_prepared_dir.name + + if fortran_dir.exists(): + shutil.rmtree(fortran_dir) + if prep_dir.exists(): + shutil.rmtree(prep_dir) + + shutil.copytree(source_fortran_dir, fortran_dir) + shutil.copytree(source_prepared_dir, prepared_data_dir) + return fortran_dir, prep_dir, prepared_data_dir, reference_dir + + def build_correctness_rows(fortran_data, qgpu_data, tolerance): compare.ENERGY_TOLERANCE = tolerance rows = [] @@ -182,20 +214,27 @@ def collect(args): out_dir.mkdir(parents=True, exist_ok=True) qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) - prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) - data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) - - fortran_dir = out_dir / "fortran_reference" - prep_dir = out_dir / "qgpu_prepare" qgpu_run_dir = out_dir / "qgpu_run" - fortran_dir.mkdir(parents=True, exist_ok=True) - print(f"Preparing Fortran reference for {args.test}") - write_md_input(data, fortran_dir) - prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + if args.reference_dir: + print(f"Reusing Fortran/QGPU prepared reference from {args.reference_dir}") + fortran_dir, prep_dir, prepared_data_dir, reference_dir = copy_reference_inputs(args.reference_dir, out_dir) + prep_fortran_bin = None + else: + prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) + data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) + + fortran_dir = out_dir / "fortran_reference" + prep_dir = out_dir / "qgpu_prepare" + fortran_dir.mkdir(parents=True, exist_ok=True) - print("Preparing QGPU input") - prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) + print(f"Preparing Fortran reference for {args.test}") + write_md_input(data, fortran_dir) + prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + + print("Preparing QGPU input") + prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) + reference_dir = None print("Running QGPU correctness simulation") qgpu_data_dir, qgpu_run = run_qgpu_once(qgpu_bin, prepared_data_dir, qgpu_run_dir) @@ -215,7 +254,9 @@ def collect(args): "lambda": args.lambda_name, "shake": args.shake, "qgpu_bin": str(qgpu_bin), - "prep_fortran_bin": str(prep_fortran_bin), + "prep_fortran_bin": str(prep_fortran_bin) if prep_fortran_bin is not None else None, + "reference_dir": str(reference_dir) if reference_dir is not None else None, + "prepared_qgpu_input": str(prepared_data_dir), "fortran_energy": str(fortran_energy_path), "qgpu_energy": str(qgpu_energy_path), "qgpu_run": qgpu_run, @@ -364,6 +405,10 @@ def parse_args(): collect_parser.add_argument("--shake", action="store_true", help="Enable shake.") collect_parser.add_argument("--out", help="Output directory.") collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.") + collect_parser.add_argument( + "--reference-dir", + help="Existing correctness result directory containing fortran_reference/ and qgpu_prepare/ to reuse.", + ) collect_parser.add_argument( "--prep-fortran-bin", default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), From a73e56a112ff821e3dd29a9bad126caa4b0efd3e Mon Sep 17 00:00:00 2001 From: shen Date: Tue, 28 Apr 2026 15:56:07 +0200 Subject: [PATCH 11/20] support spfp --- src/core/Makefile | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/core/Makefile b/src/core/Makefile index 367be528..0b607d06 100644 --- a/src/core/Makefile +++ b/src/core/Makefile @@ -1,12 +1,22 @@ CC = nvcc -CFLAGS = -O3 -std=c++17 -arch=sm_89 -I./cuda/include -I./common/include -I./cpu/include -I. +CFLAGS = -O3 -std=c++17 -arch=sm_86 -I./cuda/include -I./common/include -I./cpu/include -I. DEPFLAGS = -MMD -MF $(@:.o=.d) +QDYN_SPFP ?= 0 + +ifneq ($(filter 1 true TRUE yes YES on ON,$(QDYN_SPFP)),) +CFLAGS += -DQDYN_SPFP +PRECISION = spfp +else +PRECISION = dpfp +endif + +BUILD_DIR = build/$(PRECISION) # collect all .cu files except main.cu SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu)) CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp) -MAIN_OBJ = main.o -OBJS = $(SRCS:.cu=.o) $(CPPSRCS:.cpp=.o) +MAIN_OBJ = $(BUILD_DIR)/main.cu.o +OBJS = $(addprefix $(BUILD_DIR)/,$(SRCS:.cu=.cu.o)) $(addprefix $(BUILD_DIR)/,$(CPPSRCS:.cpp=.cpp.o)) DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d) all: qdyn move @@ -14,13 +24,16 @@ all: qdyn move qdyn: $(MAIN_OBJ) $(OBJS) $(CC) $(CFLAGS) -o $@ $^ -%.o: %.cu +$(BUILD_DIR)/%.cu.o: %.cu + mkdir -p $(dir $@) $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ -%.o: %.cpp +$(BUILD_DIR)/%.cpp.o: %.cpp + mkdir -p $(dir $@) $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ clean: + rm -rf build rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn move: From bc0cb51b066cd38ec2a4ae0796634d68a502be38 Mon Sep 17 00:00:00 2001 From: shen Date: Wed, 29 Apr 2026 16:35:23 +0200 Subject: [PATCH 12/20] support mpi --- benchmark-qgpu/benchmark_test.py | 191 ++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 19 deletions(-) diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py index a30695f0..afefcfbc 100644 --- a/benchmark-qgpu/benchmark_test.py +++ b/benchmark-qgpu/benchmark_test.py @@ -52,6 +52,21 @@ def command_text(args): return " ".join(shlex.quote(str(arg)) for arg in args) +def split_mpirun_args(args): + if args is None: + return [] + if isinstance(args, str): + return shlex.split(args) + return [str(arg) for arg in args] + + +def build_fortran_command(fortran_bin, input_file, mpi_procs=None, mpirun_bin="mpirun", mpirun_args=None): + command = [str(fortran_bin), input_file] + if mpi_procs is None: + return command + return [str(mpirun_bin), "-np", str(mpi_procs), *split_mpirun_args(mpirun_args), *command] + + def resolve_qgpu_bin(path): if path: candidate = Path(path).expanduser() @@ -149,7 +164,16 @@ def write_md_input(data, fortran_dir): runTEST.create_MD_input(data) -def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps): +def run_fortran_repeats( + data, + fortran_bin, + fortran_dir, + repeat, + steps, + mpi_procs=None, + mpirun_bin="mpirun", + mpirun_args=None, +): records = [] saw_success = False @@ -158,7 +182,13 @@ def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps): stderr_name = "fortran.err" if repeat == 1 else f"fortran_{index}.err" stdout_path = fortran_dir / stdout_name stderr_path = fortran_dir / stderr_name - args = [str(fortran_bin), "eq1.inp"] + args = build_fortran_command( + fortran_bin, + "eq1.inp", + mpi_procs=mpi_procs, + mpirun_bin=mpirun_bin, + mpirun_args=mpirun_args, + ) return_code, wall_seconds = run_timed(args, fortran_dir, stdout_path, stderr_path) if return_code == 0: saw_success = True @@ -180,20 +210,33 @@ def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps): return records, saw_success -def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir): +def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_steps=None): + input_path = fortran_dir / "eq1.inp" + original_input = input_path.read_text(encoding="utf-8") + parse_data = data + if prep_steps is not None: + prep_data = dict(data) + prep_data["timestep"] = str(prep_steps) + write_md_input(prep_data, fortran_dir) + parse_data = prep_data + stdout_path = fortran_dir / "restart_prep_qdyn_test.log" stderr_path = fortran_dir / "restart_prep_qdyn_test.err" args = [str(prep_fortran_bin), "eq1.inp"] - return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path) - if return_code != 0: - raise RuntimeError( - "QGPU restart preparation failed. " - f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}" - ) + try: + return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path) + if return_code != 0: + raise RuntimeError( + "QGPU restart preparation failed. " + f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}" + ) - shutil.copyfile(stdout_path, fortran_dir / "eq1.log") - with pushd(fortran_dir): - runTEST.Parse_Q6_data(data) + shutil.copyfile(stdout_path, fortran_dir / "eq1.log") + with pushd(fortran_dir): + runTEST.Parse_Q6_data(parse_data) + finally: + if prep_steps is not None: + input_path.write_text(original_input, encoding="utf-8") def prepare_qgpu_input(data, fortran_dir, prep_dir): @@ -277,6 +320,23 @@ def write_summary_csv(records, out_dir): return csv_path +def read_summary_csv(csv_path): + records = [] + with open(csv_path, newline="", encoding="utf-8") as csv_f: + reader = csv.DictReader(csv_f) + for row in reader: + parsed = dict(row) + parsed["repeat"] = int(parsed["repeat"]) + parsed["return_code"] = int(parsed["return_code"]) + parsed["wall_seconds"] = float(parsed["wall_seconds"]) + parsed["steps"] = int(parsed["steps"]) + parsed["ns_per_day"] = float(parsed["ns_per_day"]) if parsed.get("ns_per_day") else None + records.append(parsed) + if not records: + raise RuntimeError(f"No records found in {csv_path}") + return records + + def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin): by_test = {} for record in records: @@ -286,8 +346,8 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin): for test_name in sorted(by_test): fortran_records = by_test[test_name].get("fortran", []) qgpu_records = by_test[test_name].get("qgpu", []) - fortran_ok = [r["wall_seconds"] for r in fortran_records if r["return_code"] == 0] - qgpu_ok = [r["wall_seconds"] for r in qgpu_records if r["return_code"] == 0] + fortran_ok = [float(r["wall_seconds"]) for r in fortran_records if int(r["return_code"]) == 0] + qgpu_ok = [float(r["wall_seconds"]) for r in qgpu_records if int(r["return_code"]) == 0] if not fortran_ok or not qgpu_ok: continue fortran_median = median(fortran_ok) @@ -313,6 +373,10 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin): "lambda": args.lambda_name, "shake": args.shake, "repeat": args.repeat, + "restart_prep_steps": getattr(args, "restart_prep_steps", None), + "fortran_mpi_procs": getattr(args, "fortran_mpi_procs", None), + "mpirun_bin": getattr(args, "mpirun_bin", None), + "mpirun_args": getattr(args, "mpirun_args", None), }, "binaries": { "fortran": str(fortran_bin), @@ -323,6 +387,17 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin): } +def summarize_for_plot(records): + args = argparse.Namespace( + test=sorted({record["test"] for record in records}), + steps=sorted({int(record["steps"]) for record in records}), + lambda_name=None, + shake=None, + repeat=None, + ) + return summarize(records, args, qgpu_bin="", fortran_bin="", prep_fortran_bin="") + + def write_summary_json(summary, out_dir): json_path = out_dir / "summary.json" with open(json_path, "w", encoding="utf-8") as json_f: @@ -402,6 +477,28 @@ def plot_speedup(summary, out_dir): return png_path +def plot_summary_csv(args): + csv_path = Path(args.csv).expanduser().resolve() + records = read_summary_csv(csv_path) + summary = summarize_for_plot(records) + if args.out: + out_path = Path(args.out).expanduser().resolve() + out_dir = out_path.parent + out_dir.mkdir(parents=True, exist_ok=True) + png_path = plot_speedup(summary, out_dir) + if png_path != out_path: + if out_path.exists(): + out_path.unlink() + png_path.rename(out_path) + png_path = out_path + else: + png_path = plot_speedup(summary, csv_path.parent) + if png_path is None: + raise RuntimeError("No successful Fortran/QGPU pairs found to plot.") + print(f"Speedup plot: {png_path}") + return 0 + + def default_out_dir(test_names): stamp = datetime.now().strftime("%Y%m%d_%H%M%S") label = test_names[0] if len(test_names) == 1 else "multi" @@ -409,6 +506,13 @@ def default_out_dir(test_names): def parse_args(): + if len(sys.argv) > 1 and sys.argv[1] == "plot": + parser = argparse.ArgumentParser(description="Plot benchmark_test.py speedup from an existing summary.csv.") + parser.add_argument("command", choices=["plot"]) + parser.add_argument("csv", help="summary.csv written by benchmark_test.py.") + parser.add_argument("--out", help="Output PNG path. Defaults to speedup.png next to the CSV.") + return parser.parse_args() + parser = argparse.ArgumentParser(description="Benchmark Fortran vs QGPU for runTEST.py test cases.") parser.add_argument("--test", nargs="+", help="Test name(s) from test/runTEST.py.") parser.add_argument("--list-tests", action="store_true", help="List available tests and exit.") @@ -417,10 +521,32 @@ def parse_args(): parser.add_argument("--shake", action="store_true", help="Enable shake in generated MD input.") parser.add_argument("--repeat", type=int, default=1, help="Number of repeats for each runner.") parser.add_argument("--out", default=None, help="Output directory.") + parser.add_argument( + "--restart-prep-steps", + type=int, + default=1, + help="MD steps used only for qdyn_test restart preparation. Defaults to 1.", + ) parser.add_argument( "--fortran-bin", default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"), - help="Path to production Fortran qdyn binary used for timed Fortran runs.", + help="Path to production Fortran qdyn/qdynp binary used for timed Fortran runs.", + ) + parser.add_argument( + "--fortran-mpi-procs", + type=int, + default=None, + help="Run the timed Fortran binary through mpirun with this many MPI ranks.", + ) + parser.add_argument( + "--mpirun-bin", + default="mpirun", + help="MPI launcher to use with --fortran-mpi-procs. Defaults to mpirun.", + ) + parser.add_argument( + "--mpirun-args", + default=None, + help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".', ) parser.add_argument( "--prep-fortran-bin", @@ -432,6 +558,8 @@ def parse_args(): def validate_args(args): + if getattr(args, "command", None) == "plot": + return if args.list_tests: return if not args.test: @@ -442,12 +570,19 @@ def validate_args(args): raise SystemExit("--steps must be >= 1.") if args.repeat < 1: raise SystemExit("--repeat must be >= 1.") + if args.restart_prep_steps < 1: + raise SystemExit("--restart-prep-steps must be >= 1.") + if args.fortran_mpi_procs is not None and args.fortran_mpi_procs < 1: + raise SystemExit("--fortran-mpi-procs must be >= 1.") def main(): args = parse_args() validate_args(args) + if getattr(args, "command", None) == "plot": + return plot_summary_csv(args) + testinfo = runTEST.get_default_testinfo() if args.list_tests: for test_name in sorted(testinfo): @@ -473,16 +608,34 @@ def main(): print(f"Preparing Fortran input for {test_name} in {fortran_dir}") write_md_input(data, fortran_dir) - print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))") + if args.fortran_mpi_procs is None: + print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))") + else: + print( + f"Running Fortran for {test_name} with {args.fortran_mpi_procs} MPI rank(s) " + f"({args.repeat} repeat(s))" + ) fortran_records, fortran_ok = run_fortran_repeats( - data, fortran_bin, fortran_dir, args.repeat, args.steps + data, + fortran_bin, + fortran_dir, + args.repeat, + args.steps, + mpi_procs=args.fortran_mpi_procs, + mpirun_bin=args.mpirun_bin, + mpirun_args=args.mpirun_args, ) all_records.extend(fortran_records) if not fortran_ok: continue - print(f"Preparing QGPU restart with qdyn_test for {test_name}") - prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + print(f"Preparing QGPU restart with qdyn_test for {test_name} ({args.restart_prep_steps} step(s))") + prepare_restart_with_qdyn_test( + data, + prep_fortran_bin, + fortran_dir, + prep_steps=args.restart_prep_steps, + ) print(f"Preparing QGPU CSV input for {test_name}") prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) From e95b427fbb87b9323aa5ee31efd6c73af23d64fb Mon Sep 17 00:00:00 2001 From: shen Date: Wed, 29 Apr 2026 17:12:23 +0200 Subject: [PATCH 13/20] support mixed precision --- src/core/common/include/precision.h | 6 ++ src/core/common/src/handler.cpp | 5 ++ src/core/cuda/src/cuda_improper2_force.cu | 4 +- src/core/cuda/src/cuda_leapfrog.cu | 35 +++++---- src/core/cuda/src/cuda_nonbonded_14_force.cu | 74 +++++++++++++------- src/core/cuda/src/cuda_nonbonded_force.cu | 64 ++++++++++------- src/core/cuda/src/cuda_polx_water_force.cu | 13 ++-- src/core/cuda/src/cuda_pshell_force.cu | 2 +- src/core/cuda/src/cuda_radix_water_force.cu | 9 ++- src/core/cuda/src/cuda_restrang_force.cu | 6 +- src/core/cuda/src/cuda_restrdis_force.cu | 4 +- src/core/cuda/src/cuda_restrpos_force.cu | 6 +- src/core/cuda/src/cuda_restrseq_force.cu | 6 +- src/core/cuda/src/cuda_restrwall_force.cu | 4 +- src/core/cuda/src/cuda_shake_constraints.cu | 5 +- src/core/cuda/src/cuda_temperature.cu | 5 +- src/core/cuda/src/cuda_torsion_force.cu | 4 +- 17 files changed, 155 insertions(+), 97 deletions(-) diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h index f15fc6ca..fc633f45 100644 --- a/src/core/common/include/precision.h +++ b/src/core/common/include/precision.h @@ -2,6 +2,12 @@ #ifdef QDYN_SPFP using real_t = float; +using nonbond_work_t = float; #else using real_t = double; +using nonbond_work_t = double; #endif + +using energy_accum_t = double; +using force_accum_t = double; +using constraint_work_t = double; diff --git a/src/core/common/src/handler.cpp b/src/core/common/src/handler.cpp index 3fdd1341..b462b2c7 100644 --- a/src/core/common/src/handler.cpp +++ b/src/core/common/src/handler.cpp @@ -88,6 +88,11 @@ void Handler::update_energy_totals() { } void Handler::print_outputs(int iteration) { + auto& host = Context::instance(); + if (host.run_gpu && host.md.trajectory != 0 && iteration % host.md.trajectory == 0) { + host.coords->download(); + host.velocities->download(); + } print_energies(); write_coords(iteration); write_velocities(iteration); diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index e44678e0..78707b12 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -51,8 +51,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z; rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x; - bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2)); - bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2)); + bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z); + bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z); bjinv = sqrt(bj2inv); bkinv = sqrt(bk2inv); diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu index 49312337..1e010f7e 100644 --- a/src/core/cuda/src/cuda_leapfrog.cu +++ b/src/core/cuda/src/cuda_leapfrog.cu @@ -45,6 +45,20 @@ __global__ void calc_leapfrog_kernel( coords[i].z += velocities[i].z * dt; } +__global__ void update_velocities_from_positions_kernel( + vel_t* velocities, + const coord_t* coords, + const coord_t* xcoords, + int n_atoms, + double dt) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_atoms) return; + + velocities[idx].x = (coords[idx].x - xcoords[idx].x) / dt; + velocities[idx].y = (coords[idx].y - xcoords[idx].y) / dt; + velocities[idx].z = (coords[idx].z - xcoords[idx].z) / dt; +} + void calc_leapfrog_host() { auto& host = Context::instance(); auto d_atypes = host.atypes->gpu_data_p; @@ -70,24 +84,17 @@ void calc_leapfrog_host() { host.dt); check_cuda(cudaDeviceSynchronize()); - host.velocities->download(); - host.dvelocities->download(); - host.coords->download(); - host.xcoords->download(); - // shake - // todo: Here is some problem, it writes into cpu memory, but we use gpu.. printf("n_shake_constraints: %d\n", host.n_shake_constraints); if (host.n_shake_constraints > 0) { calc_shake_constraints_host(); - auto &velocities = host.velocities->cpu_data_p; - auto &coords = host.coords->cpu_data_p; - auto *xcoords = host.xcoords->cpu_data_p; - for (int i = 0; i < host.n_atoms; i++) { - velocities[i].x = (coords[i].x - xcoords[i].x) / host.dt; - velocities[i].y = (coords[i].y - xcoords[i].y) / host.dt; - velocities[i].z = (coords[i].z - xcoords[i].z) / host.dt; - } + update_velocities_from_positions_kernel<<>>( + d_velocities, + d_coords, + d_xcoords, + host.n_atoms, + host.dt); + check_cuda(cudaDeviceSynchronize()); } } diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu index a33bb695..78c4bc91 100644 --- a/src/core/cuda/src/cuda_nonbonded_14_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu @@ -12,6 +12,14 @@ int* d_atom_to_qi = nullptr; double* d_evdw_totals = nullptr; double* d_ecoul_totals = nullptr; +__device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) { +#ifdef QDYN_SPFP + return rsqrtf(value); +#else + return rsqrt(value); +#endif +} + __device__ __forceinline__ int unified_parameter_index( int atom_idx, int state, @@ -35,33 +43,47 @@ __device__ void calculate_nonbonded_14_pair( real_t y_aii, real_t x_bii, real_t y_bii, - double coulomb_constant, - double scaling, + nonbond_work_t coulomb_constant, + nonbond_work_t scaling, int vdw_rule, - double lambda, - double& evdw, - double& ecoul, - double& dv) { - const real_t dx = x.x - y.x; - const real_t dy = x.y - y.y; - const real_t dz = x.z - y.z; - const real_t r = rsqrt(dx * dx + dy * dy + dz * dz); - const real_t r2 = r * r; - const real_t r6 = r2 * r2 * r2; + nonbond_work_t lambda, + nonbond_work_t& evdw, + nonbond_work_t& ecoul, + nonbond_work_t& dv) { + const nonbond_work_t dx = static_cast(x.x - y.x); + const nonbond_work_t dy = static_cast(x.y - y.y); + const nonbond_work_t dz = static_cast(x.z - y.z); + const nonbond_work_t r = nonbond14_rsqrt(dx * dx + dy * dy + dz * dz); + const nonbond_work_t r2 = r * r; + const nonbond_work_t r6 = r2 * r2 * r2; ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda; - real_t v_a = 0.0; - real_t v_b = 0.0; + nonbond_work_t v_a = 0.0; + nonbond_work_t v_b = 0.0; if (vdw_rule == VDW_GEOMETRIC) { - calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b); + calc_vdw_geometric( + static_cast(x_aii), + static_cast(y_aii), + static_cast(x_bii), + static_cast(y_bii), + r6, + &v_a, + &v_b); } else { - calc_vdw_arithmetic(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b); + calc_vdw_arithmetic( + static_cast(x_aii), + static_cast(y_aii), + static_cast(x_bii), + static_cast(y_bii), + r6, + &v_a, + &v_b); } v_a *= lambda; v_b *= lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); + dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); } __global__ void calc_nonbonded_14_force_kernel( @@ -104,10 +126,10 @@ __global__ void calc_nonbonded_14_force_kernel( const coord_t ri = d_coords[ai]; const coord_t rj = d_coords[aj]; - double evdw = 0.0; - double ecoul = 0.0; - double dv = 0.0; - const double pair_lambda = (mode == NONBONDED_14_PP) ? 1.0 : lambda; + nonbond_work_t evdw = 0.0; + nonbond_work_t ecoul = 0.0; + nonbond_work_t dv = 0.0; + const nonbond_work_t pair_lambda = static_cast((mode == NONBONDED_14_PP) ? 1.0 : lambda); calculate_nonbonded_14_pair( ri, @@ -118,17 +140,17 @@ __global__ void calc_nonbonded_14_force_kernel( aj_type.aii_1_4, ai_type.bii_1_4, aj_type.bii_1_4, - d_topo.coulomb_constant, - d_topo.el14_scale, + static_cast(d_topo.coulomb_constant), + static_cast(d_topo.el14_scale), d_topo.vdw_rule, pair_lambda, evdw, ecoul, dv); - const real_t dx = rj.x - ri.x; - const real_t dy = rj.y - ri.y; - const real_t dz = rj.z - ri.z; + const nonbond_work_t dx = static_cast(rj.x - ri.x); + const nonbond_work_t dy = static_cast(rj.y - ri.y); + const nonbond_work_t dz = static_cast(rj.z - ri.z); atomicAdd(&d_dvelocities[ai].x, -dv * dx); atomicAdd(&d_dvelocities[ai].y, -dv * dy); atomicAdd(&d_dvelocities[ai].z, -dv * dz); diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index 097a3550..ce3f73ae 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -9,6 +9,20 @@ namespace CudaNonbondedForce { bool is_initialized = false; double *d_evdw_total, *d_ecoul_total; +struct nonbond_vec_t { + nonbond_work_t x; + nonbond_work_t y; + nonbond_work_t z; +}; + +__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) { +#ifdef QDYN_SPFP + return rsqrtf(value); +#else + return rsqrt(value); +#endif +} + __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); y = t - (x * n - (x * (x - 1) >> 1)); @@ -46,20 +60,20 @@ __device__ void calculate_unforce_bound( const real_t charge_product, const vdw_pair_param_t& pair_param, - const double coulomb_constant, + const nonbond_work_t coulomb_constant, - const double scaling, - const double lambda, + const nonbond_work_t scaling, + const nonbond_work_t lambda, - double& evdw, - double& ecoul, - double& dv) { - const real_t dx = x.x - y.x; - const real_t dy = x.y - y.y; - const real_t dz = x.z - y.z; - const real_t r = rsqrt(dx * dx + dy * dy + dz * dz); - const real_t r2 = r * r; - const real_t r6 = r2 * r2 * r2; + nonbond_work_t& evdw, + nonbond_work_t& ecoul, + nonbond_work_t& dv) { + const nonbond_work_t dx = static_cast(x.x - y.x); + const nonbond_work_t dy = static_cast(x.y - y.y); + const nonbond_work_t dz = static_cast(x.z - y.z); + const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); + const nonbond_work_t r2 = r * r; + const nonbond_work_t r6 = r2 * r2 * r2; // double v_a = r6 * r6; // double v_b = r6; // ecoul = r; @@ -68,10 +82,10 @@ __device__ void calculate_unforce_bound( ecoul = scaling * coulomb_constant * charge_product * r * lambda; - const real_t v_a = pair_param.a * r6 * r6 * static_cast(lambda); - const real_t v_b = pair_param.b * r6 * static_cast(lambda); + const nonbond_work_t v_a = static_cast(pair_param.a) * r6 * r6 * lambda; + const nonbond_work_t v_b = static_cast(pair_param.b) * r6 * lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); + dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); } __global__ void calc_nonbonded_force_kernel( @@ -160,8 +174,8 @@ __global__ void calc_nonbonded_force_kernel( int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1; int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1; - double3 x_force = {0.0, 0.0, 0.0}; - double3 y_force = {0.0, 0.0, 0.0}; + nonbond_vec_t x_force = {0.0, 0.0, 0.0}; + nonbond_vec_t y_force = {0.0, 0.0, 0.0}; double evdw_sum = 0.0; double ecoul_sum = 0.0; @@ -216,12 +230,14 @@ __global__ void calc_nonbonded_force_kernel( } } + const nonbond_work_t kernel_lambda = static_cast(lambda); + const nonbond_work_t coulomb_constant = static_cast(d_topo.coulomb_constant); const int charge_pair_row = x_charge_type_idx * n_charge_types; const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0; for (int i = 0; i < 32; i++) { if (is_valid()) { - double scaling = 1.0; + nonbond_work_t scaling = static_cast(1.0); real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx]; @@ -233,16 +249,16 @@ __global__ void calc_nonbonded_force_kernel( // } // } - double evdw = 0, ecoul = 0, dv = 0; + nonbond_work_t evdw = 0, ecoul = 0, dv = 0; calculate_unforce_bound( x_coord, y_coord, charge_product, pair_param, - d_topo.coulomb_constant, + coulomb_constant, scaling, - lambda, + kernel_lambda, evdw, ecoul, dv); @@ -250,9 +266,9 @@ __global__ void calc_nonbonded_force_kernel( evdw_sum += evdw; ecoul_sum += ecoul; - const real_t dx = x_coord.x - y_coord.x; - const real_t dy = x_coord.y - y_coord.y; - const real_t dz = x_coord.z - y_coord.z; + const nonbond_work_t dx = static_cast(x_coord.x - y_coord.x); + const nonbond_work_t dy = static_cast(x_coord.y - y_coord.y); + const nonbond_work_t dz = static_cast(x_coord.z - y_coord.z); y_force.x -= dv * dx; y_force.y -= dv * dy; y_force.z -= dv * dz; diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index 9b0eb667..13c37fbc 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -46,7 +46,7 @@ __global__ void calc_polx_theta_and_shells( rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y; rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z; - rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2)); + rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z); rmu.x /= rm; rmu.y /= rm; @@ -55,7 +55,7 @@ __global__ void calc_polx_theta_and_shells( rcu.x = coords[wi].x - topo.solvent_center.x; rcu.y = coords[wi].y - topo.solvent_center.y; rcu.z = coords[wi].z - topo.solvent_center.z; - rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2)); + rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z); rcu.x /= rc; rcu.y /= rc; rcu.z /= rc; @@ -106,18 +106,19 @@ __global__ void calc_polx_water_forces_kernel( if (theta_val > M_PI) theta_val = M_PI; avtdum += theta[ii]; - ener = .5 * md.polarisation_force * pow(theta[ii] - theta_val + wshells[is].theta_corr, 2); + const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr; + ener = .5 * md.polarisation_force * dtheta * dtheta; // E_restraint.Upolx += ener; atomicAdd(energy, ener); - dv = md.polarisation_force * (theta[ii] - theta_val + wshells[is].theta_corr); + dv = md.polarisation_force * dtheta; wi = n_atoms_solute + 3 * ii; rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x; rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y; rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z; - rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2)); + rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z); rmu.x /= rm; rmu.y /= rm; @@ -126,7 +127,7 @@ __global__ void calc_polx_water_forces_kernel( rcu.x = coords[wi].x - topo.solvent_center.x; rcu.y = coords[wi].y - topo.solvent_center.y; rcu.z = coords[wi].z - topo.solvent_center.z; - rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2)); + rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z); rcu.x /= rc; rcu.y /= rc; rcu.z /= rc; diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu index a01fb536..5221cb9e 100644 --- a/src/core/cuda/src/cuda_pshell_force.cu +++ b/src/core/cuda/src/cuda_pshell_force.cu @@ -34,7 +34,7 @@ __global__ void calc_pshell_force_kernel( dr.x = coords[i].x - coords_init[i].x; dr.y = coords[i].y - coords_init[i].y; dr.z = coords[i].z - coords_init[i].z; - r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); + r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; ener = 0.5 * k * r2; // printf("dr = %f %f %f\n", dr.x, dr.y, dr.z); diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu index 06f5f5a3..f037e9db 100644 --- a/src/core/cuda/src/cuda_radix_water_force.cu +++ b/src/core/cuda/src/cuda_radix_water_force.cu @@ -29,18 +29,18 @@ __global__ void calc_radix_water_forces_kernel( dr.x = coords[i].x - topo.solvent_center.x; dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - double b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); + double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); double db = b - (topo.solvent_radius - shift); double ener, dv; if (db > 0) { - ener = 0.5 * md.radial_force * pow(db, 2) - Dwmz; + ener = 0.5 * md.radial_force * db * db - Dwmz; dv = md.radial_force * db / b; } else { if (b > 0.0) { double fexp = exp(awmz * db); - ener = Dwmz * (pow(fexp, 2) - 2 * fexp); - dv = -2 * Dwmz * awmz * (fexp - pow(fexp, 2)) / b; + ener = Dwmz * (fexp * fexp - 2 * fexp); + dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b; } else { dv = 0; ener = 0; @@ -91,7 +91,6 @@ void calc_radix_water_forces_host() { d_dvelocities, d_energy); check_cuda(cudaDeviceSynchronize()); - host.dvelocities->download(); check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost)); host.E_restraint.Uradx += energy; } diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index eb0813f5..b214aee9 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -45,8 +45,8 @@ __global__ void calc_restrang_force_kernel( lambda = 1; } - r2ij = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); - r2jk = pow(dr2.x, 2) + pow(dr2.y, 2) + pow(dr2.z, 2); + r2ij = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; + r2jk = dr2.x * dr2.x + dr2.y * dr2.y + dr2.z * dr2.z; rij = sqrt(r2ij); rjk = sqrt(r2jk); @@ -60,7 +60,7 @@ __global__ void calc_restrang_force_kernel( th = acos(cos_th); dth = th - to_radians_device(restrangs[ir].ang); - ener = .5 * restrangs[ir].k * pow(dth, 2); + ener = .5 * restrangs[ir].k * dth * dth; dv = lambda * restrangs[ir].k * dth; f1 = sin(th); diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu index 9aacf977..14f9b466 100644 --- a/src/core/cuda/src/cuda_restrdis_force.cu +++ b/src/core/cuda/src/cuda_restrdis_force.cu @@ -40,7 +40,7 @@ __global__ void calc_restrdis_forces_kernel( lambda = 1; } - b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); + b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); if (b < restrdists[ir].d1) { db = b - restrdists[ir].d1; } else if (b > restrdists[ir].d2) { @@ -50,7 +50,7 @@ __global__ void calc_restrdis_forces_kernel( return; } - ener = .5 * restrdists[ir].k * pow(db, 2); + ener = .5 * restrdists[ir].k * db * db; dv = lambda * restrdists[ir].k * db / b; atomicAdd(&dvelocities[j].x, dr.x * dv); diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu index 5f479364..695e2b33 100644 --- a/src/core/cuda/src/cuda_restrpos_force.cu +++ b/src/core/cuda/src/cuda_restrpos_force.cu @@ -39,9 +39,9 @@ __global__ void calc_restrpos_forces_kernel( lambda = 1; } - x2 = pow(dr.x, 2); - y2 = pow(dr.y, 2); - z2 = pow(dr.z, 2); + x2 = dr.x * dr.x; + y2 = dr.y * dr.y; + z2 = dr.z * dr.z; ener = .5 * restrspos[ir].k.x * x2 + .5 * restrspos[ir].k.y * y2 + .5 * restrspos[ir].k.z * z2; diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu index b5db3552..71835e4e 100644 --- a/src/core/cuda/src/cuda_restrseq_force.cu +++ b/src/core/cuda/src/cuda_restrseq_force.cu @@ -46,7 +46,7 @@ __global__ void calc_restrseq_forces_kernel( dr.x /= n_ctr; dr.y /= n_ctr; dr.z /= n_ctr; - r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); + r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; ener = .5 * k * r2; atomicAdd(upres_energy, ener); @@ -77,7 +77,7 @@ __global__ void calc_restrseq_forces_kernel( dr.x /= totmass; dr.y /= totmass; dr.z /= totmass; - r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); + r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; ener = .5 * k * r2; atomicAdd(upres_energy, ener); @@ -100,7 +100,7 @@ __global__ void calc_restrseq_forces_kernel( dr.y = coords[i].y - coords_init[i].y; dr.z = coords[i].z - coords_init[i].z; - r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); + r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; ener = .5 * k * r2; atomicAdd(upres_energy, ener); diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu index 12d890ad..c928bb71 100644 --- a/src/core/cuda/src/cuda_restrwall_force.cu +++ b/src/core/cuda/src/cuda_restrwall_force.cu @@ -29,11 +29,11 @@ __global__ void calc_restrwall_forces_kernel( dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); + b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); db = b - restrwalls[ir].d; if (db > 0) { - ener = .5 * k * pow(db, 2) - restrwalls[ir].dMorse; + ener = .5 * k * db * db - restrwalls[ir].dMorse; dv = k * db / b; } else { fexp = exp(restrwalls[ir].aMorse * db); diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu index e9dfd051..bda47e50 100644 --- a/src/core/cuda/src/cuda_shake_constraints.cu +++ b/src/core/cuda/src/cuda_shake_constraints.cu @@ -48,7 +48,7 @@ __global__ void calc_shake_constraints_kernel( xij.x = coords[ai].x - coords[aj].x; xij.y = coords[ai].y - coords[aj].y; xij.z = coords[ai].z - coords[aj].z; - xij2 = pow(xij.x, 2) + pow(xij.y, 2) + pow(xij.z, 2); + xij2 = xij.x * xij.x + xij.y * xij.y + xij.z * xij.z; diff = shake_bonds[shake + i].dist2 - xij2; if (fabs(diff) < shake_tol * shake_bonds[shake + i].dist2) { shake_bonds[shake + i].ready = true; @@ -86,7 +86,7 @@ __global__ void calc_shake_constraints_kernel( xxij.x = xcoords[ai].x - xcoords[aj].x; xxij.y = xcoords[ai].y - xcoords[aj].y; xxij.z = xcoords[ai].z - xcoords[aj].z; - xxij2 = pow(xxij.x, 2) + pow(xxij.y, 2) + pow(xxij.z, 2); + xxij2 = xxij.x * xxij.x + xxij.y * xxij.y + xxij.z * xxij.z; printf(">>> Shake failed, i = %d,j = %d, d = %f, d0 = %f", ai, aj, sqrt(xxij2), shake_bonds[shake + i].dist2); } return; @@ -154,6 +154,5 @@ int calc_shake_constraints_host() { d_mol_shake_offset); cudaDeviceSynchronize(); cudaMemcpy(&total_iterations_host, d_total_iterations, sizeof(int), cudaMemcpyDeviceToHost); - host.coords->download(); return host.n_molecules == 0 ? 0 : total_iterations_host / host.n_molecules; } diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu index a02c6cf7..baba687e 100644 --- a/src/core/cuda/src/cuda_temperature.cu +++ b/src/core/cuda/src/cuda_temperature.cu @@ -19,7 +19,10 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; double mass_i = catypes[atypes[idx].code - 1].m; - double ener = .5 * mass_i * (pow(velocities[idx].x, 2) + pow(velocities[idx].y, 2) + pow(velocities[idx].z, 2)); + const double vx = velocities[idx].x; + const double vy = velocities[idx].y; + const double vz = velocities[idx].z; + double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); bool is_solute = (idx < n_atoms_solute); bool is_excluded = excluded[idx]; diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 6ef7cd45..97b687a6 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -57,8 +57,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z; rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x; - bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2)); - bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2)); + bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z); + bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z); bjinv = sqrt(bj2inv); bkinv = sqrt(bk2inv); From d32c490b3ea8576ef3036ce8d8fcfc1332375bee Mon Sep 17 00:00:00 2001 From: shen Date: Wed, 29 Apr 2026 17:14:15 +0200 Subject: [PATCH 14/20] save binary in build --- src/core/Makefile | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/core/Makefile b/src/core/Makefile index 0b607d06..6acc5da1 100644 --- a/src/core/Makefile +++ b/src/core/Makefile @@ -1,43 +1,44 @@ CC = nvcc -CFLAGS = -O3 -std=c++17 -arch=sm_86 -I./cuda/include -I./common/include -I./cpu/include -I. -DEPFLAGS = -MMD -MF $(@:.o=.d) -QDYN_SPFP ?= 0 - -ifneq ($(filter 1 true TRUE yes YES on ON,$(QDYN_SPFP)),) -CFLAGS += -DQDYN_SPFP -PRECISION = spfp -else -PRECISION = dpfp +SPFPFLAGS = +ifeq ($(QDYN_SPFP),1) +SPFPFLAGS += -DQDYN_SPFP endif - -BUILD_DIR = build/$(PRECISION) +CFLAGS = -O3 -std=c++17 -arch=sm_86 $(SPFPFLAGS) -I./cuda/include -I./common/include -I./cpu/include -I. +DEPFLAGS = -MMD -MF $(@:.o=.d) +BUILD_MODE = $(if $(filter 1,$(QDYN_SPFP)),spfp,dpfp) +OBJDIR = .build/$(BUILD_MODE) +TARGET = $(OBJDIR)/qdyn # collect all .cu files except main.cu SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu)) CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp) -MAIN_OBJ = $(BUILD_DIR)/main.cu.o -OBJS = $(addprefix $(BUILD_DIR)/,$(SRCS:.cu=.cu.o)) $(addprefix $(BUILD_DIR)/,$(CPPSRCS:.cpp=.cpp.o)) +MAIN_OBJ = $(OBJDIR)/main.o +OBJS = $(addprefix $(OBJDIR)/,$(SRCS:.cu=.o)) $(addprefix $(OBJDIR)/,$(CPPSRCS:.cpp=.o)) DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d) all: qdyn move -qdyn: $(MAIN_OBJ) $(OBJS) +qdyn: $(TARGET) + cp $< $@ + +$(TARGET): $(MAIN_OBJ) $(OBJS) $(CC) $(CFLAGS) -o $@ $^ -$(BUILD_DIR)/%.cu.o: %.cu - mkdir -p $(dir $@) +$(OBJDIR)/%.o: %.cu + mkdir -p $(@D) $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ -$(BUILD_DIR)/%.cpp.o: %.cpp - mkdir -p $(dir $@) +$(OBJDIR)/%.o: %.cpp + mkdir -p $(@D) $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ clean: - rm -rf build - rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn + rm -rf .build qdyn ../../bin/qdyn -move: +move: $(TARGET) mkdir -p ../../bin - mv qdyn ../../bin/ + cp $< ../../bin/qdyn + +.PHONY: all qdyn clean move -include $(DEPS) From 100d1c0987afede48eea874592523b9ef761d733 Mon Sep 17 00:00:00 2001 From: shen Date: Wed, 29 Apr 2026 20:04:15 +0200 Subject: [PATCH 15/20] update dvel to float --- src/core/common/include/md_types.h | 6 +- src/core/common/include/precision.h | 9 +- src/core/cpu/src/cpu_angle_force.cpp | 4 +- src/core/cpu/src/cpu_improper2_force.cpp | 4 +- src/core/cpu/src/cpu_polx_water_force.cpp | 4 +- src/core/cpu/src/cpu_q_angle_force.cpp | 4 +- src/core/cpu/src/cpu_q_torsion_force.cpp | 4 +- src/core/cpu/src/cpu_restrang_force.cpp | 4 +- src/core/cpu/src/cpu_torsion_force.cpp | 4 +- src/core/cuda/src/cuda_angle_force.cu | 4 +- src/core/cuda/src/cuda_improper2_force.cu | 2 +- src/core/cuda/src/cuda_nonbonded_force.cu | 126 +++++++++++---------- src/core/cuda/src/cuda_polx_water_force.cu | 2 +- src/core/cuda/src/cuda_restrang_force.cu | 4 +- src/core/cuda/src/cuda_torsion_force.cu | 2 +- 15 files changed, 99 insertions(+), 84 deletions(-) diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index 6a4d2865..27c20cef 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -310,9 +310,9 @@ struct vel_t { }; struct dvel_t { - double x; - double y; - double z; + force_accum_t x; + force_accum_t y; + force_accum_t z; }; struct E_bonded_t { diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h index fc633f45..80b790f7 100644 --- a/src/core/common/include/precision.h +++ b/src/core/common/include/precision.h @@ -3,11 +3,18 @@ #ifdef QDYN_SPFP using real_t = float; using nonbond_work_t = float; +using force_accum_t = float; #else using real_t = double; using nonbond_work_t = double; +using force_accum_t = double; #endif using energy_accum_t = double; -using force_accum_t = double; using constraint_work_t = double; + +#ifdef QDYN_SPFP +constexpr double k_singular_sin_epsilon = 1.0e-6; +#else +constexpr double k_singular_sin_epsilon = 1.0e-12; +#endif diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp index a9c29c1e..ae600561 100644 --- a/src/core/cpu/src/cpu_angle_force.cpp +++ b/src/core/cpu/src/cpu_angle_force.cpp @@ -64,9 +64,9 @@ double calc_angle_forces(int start, int end) { dv = cangle.kth * dth; f1 = sin(th); - if (std::fabs(f1) < 1.0E-12) { + if (std::fabs(f1) < k_singular_sin_epsilon) { // Avoid division by zero - f1 = -1.0E12; + f1 = -1.0 / k_singular_sin_epsilon; } else { f1 = -1.0 / f1; } diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp index af73a9cc..6e4faa60 100644 --- a/src/core/cpu/src/cpu_improper2_force.cpp +++ b/src/core/cpu/src/cpu_improper2_force.cpp @@ -79,8 +79,8 @@ double calc_improper2_forces(int start, int end) { // Forces f1 = sin(phi); - if (std::fabs(f1) < 1E-12) { - f1 = 1E-12; + if (std::fabs(f1) < k_singular_sin_epsilon) { + f1 = std::copysign(k_singular_sin_epsilon, f1); } f1 = -1 / f1; diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp index 9d0e4711..5116dbbb 100644 --- a/src/core/cpu/src/cpu_polx_water_force.cpp +++ b/src/core/cpu/src/cpu_polx_water_force.cpp @@ -158,8 +158,8 @@ void calc_polx_w_forces(int iteration) { cos_th = -1; } f0 = sin(acos(cos_th)); - if (fabs(f0) < 1.0E-12) { - f0 = 1.0E-12; + if (fabs(f0) < k_singular_sin_epsilon) { + f0 = k_singular_sin_epsilon; } f0 = -1.0 / f0; f0 *= dv; diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp index c9c2ea65..14aa802c 100644 --- a/src/core/cpu/src/cpu_q_angle_force.cpp +++ b/src/core/cpu/src/cpu_q_angle_force.cpp @@ -56,8 +56,8 @@ void calc_qangle_forces(int state) { dv = ctx.q_cangles[ic].kth * dth * lambdas[state]; f1 = sin(th); - if (abs(f1) < 1E-12) { - f1 = 1E-12; + if (fabs(f1) < k_singular_sin_epsilon) { + f1 = k_singular_sin_epsilon; } f1 = -1.0 / f1; diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp index be309347..7b7fb271 100644 --- a/src/core/cpu/src/cpu_q_torsion_force.cpp +++ b/src/core/cpu/src/cpu_q_torsion_force.cpp @@ -76,8 +76,8 @@ void calc_qtorsion_forces(int state) { // Forces f1 = sin(phi); - if (abs(f1) < 1E-12) { - f1 = 1E-12; + if (fabs(f1) < k_singular_sin_epsilon) { + f1 = copysign(k_singular_sin_epsilon, f1); } f1 = -1 / f1; diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp index d809a9c1..84f593b0 100644 --- a/src/core/cpu/src/cpu_restrang_force.cpp +++ b/src/core/cpu/src/cpu_restrang_force.cpp @@ -61,8 +61,8 @@ void calc_restrang_forces() { dv = lambda * restrangs[ir].k * dth; f1 = sin(th); - if (fabs(f1) < 1E-12) { - f1 = -1E-12; + if (fabs(f1) < k_singular_sin_epsilon) { + f1 = -1.0 / k_singular_sin_epsilon; } else { f1 = -1 / f1; } diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp index e8aaa2a3..4ebb44b2 100644 --- a/src/core/cpu/src/cpu_torsion_force.cpp +++ b/src/core/cpu/src/cpu_torsion_force.cpp @@ -88,8 +88,8 @@ double calc_torsion_forces(int start, int end) { // Forces f1 = sin(phi); - if (std::fabs(f1) < 1E-12) { - f1 = 1E-12; + if (std::fabs(f1) < k_singular_sin_epsilon) { + f1 = std::copysign(k_singular_sin_epsilon, f1); } f1 = -1 / f1; diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index dcd044ce..f20b039a 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -39,8 +39,8 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co double dv = cang.kth * dtheta; double f1 = sin(theta); - if (fabs(f1) < 1e-12) { - f1 = -1.0e12; + if (fabs(f1) < k_singular_sin_epsilon) { + f1 = -1.0 / k_singular_sin_epsilon; } else { f1 = -1.0 / f1; } diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index 78707b12..dd7d91aa 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -76,7 +76,7 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp // Forces f1 = sin(phi); - if (fabs(f1) < 1E-12) f1 = 1E-12; + if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1); f1 = -1 / f1; // printf("f1 = %f phi = %f cos_phi = %f\n", f1, phi, cos_phi); diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index ce3f73ae..32b4077a 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -9,18 +9,19 @@ namespace CudaNonbondedForce { bool is_initialized = false; double *d_evdw_total, *d_ecoul_total; +template struct nonbond_vec_t { - nonbond_work_t x; - nonbond_work_t y; - nonbond_work_t z; + WorkT x; + WorkT y; + WorkT z; }; -__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) { -#ifdef QDYN_SPFP +__device__ __forceinline__ float nonbond_rsqrt(float value) { return rsqrtf(value); -#else +} + +__device__ __forceinline__ double nonbond_rsqrt(double value) { return rsqrt(value); -#endif } __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { @@ -53,6 +54,7 @@ __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned m return v; } +template __device__ void calculate_unforce_bound( const coord_t& x, const coord_t& y, @@ -60,20 +62,20 @@ __device__ void calculate_unforce_bound( const real_t charge_product, const vdw_pair_param_t& pair_param, - const nonbond_work_t coulomb_constant, + const WorkT coulomb_constant, - const nonbond_work_t scaling, - const nonbond_work_t lambda, + const WorkT scaling, + const WorkT lambda, - nonbond_work_t& evdw, - nonbond_work_t& ecoul, - nonbond_work_t& dv) { - const nonbond_work_t dx = static_cast(x.x - y.x); - const nonbond_work_t dy = static_cast(x.y - y.y); - const nonbond_work_t dz = static_cast(x.z - y.z); - const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); - const nonbond_work_t r2 = r * r; - const nonbond_work_t r6 = r2 * r2 * r2; + WorkT& evdw, + WorkT& ecoul, + WorkT& dv) { + const WorkT dx = static_cast(x.x - y.x); + const WorkT dy = static_cast(x.y - y.y); + const WorkT dz = static_cast(x.z - y.z); + const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); + const WorkT r2 = r * r; + const WorkT r6 = r2 * r2 * r2; // double v_a = r6 * r6; // double v_b = r6; // ecoul = r; @@ -82,12 +84,13 @@ __device__ void calculate_unforce_bound( ecoul = scaling * coulomb_constant * charge_product * r * lambda; - const nonbond_work_t v_a = static_cast(pair_param.a) * r6 * r6 * lambda; - const nonbond_work_t v_b = static_cast(pair_param.b) * r6 * lambda; + const WorkT v_a = static_cast(pair_param.a) * r6 * r6 * lambda; + const WorkT v_b = static_cast(pair_param.b) * r6 * lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); + dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); } +template __global__ void calc_nonbonded_force_kernel( const int nx, const int ny, @@ -174,8 +177,8 @@ __global__ void calc_nonbonded_force_kernel( int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1; int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1; - nonbond_vec_t x_force = {0.0, 0.0, 0.0}; - nonbond_vec_t y_force = {0.0, 0.0, 0.0}; + nonbond_vec_t x_force = {0.0, 0.0, 0.0}; + nonbond_vec_t y_force = {0.0, 0.0, 0.0}; double evdw_sum = 0.0; double ecoul_sum = 0.0; @@ -230,14 +233,14 @@ __global__ void calc_nonbonded_force_kernel( } } - const nonbond_work_t kernel_lambda = static_cast(lambda); - const nonbond_work_t coulomb_constant = static_cast(d_topo.coulomb_constant); + const WorkT kernel_lambda = static_cast(lambda); + const WorkT coulomb_constant = static_cast(d_topo.coulomb_constant); const int charge_pair_row = x_charge_type_idx * n_charge_types; const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0; for (int i = 0; i < 32; i++) { if (is_valid()) { - nonbond_work_t scaling = static_cast(1.0); + WorkT scaling = static_cast(1.0); real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx]; @@ -249,7 +252,7 @@ __global__ void calc_nonbonded_force_kernel( // } // } - nonbond_work_t evdw = 0, ecoul = 0, dv = 0; + WorkT evdw = 0, ecoul = 0, dv = 0; calculate_unforce_bound( x_coord, @@ -266,9 +269,9 @@ __global__ void calc_nonbonded_force_kernel( evdw_sum += evdw; ecoul_sum += ecoul; - const nonbond_work_t dx = static_cast(x_coord.x - y_coord.x); - const nonbond_work_t dy = static_cast(x_coord.y - y_coord.y); - const nonbond_work_t dz = static_cast(x_coord.z - y_coord.z); + const WorkT dx = static_cast(x_coord.x - y_coord.x); + const WorkT dy = static_cast(x_coord.y - y_coord.y); + const WorkT dz = static_cast(x_coord.z - y_coord.z); y_force.x -= dv * dx; y_force.y -= dv * dy; y_force.z -= dv * dz; @@ -334,34 +337,39 @@ std::pair calc_nonbonded_force_host( cudaMemset(d_ecoul_total, 0, sizeof(double)); cudaMemset(d_evdw_total, 0, sizeof(double)); - calc_nonbonded_force_kernel<<>>( - nx, - ny, - x_charges_types, - y_charges_types, - host.charge_pair_products->gpu_data_p, - x_atypes_types, - y_atypes_types, - host.catype_pair_params->gpu_data_p, - host.topo, - host.excluded->gpu_data_p, - host.LJ_matrix->gpu_data_p, - x_idx_list, - y_idx_list, - host.coords->gpu_data_p, - host.dvelocities->gpu_data_p, - d_evdw_total, - d_ecoul_total, - symmetric, - disable_water_h_lj, - host.n_atoms_solute, - host.n_charge_types, - host.zero_charge_type, - host.n_catype_types, - host.zero_catype_type, - host.n_qelscales, - lambda, - host.q_elscales->gpu_data_p); + auto launch_kernel = [&](auto work_tag) { + using WorkT = decltype(work_tag); + calc_nonbonded_force_kernel<<>>( + nx, + ny, + x_charges_types, + y_charges_types, + host.charge_pair_products->gpu_data_p, + x_atypes_types, + y_atypes_types, + host.catype_pair_params->gpu_data_p, + host.topo, + host.excluded->gpu_data_p, + host.LJ_matrix->gpu_data_p, + x_idx_list, + y_idx_list, + host.coords->gpu_data_p, + host.dvelocities->gpu_data_p, + d_evdw_total, + d_ecoul_total, + symmetric, + disable_water_h_lj, + host.n_atoms_solute, + host.n_charge_types, + host.zero_charge_type, + host.n_catype_types, + host.zero_catype_type, + host.n_qelscales, + lambda, + host.q_elscales->gpu_data_p); + }; + + launch_kernel(nonbond_work_t{}); cudaDeviceSynchronize(); diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index 13c37fbc..7be0656f 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -136,7 +136,7 @@ __global__ void calc_polx_water_forces_kernel( if (cos_th > 1) cos_th = 1; if (cos_th < -1) cos_th = -1; f0 = sin(acos(cos_th)); - if (abs(f0) < 1.0E-12) f0 = 1.0E-12; + if (abs(f0) < k_singular_sin_epsilon) f0 = k_singular_sin_epsilon; f0 = -1.0 / f0; f0 *= dv; diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index b214aee9..567a78df 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -64,8 +64,8 @@ __global__ void calc_restrang_force_kernel( dv = lambda * restrangs[ir].k * dth; f1 = sin(th); - if (fabs(f1) < 1E-12) { - f1 = -1E-12; + if (fabs(f1) < k_singular_sin_epsilon) { + f1 = -1.0 / k_singular_sin_epsilon; } else { f1 = -1 / f1; } diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 97b687a6..5baffbde 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -76,7 +76,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio // Forces f1 = sin(phi); - if (fabs(f1) < 1E-12) f1 = 1E-12; + if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1); f1 = -1 / f1; di.x = f1 * (rnk.x * (bjinv * bkinv) - cos_phi * rnj.x * bj2inv); From 840e1c9f16adee5b330a6a34dd3ef605f17e9a30 Mon Sep 17 00:00:00 2001 From: shen Date: Wed, 29 Apr 2026 20:48:27 +0200 Subject: [PATCH 16/20] update all to float --- src/core/common/include/context.h | 32 ++--- src/core/common/include/md_types.h | 128 +++++++++--------- src/core/common/include/precision.h | 5 +- src/core/common/src/init.cpp | 56 ++++---- src/core/common/src/parse.cpp | 2 +- src/core/cpu/include/cpu_angle_force.h | 5 +- src/core/cpu/include/cpu_bond_force.h | 4 +- src/core/cpu/include/cpu_improper2_force.h | 4 +- src/core/cpu/include/cpu_torsion_force.h | 4 +- src/core/cpu/include/cpu_utils.h | 8 +- src/core/cpu/src/cpu_angle_force.cpp | 10 +- src/core/cpu/src/cpu_bond_force.cpp | 6 +- src/core/cpu/src/cpu_improper2_force.cpp | 8 +- src/core/cpu/src/cpu_leapfrog.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_pp_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_pw_force.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_qp_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_qq_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_qw_force.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_ww_force.cpp | 4 +- src/core/cpu/src/cpu_polx_water_force.cpp | 18 +-- src/core/cpu/src/cpu_pshell_force.cpp | 2 +- src/core/cpu/src/cpu_q_angle_force.cpp | 4 +- src/core/cpu/src/cpu_q_bond_force.cpp | 2 +- src/core/cpu/src/cpu_q_torsion_force.cpp | 8 +- src/core/cpu/src/cpu_radix_water_force.cpp | 4 +- src/core/cpu/src/cpu_restrang_force.cpp | 4 +- src/core/cpu/src/cpu_restrdis_force.cpp | 2 +- src/core/cpu/src/cpu_restrpos_force.cpp | 2 +- src/core/cpu/src/cpu_restrseq_force.cpp | 4 +- src/core/cpu/src/cpu_restrwall_force.cpp | 2 +- src/core/cpu/src/cpu_shake.cpp | 8 +- src/core/cpu/src/cpu_temperature.cpp | 10 +- src/core/cpu/src/cpu_torsion_force.cpp | 12 +- src/core/cpu/src/utils.cpp | 15 +- src/core/cuda/include/cuda_angle_force.cuh | 4 +- src/core/cuda/include/cuda_bond_force.cuh | 4 +- .../cuda/include/cuda_improper2_force.cuh | 4 +- .../cuda/include/cuda_nonbonded_force.cuh | 8 +- src/core/cuda/include/cuda_torsion_force.cuh | 4 +- src/core/cuda/include/cuda_utility.cuh | 3 +- src/core/cuda/src/cuda_angle_force.cu | 33 ++--- src/core/cuda/src/cuda_bond_force.cu | 28 ++-- src/core/cuda/src/cuda_improper2_force.cu | 18 +-- src/core/cuda/src/cuda_leapfrog.cu | 12 +- src/core/cuda/src/cuda_nonbonded_14_force.cu | 30 ++-- src/core/cuda/src/cuda_nonbonded_force.cu | 38 +++--- src/core/cuda/src/cuda_polx_water_force.cu | 54 ++++---- src/core/cuda/src/cuda_pshell_force.cu | 26 ++-- src/core/cuda/src/cuda_radix_water_force.cu | 28 ++-- src/core/cuda/src/cuda_restrang_force.cu | 18 +-- src/core/cuda/src/cuda_restrdis_force.cu | 16 +-- src/core/cuda/src/cuda_restrpos_force.cu | 16 +-- src/core/cuda/src/cuda_restrseq_force.cu | 16 +-- src/core/cuda/src/cuda_restrwall_force.cu | 14 +- src/core/cuda/src/cuda_shake_constraints.cu | 4 +- src/core/cuda/src/cuda_temperature.cu | 66 ++++----- src/core/cuda/src/cuda_torsion_force.cu | 25 ++-- 58 files changed, 452 insertions(+), 420 deletions(-) diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h index 83817bb8..ee516d7a 100644 --- a/src/core/common/include/context.h +++ b/src/core/common/include/context.h @@ -32,8 +32,8 @@ class Context { int n_qatoms = 0; int n_waters = 0; int n_molecules = 0; - double dt = 0.0; - double tau_T = 0.0; + real_t dt = 0.0; + real_t tau_T = 0.0; md_t md; topo_t topo; int n_excluded = 0; @@ -108,7 +108,7 @@ class Context { std::unique_ptr> excluded; - std::unique_ptr> winv; + std::unique_ptr> winv; std::unique_ptr> shell; @@ -137,12 +137,12 @@ class Context { Water */ std::unique_ptr> wshells; - double crgQtot = 0.0; - double Dwmz = 0.0; - double awmz = 0.0; - std::vector theta; - std::vector theta0; - std::vector tdum; + real_t crgQtot = 0.0; + real_t Dwmz = 0.0; + real_t awmz = 0.0; + std::vector theta; + std::vector theta0; + std::vector tdum; int n_max_inshell = 0; int n_shells = 0; std::vector> list_sh; @@ -152,7 +152,7 @@ class Context { /* FEP */ - std::unique_ptr> lambdas; // Actually length is only 2.. + std::unique_ptr> lambdas; // Actually length is only 2.. /* Energy @@ -206,13 +206,13 @@ class Context { Temperature */ - double Temp = 0.0; - double Tfree = 0.0; - double Ndegf = 0.0; - double Ndegfree = 0.0; + real_t Temp = 0.0; + real_t Tfree = 0.0; + real_t Ndegf = 0.0; + real_t Ndegfree = 0.0; - double Tscale_solute = 0.0; - double Tscale_solvent = 0.0; + real_t Tscale_solute = 0.0; + real_t Tscale_solvent = 0.0; /* Info for FEP */ diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index 27c20cef..dd5ef21d 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -12,29 +12,29 @@ struct md_t { // [MD] int steps; - double stepsize; - double temperature; + real_t stepsize; + real_t temperature; char thermostat[40]; - double bath_coupling; + real_t bath_coupling; int random_seed; - double initial_temperature; + real_t initial_temperature; bool shake_solvent; bool shake_solute; bool shake_hydrogens; bool lrf; bool charge_groups; // [cut-offs] - double solute_solute; - double solvent_solvent; - double solute_solvent; - double q_atom; + real_t solute_solute; + real_t solvent_solvent; + real_t solute_solvent; + real_t q_atom; // [sphere] - double shell_radius; // Note: this is for the pshell - double shell_force; // Note: this is for the pshell + real_t shell_radius; // Note: this is for the pshell + real_t shell_force; // Note: this is for the pshell // [solvent] - double radial_force; + real_t radial_force; bool polarisation; - double polarisation_force; + real_t polarisation_force; // [intervals] int non_bond; int output; @@ -62,8 +62,8 @@ struct bond_t { struct cbond_t { int code; - double kb; - double b0; + real_t kb; + real_t b0; }; struct angle_t { @@ -75,8 +75,8 @@ struct angle_t { struct cangle_t { int code; - double kth; - double th0; + real_t kth; + real_t th0; }; struct torsion_t { @@ -89,10 +89,10 @@ struct torsion_t { struct ctorsion_t { int code; - double k; - double n; - double d; - double paths; + real_t k; + real_t n; + real_t d; + real_t paths; }; struct improper_t { @@ -105,8 +105,8 @@ struct improper_t { struct cimproper_t { int code; - double k; - double phi0; + real_t k; + real_t phi0; }; struct charge_t { @@ -126,11 +126,11 @@ struct atype_t { struct catype_t { int code; - double m; + real_t m; real_t aii_normal; real_t bii_normal; - // double aii_polar; - // double bii_polar; + // real_t aii_polar; + // real_t bii_polar; real_t aii_1_4; real_t bii_1_4; }; @@ -142,12 +142,12 @@ struct vdw_pair_param_t { struct topo_t { int solvent_type; - double exclusion_radius; - double solvent_radius; + real_t exclusion_radius; + real_t solvent_radius; coord_t solute_center; coord_t solvent_center; - double el14_scale; - double coulomb_constant; + real_t el14_scale; + real_t coulomb_constant; int vdw_rule; // 1=geometric, 2=arithmetic }; @@ -177,14 +177,14 @@ struct q_angcouple_t { }; // no use struct q_cimproper_t { - double k; - double phi0; + real_t k; + real_t phi0; }; // no use struct q_elscale_t { int qi; int qj; - double mu; + real_t mu; }; struct q_exclpair_t { @@ -211,18 +211,18 @@ struct q_offdiag_t { int j; int qk; int ql; - double Aij; - double muij; + real_t Aij; + real_t muij; }; // no use struct q_shake_t { int ai; int aj; - double dist; + real_t dist; }; // no use struct q_softcore_t { - double s; + real_t s; }; // no use struct q_softpair_t { @@ -243,7 +243,7 @@ struct q_torcouple_t { struct restrseq_t { int ai; int aj; - double k; + real_t k; bool ih; int to_center; // Flag for restraining to geom. or mass center }; @@ -258,32 +258,32 @@ struct restrpos_t { struct restrdis_t { int ai, aj; int ipsi; - double d1, d2; - double k; + real_t d1, d2; + real_t k; char itext[20], jtext[20]; }; struct restrang_t { int ai, aj, ak; int ipsi; - double ang; - double k; + real_t ang; + real_t k; }; struct restrwall_t { int ai, aj; - double d, k, aMorse, dMorse; + real_t d, k, aMorse, dMorse; bool ih; }; struct shell_t { int n_inshell; - double theta_corr; - double avtheta; - double avn_inshell; - double router; - double dr; - double cstb; + real_t theta_corr; + real_t avtheta; + real_t avn_inshell; + real_t router; + real_t dr; + real_t cstb; }; /* ============================================= @@ -294,7 +294,7 @@ struct shell_t { struct shake_bond_t { int ai; int aj; - double dist2; + real_t dist2; bool ready; }; @@ -316,28 +316,28 @@ struct dvel_t { }; struct E_bonded_t { - double Ubond; - double Uangle; - double Utor; - double Uimp; + real_t Ubond; + real_t Uangle; + real_t Utor; + real_t Uimp; }; struct E_nonbonded_t { - double Ucoul; - double Uvdw; + real_t Ucoul; + real_t Uvdw; }; struct E_restraint_t { - double Uradx; - double Upolx; - double Ufix; - double Ushell; - double Upres; - double Urestr; + real_t Uradx; + real_t Upolx; + real_t Ufix; + real_t Ushell; + real_t Upres; + real_t Urestr; }; struct energy_t { - double Ukin; - double Upot; - double Utot; + real_t Ukin; + real_t Upot; + real_t Utot; }; diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h index 80b790f7..b0978010 100644 --- a/src/core/common/include/precision.h +++ b/src/core/common/include/precision.h @@ -4,14 +4,15 @@ using real_t = float; using nonbond_work_t = float; using force_accum_t = float; +using energy_accum_t = float; +using constraint_work_t = float; #else using real_t = double; using nonbond_work_t = double; using force_accum_t = double; -#endif - using energy_accum_t = double; using constraint_work_t = double; +#endif #ifdef QDYN_SPFP constexpr double k_singular_sin_epsilon = 1.0e-6; diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp index 499c01cb..e7c2b8c0 100644 --- a/src/core/common/src/init.cpp +++ b/src/core/common/src/init.cpp @@ -38,10 +38,10 @@ void initialize_catype_tables() { auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p; std::vector h_catype_table_all; - std::map, int> catype_to_type_host; + std::map, int> catype_to_type_host; auto add_catype = [&](catype_t catype) -> int { - const std::array key = { + const std::array key = { catype.aii_normal, catype.bii_normal, catype.aii_1_4, @@ -91,7 +91,7 @@ void initialize_catype_tables() { for (int i = 0; i < static_cast(ctx.p_atoms_list->length); i++) { const int id = p_atoms_cpu[i]; const catype_t catype = catypes[atypes[id].code - 1]; - const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; + const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; p_catype_types_cpu[i] = catype_to_type_host[key]; } @@ -109,7 +109,7 @@ void initialize_catype_tables() { const int id = q_atoms_cpu[i]; const atype_t& qat = ctx.q_atypes[q_idx[id] + ctx.n_qatoms * state]; const catype_t& qcatype = ctx.q_catypes[qat.code - 1]; - const std::array key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4}; + const std::array key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4}; q_catype_types_cpu[state * ctx.q_atoms_list->length + i] = catype_to_type_host[key]; } } @@ -118,7 +118,7 @@ void initialize_catype_tables() { for (int i = 0; i < static_cast(ctx.w_atoms_list->length); i++) { const int id = w_atoms_cpu[i]; const catype_t catype = catypes[atypes[id].code - 1]; - const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; + const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; w_catype_types_cpu[i] = catype_to_type_host[key]; } printf("Total water atom number: %lu, w_catype_types size: %lu\n", ctx.w_atoms_list->length, w_catype_types_cpu.size()); @@ -141,10 +141,10 @@ void initialize_charge_tables() { auto *w_atoms_cpu = ctx.w_atoms_list->cpu_data_p; auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p; - std::map charge_to_type_host; + std::map charge_to_type_host; std::vector h_charge_table_all; - auto add_charge = [&](double charge) -> int { + auto add_charge = [&](real_t charge) -> int { if (charge_to_type_host.count(charge) == 0) { int sz = static_cast(h_charge_table_all.size()); ccharge_t new_ccharge = {}; @@ -161,7 +161,7 @@ void initialize_charge_tables() { } for (int state = 0; state < ctx.n_lambdas; state++) { for (int i = 0; i < ctx.n_qatoms; i++) { - double charge = ctx.q_charges[i + ctx.n_qatoms * state].charge; + real_t charge = ctx.q_charges[i + ctx.n_qatoms * state].charge; add_charge(charge); add_charge(charge * lambda_values[state]); } @@ -181,7 +181,7 @@ void initialize_charge_tables() { std::vector p_charge_types_cpu(ctx.p_atoms_list->length); for (int i = 0; i < static_cast(ctx.p_atoms_list->length); i++) { const int id = p_atoms_cpu[i]; - const double charge = ccharges[charges[id].code - 1].charge; + const real_t charge = ccharges[charges[id].code - 1].charge; p_charge_types_cpu[i] = charge_to_type_host[charge]; } @@ -197,7 +197,7 @@ void initialize_charge_tables() { for (int state = 0; state < ctx.n_lambdas; state++) { for (int i = 0; i < static_cast(ctx.q_atoms_list->length); i++) { const int id = q_atoms_cpu[i]; - const double charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge; + const real_t charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge; q_charge_types_cpu[state * ctx.q_atoms_list->length + i] = charge_to_type_host[charge]; } } @@ -205,7 +205,7 @@ void initialize_charge_tables() { std::vector w_charge_types_cpu(ctx.w_atoms_list->length); for (int i = 0; i < static_cast(ctx.w_atoms_list->length); i++) { const int id = w_atoms_cpu[i]; - const double charge = ccharges[charges[id].code - 1].charge; + const real_t charge = ccharges[charges[id].code - 1].charge; w_charge_types_cpu[i] = charge_to_type_host[charge]; } @@ -493,8 +493,8 @@ void init_velocities() { auto& velocities = ctx.velocities->cpu_data_p; // If not previous value set, use a Maxwell distribution to fill velocities - double kT = Boltz * ctx.md.initial_temperature; - double sd, mass; + real_t kT = Boltz * ctx.md.initial_temperature; + real_t sd, mass; for (int i = 0; i < ctx.n_atoms; i++) { mass = catypes[atypes[i].code - 1].m; sd = sqrt(kT / mass); @@ -514,7 +514,7 @@ void init_inv_mass() { auto& ctx = Context::instance(); auto& atypes = ctx.atypes->cpu_data_p; auto& catypes = ctx.catypes->cpu_data_p; - ctx.winv = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); + ctx.winv = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); auto* winv = ctx.winv->cpu_data_p; for (int ai = 0; ai < ctx.n_atoms; ai++) { winv[ai] = 1 / catypes[atypes[ai].code - 1].m; @@ -539,7 +539,7 @@ void init_water_sphere() { void init_wshells() { auto& ctx = Context::instance(); int n_inshell; - double drs, router, ri, dr, Vshell, rshell; + real_t drs, router, ri, dr, Vshell, rshell; auto& bonds = ctx.bonds->cpu_data_p; auto& cbonds = ctx.cbonds->cpu_data_p; auto& angles = ctx.angles->cpu_data_p; @@ -547,8 +547,8 @@ void init_wshells() { // Get water properties from the first water molecule. cbond_t cbondw = cbonds[bonds[ctx.n_atoms_solute].code - 1]; cangle_t canglew = cangles[angles[ctx.n_atoms_solute].code - 1]; - const double crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge; - const double mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2); + const real_t crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge; + const real_t mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2); drs = wpolr_layer / drouter; @@ -605,7 +605,7 @@ void init_pshells() { auto& catypes = ctx.catypes->cpu_data_p; auto& coords_init = ctx.coords_init->cpu_data_p; auto* excluded = ctx.excluded->cpu_data_p; - double mass, r2, rin2; + real_t mass, r2, rin2; ctx.heavy = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); auto* heavy = ctx.heavy->cpu_data_p; @@ -655,7 +655,7 @@ static int mark_heavy_atoms(Context& ctx) { auto* heavy = ctx.heavy->cpu_data_p; int n_heavy = 0; for (int i = 0; i < ctx.n_atoms; i++) { - double mass = catypes[atypes[i].code - 1].m; + real_t mass = catypes[atypes[i].code - 1].m; if (mass < 4.0) { heavy[i] = false; } else { @@ -681,7 +681,7 @@ void init_pshells_from_charge_groups() { auto& ctx = Context::instance(); auto& coords_init = ctx.coords_init->cpu_data_p; auto* excluded = ctx.excluded->cpu_data_p; - double r2, rin2; + real_t r2, rin2; auto& charge_groups = ctx.charge_group_config; const bool use_switch_atom = charge_groups.iuse_switch_atom == 1; @@ -697,9 +697,9 @@ void init_pshells_from_charge_groups() { const auto& charge_group = charge_groups.charge_groups[grp]; int i = charge_group.iswitch - 1; if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) { - double cx = coords_init[i].x; - double cy = coords_init[i].y; - double cz = coords_init[i].z; + real_t cx = coords_init[i].x; + real_t cy = coords_init[i].y; + real_t cz = coords_init[i].z; if (!use_switch_atom) { cx = 0.0; cy = 0.0; @@ -710,7 +710,7 @@ void init_pshells_from_charge_groups() { cy += coords_init[ai].y; cz += coords_init[ai].z; } - double inv_atoms = 1.0 / static_cast(charge_group.atoms.size()); + real_t inv_atoms = 1.0 / static_cast(charge_group.atoms.size()); cx *= inv_atoms; cy *= inv_atoms; cz *= inv_atoms; @@ -748,7 +748,7 @@ void init_shake() { int mol = 0; int shake; int n_solute_shake_constraints = 0; - double excl_shake = 0; + real_t excl_shake = 0; auto& bonds = ctx.bonds->cpu_data_p; auto& cbonds = ctx.cbonds->cpu_data_p; @@ -808,10 +808,10 @@ void init_shake() { ctx.Ndegf = 3 * ctx.n_atoms - ctx.n_shake_constraints; ctx.Ndegfree = ctx.Ndegf - 3 * ctx.n_excluded + excl_shake; - const double Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints; + const real_t Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints; - const double Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints); - const double Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent; + const real_t Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints); + const real_t Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent; printf("n_shake_constrains = %d, n_solute_shake_constraints = %d, excl_shake = %f\n", ctx.n_shake_constraints, n_solute_shake_constraints, excl_shake); diff --git a/src/core/common/src/parse.cpp b/src/core/common/src/parse.cpp index 98e859ae..1b45a7a6 100644 --- a/src/core/common/src/parse.cpp +++ b/src/core/common/src/parse.cpp @@ -132,7 +132,7 @@ void parse_md(const char* filename) { #ifdef VERBOSE printf("reading in %d lambdas (%s in file)\n", ctx.n_lambdas, file.buffer[k][1]); #endif - ctx.lambdas = std::make_unique>(ctx.n_lambdas, true, ctx.run_gpu); + ctx.lambdas = std::make_unique>(ctx.n_lambdas, true, ctx.run_gpu); auto *lambdas = ctx.lambdas->cpu_data_p; k++; for (int i = 0; i < ctx.n_lambdas; i++) { diff --git a/src/core/cpu/include/cpu_angle_force.h b/src/core/cpu/include/cpu_angle_force.h index df2a3a64..ea4f5ef6 100644 --- a/src/core/cpu/include/cpu_angle_force.h +++ b/src/core/cpu/include/cpu_angle_force.h @@ -1,2 +1,5 @@ #pragma once -double calc_angle_forces(int start, int end); \ No newline at end of file + +#include "common/include/precision.h" + +real_t calc_angle_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_bond_force.h b/src/core/cpu/include/cpu_bond_force.h index 6a2f7f73..32775c6e 100644 --- a/src/core/cpu/include/cpu_bond_force.h +++ b/src/core/cpu/include/cpu_bond_force.h @@ -1,3 +1,5 @@ #pragma once -double calc_bond_forces(int start, int end); +#include "common/include/precision.h" + +real_t calc_bond_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_improper2_force.h b/src/core/cpu/include/cpu_improper2_force.h index 26d694aa..b6606e57 100644 --- a/src/core/cpu/include/cpu_improper2_force.h +++ b/src/core/cpu/include/cpu_improper2_force.h @@ -1,3 +1,5 @@ #pragma once -double calc_improper2_forces(int start, int end); +#include "common/include/precision.h" + +real_t calc_improper2_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_torsion_force.h b/src/core/cpu/include/cpu_torsion_force.h index 19089318..309bd505 100644 --- a/src/core/cpu/include/cpu_torsion_force.h +++ b/src/core/cpu/include/cpu_torsion_force.h @@ -1,3 +1,5 @@ #pragma once -double calc_torsion_forces(int start, int end); +#include "common/include/precision.h" + +real_t calc_torsion_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_utils.h b/src/core/cpu/include/cpu_utils.h index e7be4557..352d6b3c 100644 --- a/src/core/cpu/include/cpu_utils.h +++ b/src/core/cpu/include/cpu_utils.h @@ -1,5 +1,7 @@ #pragma once -double gauss(double mean, double sd); -double to_degrees(double radians); -double to_radians(double degrees); +#include "common/include/precision.h" + +real_t gauss(real_t mean, real_t sd); +real_t to_degrees(real_t radians); +real_t to_radians(real_t degrees); diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp index ae600561..1f19390f 100644 --- a/src/core/cpu/src/cpu_angle_force.cpp +++ b/src/core/cpu/src/cpu_angle_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -double calc_angle_forces(int start, int end) { +real_t calc_angle_forces(int start, int end) { auto& ctx = Context::instance(); auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; @@ -15,11 +15,11 @@ double calc_angle_forces(int start, int end) { coord_t rji, rjk; coord_t di, dk; - double bji2inv, bjk2inv, bjiinv, bjkinv; + real_t bji2inv, bjk2inv, bjiinv, bjkinv; cangle_t cangle; - double cos_th, th, dth, dv, f1; - double ener; - double angle = 0; + real_t cos_th, th, dth, dv, f1; + real_t ener; + real_t angle = 0; auto &angles = ctx.angles->cpu_data_p; auto &cangles = ctx.cangles->cpu_data_p; diff --git a/src/core/cpu/src/cpu_bond_force.cpp b/src/core/cpu/src/cpu_bond_force.cpp index 2a539f90..0ab4baff 100644 --- a/src/core/cpu/src/cpu_bond_force.cpp +++ b/src/core/cpu/src/cpu_bond_force.cpp @@ -4,7 +4,7 @@ #include "context.h" -double calc_bond_forces(int start, int end) { +real_t calc_bond_forces(int start, int end) { auto& ctx = Context::instance(); auto &bonds = ctx.bonds->cpu_data_p; auto &cbonds = ctx.cbonds->cpu_data_p; @@ -13,8 +13,8 @@ double calc_bond_forces(int start, int end) { int aii, aji; coord_t ai, aj, dx; cbond_t cbond; - double dx2, dx1, ddx, ener, ampl; - double bond = 0; + real_t dx2, dx1, ddx, ener, ampl; + real_t bond = 0; for (int i = start; i < end; i++) { aii = bonds[i].ai - 1; diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp index 6e4faa60..fea7c724 100644 --- a/src/core/cpu/src/cpu_improper2_force.cpp +++ b/src/core/cpu/src/cpu_improper2_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -double calc_improper2_forces(int start, int end) { +real_t calc_improper2_forces(int start, int end) { auto& ctx = Context::instance(); auto &impropers = ctx.impropers->cpu_data_p; auto &cimpropers = ctx.cimpropers->cpu_data_p; @@ -15,13 +15,13 @@ double calc_improper2_forces(int start, int end) { coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi, arg, ener, dv, f1; + real_t bj2inv, bk2inv, bjinv, bkinv; + real_t cos_phi, phi, arg, ener, dv, f1; coord_t di, dl, dpi, dpj, dpk, dpl; improper_t imp; cimproper_t cimp; - double improper = 0; + real_t improper = 0; for (int i = start; i < end; i++) { imp = impropers[i]; diff --git a/src/core/cpu/src/cpu_leapfrog.cpp b/src/core/cpu/src/cpu_leapfrog.cpp index 9d1ff43a..0927e414 100644 --- a/src/core/cpu/src/cpu_leapfrog.cpp +++ b/src/core/cpu/src/cpu_leapfrog.cpp @@ -11,8 +11,8 @@ void calc_leapfrog() { auto &velocities = ctx.velocities->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; auto *xcoords = ctx.xcoords->cpu_data_p; - double mass_i; - double winv_i; + real_t mass_i; + real_t winv_i; for (int i = 0; i < ctx.n_atoms_solute; i++) { mass_i = catypes[atypes[i].code - 1].m; diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp index 390c67eb..cbeb11f5 100644 --- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp @@ -13,7 +13,7 @@ void calc_nonbonded_pp_forces() { auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p; auto *excluded = ctx.excluded->cpu_data_p; bool bond14, bond23; - double scaling; + real_t scaling; coord_t da; real_t r2a, ra, r6a; real_t V_a, V_b; @@ -67,8 +67,8 @@ void calc_nonbonded_pp_forces() { dvelocities[j].y += dva * da.y; dvelocities[j].z += dva * da.z; - ctx.E_nonbond_pp.Ucoul += static_cast(Vela); - ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); + ctx.E_nonbond_pp.Ucoul += static_cast(Vela); + ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp index 030c1290..52c9242b 100644 --- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp @@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() { dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); - ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); + ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); + ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp index 7a81a516..b0df677d 100644 --- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp @@ -18,7 +18,7 @@ void calc_nonbonded_qp_forces() { real_t r2, r; real_t ai_aii, aj_aii, ai_bii, aj_bii; bool bond23, bond14; - double scaling; + real_t scaling; real_t Vel, V_a, V_b, dv; for (int qi = 0; qi < ctx.n_qatoms; qi++) { @@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() { dvelocities[j].z += dv * da.z; // Update Q totals - ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); - ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); + ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp index 006a3c0e..96462795 100644 --- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp @@ -16,7 +16,7 @@ void calc_nonbonded_qq_forces() { auto *q_elscales = ctx.q_elscales->cpu_data_p; int ai, aj; real_t crg_i, crg_j; - double elscale, scaling; + real_t elscale, scaling; bool bond23, bond14; coord_t da; real_t r2a, ra, r6a; @@ -81,8 +81,8 @@ void calc_nonbonded_qq_forces() { dvelocities[aj].y += dva * da.y; dvelocities[aj].z += dva * da.z; - ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); - ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); + ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp index 8d18bc55..1ab0b469 100644 --- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp @@ -75,8 +75,8 @@ void calc_nonbonded_qw_forces() { dvH1 -= r2H1 * VelH1 * lambda; dvH2 -= r2H2 * VelH2 * lambda; - ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); - ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); + ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); } // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp index 3be5e6f0..f6d2ac98 100644 --- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp @@ -59,8 +59,8 @@ void accumulate_pair_force(Context& ctx, dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - energy.Ucoul += static_cast(ecoul); - energy.Uvdw += static_cast(evdw); + energy.Ucoul += static_cast(ecoul); + energy.Uvdw += static_cast(evdw); } void calc_nonbonded_ww_forces() { diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp index 5116dbbb..440d03d2 100644 --- a/src/core/cpu/src/cpu_polx_water_force.cpp +++ b/src/core/cpu/src/cpu_polx_water_force.cpp @@ -13,12 +13,12 @@ void calc_polx_w_forces(int iteration) { auto *wshells = ctx.wshells->cpu_data_p; int wi, imin, jw, ii, iis, jmin; - double tmin; + real_t tmin; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; - double rm, rc; - double cos_th; - double avtdum, arg, f0, dv; - double ener; + real_t rm, rc; + real_t cos_th; + real_t avtdum, arg, f0, dv; + real_t ener; for (int is = 0; is < ctx.n_shells; is++) { wshells[is].n_inshell = 0; @@ -93,8 +93,8 @@ void calc_polx_w_forces(int iteration) { if (iteration != 0 && iteration % itdis_update == 0) { for (int is = 0; is < ctx.n_shells; is++) { printf("SHELL %d\n", is); - wshells[is].avtheta /= (double)itdis_update; - wshells[is].avn_inshell /= (double)itdis_update; + wshells[is].avtheta /= (real_t)itdis_update; + wshells[is].avn_inshell /= (real_t)itdis_update; wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb); printf("average theta = %f, average in shell = %f, theta_corr = %f\n", @@ -113,7 +113,7 @@ void calc_polx_w_forces(int iteration) { avtdum = 0; for (int il = 0; il < wshells[is].n_inshell; il++) { ii = ctx.nsort[il][is]; - arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell); + arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell); ctx.theta0[il] = acos(arg); ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * wshells[is].cstb / 2; if (ctx.theta0[il] < 0) { @@ -189,7 +189,7 @@ void calc_polx_w_forces(int iteration) { dvelocities[wi + 2].z += f0 * f1H2.z; } - wshells[is].avtheta += avtdum / (double)wshells[is].n_inshell; + wshells[is].avtheta += avtdum / (real_t)wshells[is].n_inshell; wshells[is].avn_inshell += wshells[is].n_inshell; } } diff --git a/src/core/cpu/src/cpu_pshell_force.cpp b/src/core/cpu/src/cpu_pshell_force.cpp index 9ff083cc..a547f16d 100644 --- a/src/core/cpu/src/cpu_pshell_force.cpp +++ b/src/core/cpu/src/cpu_pshell_force.cpp @@ -13,7 +13,7 @@ void calc_pshell_forces() { auto *shell = ctx.shell->cpu_data_p; coord_t dr; - double k, r2, ener; + real_t k, r2, ener; for (int i = 0; i < ctx.n_atoms_solute; i++) { if (shell[i] || excluded[i]) { diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp index 14aa802c..c92c904d 100644 --- a/src/core/cpu/src/cpu_q_angle_force.cpp +++ b/src/core/cpu/src/cpu_q_angle_force.cpp @@ -14,8 +14,8 @@ void calc_qangle_forces(int state) { int ic; int ai, aj, ak; coord_t rji, rjk; - double bji, bjk; - double cos_th, th, dth, ener, dv, f1; + real_t bji, bjk; + real_t cos_th, th, dth, ener, dv, f1; coord_t di, dk; for (int i = 0; i < ctx.n_qangles; i++) { diff --git a/src/core/cpu/src/cpu_q_bond_force.cpp b/src/core/cpu/src/cpu_q_bond_force.cpp index 5f2f7203..6b924c69 100644 --- a/src/core/cpu/src/cpu_q_bond_force.cpp +++ b/src/core/cpu/src/cpu_q_bond_force.cpp @@ -11,7 +11,7 @@ void calc_qbond_forces(int state) { auto *lambdas = ctx.lambdas->cpu_data_p; int ic; int ai, aj; - double b, db, ener, dv; + real_t b, db, ener, dv; coord_t rij; for (int i = 0; i < ctx.n_qbonds; i++) { diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp index 7b7fb271..2be495b0 100644 --- a/src/core/cpu/src/cpu_q_torsion_force.cpp +++ b/src/core/cpu/src/cpu_q_torsion_force.cpp @@ -15,10 +15,10 @@ void calc_qtorsion_forces(int state) { coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - double bj2inv, bk2inv, bjinv, bkinv; - double bj, bk, cos_phi, phi; - double arg, dv, f1; - double ener; + real_t bj2inv, bk2inv, bjinv, bkinv; + real_t bj, bk, cos_phi, phi; + real_t arg, dv, f1; + real_t ener; for (int i = 0; i < ctx.n_qtorsions; i++) { ic = ctx.q_torsions[i + ctx.n_qtorsions * state].code; diff --git a/src/core/cpu/src/cpu_radix_water_force.cpp b/src/core/cpu/src/cpu_radix_water_force.cpp index a887ad31..a85af35c 100644 --- a/src/core/cpu/src/cpu_radix_water_force.cpp +++ b/src/core/cpu/src/cpu_radix_water_force.cpp @@ -10,9 +10,9 @@ void calc_radix_w_forces() { auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; - double b, db, ener, dv, fexp; + real_t b, db, ener, dv, fexp; coord_t dr; - double shift; + real_t shift; if (ctx.md.radial_force != 0) { shift = sqrt(Boltz * ctx.Tfree / ctx.md.radial_force); diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp index 84f593b0..c2b9ed50 100644 --- a/src/core/cpu/src/cpu_restrang_force.cpp +++ b/src/core/cpu/src/cpu_restrang_force.cpp @@ -15,8 +15,8 @@ void calc_restrang_forces() { int state, i, j, k; coord_t dr, dr2, di, dk; - double lambda, r2ij, r2jk, rij, rjk, cos_th, th; - double dth, dv, ener, f1; + real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th; + real_t dth, dv, ener, f1; for (int ir = 0; ir < ctx.n_restrangs; ir++) { state = restrangs[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrdis_force.cpp b/src/core/cpu/src/cpu_restrdis_force.cpp index c15cbef7..859481f3 100644 --- a/src/core/cpu/src/cpu_restrdis_force.cpp +++ b/src/core/cpu/src/cpu_restrdis_force.cpp @@ -14,7 +14,7 @@ void calc_restrdis_forces() { int state, i, j; coord_t dr; - double lambda, b, db, dv, ener; + real_t lambda, b, db, dv, ener; for (int ir = 0; ir < ctx.n_restrdists; ir++) { state = restrdists[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrpos_force.cpp b/src/core/cpu/src/cpu_restrpos_force.cpp index 6db044b4..a3e8710d 100644 --- a/src/core/cpu/src/cpu_restrpos_force.cpp +++ b/src/core/cpu/src/cpu_restrpos_force.cpp @@ -14,7 +14,7 @@ void calc_restrpos_forces() { int state, i; coord_t dr; - double lambda, ener, x2, y2, z2; + real_t lambda, ener, x2, y2, z2; for (int ir = 0; ir < ctx.n_restrspos; ir++) { state = restrspos[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrseq_force.cpp b/src/core/cpu/src/cpu_restrseq_force.cpp index 296762e8..f9ff9fd0 100644 --- a/src/core/cpu/src/cpu_restrseq_force.cpp +++ b/src/core/cpu/src/cpu_restrseq_force.cpp @@ -13,9 +13,9 @@ void calc_restrseq_forces() { auto &restrseqs = ctx.restrseqs->cpu_data_p; auto *heavy = ctx.heavy->cpu_data_p; - double k, mass, totmass; + real_t k, mass, totmass; coord_t dr; - double r2, ener; + real_t r2, ener; for (int s = 0; s < ctx.n_restrseqs; s++) { k = restrseqs[s].k; diff --git a/src/core/cpu/src/cpu_restrwall_force.cpp b/src/core/cpu/src/cpu_restrwall_force.cpp index fd49749a..7da6faa6 100644 --- a/src/core/cpu/src/cpu_restrwall_force.cpp +++ b/src/core/cpu/src/cpu_restrwall_force.cpp @@ -11,7 +11,7 @@ void calc_restrwall_forces() { auto &restrwalls = ctx.restrwalls->cpu_data_p; auto *heavy = ctx.heavy->cpu_data_p; - double k, b, db, ener, dv, fexp; + real_t k, b, db, ener, dv, fexp; coord_t dr; for (int ir = 0; ir < ctx.n_restrwalls; ir++) { diff --git a/src/core/cpu/src/cpu_shake.cpp b/src/core/cpu/src/cpu_shake.cpp index cb29a0f0..91162c98 100644 --- a/src/core/cpu/src/cpu_shake.cpp +++ b/src/core/cpu/src/cpu_shake.cpp @@ -34,7 +34,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) { const int aj = shake_bond.aj - 1; coord_t xij; coord_t xxij; - double xij2, diff, corr, scp; + real_t xij2, diff, corr, scp; xij.x = coords[ai].x - coords[aj].x; xij.y = coords[ai].y - coords[aj].y; @@ -75,7 +75,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) { const int ai = shake_bonds[shake + i].ai - 1; const int aj = shake_bonds[shake + i].aj - 1; coord_t xxij; - double xxij2; + real_t xxij2; xxij.x = xcoords[ai].x - xcoords[aj].x; xxij.y = xcoords[ai].y - xcoords[aj].y; @@ -125,11 +125,11 @@ void stop_cm_translation() { auto &atypes = ctx.atypes->cpu_data_p; auto &catypes = ctx.catypes->cpu_data_p; auto &velocities = ctx.velocities->cpu_data_p; - double total_mass = 0; + real_t total_mass = 0; coord_t vcm = {}; for (int ai = 0; ai < ctx.n_atoms; ai++) { - const double rmass = catypes[atypes[ai].code - 1].m; + const real_t rmass = catypes[atypes[ai].code - 1].m; total_mass += rmass; vcm.x += velocities[ai].x * rmass; vcm.y += velocities[ai].y; diff --git a/src/core/cpu/src/cpu_temperature.cpp b/src/core/cpu/src/cpu_temperature.cpp index 6b76139f..537dec77 100644 --- a/src/core/cpu/src/cpu_temperature.cpp +++ b/src/core/cpu/src/cpu_temperature.cpp @@ -17,11 +17,11 @@ void calc_temperature() { auto *excluded = ctx.excluded->cpu_data_p; ctx.Temp = 0; ctx.Tfree = 0; - double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; - double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; - double Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms; - double ener; - double mass_i; + real_t Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; + real_t Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; + real_t Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms; + real_t ener; + real_t mass_i; ctx.Temp = 0; for (int i = 0; i < ctx.n_atoms_solute; i++) { diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp index 4ebb44b2..37a68298 100644 --- a/src/core/cpu/src/cpu_torsion_force.cpp +++ b/src/core/cpu/src/cpu_torsion_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -double calc_torsion_forces(int start, int end) { +real_t calc_torsion_forces(int start, int end) { auto& ctx = Context::instance(); auto &torsions = ctx.torsions->cpu_data_p; auto &ctorsions = ctx.ctorsions->cpu_data_p; @@ -17,11 +17,11 @@ double calc_torsion_forces(int start, int end) { coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi; - double arg, dv, f1; - double ener; - double torsion = 0; + real_t bj2inv, bk2inv, bjinv, bkinv; + real_t cos_phi, phi; + real_t arg, dv, f1; + real_t ener; + real_t torsion = 0; torsion_t t; ctorsion_t ctors; diff --git a/src/core/cpu/src/utils.cpp b/src/core/cpu/src/utils.cpp index ed680aa3..00c37e41 100644 --- a/src/core/cpu/src/utils.cpp +++ b/src/core/cpu/src/utils.cpp @@ -1,24 +1,25 @@ #include #include +#include "common/include/precision.h" + // Get a value from a gaussian distributed random variable with // mean mean and standard deviation sd -double gauss(double mean, double sd) { - double v1, v2, nd10; +real_t gauss(real_t mean, real_t sd) { + real_t v1, v2, nd10; - v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); - v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); + v1 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. ); + v2 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. ); nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1)); return sd * nd10 + mean; } -double to_degrees(double radians) { +real_t to_degrees(real_t radians) { return radians * (180.0 / M_PI); } -double to_radians(double degrees) { +real_t to_radians(real_t degrees) { return degrees * (M_PI / 180.0); } - diff --git a/src/core/cuda/include/cuda_angle_force.cuh b/src/core/cuda/include/cuda_angle_force.cuh index c2e00e15..63ebb011 100644 --- a/src/core/cuda/include/cuda_angle_force.cuh +++ b/src/core/cuda/include/cuda_angle_force.cuh @@ -1,5 +1,7 @@ #pragma once +#include "common/include/precision.h" + void init_angle_force_kernel_data(); -double calc_angle_forces_host(int start, int end); +real_t calc_angle_forces_host(int start, int end); void cleanup_angle_force(); diff --git a/src/core/cuda/include/cuda_bond_force.cuh b/src/core/cuda/include/cuda_bond_force.cuh index 83961ed5..bddc873c 100644 --- a/src/core/cuda/include/cuda_bond_force.cuh +++ b/src/core/cuda/include/cuda_bond_force.cuh @@ -1,5 +1,7 @@ #pragma once +#include "common/include/precision.h" + void init_bond_force_kernel_data(); -double calc_bond_forces_host(int start, int end); +real_t calc_bond_forces_host(int start, int end); void cleanup_bond_force(); diff --git a/src/core/cuda/include/cuda_improper2_force.cuh b/src/core/cuda/include/cuda_improper2_force.cuh index cb0a9635..9e0a2cfd 100644 --- a/src/core/cuda/include/cuda_improper2_force.cuh +++ b/src/core/cuda/include/cuda_improper2_force.cuh @@ -1,5 +1,7 @@ #pragma once +#include "common/include/precision.h" + void init_improper2_force_kernel_data(); -double calc_improper2_forces_host(int start, int end); +real_t calc_improper2_forces_host(int start, int end); void cleanup_improper2_force(); diff --git a/src/core/cuda/include/cuda_nonbonded_force.cuh b/src/core/cuda/include/cuda_nonbonded_force.cuh index f1a9b252..ee227088 100644 --- a/src/core/cuda/include/cuda_nonbonded_force.cuh +++ b/src/core/cuda/include/cuda_nonbonded_force.cuh @@ -1,8 +1,12 @@ #pragma once +#include + +#include "common/include/precision.h" + void init_nonbonded_force_kernel_data(); -std::pair calc_nonbonded_force_host( +std::pair calc_nonbonded_force_host( int nx, int ny, int* x_idx_list, @@ -14,7 +18,7 @@ std::pair calc_nonbonded_force_host( const int* x_atypes_types, const int* y_atypes_types, const bool disable_water_h_lj = false, - const double lambda = 1.0 + const real_t lambda = 1.0 ); void cleanup_nonbonded_force(); diff --git a/src/core/cuda/include/cuda_torsion_force.cuh b/src/core/cuda/include/cuda_torsion_force.cuh index 50315181..cac7e191 100644 --- a/src/core/cuda/include/cuda_torsion_force.cuh +++ b/src/core/cuda/include/cuda_torsion_force.cuh @@ -1,6 +1,8 @@ #pragma once +#include "common/include/precision.h" + void init_torsion_force_kernel_data(); -double calc_torsion_forces_host(int start, int end); +real_t calc_torsion_forces_host(int start, int end); void cleanup_torsion_force(); diff --git a/src/core/cuda/include/cuda_utility.cuh b/src/core/cuda/include/cuda_utility.cuh index 36767be0..9cbcefd5 100644 --- a/src/core/cuda/include/cuda_utility.cuh +++ b/src/core/cuda/include/cuda_utility.cuh @@ -3,7 +3,8 @@ #include #include "common/include/cuda_runtime_utility.h" +#include "common/include/precision.h" -__device__ inline double to_radians_device(double degrees) { +__device__ inline real_t to_radians_device(real_t degrees) { return degrees * (M_PI / 180.0); } diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index f20b039a..445bed51 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -4,10 +4,10 @@ namespace CudaAngleForce { bool is_initialized = false; -double* d_energy_sum; +real_t* d_energy_sum; } // namespace CudaAngleForce -__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) { +__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, real_t* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; @@ -24,21 +24,22 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co coord_t rji = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; coord_t rjk = {rk.x - rj.x, rk.y - rj.y, rk.z - rj.z}; - double rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z); - double rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z); + real_t rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z); + real_t rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z); - double cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length); + real_t cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length); - cos_theta = fmax(fmin(cos_theta, 1.0), -1.0); // Clamp value to avoid NaNs - double theta = acos(cos_theta); + cos_theta = cos_theta > static_cast(1.0) ? static_cast(1.0) : cos_theta; + cos_theta = cos_theta < static_cast(-1.0) ? static_cast(-1.0) : cos_theta; + real_t theta = acos(cos_theta); - double dtheta = theta - to_radians_device(cang.th0); - double energy = 0.5 * cang.kth * dtheta * dtheta; + real_t dtheta = theta - to_radians_device(cang.th0); + real_t energy = 0.5 * cang.kth * dtheta * dtheta; // calculate force magnitude - double dv = cang.kth * dtheta; + real_t dv = cang.kth * dtheta; - double f1 = sin(theta); + real_t f1 = sin(theta); if (fabs(f1) < k_singular_sin_epsilon) { f1 = -1.0 / k_singular_sin_epsilon; } else { @@ -70,7 +71,7 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co atomicAdd(&dvelocities[j].z, -dv * (di.z + dk.z)); } -double calc_angle_forces_host(int start, int end) { +real_t calc_angle_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaAngleForce; @@ -85,8 +86,8 @@ double calc_angle_forces_host(int start, int end) { // todo: now have to do that, after moving all to CudaContext, can remove it // ctx.sync_all_to_device(); - double h_energy_sum = 0.0; - cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice); + real_t h_energy_sum = 0.0; + cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(real_t), cudaMemcpyHostToDevice); // launch kernel calc_angle_forces_kernel<<>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum); @@ -94,14 +95,14 @@ double calc_angle_forces_host(int start, int end) { // todo: Now have to do that, after moving all to CudaContext, can remove it // copy results back to host - cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); return h_energy_sum; } void init_angle_force_kernel_data() { using namespace CudaAngleForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_bond_force.cu b/src/core/cuda/src/cuda_bond_force.cu index 9b31a660..476d7209 100644 --- a/src/core/cuda/src/cuda_bond_force.cu +++ b/src/core/cuda/src/cuda_bond_force.cu @@ -3,9 +3,9 @@ #include "cuda_utility.cuh" namespace CudaBondForce { bool is_initialized = false; -double* d_energy_sum; +real_t* d_energy_sum; } // namespace CudaBondForce -__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) { +__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, real_t* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; bond_t bond = bonds[idx]; @@ -13,18 +13,18 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord coord_t rj = coords[bond.aj - 1]; cbond_t cbond = cbonds[bond.code - 1]; - double dx = rj.x - ri.x; - double dy = rj.y - ri.y; - double dz = rj.z - ri.z; - double r = sqrt(dx * dx + dy * dy + dz * dz); + real_t dx = rj.x - ri.x; + real_t dy = rj.y - ri.y; + real_t dz = rj.z - ri.z; + real_t r = sqrt(dx * dx + dy * dy + dz * dz); - double dr = r - cbond.b0; - double energy = 0.5 * cbond.kb * dr * dr; + real_t dr = r - cbond.b0; + real_t energy = 0.5 * cbond.kb * dr * dr; atomicAdd(energy_sum, energy); // update forces - double f = cbond.kb * dr / r; + real_t f = cbond.kb * dr / r; atomicAdd(&dvelocities[bond.aj - 1].x, f * dx); atomicAdd(&dvelocities[bond.aj - 1].y, f * dy); atomicAdd(&dvelocities[bond.aj - 1].z, f * dz); @@ -33,15 +33,15 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord atomicAdd(&dvelocities[bond.ai - 1].z, -f * dz); } -double calc_bond_forces_host(int start, int end) { +real_t calc_bond_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaBondForce; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - double energy = 0.0; - cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); + real_t energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); bond_t* d_bonds = host_ctx.bonds->gpu_data_p; @@ -51,7 +51,7 @@ double calc_bond_forces_host(int start, int end) { calc_bond_forces_kernel<<>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); return energy; } @@ -59,7 +59,7 @@ double calc_bond_forces_host(int start, int end) { void init_bond_force_kernel_data() { using namespace CudaBondForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index dd7d91aa..f0f790db 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -4,10 +4,10 @@ namespace CudaImproper2Force { bool is_initialized = false; -double* d_energy_sum; +real_t* d_energy_sum; } // namespace CudaImproper2Force -__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) { +__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x + start; if (i >= end) return; @@ -15,8 +15,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi, arg, ener, dv, f1; + real_t bj2inv, bk2inv, bjinv, bkinv; + real_t cos_phi, phi, arg, ener, dv, f1; coord_t di, dl, dpi, dpj, dpk, dpl; improper_t imp; @@ -124,15 +124,15 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp atomicAdd(&dvelocities[ali].z, dv * dpl.z); } -double calc_improper2_forces_host(int start, int end) { +real_t calc_improper2_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaImproper2Force; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - double energy = 0.0; - cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); + real_t energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); coord_t* d_coords = host_ctx.coords->gpu_data_p; @@ -142,14 +142,14 @@ double calc_improper2_forces_host(int start, int end) { calc_improper2_forces_kernel<<>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); return energy; } void init_improper2_force_kernel_data() { using namespace CudaImproper2Force; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu index 1e010f7e..2ac8245a 100644 --- a/src/core/cuda/src/cuda_leapfrog.cu +++ b/src/core/cuda/src/cuda_leapfrog.cu @@ -18,20 +18,20 @@ __global__ void calc_leapfrog_kernel( coord_t* xcoords, int n_atoms, int n_atoms_solute, - double Tscale_solute, - double Tscale_solvent, - double dt) { + real_t Tscale_solute, + real_t Tscale_solvent, + real_t dt) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; int i = idx; // Kernel implementation goes here - double mass_i, winv_i; + real_t mass_i, winv_i; mass_i = catypes[atypes[i].code - 1].m; winv_i = 1 / mass_i; - double scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent; + real_t scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent; velocities[i].x = (velocities[i].x - dvelocities[i].x * dt * winv_i) * scale; velocities[i].y = (velocities[i].y - dvelocities[i].y * dt * winv_i) * scale; velocities[i].z = (velocities[i].z - dvelocities[i].z * dt * winv_i) * scale; @@ -50,7 +50,7 @@ __global__ void update_velocities_from_positions_kernel( const coord_t* coords, const coord_t* xcoords, int n_atoms, - double dt) { + real_t dt) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu index 78c4bc91..f925fe6f 100644 --- a/src/core/cuda/src/cuda_nonbonded_14_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu @@ -9,8 +9,8 @@ bool is_initialized = false; constexpr int kNonbonded14ModeCount = 3; int* d_atom_to_qi = nullptr; -double* d_evdw_totals = nullptr; -double* d_ecoul_totals = nullptr; +real_t* d_evdw_totals = nullptr; +real_t* d_ecoul_totals = nullptr; __device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) { #ifdef QDYN_SPFP @@ -96,13 +96,13 @@ __global__ void calc_nonbonded_14_force_kernel( const catype_t* unified_catypes, const coord_t* d_coords, dvel_t* d_dvelocities, - double* evdw_totals, - double* ecoul_totals, + real_t* evdw_totals, + real_t* ecoul_totals, bool include_pp, int state, int n_atoms, int n_qatoms, - double lambda) { + real_t lambda) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_pairs) return; @@ -166,14 +166,14 @@ __global__ void calc_nonbonded_14_force_kernel( namespace { struct Nonbonded14EnergyBuckets { - double evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; - double ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; + real_t evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; + real_t ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; }; } static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( int state, - double lambda, + real_t lambda, bool include_pp) { using namespace CudaNonbonded14Force; @@ -182,8 +182,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( Nonbonded14EnergyBuckets energies = {}; if (n_ngbrs_14 == 0) return energies; - cudaMemset(d_ecoul_totals, 0, sizeof(double) * kNonbonded14ModeCount); - cudaMemset(d_evdw_totals, 0, sizeof(double) * kNonbonded14ModeCount); + cudaMemset(d_ecoul_totals, 0, sizeof(real_t) * kNonbonded14ModeCount); + cudaMemset(d_evdw_totals, 0, sizeof(real_t) * kNonbonded14ModeCount); const int block_size = 256; const int num_blocks = (n_ngbrs_14 + block_size - 1) / block_size; @@ -208,8 +208,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( cudaDeviceSynchronize(); - cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); - cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); + cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); + cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); return energies; } @@ -221,7 +221,7 @@ void calc_nonbonded_14_forces_host() { if (host.n_ngbrs14 == 0) return; for (int state = 0; state < host.n_lambdas; state++) { - const double lambda = lambdas[state]; + const real_t lambda = lambdas[state]; const bool include_pp = (state == 0); Nonbonded14EnergyBuckets energies = calc_nonbonded_14_force_state_host(state, lambda, include_pp); @@ -248,8 +248,8 @@ void init_nonbonded_14_force_kernel_data() { check_cudaMalloc((void**)&d_atom_to_qi, sizeof(int) * host.atom_to_qi.size()); check_cuda(cudaMemcpy(d_atom_to_qi, host.atom_to_qi.data(), sizeof(int) * host.atom_to_qi.size(), cudaMemcpyHostToDevice)); - check_cudaMalloc((void**)&d_evdw_totals, sizeof(double) * kNonbonded14ModeCount); - check_cudaMalloc((void**)&d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount); + check_cudaMalloc((void**)&d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount); + check_cudaMalloc((void**)&d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount); is_initialized = true; } diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index 32b4077a..d7f0719c 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -7,7 +7,7 @@ namespace CudaNonbondedForce { bool is_initialized = false; -double *d_evdw_total, *d_ecoul_total; +real_t *d_evdw_total, *d_ecoul_total; template struct nonbond_vec_t { @@ -20,9 +20,11 @@ __device__ __forceinline__ float nonbond_rsqrt(float value) { return rsqrtf(value); } +#ifndef QDYN_SPFP __device__ __forceinline__ double nonbond_rsqrt(double value) { return rsqrt(value); } +#endif __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); @@ -39,6 +41,7 @@ __device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffff return __shfl_sync(mask, v, srcLane); } +#ifndef QDYN_SPFP template <> __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) { int2 a = *reinterpret_cast(&v); @@ -46,6 +49,7 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas a.y = __shfl_sync(mask, a.y, srcLane); return *reinterpret_cast(&a); } +#endif __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { v.x = shfl_value(v.x, srcLane, mask); @@ -76,8 +80,8 @@ __device__ void calculate_unforce_bound( const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); const WorkT r2 = r * r; const WorkT r6 = r2 * r2 * r2; - // double v_a = r6 * r6; - // double v_b = r6; + // real_t v_a = r6 * r6; + // real_t v_b = r6; // ecoul = r; // evdw = v_a - v_b; // dv = r2 * (-ecoul - v_a + v_b); @@ -116,8 +120,8 @@ __global__ void calc_nonbonded_force_kernel( dvel_t* d_dvelocities, - double* evdw_tot, - double* ecoul_tot, + real_t* evdw_tot, + real_t* ecoul_tot, bool symmetric, @@ -130,7 +134,7 @@ __global__ void calc_nonbonded_force_kernel( const int n_catype_types, const int zero_catype_type, const int n_qelscales, - const double lambda, + const real_t lambda, const q_elscale_t* d_qelscales // todo: Now doesn't use it. Should optimize it later ) { @@ -180,8 +184,8 @@ __global__ void calc_nonbonded_force_kernel( nonbond_vec_t x_force = {0.0, 0.0, 0.0}; nonbond_vec_t y_force = {0.0, 0.0, 0.0}; - double evdw_sum = 0.0; - double ecoul_sum = 0.0; + real_t evdw_sum = 0.0; + real_t ecoul_sum = 0.0; const unsigned mask = 0xffffffffu; @@ -307,7 +311,7 @@ __global__ void calc_nonbonded_force_kernel( } // namespace CudaNonbondedForce -std::pair calc_nonbonded_force_host( +std::pair calc_nonbonded_force_host( int nx, int ny, int* x_idx_list, @@ -318,7 +322,7 @@ std::pair calc_nonbonded_force_host( const int* y_charges_types, const int* x_atypes_types, const int* y_atypes_types, - const bool disable_water_h_lj, const double lambda) { + const bool disable_water_h_lj, const real_t lambda) { using namespace CudaNonbondedForce; Context& host = Context::instance(); const int thread_num = 256; @@ -334,8 +338,8 @@ std::pair calc_nonbonded_force_host( dim3 grid = dim3(grid_sz); - cudaMemset(d_ecoul_total, 0, sizeof(double)); - cudaMemset(d_evdw_total, 0, sizeof(double)); + cudaMemset(d_ecoul_total, 0, sizeof(real_t)); + cudaMemset(d_evdw_total, 0, sizeof(real_t)); auto launch_kernel = [&](auto work_tag) { using WorkT = decltype(work_tag); @@ -373,9 +377,9 @@ std::pair calc_nonbonded_force_host( cudaDeviceSynchronize(); - double evdw_tot = 0, ecoul_tot = 0; - cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost); + real_t evdw_tot = 0, ecoul_tot = 0; + cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(real_t), cudaMemcpyDeviceToHost); return {evdw_tot, ecoul_tot}; } @@ -383,8 +387,8 @@ std::pair calc_nonbonded_force_host( void init_nonbonded_force_kernel_data() { using namespace CudaNonbondedForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_evdw_total, sizeof(double)); - check_cudaMalloc((void**)&d_ecoul_total, sizeof(double)); + check_cudaMalloc((void**)&d_evdw_total, sizeof(real_t)); + check_cudaMalloc((void**)&d_ecoul_total, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index 7be0656f..bdb35608 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -14,11 +14,11 @@ int* water_shell = nullptr; int* water_rank = nullptr; int* polx_list_sh = nullptr; // use 1d array to simulate 2d array -double* d_energy; +real_t* d_energy; int* d_list_sh = nullptr; -double* d_theta = nullptr; -double* d_theta0 = nullptr; -double* d_tdum = nullptr; +real_t* d_theta = nullptr; +real_t* d_theta0 = nullptr; +real_t* d_tdum = nullptr; int* d_water_shell = nullptr; int* d_water_rank = nullptr; @@ -27,15 +27,15 @@ int* d_water_rank = nullptr; __global__ void calc_polx_theta_and_shells( int n_waters, int n_shells, int n_atoms_solute, coord_t* coords, topo_t topo, shell_t* wshells, int* list_sh, - double* theta, double* theta0, double* tdum) { + real_t* theta, real_t* theta0, real_t* tdum) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_waters) return; int i = idx; int wi, iis; coord_t rmu, rcu; - double rm, rc; - double cos_th; + real_t rm, rc; + real_t cos_th; theta[i] = 0; theta0[i] = 0; @@ -81,7 +81,7 @@ __global__ void calc_polx_theta_and_shells( __global__ void calc_polx_water_forces_kernel( int n_waters, int n_atoms_solute, shell_t* wshells, coord_t* coords, dvel_t* dvelocities, topo_t topo, - double* theta, md_t md, double* energy, + real_t* theta, md_t md, real_t* energy, int* water_rank, int* water_shell) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_waters) return; @@ -92,21 +92,21 @@ __global__ void calc_polx_water_forces_kernel( int wi, ii; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; - double rm, rc; - double cos_th; - double avtdum, arg, f0, dv; - double ener; + real_t rm, rc; + real_t cos_th; + real_t avtdum, arg, f0, dv; + real_t ener; avtdum = 0; ii = idx; - arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell); - double theta_val = acos(arg); + arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell); + real_t theta_val = acos(arg); theta_val = theta_val - 3 * sin(theta_val) * wshells[is].cstb / 2; if (theta_val < 0) theta_val = 0; if (theta_val > M_PI) theta_val = M_PI; avtdum += theta[ii]; - const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr; + const real_t dtheta = theta[ii] - theta_val + wshells[is].theta_corr; ener = .5 * md.polarisation_force * dtheta * dtheta; // E_restraint.Upolx += ener; atomicAdd(energy, ener); @@ -164,7 +164,7 @@ __global__ void calc_polx_water_forces_kernel( atomicAdd(&dvelocities[wi + 2].y, f0 * (f1H2.y)); atomicAdd(&dvelocities[wi + 2].z, f0 * (f1H2.z)); - atomicAdd(&wshells[is].avtheta, avtdum / (double)wshells[is].n_inshell); + atomicAdd(&wshells[is].avtheta, avtdum / (real_t)wshells[is].n_inshell); atomicAdd(&wshells[is].avn_inshell, wshells[is].n_inshell); } @@ -174,7 +174,7 @@ void sort_waters() { auto *wshells = ctx.wshells->cpu_data_p; int imin, jmin, jw; - double tmin; + real_t tmin; // Sort the waters according to theta for (int is = 0; is < ctx.n_shells; is++) { imin = 0; @@ -224,7 +224,7 @@ void calc_polx_water_forces_host(int iteration) { // todo: sort in cpu now.. ctx.wshells->download(); - cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(real_t), cudaMemcpyDeviceToHost); cudaMemcpy(polx_list_sh, d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int), cudaMemcpyDeviceToHost); // Reset per-water metadata; only waters placed in shells will be overwritten in sort_waters(). @@ -244,8 +244,8 @@ void calc_polx_water_forces_host(int iteration) { if (iteration != 0 && iteration % itdis_update == 0) { for (int is = 0; is < ctx.n_shells; is++) { printf("SHELL %d\n", is); - wshells[is].avtheta /= (double)itdis_update; - wshells[is].avn_inshell /= (double)itdis_update; + wshells[is].avtheta /= (real_t)itdis_update; + wshells[is].avn_inshell /= (real_t)itdis_update; wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb); printf("average theta = %f, average in shell = %f, theta_corr = %f\n", wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI); @@ -256,12 +256,12 @@ void calc_polx_water_forces_host(int iteration) { } // Calculate energy and force - cudaMemset(d_energy, 0, sizeof(double)); + cudaMemset(d_energy, 0, sizeof(real_t)); calc_polx_water_forces_kernel<<>>( ctx.n_waters, ctx.n_atoms_solute, d_wshells, d_coords, d_dvelocities, ctx.topo, d_theta, ctx.md, d_energy, d_water_rank, d_water_shell); - double energy; - cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost); + real_t energy; + cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost); ctx.E_restraint.Upolx += energy; ctx.wshells->download(); // Copy back forces for all atoms (solute + solvent); water forces were being dropped. @@ -275,11 +275,11 @@ void init_polx_water_force_kernel_data() { water_shell = new int[ctx.n_waters]; polx_list_sh = new int[ctx.n_max_inshell * ctx.n_shells]; - check_cudaMalloc((void**)&d_energy, sizeof(double)); + check_cudaMalloc((void**)&d_energy, sizeof(real_t)); check_cudaMalloc((void**)&d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int)); - check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(double)); - check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(double)); - check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(real_t)); + check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(real_t)); + check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(real_t)); check_cudaMalloc((void**)&d_water_rank, ctx.n_waters * sizeof(int)); check_cudaMalloc((void**)&d_water_shell, ctx.n_waters * sizeof(int)); diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu index 5221cb9e..b6ef257e 100644 --- a/src/core/cuda/src/cuda_pshell_force.cu +++ b/src/core/cuda/src/cuda_pshell_force.cu @@ -5,8 +5,8 @@ #include namespace CudaPshellForce { bool is_initialized = false; -double* d_ufix_energy; -double* d_ushell_energy; +real_t* d_ufix_energy; +real_t* d_ushell_energy; } // namespace CudaPshellForce __global__ void calc_pshell_force_kernel( @@ -15,14 +15,14 @@ __global__ void calc_pshell_force_kernel( bool* excluded, coord_t* coords, coord_t* coords_init, - double* ufix_energy, - double* ushell_energy, + real_t* ufix_energy, + real_t* ushell_energy, dvel_t* dvelocities) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n_atoms_solute) return; coord_t dr; - double k, r2, ener; + real_t k, r2, ener; if (shell[i] || excluded[i]) { // printf("i = %d excluded = %s shell = %s\n", i, excluded[i] ? "True" : "False", shell[i] ? "True" : "False"); @@ -57,8 +57,8 @@ void calc_pshell_forces_host() { auto d_coords_init = host.coords_init->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - cudaMemset(d_ufix_energy, 0, sizeof(double)); - cudaMemset(d_ushell_energy, 0, sizeof(double)); + cudaMemset(d_ufix_energy, 0, sizeof(real_t)); + cudaMemset(d_ushell_energy, 0, sizeof(real_t)); int blockSize = 256; int numBlocks = (host.n_atoms_solute + blockSize - 1) / blockSize; @@ -72,10 +72,10 @@ void calc_pshell_forces_host() { d_ushell_energy, d_dvelocities); cudaDeviceSynchronize(); - double ufix_energy; - double ushell_energy; - cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost); + real_t ufix_energy; + real_t ushell_energy; + cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(real_t), cudaMemcpyDeviceToHost); host.E_restraint.Ufix += ufix_energy; host.E_restraint.Ushell += ushell_energy; @@ -85,8 +85,8 @@ void calc_pshell_forces_host() { void init_pshell_force_kernel_data() { using namespace CudaPshellForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_ufix_energy, sizeof(double)); - check_cudaMalloc((void**)&d_ushell_energy, sizeof(double)); + check_cudaMalloc((void**)&d_ufix_energy, sizeof(real_t)); + check_cudaMalloc((void**)&d_ushell_energy, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu index f037e9db..26c8c94f 100644 --- a/src/core/cuda/src/cuda_radix_water_force.cu +++ b/src/core/cuda/src/cuda_radix_water_force.cu @@ -6,20 +6,20 @@ #include "cuda/include/cuda_utility.cuh" namespace CudaRadixWaterForce { bool is_initialized = false; -double* d_energy; +real_t* d_energy; } // namespace CudaRadixWaterForce __global__ void calc_radix_water_forces_kernel( coord_t* coords, - double shift, + real_t shift, int n_atoms_solute, int n_atoms, topo_t topo, md_t md, - double Dwmz, - double awmz, + real_t Dwmz, + real_t awmz, dvel_t* dvelocities, - double* energy) { + real_t* energy) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = n_atoms_solute + i * 3; // Process only oxygen atoms of water molecules if (i >= n_atoms) return; @@ -29,16 +29,16 @@ __global__ void calc_radix_water_forces_kernel( dr.x = coords[i].x - topo.solvent_center.x; dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); - double db = b - (topo.solvent_radius - shift); + real_t b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); + real_t db = b - (topo.solvent_radius - shift); - double ener, dv; + real_t ener, dv; if (db > 0) { ener = 0.5 * md.radial_force * db * db - Dwmz; dv = md.radial_force * db / b; } else { if (b > 0.0) { - double fexp = exp(awmz * db); + real_t fexp = exp(awmz * db); ener = Dwmz * (fexp * fexp - 2 * fexp); dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b; } else { @@ -70,16 +70,16 @@ void calc_radix_water_forces_host() { auto d_coords = host.coords->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - check_cuda(cudaMemset(d_energy, 0, sizeof(double))); + check_cuda(cudaMemset(d_energy, 0, sizeof(real_t))); - double shift; + real_t shift; if (host.md.radial_force != 0) { shift = sqrt(Boltz * host.Tfree / host.md.radial_force); } else { shift = 0; } - double energy = 0.0; + real_t energy = 0.0; calc_radix_water_forces_kernel<<>>(d_coords, shift, host.n_atoms_solute, @@ -91,14 +91,14 @@ void calc_radix_water_forces_host() { d_dvelocities, d_energy); check_cuda(cudaDeviceSynchronize()); - check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost)); + check_cuda(cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost)); host.E_restraint.Uradx += energy; } void init_radix_water_force_kernel_data() { using namespace CudaRadixWaterForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy, sizeof(double)); + check_cudaMalloc((void**)&d_energy, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index 567a78df..e32872b7 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -3,26 +3,26 @@ #include "common/include/context.h" namespace CudaRestrangForce { bool is_initialized = false; -double* d_E_restraint; +real_t* d_E_restraint; } // namespace CudaRestrangForce __global__ void calc_restrang_force_kernel( restrang_t* restrangs, int n_restrangs, coord_t* coords, - double* lambdas, + real_t* lambdas, int n_lambdas, dvel_t* dvelocities, E_restraint_t* EQ_restraint, - double* E_restraint) { + real_t* E_restraint) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrangs) return; int ir = idx; int state, i, j, k; coord_t dr, dr2, di, dk; - double lambda, r2ij, r2jk, rij, rjk, cos_th, th; - double dth, dv, ener, f1; + real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th; + real_t dth, dv, ener, f1; state = restrangs[ir].ipsi - 1; i = restrangs[ir].ai - 1; @@ -110,8 +110,8 @@ void calc_restrang_force_host() { auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_EQ_restraint = host.EQ_restraint->gpu_data_p; - double val = 0; - cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); + real_t val = 0; + cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice); int blockSize = 256; int numBlocks = (host.n_restrangs + blockSize - 1) / blockSize; @@ -126,14 +126,14 @@ void calc_restrang_force_host() { d_E_restraint); cudaDeviceSynchronize(); host.EQ_restraint->download(); - cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); host.E_restraint.Upres += val; } void init_restrang_force_kernel_data() { using namespace CudaRestrangForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu index 14f9b466..cdb035b4 100644 --- a/src/core/cuda/src/cuda_restrdis_force.cu +++ b/src/core/cuda/src/cuda_restrdis_force.cu @@ -5,24 +5,24 @@ #include "common/include/context.h" namespace CudaRestrdisForce { bool is_initialized = false; -double* d_E_restraint; +real_t* d_E_restraint; } // namespace CudaRestrdisForce __global__ void calc_restrdis_forces_kernel( restrdis_t* restrdists, int n_restrdists, coord_t* coords, - double* lambdas, + real_t* lambdas, int n_lambdas, dvel_t* dvelocities, E_restraint_t* EQ_restraint, - double* E_restraint) { + real_t* E_restraint) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrdists) return; int state, i, j; coord_t dr; - double lambda, b, db, dv, ener; + real_t lambda, b, db, dv, ener; int ir = idx; @@ -82,7 +82,7 @@ void calc_restrdis_forces_host() { auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_EQ_restraint = host.EQ_restraint->gpu_data_p; - cudaMemset(d_E_restraint, 0, sizeof(double)); + cudaMemset(d_E_restraint, 0, sizeof(real_t)); int blockSize = 256; int numBlocks = (host.n_restrdists + blockSize - 1) / blockSize; @@ -97,8 +97,8 @@ void calc_restrdis_forces_host() { d_E_restraint); cudaDeviceSynchronize(); host.EQ_restraint->download(); - double ener; - cudaMemcpy(&ener, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); + real_t ener; + cudaMemcpy(&ener, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); printf("Energy restraint: %f\n", ener); host.E_restraint.Upres += ener; } @@ -106,7 +106,7 @@ void calc_restrdis_forces_host() { void init_restrdis_force_kernel_data() { using namespace CudaRestrdisForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu index 695e2b33..5307bff5 100644 --- a/src/core/cuda/src/cuda_restrpos_force.cu +++ b/src/core/cuda/src/cuda_restrpos_force.cu @@ -6,17 +6,17 @@ namespace CudaRestrposForce { bool is_initialized = false; -double* d_E_restraint; +real_t* d_E_restraint; } // namespace CudaRestrposForce __global__ void calc_restrpos_forces_kernel( restrpos_t* restrspos, int n_restrspos, coord_t* coords, - double* lambdas, + real_t* lambdas, int n_lambdas, E_restraint_t* EQ_restraint, - double* E_restraint, + real_t* E_restraint, dvel_t* dvelocities) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrspos) return; @@ -24,7 +24,7 @@ __global__ void calc_restrpos_forces_kernel( int state, i; coord_t dr; - double lambda, ener, x2, y2, z2; + real_t lambda, ener, x2, y2, z2; state = restrspos[ir].ipsi - 1; i = restrspos[ir].a - 1; @@ -64,8 +64,8 @@ void calc_restrpos_forces_host() { auto& host = Context::instance(); if (host.n_restrspos == 0) return; using namespace CudaRestrposForce; - double val = 0.0; - cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); + real_t val = 0.0; + cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice); auto d_restrspos = host.restrspos->gpu_data_p; auto d_coords = host.coords->gpu_data_p; @@ -85,7 +85,7 @@ void calc_restrpos_forces_host() { d_E_restraint, d_dvelocities); cudaDeviceSynchronize(); - cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); host.E_restraint.Upres += val; host.EQ_restraint->download(); } @@ -93,7 +93,7 @@ void calc_restrpos_forces_host() { void init_restrpos_force_kernel_data() { using namespace CudaRestrposForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu index 71835e4e..e5951303 100644 --- a/src/core/cuda/src/cuda_restrseq_force.cu +++ b/src/core/cuda/src/cuda_restrseq_force.cu @@ -4,7 +4,7 @@ namespace CudaRestrseqForce { bool is_initialized = false; -double* d_upres_energy; +real_t* d_upres_energy; } // namespace CudaRestrseqForce __global__ void calc_restrseq_forces_kernel( int n_restrseqs, @@ -15,13 +15,13 @@ __global__ void calc_restrseq_forces_kernel( catype_t* catypes, bool* heavy, dvel_t* dvelocities, - double* upres_energy) { + real_t* upres_energy) { int s = blockIdx.x * blockDim.x + threadIdx.x; if (s >= n_restrseqs) return; - double k, mass, totmass; + real_t k, mass, totmass; coord_t dr; - double r2, ener; + real_t r2, ener; k = restrseqs[s].k; @@ -123,7 +123,7 @@ void calc_restrseq_forces_host() { auto d_catypes = host.catypes->gpu_data_p; auto d_heavy = host.heavy->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - cudaMemset(d_upres_energy, 0, sizeof(double)); + cudaMemset(d_upres_energy, 0, sizeof(real_t)); // ctx.sync_all_to_device(); int blockSize = 256; @@ -139,8 +139,8 @@ void calc_restrseq_forces_host() { d_dvelocities, d_upres_energy); cudaDeviceSynchronize(); - double upres_energy; - cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost); + real_t upres_energy; + cudaMemcpy(&upres_energy, d_upres_energy, sizeof(real_t), cudaMemcpyDeviceToHost); host.E_restraint.Upres = upres_energy; printf("Restrseq U_upres: %f\n", upres_energy); } @@ -148,7 +148,7 @@ void calc_restrseq_forces_host() { void init_restrseq_force_kernel_data() { using namespace CudaRestrseqForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_upres_energy, sizeof(double)); + check_cudaMalloc((void**)&d_upres_energy, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu index c928bb71..2ca01839 100644 --- a/src/core/cuda/src/cuda_restrwall_force.cu +++ b/src/core/cuda/src/cuda_restrwall_force.cu @@ -5,20 +5,20 @@ namespace CudaRestrwallForce { bool is_initialized = false; -double* d_energies; +real_t* d_energies; } // namespace CudaRestrwallForce __global__ void calc_restrwall_forces_kernel( restrwall_t* restrwalls, int n_restrwalls, coord_t* coords, - double* energies, + real_t* energies, dvel_t* dvelocities, bool* heavy, topo_t topo) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrwalls) return; - double k, b, db, ener, dv, fexp; + real_t k, b, db, ener, dv, fexp; coord_t dr; int ir = idx; @@ -58,7 +58,7 @@ void calc_restrwall_forces_host() { auto d_coords = host.coords->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_heavy = host.heavy->gpu_data_p; - cudaMemset(d_energies, 0, sizeof(double)); + cudaMemset(d_energies, 0, sizeof(real_t)); int blockSize = 256; int numBlocks = (host.n_restrwalls + blockSize - 1) / blockSize; @@ -69,8 +69,8 @@ void calc_restrwall_forces_host() { d_energies, d_dvelocities, d_heavy, host.topo); cudaDeviceSynchronize(); - double h_energy; - cudaMemcpy(&h_energy, d_energies, sizeof(double), cudaMemcpyDeviceToHost); + real_t h_energy; + cudaMemcpy(&h_energy, d_energies, sizeof(real_t), cudaMemcpyDeviceToHost); printf("Restrwall energy: %f\n", h_energy); host.E_restraint.Upres += h_energy; } @@ -78,7 +78,7 @@ void calc_restrwall_forces_host() { void init_restrwall_force_kernel_data() { using namespace CudaRestrwallForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energies, sizeof(double)); + check_cudaMalloc((void**)&d_energies, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu index bda47e50..03fa76cd 100644 --- a/src/core/cuda/src/cuda_shake_constraints.cu +++ b/src/core/cuda/src/cuda_shake_constraints.cu @@ -17,7 +17,7 @@ __global__ void calc_shake_constraints_kernel( shake_bond_t* shake_bonds, coord_t* coords, coord_t* xcoords, - double* winv, + real_t* winv, int* total_iterations, int* mol_shake_offset) { int idx = blockIdx.x; @@ -26,7 +26,7 @@ __global__ void calc_shake_constraints_kernel( int mol = idx; int ai, aj, n_iterations, shake; - double xij2, diff, corr, scp, xxij2; + real_t xij2, diff, corr, scp, xxij2; coord_t xij, xxij; if (mol_n_shakes[mol] == 0) return; diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu index baba687e..46c4c373 100644 --- a/src/core/cuda/src/cuda_temperature.cu +++ b/src/core/cuda/src/cuda_temperature.cu @@ -6,23 +6,23 @@ namespace CudaTemperature { bool is_initialized = false; -double* d_Temp_solute; -double* d_Tfree_solute; -double* d_Texcl_solute; -double* d_Temp_solvent; -double* d_Tfree_solvent; -double* d_Texcl_solvent; +real_t* d_Temp_solute; +real_t* d_Tfree_solute; +real_t* d_Texcl_solute; +real_t* d_Temp_solvent; +real_t* d_Tfree_solvent; +real_t* d_Texcl_solvent; } // namespace CudaTemperature -__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, double boltz, double ekinmax, - double* Temp_solute, double* Tfree_solute, double* Texcl_solute, double* Temp_solvent, double* Tfree_solvent, double* Texcl_solvent) { +__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, real_t boltz, real_t ekinmax, + real_t* Temp_solute, real_t* Tfree_solute, real_t* Texcl_solute, real_t* Temp_solvent, real_t* Tfree_solvent, real_t* Texcl_solvent) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; - double mass_i = catypes[atypes[idx].code - 1].m; - const double vx = velocities[idx].x; - const double vy = velocities[idx].y; - const double vz = velocities[idx].z; - double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); + real_t mass_i = catypes[atypes[idx].code - 1].m; + const real_t vx = velocities[idx].x; + const real_t vy = velocities[idx].y; + const real_t vz = velocities[idx].z; + real_t ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); bool is_solute = (idx < n_atoms_solute); bool is_excluded = excluded[idx]; @@ -49,14 +49,14 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t void calc_temperature_host() { auto& host = Context::instance(); using namespace CudaTemperature; - double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0; + real_t h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0; - cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(real_t), cudaMemcpyHostToDevice); atype_t* d_atypes = host.atypes->gpu_data_p; catype_t* d_catypes = host.catypes->gpu_data_p; @@ -66,17 +66,17 @@ void calc_temperature_host() { int blockSize = 256; int numBlocks = (host.n_atoms + blockSize - 1) / blockSize; - double Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms; + real_t Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms; calc_temperature_kernel<<>>(host.n_atoms, host.n_atoms_solute, d_atypes, d_catypes, d_velocities, d_excluded, Boltz, Ekinmax, d_Temp_solute, d_Tfree_solute, d_Texcl_solute, d_Temp_solvent, d_Tfree_solvent, d_Texcl_solvent); cudaDeviceSynchronize(); - cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); host.Tfree = h_Tfree_solute + h_Tfree_solvent; host.Temp = h_Temp_solute + h_Temp_solvent; @@ -98,12 +98,12 @@ void calc_temperature_host() { void init_temperature_kernel_data() { using namespace CudaTemperature; if (!is_initialized) { - check_cudaMalloc((void**)&d_Temp_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double)); - check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double)); - check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double)); + check_cudaMalloc((void**)&d_Temp_solute, sizeof(real_t)); + check_cudaMalloc((void**)&d_Tfree_solute, sizeof(real_t)); + check_cudaMalloc((void**)&d_Texcl_solute, sizeof(real_t)); + check_cudaMalloc((void**)&d_Temp_solvent, sizeof(real_t)); + check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(real_t)); + check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(real_t)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 5baffbde..1c0692ae 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -4,10 +4,10 @@ namespace CudaTorsionForce { bool is_initialized = false; -double* d_energy_sum = nullptr; +real_t* d_energy_sum = nullptr; } // namespace CudaTorsionForce -__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum) { +__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x + start; if (i >= end) return; int aii, aji, aki, ali; @@ -16,10 +16,10 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi; - double arg, dv, f1; - double ener; + real_t bj2inv, bk2inv, bjinv, bkinv; + real_t cos_phi, phi; + real_t arg, dv, f1; + real_t ener; torsion_t t; ctorsion_t ctors; @@ -63,7 +63,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio bkinv = sqrt(bk2inv); cos_phi = (rnj.x * rnk.x + rnj.y * rnk.y + rnj.z * rnk.z) * (bjinv * bkinv); - cos_phi = fmin(fmax(cos_phi, -1.0), 1.0); + cos_phi = cos_phi > static_cast(1.0) ? static_cast(1.0) : cos_phi; + cos_phi = cos_phi < static_cast(-1.0) ? static_cast(-1.0) : cos_phi; phi = acos(cos_phi); if (rjk.x * (rnj.y * rnk.z - rnj.z * rnk.y) + rjk.y * (rnj.z * rnk.x - rnj.x * rnk.z) + rjk.z * (rnj.x * rnk.y - rnj.y * rnk.x) < 0) { phi = -phi; @@ -123,15 +124,15 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio atomicAdd(&dvelocities[ali].z, dv * dpl.z); } -double calc_torsion_forces_host(int start, int end) { +real_t calc_torsion_forces_host(int start, int end) { using namespace CudaTorsionForce; int N = end - start; if (N <= 0) return 0.0; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - double zero = 0.0; - cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice); + real_t zero = 0.0; + cudaMemcpy(d_energy_sum, &zero, sizeof(real_t), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); coord_t* d_coords = host_ctx.coords->gpu_data_p; @@ -141,7 +142,7 @@ double calc_torsion_forces_host(int start, int end) { calc_torsion_forces_kernel<<>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&zero, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); return zero; } @@ -150,7 +151,7 @@ double calc_torsion_forces_host(int start, int end) { void init_torsion_force_kernel_data() { using namespace CudaTorsionForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); is_initialized = true; } } From 6f5940f20d53f1029326e2e1ea1de828ffa9392a Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Thu, 30 Apr 2026 10:24:37 +0200 Subject: [PATCH 17/20] fix benchmark fortran run --- benchmark-qgpu/benchmark_correctness.py | 47 ++++++++++++++++++++++--- benchmark-qgpu/benchmark_nsday.py | 42 +++++++++++++++++++++- benchmark-qgpu/benchmark_test.py | 29 ++++++++++++--- 3 files changed, 107 insertions(+), 11 deletions(-) diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py index d30f321d..f8f4fd5c 100644 --- a/benchmark-qgpu/benchmark_correctness.py +++ b/benchmark-qgpu/benchmark_correctness.py @@ -221,16 +221,34 @@ def collect(args): fortran_dir, prep_dir, prepared_data_dir, reference_dir = copy_reference_inputs(args.reference_dir, out_dir) prep_fortran_bin = None else: - prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) + default_prep_fortran_bin = ( + ROOT / "src" / "q6" / "bin" / "q6" / "qdynp" + if args.prep_fortran_mpi_procs is not None + else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test" + ) + prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin or default_prep_fortran_bin) data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake) fortran_dir = out_dir / "fortran_reference" prep_dir = out_dir / "qgpu_prepare" fortran_dir.mkdir(parents=True, exist_ok=True) - print(f"Preparing Fortran reference for {args.test}") + if args.prep_fortran_mpi_procs is None: + print(f"Preparing Fortran reference for {args.test}") + else: + print( + f"Preparing Fortran reference for {args.test} " + f"with {args.prep_fortran_mpi_procs} MPI rank(s)" + ) write_md_input(data, fortran_dir) - prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir) + prepare_restart_with_qdyn_test( + data, + prep_fortran_bin, + fortran_dir, + mpi_procs=args.prep_fortran_mpi_procs, + mpirun_bin=args.mpirun_bin, + mpirun_args=args.mpirun_args, + ) print("Preparing QGPU input") prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir) @@ -255,6 +273,9 @@ def collect(args): "shake": args.shake, "qgpu_bin": str(qgpu_bin), "prep_fortran_bin": str(prep_fortran_bin) if prep_fortran_bin is not None else None, + "prep_fortran_mpi_procs": args.prep_fortran_mpi_procs, + "mpirun_bin": args.mpirun_bin, + "mpirun_args": args.mpirun_args, "reference_dir": str(reference_dir) if reference_dir is not None else None, "prepared_qgpu_input": str(prepared_data_dir), "fortran_energy": str(fortran_energy_path), @@ -411,8 +432,24 @@ def parse_args(): ) collect_parser.add_argument( "--prep-fortran-bin", - default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"), - help="Path to qdyn_test used to generate Fortran reference data.", + default=None, + help="Path to Fortran binary used to generate reference data. Defaults to qdynp with MPI, otherwise qdyn_test.", + ) + collect_parser.add_argument( + "--prep-fortran-mpi-procs", + type=positive_int, + default=None, + help="Run the Fortran reference preparation through mpirun with this many MPI ranks.", + ) + collect_parser.add_argument( + "--mpirun-bin", + default="mpirun", + help="MPI launcher to use with --prep-fortran-mpi-procs. Defaults to mpirun.", + ) + collect_parser.add_argument( + "--mpirun-args", + default=None, + help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".', ) collect_parser.add_argument( "--tolerance", diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py index 71ddbe9b..30ffbebc 100644 --- a/benchmark-qgpu/benchmark_nsday.py +++ b/benchmark-qgpu/benchmark_nsday.py @@ -90,7 +90,28 @@ def resolve_collect_data_dir(args, out_dir): return data_dir, args.steps -def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, steps, label, repeat): +def cleanup_successful_task_data(processes): + removed = 0 + for item in processes: + if item["return_code"] != 0: + continue + data_dir = item["data_dir"] + if data_dir.exists(): + shutil.rmtree(data_dir) + removed += 1 + return removed + + +def run_concurrency_batch( + qgpu_bin, + prepared_data_dir, + run_dir, + concurrency, + steps, + label, + repeat, + cleanup_run_data=False, +): if run_dir.exists(): shutil.rmtree(run_dir) run_dir.mkdir(parents=True) @@ -113,6 +134,7 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste "args": args, "stdout": stdout_path, "stderr": stderr_path, + "data_dir": data_dir, "command": command_text(args), } ) @@ -133,6 +155,7 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste "stderr_file": stderr_f, "stdout": spec["stdout"], "stderr": spec["stderr"], + "data_dir": spec["data_dir"], "start": proc_start, "command": spec["command"], } @@ -172,6 +195,12 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste batch_wall_seconds = time.perf_counter() - batch_start failed = sum(1 for row in process_rows if row["return_code"] != 0) + + if cleanup_run_data: + removed_task_data = cleanup_successful_task_data(processes) + if removed_task_data: + print(f"Removed copied QGPU data for {removed_task_data} successful task(s) under {run_dir}") + total_ns_per_day = concurrency * steps * TIME_STEP_NS * 86400 / batch_wall_seconds mean_process_ns_per_day = ( sum(float(row["process_ns_per_day"]) for row in process_rows if row["process_ns_per_day"] != "") @@ -255,6 +284,7 @@ def collect(args): steps=steps, label=label, repeat=repeat, + cleanup_run_data=not args.keep_run_data, ) batch_rows.append(batch_row) process_rows.extend(rows) @@ -269,6 +299,7 @@ def collect(args): "qgpu_bin": str(qgpu_bin), "prepared_data_dir": str(prepared_data_dir), "steps": steps, + "keep_run_data": args.keep_run_data, }, ) raise RuntimeError( @@ -291,6 +322,7 @@ def collect(args): "steps": steps, "concurrency": args.concurrency, "repeat": args.repeat, + "keep_run_data": args.keep_run_data, }, ) print(f"Summary CSV: {summary_csv}") @@ -457,6 +489,14 @@ def parse_args(): help="Path to qdyn_test used only when preparing from --test.", ) collect_parser.add_argument("--pause-seconds", type=float, default=0.0, help="Pause between batches.") + collect_parser.add_argument( + "--keep-run-data", + action="store_true", + help=( + "Keep per-task copied QGPU input/output directories. By default successful task data is " + "deleted after logs and timing are recorded." + ), + ) plot_parser = subparsers.add_parser("plot", help="Plot ns/day vs concurrency from one or more CSV files.") plot_parser.add_argument("csv", nargs="+", help="One or more nsday_summary.csv files from collect.") diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py index afefcfbc..fcad0788 100644 --- a/benchmark-qgpu/benchmark_test.py +++ b/benchmark-qgpu/benchmark_test.py @@ -210,7 +210,15 @@ def run_fortran_repeats( return records, saw_success -def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_steps=None): +def prepare_restart_with_qdyn_test( + data, + prep_fortran_bin, + fortran_dir, + prep_steps=None, + mpi_procs=None, + mpirun_bin="mpirun", + mpirun_args=None, +): input_path = fortran_dir / "eq1.inp" original_input = input_path.read_text(encoding="utf-8") parse_data = data @@ -222,7 +230,13 @@ def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_ste stdout_path = fortran_dir / "restart_prep_qdyn_test.log" stderr_path = fortran_dir / "restart_prep_qdyn_test.err" - args = [str(prep_fortran_bin), "eq1.inp"] + args = build_fortran_command( + prep_fortran_bin, + "eq1.inp", + mpi_procs=mpi_procs, + mpirun_bin=mpirun_bin, + mpirun_args=mpirun_args, + ) try: return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path) if return_code != 0: @@ -529,8 +543,8 @@ def parse_args(): ) parser.add_argument( "--fortran-bin", - default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"), - help="Path to production Fortran qdyn/qdynp binary used for timed Fortran runs.", + default=None, + help="Path to production Fortran binary used for timed Fortran runs. Defaults to qdynp with MPI, otherwise qdyn.", ) parser.add_argument( "--fortran-mpi-procs", @@ -590,7 +604,12 @@ def main(): return 0 qgpu_bin = resolve_qgpu_bin(args.qgpu_bin) - fortran_bin = resolve_fortran_bin(args.fortran_bin) + default_fortran_bin = ( + ROOT / "src" / "q6" / "bin" / "q6" / "qdynp" + if args.fortran_mpi_procs is not None + else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn" + ) + fortran_bin = resolve_fortran_bin(args.fortran_bin or default_fortran_bin) prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin) out_dir = Path(args.out).expanduser().resolve() if args.out else default_out_dir(args.test) out_dir.mkdir(parents=True, exist_ok=True) From 6759c86e0d1cc0c1dce82b5f25b53bc014b6151f Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Thu, 30 Apr 2026 16:07:51 +0200 Subject: [PATCH 18/20] Revert "Merge branch 'feature/qgpu_mixed_precision' into feature/qgpu_benchmark_script" This reverts commit 97775273097769e3c687e16891784843b20795c7, reversing changes made to e6eee26979200b22c1cbc7ee6851dd1653aebfb0. --- src/core/common/include/context.h | 32 ++--- src/core/common/include/md_types.h | 128 +++++++++--------- src/core/common/include/precision.h | 5 +- src/core/common/src/init.cpp | 56 ++++---- src/core/common/src/parse.cpp | 2 +- src/core/cpu/include/cpu_angle_force.h | 5 +- src/core/cpu/include/cpu_bond_force.h | 4 +- src/core/cpu/include/cpu_improper2_force.h | 4 +- src/core/cpu/include/cpu_torsion_force.h | 4 +- src/core/cpu/include/cpu_utils.h | 8 +- src/core/cpu/src/cpu_angle_force.cpp | 10 +- src/core/cpu/src/cpu_bond_force.cpp | 6 +- src/core/cpu/src/cpu_improper2_force.cpp | 8 +- src/core/cpu/src/cpu_leapfrog.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_pp_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_pw_force.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_qp_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_qq_force.cpp | 6 +- src/core/cpu/src/cpu_nonbonded_qw_force.cpp | 4 +- src/core/cpu/src/cpu_nonbonded_ww_force.cpp | 4 +- src/core/cpu/src/cpu_polx_water_force.cpp | 18 +-- src/core/cpu/src/cpu_pshell_force.cpp | 2 +- src/core/cpu/src/cpu_q_angle_force.cpp | 4 +- src/core/cpu/src/cpu_q_bond_force.cpp | 2 +- src/core/cpu/src/cpu_q_torsion_force.cpp | 8 +- src/core/cpu/src/cpu_radix_water_force.cpp | 4 +- src/core/cpu/src/cpu_restrang_force.cpp | 4 +- src/core/cpu/src/cpu_restrdis_force.cpp | 2 +- src/core/cpu/src/cpu_restrpos_force.cpp | 2 +- src/core/cpu/src/cpu_restrseq_force.cpp | 4 +- src/core/cpu/src/cpu_restrwall_force.cpp | 2 +- src/core/cpu/src/cpu_shake.cpp | 8 +- src/core/cpu/src/cpu_temperature.cpp | 10 +- src/core/cpu/src/cpu_torsion_force.cpp | 12 +- src/core/cpu/src/utils.cpp | 15 +- src/core/cuda/include/cuda_angle_force.cuh | 4 +- src/core/cuda/include/cuda_bond_force.cuh | 4 +- .../cuda/include/cuda_improper2_force.cuh | 4 +- .../cuda/include/cuda_nonbonded_force.cuh | 8 +- src/core/cuda/include/cuda_torsion_force.cuh | 4 +- src/core/cuda/include/cuda_utility.cuh | 3 +- src/core/cuda/src/cuda_angle_force.cu | 33 +++-- src/core/cuda/src/cuda_bond_force.cu | 28 ++-- src/core/cuda/src/cuda_improper2_force.cu | 18 +-- src/core/cuda/src/cuda_leapfrog.cu | 12 +- src/core/cuda/src/cuda_nonbonded_14_force.cu | 30 ++-- src/core/cuda/src/cuda_nonbonded_force.cu | 38 +++--- src/core/cuda/src/cuda_polx_water_force.cu | 54 ++++---- src/core/cuda/src/cuda_pshell_force.cu | 26 ++-- src/core/cuda/src/cuda_radix_water_force.cu | 28 ++-- src/core/cuda/src/cuda_restrang_force.cu | 18 +-- src/core/cuda/src/cuda_restrdis_force.cu | 16 +-- src/core/cuda/src/cuda_restrpos_force.cu | 16 +-- src/core/cuda/src/cuda_restrseq_force.cu | 16 +-- src/core/cuda/src/cuda_restrwall_force.cu | 14 +- src/core/cuda/src/cuda_shake_constraints.cu | 4 +- src/core/cuda/src/cuda_temperature.cu | 66 ++++----- src/core/cuda/src/cuda_torsion_force.cu | 25 ++-- 58 files changed, 420 insertions(+), 452 deletions(-) diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h index ee516d7a..83817bb8 100644 --- a/src/core/common/include/context.h +++ b/src/core/common/include/context.h @@ -32,8 +32,8 @@ class Context { int n_qatoms = 0; int n_waters = 0; int n_molecules = 0; - real_t dt = 0.0; - real_t tau_T = 0.0; + double dt = 0.0; + double tau_T = 0.0; md_t md; topo_t topo; int n_excluded = 0; @@ -108,7 +108,7 @@ class Context { std::unique_ptr> excluded; - std::unique_ptr> winv; + std::unique_ptr> winv; std::unique_ptr> shell; @@ -137,12 +137,12 @@ class Context { Water */ std::unique_ptr> wshells; - real_t crgQtot = 0.0; - real_t Dwmz = 0.0; - real_t awmz = 0.0; - std::vector theta; - std::vector theta0; - std::vector tdum; + double crgQtot = 0.0; + double Dwmz = 0.0; + double awmz = 0.0; + std::vector theta; + std::vector theta0; + std::vector tdum; int n_max_inshell = 0; int n_shells = 0; std::vector> list_sh; @@ -152,7 +152,7 @@ class Context { /* FEP */ - std::unique_ptr> lambdas; // Actually length is only 2.. + std::unique_ptr> lambdas; // Actually length is only 2.. /* Energy @@ -206,13 +206,13 @@ class Context { Temperature */ - real_t Temp = 0.0; - real_t Tfree = 0.0; - real_t Ndegf = 0.0; - real_t Ndegfree = 0.0; + double Temp = 0.0; + double Tfree = 0.0; + double Ndegf = 0.0; + double Ndegfree = 0.0; - real_t Tscale_solute = 0.0; - real_t Tscale_solvent = 0.0; + double Tscale_solute = 0.0; + double Tscale_solvent = 0.0; /* Info for FEP */ diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index dd5ef21d..27c20cef 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -12,29 +12,29 @@ struct md_t { // [MD] int steps; - real_t stepsize; - real_t temperature; + double stepsize; + double temperature; char thermostat[40]; - real_t bath_coupling; + double bath_coupling; int random_seed; - real_t initial_temperature; + double initial_temperature; bool shake_solvent; bool shake_solute; bool shake_hydrogens; bool lrf; bool charge_groups; // [cut-offs] - real_t solute_solute; - real_t solvent_solvent; - real_t solute_solvent; - real_t q_atom; + double solute_solute; + double solvent_solvent; + double solute_solvent; + double q_atom; // [sphere] - real_t shell_radius; // Note: this is for the pshell - real_t shell_force; // Note: this is for the pshell + double shell_radius; // Note: this is for the pshell + double shell_force; // Note: this is for the pshell // [solvent] - real_t radial_force; + double radial_force; bool polarisation; - real_t polarisation_force; + double polarisation_force; // [intervals] int non_bond; int output; @@ -62,8 +62,8 @@ struct bond_t { struct cbond_t { int code; - real_t kb; - real_t b0; + double kb; + double b0; }; struct angle_t { @@ -75,8 +75,8 @@ struct angle_t { struct cangle_t { int code; - real_t kth; - real_t th0; + double kth; + double th0; }; struct torsion_t { @@ -89,10 +89,10 @@ struct torsion_t { struct ctorsion_t { int code; - real_t k; - real_t n; - real_t d; - real_t paths; + double k; + double n; + double d; + double paths; }; struct improper_t { @@ -105,8 +105,8 @@ struct improper_t { struct cimproper_t { int code; - real_t k; - real_t phi0; + double k; + double phi0; }; struct charge_t { @@ -126,11 +126,11 @@ struct atype_t { struct catype_t { int code; - real_t m; + double m; real_t aii_normal; real_t bii_normal; - // real_t aii_polar; - // real_t bii_polar; + // double aii_polar; + // double bii_polar; real_t aii_1_4; real_t bii_1_4; }; @@ -142,12 +142,12 @@ struct vdw_pair_param_t { struct topo_t { int solvent_type; - real_t exclusion_radius; - real_t solvent_radius; + double exclusion_radius; + double solvent_radius; coord_t solute_center; coord_t solvent_center; - real_t el14_scale; - real_t coulomb_constant; + double el14_scale; + double coulomb_constant; int vdw_rule; // 1=geometric, 2=arithmetic }; @@ -177,14 +177,14 @@ struct q_angcouple_t { }; // no use struct q_cimproper_t { - real_t k; - real_t phi0; + double k; + double phi0; }; // no use struct q_elscale_t { int qi; int qj; - real_t mu; + double mu; }; struct q_exclpair_t { @@ -211,18 +211,18 @@ struct q_offdiag_t { int j; int qk; int ql; - real_t Aij; - real_t muij; + double Aij; + double muij; }; // no use struct q_shake_t { int ai; int aj; - real_t dist; + double dist; }; // no use struct q_softcore_t { - real_t s; + double s; }; // no use struct q_softpair_t { @@ -243,7 +243,7 @@ struct q_torcouple_t { struct restrseq_t { int ai; int aj; - real_t k; + double k; bool ih; int to_center; // Flag for restraining to geom. or mass center }; @@ -258,32 +258,32 @@ struct restrpos_t { struct restrdis_t { int ai, aj; int ipsi; - real_t d1, d2; - real_t k; + double d1, d2; + double k; char itext[20], jtext[20]; }; struct restrang_t { int ai, aj, ak; int ipsi; - real_t ang; - real_t k; + double ang; + double k; }; struct restrwall_t { int ai, aj; - real_t d, k, aMorse, dMorse; + double d, k, aMorse, dMorse; bool ih; }; struct shell_t { int n_inshell; - real_t theta_corr; - real_t avtheta; - real_t avn_inshell; - real_t router; - real_t dr; - real_t cstb; + double theta_corr; + double avtheta; + double avn_inshell; + double router; + double dr; + double cstb; }; /* ============================================= @@ -294,7 +294,7 @@ struct shell_t { struct shake_bond_t { int ai; int aj; - real_t dist2; + double dist2; bool ready; }; @@ -316,28 +316,28 @@ struct dvel_t { }; struct E_bonded_t { - real_t Ubond; - real_t Uangle; - real_t Utor; - real_t Uimp; + double Ubond; + double Uangle; + double Utor; + double Uimp; }; struct E_nonbonded_t { - real_t Ucoul; - real_t Uvdw; + double Ucoul; + double Uvdw; }; struct E_restraint_t { - real_t Uradx; - real_t Upolx; - real_t Ufix; - real_t Ushell; - real_t Upres; - real_t Urestr; + double Uradx; + double Upolx; + double Ufix; + double Ushell; + double Upres; + double Urestr; }; struct energy_t { - real_t Ukin; - real_t Upot; - real_t Utot; + double Ukin; + double Upot; + double Utot; }; diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h index b0978010..80b790f7 100644 --- a/src/core/common/include/precision.h +++ b/src/core/common/include/precision.h @@ -4,15 +4,14 @@ using real_t = float; using nonbond_work_t = float; using force_accum_t = float; -using energy_accum_t = float; -using constraint_work_t = float; #else using real_t = double; using nonbond_work_t = double; using force_accum_t = double; +#endif + using energy_accum_t = double; using constraint_work_t = double; -#endif #ifdef QDYN_SPFP constexpr double k_singular_sin_epsilon = 1.0e-6; diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp index e7c2b8c0..499c01cb 100644 --- a/src/core/common/src/init.cpp +++ b/src/core/common/src/init.cpp @@ -38,10 +38,10 @@ void initialize_catype_tables() { auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p; std::vector h_catype_table_all; - std::map, int> catype_to_type_host; + std::map, int> catype_to_type_host; auto add_catype = [&](catype_t catype) -> int { - const std::array key = { + const std::array key = { catype.aii_normal, catype.bii_normal, catype.aii_1_4, @@ -91,7 +91,7 @@ void initialize_catype_tables() { for (int i = 0; i < static_cast(ctx.p_atoms_list->length); i++) { const int id = p_atoms_cpu[i]; const catype_t catype = catypes[atypes[id].code - 1]; - const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; + const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; p_catype_types_cpu[i] = catype_to_type_host[key]; } @@ -109,7 +109,7 @@ void initialize_catype_tables() { const int id = q_atoms_cpu[i]; const atype_t& qat = ctx.q_atypes[q_idx[id] + ctx.n_qatoms * state]; const catype_t& qcatype = ctx.q_catypes[qat.code - 1]; - const std::array key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4}; + const std::array key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4}; q_catype_types_cpu[state * ctx.q_atoms_list->length + i] = catype_to_type_host[key]; } } @@ -118,7 +118,7 @@ void initialize_catype_tables() { for (int i = 0; i < static_cast(ctx.w_atoms_list->length); i++) { const int id = w_atoms_cpu[i]; const catype_t catype = catypes[atypes[id].code - 1]; - const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; + const std::array key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4}; w_catype_types_cpu[i] = catype_to_type_host[key]; } printf("Total water atom number: %lu, w_catype_types size: %lu\n", ctx.w_atoms_list->length, w_catype_types_cpu.size()); @@ -141,10 +141,10 @@ void initialize_charge_tables() { auto *w_atoms_cpu = ctx.w_atoms_list->cpu_data_p; auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p; - std::map charge_to_type_host; + std::map charge_to_type_host; std::vector h_charge_table_all; - auto add_charge = [&](real_t charge) -> int { + auto add_charge = [&](double charge) -> int { if (charge_to_type_host.count(charge) == 0) { int sz = static_cast(h_charge_table_all.size()); ccharge_t new_ccharge = {}; @@ -161,7 +161,7 @@ void initialize_charge_tables() { } for (int state = 0; state < ctx.n_lambdas; state++) { for (int i = 0; i < ctx.n_qatoms; i++) { - real_t charge = ctx.q_charges[i + ctx.n_qatoms * state].charge; + double charge = ctx.q_charges[i + ctx.n_qatoms * state].charge; add_charge(charge); add_charge(charge * lambda_values[state]); } @@ -181,7 +181,7 @@ void initialize_charge_tables() { std::vector p_charge_types_cpu(ctx.p_atoms_list->length); for (int i = 0; i < static_cast(ctx.p_atoms_list->length); i++) { const int id = p_atoms_cpu[i]; - const real_t charge = ccharges[charges[id].code - 1].charge; + const double charge = ccharges[charges[id].code - 1].charge; p_charge_types_cpu[i] = charge_to_type_host[charge]; } @@ -197,7 +197,7 @@ void initialize_charge_tables() { for (int state = 0; state < ctx.n_lambdas; state++) { for (int i = 0; i < static_cast(ctx.q_atoms_list->length); i++) { const int id = q_atoms_cpu[i]; - const real_t charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge; + const double charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge; q_charge_types_cpu[state * ctx.q_atoms_list->length + i] = charge_to_type_host[charge]; } } @@ -205,7 +205,7 @@ void initialize_charge_tables() { std::vector w_charge_types_cpu(ctx.w_atoms_list->length); for (int i = 0; i < static_cast(ctx.w_atoms_list->length); i++) { const int id = w_atoms_cpu[i]; - const real_t charge = ccharges[charges[id].code - 1].charge; + const double charge = ccharges[charges[id].code - 1].charge; w_charge_types_cpu[i] = charge_to_type_host[charge]; } @@ -493,8 +493,8 @@ void init_velocities() { auto& velocities = ctx.velocities->cpu_data_p; // If not previous value set, use a Maxwell distribution to fill velocities - real_t kT = Boltz * ctx.md.initial_temperature; - real_t sd, mass; + double kT = Boltz * ctx.md.initial_temperature; + double sd, mass; for (int i = 0; i < ctx.n_atoms; i++) { mass = catypes[atypes[i].code - 1].m; sd = sqrt(kT / mass); @@ -514,7 +514,7 @@ void init_inv_mass() { auto& ctx = Context::instance(); auto& atypes = ctx.atypes->cpu_data_p; auto& catypes = ctx.catypes->cpu_data_p; - ctx.winv = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); + ctx.winv = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); auto* winv = ctx.winv->cpu_data_p; for (int ai = 0; ai < ctx.n_atoms; ai++) { winv[ai] = 1 / catypes[atypes[ai].code - 1].m; @@ -539,7 +539,7 @@ void init_water_sphere() { void init_wshells() { auto& ctx = Context::instance(); int n_inshell; - real_t drs, router, ri, dr, Vshell, rshell; + double drs, router, ri, dr, Vshell, rshell; auto& bonds = ctx.bonds->cpu_data_p; auto& cbonds = ctx.cbonds->cpu_data_p; auto& angles = ctx.angles->cpu_data_p; @@ -547,8 +547,8 @@ void init_wshells() { // Get water properties from the first water molecule. cbond_t cbondw = cbonds[bonds[ctx.n_atoms_solute].code - 1]; cangle_t canglew = cangles[angles[ctx.n_atoms_solute].code - 1]; - const real_t crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge; - const real_t mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2); + const double crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge; + const double mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2); drs = wpolr_layer / drouter; @@ -605,7 +605,7 @@ void init_pshells() { auto& catypes = ctx.catypes->cpu_data_p; auto& coords_init = ctx.coords_init->cpu_data_p; auto* excluded = ctx.excluded->cpu_data_p; - real_t mass, r2, rin2; + double mass, r2, rin2; ctx.heavy = std::make_unique>(ctx.n_atoms, true, ctx.run_gpu); auto* heavy = ctx.heavy->cpu_data_p; @@ -655,7 +655,7 @@ static int mark_heavy_atoms(Context& ctx) { auto* heavy = ctx.heavy->cpu_data_p; int n_heavy = 0; for (int i = 0; i < ctx.n_atoms; i++) { - real_t mass = catypes[atypes[i].code - 1].m; + double mass = catypes[atypes[i].code - 1].m; if (mass < 4.0) { heavy[i] = false; } else { @@ -681,7 +681,7 @@ void init_pshells_from_charge_groups() { auto& ctx = Context::instance(); auto& coords_init = ctx.coords_init->cpu_data_p; auto* excluded = ctx.excluded->cpu_data_p; - real_t r2, rin2; + double r2, rin2; auto& charge_groups = ctx.charge_group_config; const bool use_switch_atom = charge_groups.iuse_switch_atom == 1; @@ -697,9 +697,9 @@ void init_pshells_from_charge_groups() { const auto& charge_group = charge_groups.charge_groups[grp]; int i = charge_group.iswitch - 1; if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) { - real_t cx = coords_init[i].x; - real_t cy = coords_init[i].y; - real_t cz = coords_init[i].z; + double cx = coords_init[i].x; + double cy = coords_init[i].y; + double cz = coords_init[i].z; if (!use_switch_atom) { cx = 0.0; cy = 0.0; @@ -710,7 +710,7 @@ void init_pshells_from_charge_groups() { cy += coords_init[ai].y; cz += coords_init[ai].z; } - real_t inv_atoms = 1.0 / static_cast(charge_group.atoms.size()); + double inv_atoms = 1.0 / static_cast(charge_group.atoms.size()); cx *= inv_atoms; cy *= inv_atoms; cz *= inv_atoms; @@ -748,7 +748,7 @@ void init_shake() { int mol = 0; int shake; int n_solute_shake_constraints = 0; - real_t excl_shake = 0; + double excl_shake = 0; auto& bonds = ctx.bonds->cpu_data_p; auto& cbonds = ctx.cbonds->cpu_data_p; @@ -808,10 +808,10 @@ void init_shake() { ctx.Ndegf = 3 * ctx.n_atoms - ctx.n_shake_constraints; ctx.Ndegfree = ctx.Ndegf - 3 * ctx.n_excluded + excl_shake; - const real_t Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints; + const double Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints; - const real_t Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints); - const real_t Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent; + const double Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints); + const double Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent; printf("n_shake_constrains = %d, n_solute_shake_constraints = %d, excl_shake = %f\n", ctx.n_shake_constraints, n_solute_shake_constraints, excl_shake); diff --git a/src/core/common/src/parse.cpp b/src/core/common/src/parse.cpp index 1b45a7a6..98e859ae 100644 --- a/src/core/common/src/parse.cpp +++ b/src/core/common/src/parse.cpp @@ -132,7 +132,7 @@ void parse_md(const char* filename) { #ifdef VERBOSE printf("reading in %d lambdas (%s in file)\n", ctx.n_lambdas, file.buffer[k][1]); #endif - ctx.lambdas = std::make_unique>(ctx.n_lambdas, true, ctx.run_gpu); + ctx.lambdas = std::make_unique>(ctx.n_lambdas, true, ctx.run_gpu); auto *lambdas = ctx.lambdas->cpu_data_p; k++; for (int i = 0; i < ctx.n_lambdas; i++) { diff --git a/src/core/cpu/include/cpu_angle_force.h b/src/core/cpu/include/cpu_angle_force.h index ea4f5ef6..df2a3a64 100644 --- a/src/core/cpu/include/cpu_angle_force.h +++ b/src/core/cpu/include/cpu_angle_force.h @@ -1,5 +1,2 @@ #pragma once - -#include "common/include/precision.h" - -real_t calc_angle_forces(int start, int end); +double calc_angle_forces(int start, int end); \ No newline at end of file diff --git a/src/core/cpu/include/cpu_bond_force.h b/src/core/cpu/include/cpu_bond_force.h index 32775c6e..6a2f7f73 100644 --- a/src/core/cpu/include/cpu_bond_force.h +++ b/src/core/cpu/include/cpu_bond_force.h @@ -1,5 +1,3 @@ #pragma once -#include "common/include/precision.h" - -real_t calc_bond_forces(int start, int end); +double calc_bond_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_improper2_force.h b/src/core/cpu/include/cpu_improper2_force.h index b6606e57..26d694aa 100644 --- a/src/core/cpu/include/cpu_improper2_force.h +++ b/src/core/cpu/include/cpu_improper2_force.h @@ -1,5 +1,3 @@ #pragma once -#include "common/include/precision.h" - -real_t calc_improper2_forces(int start, int end); +double calc_improper2_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_torsion_force.h b/src/core/cpu/include/cpu_torsion_force.h index 309bd505..19089318 100644 --- a/src/core/cpu/include/cpu_torsion_force.h +++ b/src/core/cpu/include/cpu_torsion_force.h @@ -1,5 +1,3 @@ #pragma once -#include "common/include/precision.h" - -real_t calc_torsion_forces(int start, int end); +double calc_torsion_forces(int start, int end); diff --git a/src/core/cpu/include/cpu_utils.h b/src/core/cpu/include/cpu_utils.h index 352d6b3c..e7be4557 100644 --- a/src/core/cpu/include/cpu_utils.h +++ b/src/core/cpu/include/cpu_utils.h @@ -1,7 +1,5 @@ #pragma once -#include "common/include/precision.h" - -real_t gauss(real_t mean, real_t sd); -real_t to_degrees(real_t radians); -real_t to_radians(real_t degrees); +double gauss(double mean, double sd); +double to_degrees(double radians); +double to_radians(double degrees); diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp index 1f19390f..ae600561 100644 --- a/src/core/cpu/src/cpu_angle_force.cpp +++ b/src/core/cpu/src/cpu_angle_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -real_t calc_angle_forces(int start, int end) { +double calc_angle_forces(int start, int end) { auto& ctx = Context::instance(); auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; @@ -15,11 +15,11 @@ real_t calc_angle_forces(int start, int end) { coord_t rji, rjk; coord_t di, dk; - real_t bji2inv, bjk2inv, bjiinv, bjkinv; + double bji2inv, bjk2inv, bjiinv, bjkinv; cangle_t cangle; - real_t cos_th, th, dth, dv, f1; - real_t ener; - real_t angle = 0; + double cos_th, th, dth, dv, f1; + double ener; + double angle = 0; auto &angles = ctx.angles->cpu_data_p; auto &cangles = ctx.cangles->cpu_data_p; diff --git a/src/core/cpu/src/cpu_bond_force.cpp b/src/core/cpu/src/cpu_bond_force.cpp index 0ab4baff..2a539f90 100644 --- a/src/core/cpu/src/cpu_bond_force.cpp +++ b/src/core/cpu/src/cpu_bond_force.cpp @@ -4,7 +4,7 @@ #include "context.h" -real_t calc_bond_forces(int start, int end) { +double calc_bond_forces(int start, int end) { auto& ctx = Context::instance(); auto &bonds = ctx.bonds->cpu_data_p; auto &cbonds = ctx.cbonds->cpu_data_p; @@ -13,8 +13,8 @@ real_t calc_bond_forces(int start, int end) { int aii, aji; coord_t ai, aj, dx; cbond_t cbond; - real_t dx2, dx1, ddx, ener, ampl; - real_t bond = 0; + double dx2, dx1, ddx, ener, ampl; + double bond = 0; for (int i = start; i < end; i++) { aii = bonds[i].ai - 1; diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp index fea7c724..6e4faa60 100644 --- a/src/core/cpu/src/cpu_improper2_force.cpp +++ b/src/core/cpu/src/cpu_improper2_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -real_t calc_improper2_forces(int start, int end) { +double calc_improper2_forces(int start, int end) { auto& ctx = Context::instance(); auto &impropers = ctx.impropers->cpu_data_p; auto &cimpropers = ctx.cimpropers->cpu_data_p; @@ -15,13 +15,13 @@ real_t calc_improper2_forces(int start, int end) { coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; - real_t bj2inv, bk2inv, bjinv, bkinv; - real_t cos_phi, phi, arg, ener, dv, f1; + double bj2inv, bk2inv, bjinv, bkinv; + double cos_phi, phi, arg, ener, dv, f1; coord_t di, dl, dpi, dpj, dpk, dpl; improper_t imp; cimproper_t cimp; - real_t improper = 0; + double improper = 0; for (int i = start; i < end; i++) { imp = impropers[i]; diff --git a/src/core/cpu/src/cpu_leapfrog.cpp b/src/core/cpu/src/cpu_leapfrog.cpp index 0927e414..9d1ff43a 100644 --- a/src/core/cpu/src/cpu_leapfrog.cpp +++ b/src/core/cpu/src/cpu_leapfrog.cpp @@ -11,8 +11,8 @@ void calc_leapfrog() { auto &velocities = ctx.velocities->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; auto *xcoords = ctx.xcoords->cpu_data_p; - real_t mass_i; - real_t winv_i; + double mass_i; + double winv_i; for (int i = 0; i < ctx.n_atoms_solute; i++) { mass_i = catypes[atypes[i].code - 1].m; diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp index cbeb11f5..390c67eb 100644 --- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp @@ -13,7 +13,7 @@ void calc_nonbonded_pp_forces() { auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p; auto *excluded = ctx.excluded->cpu_data_p; bool bond14, bond23; - real_t scaling; + double scaling; coord_t da; real_t r2a, ra, r6a; real_t V_a, V_b; @@ -67,8 +67,8 @@ void calc_nonbonded_pp_forces() { dvelocities[j].y += dva * da.y; dvelocities[j].z += dva * da.z; - ctx.E_nonbond_pp.Ucoul += static_cast(Vela); - ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); + ctx.E_nonbond_pp.Ucoul += static_cast(Vela); + ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp index 52c9242b..030c1290 100644 --- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp @@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() { dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); - ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); + ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); + ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp index b0df677d..7a81a516 100644 --- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp @@ -18,7 +18,7 @@ void calc_nonbonded_qp_forces() { real_t r2, r; real_t ai_aii, aj_aii, ai_bii, aj_bii; bool bond23, bond14; - real_t scaling; + double scaling; real_t Vel, V_a, V_b, dv; for (int qi = 0; qi < ctx.n_qatoms; qi++) { @@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() { dvelocities[j].z += dv * da.z; // Update Q totals - ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); - ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); + ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp index 96462795..006a3c0e 100644 --- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp @@ -16,7 +16,7 @@ void calc_nonbonded_qq_forces() { auto *q_elscales = ctx.q_elscales->cpu_data_p; int ai, aj; real_t crg_i, crg_j; - real_t elscale, scaling; + double elscale, scaling; bool bond23, bond14; coord_t da; real_t r2a, ra, r6a; @@ -81,8 +81,8 @@ void calc_nonbonded_qq_forces() { dvelocities[aj].y += dva * da.y; dvelocities[aj].z += dva * da.z; - ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); - ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); + ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp index 1ab0b469..8d18bc55 100644 --- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp @@ -75,8 +75,8 @@ void calc_nonbonded_qw_forces() { dvH1 -= r2H1 * VelH1 * lambda; dvH2 -= r2H2 * VelH2 * lambda; - ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); - ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); + ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); } // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp index f6d2ac98..3be5e6f0 100644 --- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp @@ -59,8 +59,8 @@ void accumulate_pair_force(Context& ctx, dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - energy.Ucoul += static_cast(ecoul); - energy.Uvdw += static_cast(evdw); + energy.Ucoul += static_cast(ecoul); + energy.Uvdw += static_cast(evdw); } void calc_nonbonded_ww_forces() { diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp index 440d03d2..5116dbbb 100644 --- a/src/core/cpu/src/cpu_polx_water_force.cpp +++ b/src/core/cpu/src/cpu_polx_water_force.cpp @@ -13,12 +13,12 @@ void calc_polx_w_forces(int iteration) { auto *wshells = ctx.wshells->cpu_data_p; int wi, imin, jw, ii, iis, jmin; - real_t tmin; + double tmin; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; - real_t rm, rc; - real_t cos_th; - real_t avtdum, arg, f0, dv; - real_t ener; + double rm, rc; + double cos_th; + double avtdum, arg, f0, dv; + double ener; for (int is = 0; is < ctx.n_shells; is++) { wshells[is].n_inshell = 0; @@ -93,8 +93,8 @@ void calc_polx_w_forces(int iteration) { if (iteration != 0 && iteration % itdis_update == 0) { for (int is = 0; is < ctx.n_shells; is++) { printf("SHELL %d\n", is); - wshells[is].avtheta /= (real_t)itdis_update; - wshells[is].avn_inshell /= (real_t)itdis_update; + wshells[is].avtheta /= (double)itdis_update; + wshells[is].avn_inshell /= (double)itdis_update; wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb); printf("average theta = %f, average in shell = %f, theta_corr = %f\n", @@ -113,7 +113,7 @@ void calc_polx_w_forces(int iteration) { avtdum = 0; for (int il = 0; il < wshells[is].n_inshell; il++) { ii = ctx.nsort[il][is]; - arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell); + arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell); ctx.theta0[il] = acos(arg); ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * wshells[is].cstb / 2; if (ctx.theta0[il] < 0) { @@ -189,7 +189,7 @@ void calc_polx_w_forces(int iteration) { dvelocities[wi + 2].z += f0 * f1H2.z; } - wshells[is].avtheta += avtdum / (real_t)wshells[is].n_inshell; + wshells[is].avtheta += avtdum / (double)wshells[is].n_inshell; wshells[is].avn_inshell += wshells[is].n_inshell; } } diff --git a/src/core/cpu/src/cpu_pshell_force.cpp b/src/core/cpu/src/cpu_pshell_force.cpp index a547f16d..9ff083cc 100644 --- a/src/core/cpu/src/cpu_pshell_force.cpp +++ b/src/core/cpu/src/cpu_pshell_force.cpp @@ -13,7 +13,7 @@ void calc_pshell_forces() { auto *shell = ctx.shell->cpu_data_p; coord_t dr; - real_t k, r2, ener; + double k, r2, ener; for (int i = 0; i < ctx.n_atoms_solute; i++) { if (shell[i] || excluded[i]) { diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp index c92c904d..14aa802c 100644 --- a/src/core/cpu/src/cpu_q_angle_force.cpp +++ b/src/core/cpu/src/cpu_q_angle_force.cpp @@ -14,8 +14,8 @@ void calc_qangle_forces(int state) { int ic; int ai, aj, ak; coord_t rji, rjk; - real_t bji, bjk; - real_t cos_th, th, dth, ener, dv, f1; + double bji, bjk; + double cos_th, th, dth, ener, dv, f1; coord_t di, dk; for (int i = 0; i < ctx.n_qangles; i++) { diff --git a/src/core/cpu/src/cpu_q_bond_force.cpp b/src/core/cpu/src/cpu_q_bond_force.cpp index 6b924c69..5f2f7203 100644 --- a/src/core/cpu/src/cpu_q_bond_force.cpp +++ b/src/core/cpu/src/cpu_q_bond_force.cpp @@ -11,7 +11,7 @@ void calc_qbond_forces(int state) { auto *lambdas = ctx.lambdas->cpu_data_p; int ic; int ai, aj; - real_t b, db, ener, dv; + double b, db, ener, dv; coord_t rij; for (int i = 0; i < ctx.n_qbonds; i++) { diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp index 2be495b0..7b7fb271 100644 --- a/src/core/cpu/src/cpu_q_torsion_force.cpp +++ b/src/core/cpu/src/cpu_q_torsion_force.cpp @@ -15,10 +15,10 @@ void calc_qtorsion_forces(int state) { coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - real_t bj2inv, bk2inv, bjinv, bkinv; - real_t bj, bk, cos_phi, phi; - real_t arg, dv, f1; - real_t ener; + double bj2inv, bk2inv, bjinv, bkinv; + double bj, bk, cos_phi, phi; + double arg, dv, f1; + double ener; for (int i = 0; i < ctx.n_qtorsions; i++) { ic = ctx.q_torsions[i + ctx.n_qtorsions * state].code; diff --git a/src/core/cpu/src/cpu_radix_water_force.cpp b/src/core/cpu/src/cpu_radix_water_force.cpp index a85af35c..a887ad31 100644 --- a/src/core/cpu/src/cpu_radix_water_force.cpp +++ b/src/core/cpu/src/cpu_radix_water_force.cpp @@ -10,9 +10,9 @@ void calc_radix_w_forces() { auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; - real_t b, db, ener, dv, fexp; + double b, db, ener, dv, fexp; coord_t dr; - real_t shift; + double shift; if (ctx.md.radial_force != 0) { shift = sqrt(Boltz * ctx.Tfree / ctx.md.radial_force); diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp index c2b9ed50..84f593b0 100644 --- a/src/core/cpu/src/cpu_restrang_force.cpp +++ b/src/core/cpu/src/cpu_restrang_force.cpp @@ -15,8 +15,8 @@ void calc_restrang_forces() { int state, i, j, k; coord_t dr, dr2, di, dk; - real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th; - real_t dth, dv, ener, f1; + double lambda, r2ij, r2jk, rij, rjk, cos_th, th; + double dth, dv, ener, f1; for (int ir = 0; ir < ctx.n_restrangs; ir++) { state = restrangs[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrdis_force.cpp b/src/core/cpu/src/cpu_restrdis_force.cpp index 859481f3..c15cbef7 100644 --- a/src/core/cpu/src/cpu_restrdis_force.cpp +++ b/src/core/cpu/src/cpu_restrdis_force.cpp @@ -14,7 +14,7 @@ void calc_restrdis_forces() { int state, i, j; coord_t dr; - real_t lambda, b, db, dv, ener; + double lambda, b, db, dv, ener; for (int ir = 0; ir < ctx.n_restrdists; ir++) { state = restrdists[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrpos_force.cpp b/src/core/cpu/src/cpu_restrpos_force.cpp index a3e8710d..6db044b4 100644 --- a/src/core/cpu/src/cpu_restrpos_force.cpp +++ b/src/core/cpu/src/cpu_restrpos_force.cpp @@ -14,7 +14,7 @@ void calc_restrpos_forces() { int state, i; coord_t dr; - real_t lambda, ener, x2, y2, z2; + double lambda, ener, x2, y2, z2; for (int ir = 0; ir < ctx.n_restrspos; ir++) { state = restrspos[ir].ipsi - 1; diff --git a/src/core/cpu/src/cpu_restrseq_force.cpp b/src/core/cpu/src/cpu_restrseq_force.cpp index f9ff9fd0..296762e8 100644 --- a/src/core/cpu/src/cpu_restrseq_force.cpp +++ b/src/core/cpu/src/cpu_restrseq_force.cpp @@ -13,9 +13,9 @@ void calc_restrseq_forces() { auto &restrseqs = ctx.restrseqs->cpu_data_p; auto *heavy = ctx.heavy->cpu_data_p; - real_t k, mass, totmass; + double k, mass, totmass; coord_t dr; - real_t r2, ener; + double r2, ener; for (int s = 0; s < ctx.n_restrseqs; s++) { k = restrseqs[s].k; diff --git a/src/core/cpu/src/cpu_restrwall_force.cpp b/src/core/cpu/src/cpu_restrwall_force.cpp index 7da6faa6..fd49749a 100644 --- a/src/core/cpu/src/cpu_restrwall_force.cpp +++ b/src/core/cpu/src/cpu_restrwall_force.cpp @@ -11,7 +11,7 @@ void calc_restrwall_forces() { auto &restrwalls = ctx.restrwalls->cpu_data_p; auto *heavy = ctx.heavy->cpu_data_p; - real_t k, b, db, ener, dv, fexp; + double k, b, db, ener, dv, fexp; coord_t dr; for (int ir = 0; ir < ctx.n_restrwalls; ir++) { diff --git a/src/core/cpu/src/cpu_shake.cpp b/src/core/cpu/src/cpu_shake.cpp index 91162c98..cb29a0f0 100644 --- a/src/core/cpu/src/cpu_shake.cpp +++ b/src/core/cpu/src/cpu_shake.cpp @@ -34,7 +34,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) { const int aj = shake_bond.aj - 1; coord_t xij; coord_t xxij; - real_t xij2, diff, corr, scp; + double xij2, diff, corr, scp; xij.x = coords[ai].x - coords[aj].x; xij.y = coords[ai].y - coords[aj].y; @@ -75,7 +75,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) { const int ai = shake_bonds[shake + i].ai - 1; const int aj = shake_bonds[shake + i].aj - 1; coord_t xxij; - real_t xxij2; + double xxij2; xxij.x = xcoords[ai].x - xcoords[aj].x; xxij.y = xcoords[ai].y - xcoords[aj].y; @@ -125,11 +125,11 @@ void stop_cm_translation() { auto &atypes = ctx.atypes->cpu_data_p; auto &catypes = ctx.catypes->cpu_data_p; auto &velocities = ctx.velocities->cpu_data_p; - real_t total_mass = 0; + double total_mass = 0; coord_t vcm = {}; for (int ai = 0; ai < ctx.n_atoms; ai++) { - const real_t rmass = catypes[atypes[ai].code - 1].m; + const double rmass = catypes[atypes[ai].code - 1].m; total_mass += rmass; vcm.x += velocities[ai].x * rmass; vcm.y += velocities[ai].y; diff --git a/src/core/cpu/src/cpu_temperature.cpp b/src/core/cpu/src/cpu_temperature.cpp index 537dec77..6b76139f 100644 --- a/src/core/cpu/src/cpu_temperature.cpp +++ b/src/core/cpu/src/cpu_temperature.cpp @@ -17,11 +17,11 @@ void calc_temperature() { auto *excluded = ctx.excluded->cpu_data_p; ctx.Temp = 0; ctx.Tfree = 0; - real_t Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; - real_t Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; - real_t Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms; - real_t ener; - real_t mass_i; + double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; + double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; + double Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms; + double ener; + double mass_i; ctx.Temp = 0; for (int i = 0; i < ctx.n_atoms_solute; i++) { diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp index 37a68298..4ebb44b2 100644 --- a/src/core/cpu/src/cpu_torsion_force.cpp +++ b/src/core/cpu/src/cpu_torsion_force.cpp @@ -5,7 +5,7 @@ #include "context.h" #include "cpu_utils.h" -real_t calc_torsion_forces(int start, int end) { +double calc_torsion_forces(int start, int end) { auto& ctx = Context::instance(); auto &torsions = ctx.torsions->cpu_data_p; auto &ctorsions = ctx.ctorsions->cpu_data_p; @@ -17,11 +17,11 @@ real_t calc_torsion_forces(int start, int end) { coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - real_t bj2inv, bk2inv, bjinv, bkinv; - real_t cos_phi, phi; - real_t arg, dv, f1; - real_t ener; - real_t torsion = 0; + double bj2inv, bk2inv, bjinv, bkinv; + double cos_phi, phi; + double arg, dv, f1; + double ener; + double torsion = 0; torsion_t t; ctorsion_t ctors; diff --git a/src/core/cpu/src/utils.cpp b/src/core/cpu/src/utils.cpp index 00c37e41..ed680aa3 100644 --- a/src/core/cpu/src/utils.cpp +++ b/src/core/cpu/src/utils.cpp @@ -1,25 +1,24 @@ #include #include -#include "common/include/precision.h" - // Get a value from a gaussian distributed random variable with // mean mean and standard deviation sd -real_t gauss(real_t mean, real_t sd) { - real_t v1, v2, nd10; +double gauss(double mean, double sd) { + double v1, v2, nd10; - v1 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. ); - v2 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. ); + v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); + v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1)); return sd * nd10 + mean; } -real_t to_degrees(real_t radians) { +double to_degrees(double radians) { return radians * (180.0 / M_PI); } -real_t to_radians(real_t degrees) { +double to_radians(double degrees) { return degrees * (M_PI / 180.0); } + diff --git a/src/core/cuda/include/cuda_angle_force.cuh b/src/core/cuda/include/cuda_angle_force.cuh index 63ebb011..c2e00e15 100644 --- a/src/core/cuda/include/cuda_angle_force.cuh +++ b/src/core/cuda/include/cuda_angle_force.cuh @@ -1,7 +1,5 @@ #pragma once -#include "common/include/precision.h" - void init_angle_force_kernel_data(); -real_t calc_angle_forces_host(int start, int end); +double calc_angle_forces_host(int start, int end); void cleanup_angle_force(); diff --git a/src/core/cuda/include/cuda_bond_force.cuh b/src/core/cuda/include/cuda_bond_force.cuh index bddc873c..83961ed5 100644 --- a/src/core/cuda/include/cuda_bond_force.cuh +++ b/src/core/cuda/include/cuda_bond_force.cuh @@ -1,7 +1,5 @@ #pragma once -#include "common/include/precision.h" - void init_bond_force_kernel_data(); -real_t calc_bond_forces_host(int start, int end); +double calc_bond_forces_host(int start, int end); void cleanup_bond_force(); diff --git a/src/core/cuda/include/cuda_improper2_force.cuh b/src/core/cuda/include/cuda_improper2_force.cuh index 9e0a2cfd..cb0a9635 100644 --- a/src/core/cuda/include/cuda_improper2_force.cuh +++ b/src/core/cuda/include/cuda_improper2_force.cuh @@ -1,7 +1,5 @@ #pragma once -#include "common/include/precision.h" - void init_improper2_force_kernel_data(); -real_t calc_improper2_forces_host(int start, int end); +double calc_improper2_forces_host(int start, int end); void cleanup_improper2_force(); diff --git a/src/core/cuda/include/cuda_nonbonded_force.cuh b/src/core/cuda/include/cuda_nonbonded_force.cuh index ee227088..f1a9b252 100644 --- a/src/core/cuda/include/cuda_nonbonded_force.cuh +++ b/src/core/cuda/include/cuda_nonbonded_force.cuh @@ -1,12 +1,8 @@ #pragma once -#include - -#include "common/include/precision.h" - void init_nonbonded_force_kernel_data(); -std::pair calc_nonbonded_force_host( +std::pair calc_nonbonded_force_host( int nx, int ny, int* x_idx_list, @@ -18,7 +14,7 @@ std::pair calc_nonbonded_force_host( const int* x_atypes_types, const int* y_atypes_types, const bool disable_water_h_lj = false, - const real_t lambda = 1.0 + const double lambda = 1.0 ); void cleanup_nonbonded_force(); diff --git a/src/core/cuda/include/cuda_torsion_force.cuh b/src/core/cuda/include/cuda_torsion_force.cuh index cac7e191..50315181 100644 --- a/src/core/cuda/include/cuda_torsion_force.cuh +++ b/src/core/cuda/include/cuda_torsion_force.cuh @@ -1,8 +1,6 @@ #pragma once -#include "common/include/precision.h" - void init_torsion_force_kernel_data(); -real_t calc_torsion_forces_host(int start, int end); +double calc_torsion_forces_host(int start, int end); void cleanup_torsion_force(); diff --git a/src/core/cuda/include/cuda_utility.cuh b/src/core/cuda/include/cuda_utility.cuh index 9cbcefd5..36767be0 100644 --- a/src/core/cuda/include/cuda_utility.cuh +++ b/src/core/cuda/include/cuda_utility.cuh @@ -3,8 +3,7 @@ #include #include "common/include/cuda_runtime_utility.h" -#include "common/include/precision.h" -__device__ inline real_t to_radians_device(real_t degrees) { +__device__ inline double to_radians_device(double degrees) { return degrees * (M_PI / 180.0); } diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index 445bed51..f20b039a 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -4,10 +4,10 @@ namespace CudaAngleForce { bool is_initialized = false; -real_t* d_energy_sum; +double* d_energy_sum; } // namespace CudaAngleForce -__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, real_t* energy_sum) { +__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; @@ -24,22 +24,21 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co coord_t rji = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; coord_t rjk = {rk.x - rj.x, rk.y - rj.y, rk.z - rj.z}; - real_t rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z); - real_t rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z); + double rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z); + double rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z); - real_t cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length); + double cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length); - cos_theta = cos_theta > static_cast(1.0) ? static_cast(1.0) : cos_theta; - cos_theta = cos_theta < static_cast(-1.0) ? static_cast(-1.0) : cos_theta; - real_t theta = acos(cos_theta); + cos_theta = fmax(fmin(cos_theta, 1.0), -1.0); // Clamp value to avoid NaNs + double theta = acos(cos_theta); - real_t dtheta = theta - to_radians_device(cang.th0); - real_t energy = 0.5 * cang.kth * dtheta * dtheta; + double dtheta = theta - to_radians_device(cang.th0); + double energy = 0.5 * cang.kth * dtheta * dtheta; // calculate force magnitude - real_t dv = cang.kth * dtheta; + double dv = cang.kth * dtheta; - real_t f1 = sin(theta); + double f1 = sin(theta); if (fabs(f1) < k_singular_sin_epsilon) { f1 = -1.0 / k_singular_sin_epsilon; } else { @@ -71,7 +70,7 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co atomicAdd(&dvelocities[j].z, -dv * (di.z + dk.z)); } -real_t calc_angle_forces_host(int start, int end) { +double calc_angle_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaAngleForce; @@ -86,8 +85,8 @@ real_t calc_angle_forces_host(int start, int end) { // todo: now have to do that, after moving all to CudaContext, can remove it // ctx.sync_all_to_device(); - real_t h_energy_sum = 0.0; - cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(real_t), cudaMemcpyHostToDevice); + double h_energy_sum = 0.0; + cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice); // launch kernel calc_angle_forces_kernel<<>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum); @@ -95,14 +94,14 @@ real_t calc_angle_forces_host(int start, int end) { // todo: Now have to do that, after moving all to CudaContext, can remove it // copy results back to host - cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); return h_energy_sum; } void init_angle_force_kernel_data() { using namespace CudaAngleForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_bond_force.cu b/src/core/cuda/src/cuda_bond_force.cu index 476d7209..9b31a660 100644 --- a/src/core/cuda/src/cuda_bond_force.cu +++ b/src/core/cuda/src/cuda_bond_force.cu @@ -3,9 +3,9 @@ #include "cuda_utility.cuh" namespace CudaBondForce { bool is_initialized = false; -real_t* d_energy_sum; +double* d_energy_sum; } // namespace CudaBondForce -__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, real_t* energy_sum) { +__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; bond_t bond = bonds[idx]; @@ -13,18 +13,18 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord coord_t rj = coords[bond.aj - 1]; cbond_t cbond = cbonds[bond.code - 1]; - real_t dx = rj.x - ri.x; - real_t dy = rj.y - ri.y; - real_t dz = rj.z - ri.z; - real_t r = sqrt(dx * dx + dy * dy + dz * dz); + double dx = rj.x - ri.x; + double dy = rj.y - ri.y; + double dz = rj.z - ri.z; + double r = sqrt(dx * dx + dy * dy + dz * dz); - real_t dr = r - cbond.b0; - real_t energy = 0.5 * cbond.kb * dr * dr; + double dr = r - cbond.b0; + double energy = 0.5 * cbond.kb * dr * dr; atomicAdd(energy_sum, energy); // update forces - real_t f = cbond.kb * dr / r; + double f = cbond.kb * dr / r; atomicAdd(&dvelocities[bond.aj - 1].x, f * dx); atomicAdd(&dvelocities[bond.aj - 1].y, f * dy); atomicAdd(&dvelocities[bond.aj - 1].z, f * dz); @@ -33,15 +33,15 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord atomicAdd(&dvelocities[bond.ai - 1].z, -f * dz); } -real_t calc_bond_forces_host(int start, int end) { +double calc_bond_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaBondForce; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - real_t energy = 0.0; - cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice); + double energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); bond_t* d_bonds = host_ctx.bonds->gpu_data_p; @@ -51,7 +51,7 @@ real_t calc_bond_forces_host(int start, int end) { calc_bond_forces_kernel<<>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); return energy; } @@ -59,7 +59,7 @@ real_t calc_bond_forces_host(int start, int end) { void init_bond_force_kernel_data() { using namespace CudaBondForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index f0f790db..dd7d91aa 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -4,10 +4,10 @@ namespace CudaImproper2Force { bool is_initialized = false; -real_t* d_energy_sum; +double* d_energy_sum; } // namespace CudaImproper2Force -__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) { +__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x + start; if (i >= end) return; @@ -15,8 +15,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; - real_t bj2inv, bk2inv, bjinv, bkinv; - real_t cos_phi, phi, arg, ener, dv, f1; + double bj2inv, bk2inv, bjinv, bkinv; + double cos_phi, phi, arg, ener, dv, f1; coord_t di, dl, dpi, dpj, dpk, dpl; improper_t imp; @@ -124,15 +124,15 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp atomicAdd(&dvelocities[ali].z, dv * dpl.z); } -real_t calc_improper2_forces_host(int start, int end) { +double calc_improper2_forces_host(int start, int end) { int N = end - start; if (N <= 0) return 0.0; using namespace CudaImproper2Force; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - real_t energy = 0.0; - cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice); + double energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); coord_t* d_coords = host_ctx.coords->gpu_data_p; @@ -142,14 +142,14 @@ real_t calc_improper2_forces_host(int start, int end) { calc_improper2_forces_kernel<<>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); return energy; } void init_improper2_force_kernel_data() { using namespace CudaImproper2Force; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu index 2ac8245a..1e010f7e 100644 --- a/src/core/cuda/src/cuda_leapfrog.cu +++ b/src/core/cuda/src/cuda_leapfrog.cu @@ -18,20 +18,20 @@ __global__ void calc_leapfrog_kernel( coord_t* xcoords, int n_atoms, int n_atoms_solute, - real_t Tscale_solute, - real_t Tscale_solvent, - real_t dt) { + double Tscale_solute, + double Tscale_solvent, + double dt) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; int i = idx; // Kernel implementation goes here - real_t mass_i, winv_i; + double mass_i, winv_i; mass_i = catypes[atypes[i].code - 1].m; winv_i = 1 / mass_i; - real_t scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent; + double scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent; velocities[i].x = (velocities[i].x - dvelocities[i].x * dt * winv_i) * scale; velocities[i].y = (velocities[i].y - dvelocities[i].y * dt * winv_i) * scale; velocities[i].z = (velocities[i].z - dvelocities[i].z * dt * winv_i) * scale; @@ -50,7 +50,7 @@ __global__ void update_velocities_from_positions_kernel( const coord_t* coords, const coord_t* xcoords, int n_atoms, - real_t dt) { + double dt) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu index f925fe6f..78c4bc91 100644 --- a/src/core/cuda/src/cuda_nonbonded_14_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu @@ -9,8 +9,8 @@ bool is_initialized = false; constexpr int kNonbonded14ModeCount = 3; int* d_atom_to_qi = nullptr; -real_t* d_evdw_totals = nullptr; -real_t* d_ecoul_totals = nullptr; +double* d_evdw_totals = nullptr; +double* d_ecoul_totals = nullptr; __device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) { #ifdef QDYN_SPFP @@ -96,13 +96,13 @@ __global__ void calc_nonbonded_14_force_kernel( const catype_t* unified_catypes, const coord_t* d_coords, dvel_t* d_dvelocities, - real_t* evdw_totals, - real_t* ecoul_totals, + double* evdw_totals, + double* ecoul_totals, bool include_pp, int state, int n_atoms, int n_qatoms, - real_t lambda) { + double lambda) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_pairs) return; @@ -166,14 +166,14 @@ __global__ void calc_nonbonded_14_force_kernel( namespace { struct Nonbonded14EnergyBuckets { - real_t evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; - real_t ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; + double evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; + double ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {}; }; } static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( int state, - real_t lambda, + double lambda, bool include_pp) { using namespace CudaNonbonded14Force; @@ -182,8 +182,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( Nonbonded14EnergyBuckets energies = {}; if (n_ngbrs_14 == 0) return energies; - cudaMemset(d_ecoul_totals, 0, sizeof(real_t) * kNonbonded14ModeCount); - cudaMemset(d_evdw_totals, 0, sizeof(real_t) * kNonbonded14ModeCount); + cudaMemset(d_ecoul_totals, 0, sizeof(double) * kNonbonded14ModeCount); + cudaMemset(d_evdw_totals, 0, sizeof(double) * kNonbonded14ModeCount); const int block_size = 256; const int num_blocks = (n_ngbrs_14 + block_size - 1) / block_size; @@ -208,8 +208,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host( cudaDeviceSynchronize(); - cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); - cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); + cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); + cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost); return energies; } @@ -221,7 +221,7 @@ void calc_nonbonded_14_forces_host() { if (host.n_ngbrs14 == 0) return; for (int state = 0; state < host.n_lambdas; state++) { - const real_t lambda = lambdas[state]; + const double lambda = lambdas[state]; const bool include_pp = (state == 0); Nonbonded14EnergyBuckets energies = calc_nonbonded_14_force_state_host(state, lambda, include_pp); @@ -248,8 +248,8 @@ void init_nonbonded_14_force_kernel_data() { check_cudaMalloc((void**)&d_atom_to_qi, sizeof(int) * host.atom_to_qi.size()); check_cuda(cudaMemcpy(d_atom_to_qi, host.atom_to_qi.data(), sizeof(int) * host.atom_to_qi.size(), cudaMemcpyHostToDevice)); - check_cudaMalloc((void**)&d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount); - check_cudaMalloc((void**)&d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount); + check_cudaMalloc((void**)&d_evdw_totals, sizeof(double) * kNonbonded14ModeCount); + check_cudaMalloc((void**)&d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount); is_initialized = true; } diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index d7f0719c..32b4077a 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -7,7 +7,7 @@ namespace CudaNonbondedForce { bool is_initialized = false; -real_t *d_evdw_total, *d_ecoul_total; +double *d_evdw_total, *d_ecoul_total; template struct nonbond_vec_t { @@ -20,11 +20,9 @@ __device__ __forceinline__ float nonbond_rsqrt(float value) { return rsqrtf(value); } -#ifndef QDYN_SPFP __device__ __forceinline__ double nonbond_rsqrt(double value) { return rsqrt(value); } -#endif __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); @@ -41,7 +39,6 @@ __device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffff return __shfl_sync(mask, v, srcLane); } -#ifndef QDYN_SPFP template <> __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) { int2 a = *reinterpret_cast(&v); @@ -49,7 +46,6 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas a.y = __shfl_sync(mask, a.y, srcLane); return *reinterpret_cast(&a); } -#endif __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { v.x = shfl_value(v.x, srcLane, mask); @@ -80,8 +76,8 @@ __device__ void calculate_unforce_bound( const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); const WorkT r2 = r * r; const WorkT r6 = r2 * r2 * r2; - // real_t v_a = r6 * r6; - // real_t v_b = r6; + // double v_a = r6 * r6; + // double v_b = r6; // ecoul = r; // evdw = v_a - v_b; // dv = r2 * (-ecoul - v_a + v_b); @@ -120,8 +116,8 @@ __global__ void calc_nonbonded_force_kernel( dvel_t* d_dvelocities, - real_t* evdw_tot, - real_t* ecoul_tot, + double* evdw_tot, + double* ecoul_tot, bool symmetric, @@ -134,7 +130,7 @@ __global__ void calc_nonbonded_force_kernel( const int n_catype_types, const int zero_catype_type, const int n_qelscales, - const real_t lambda, + const double lambda, const q_elscale_t* d_qelscales // todo: Now doesn't use it. Should optimize it later ) { @@ -184,8 +180,8 @@ __global__ void calc_nonbonded_force_kernel( nonbond_vec_t x_force = {0.0, 0.0, 0.0}; nonbond_vec_t y_force = {0.0, 0.0, 0.0}; - real_t evdw_sum = 0.0; - real_t ecoul_sum = 0.0; + double evdw_sum = 0.0; + double ecoul_sum = 0.0; const unsigned mask = 0xffffffffu; @@ -311,7 +307,7 @@ __global__ void calc_nonbonded_force_kernel( } // namespace CudaNonbondedForce -std::pair calc_nonbonded_force_host( +std::pair calc_nonbonded_force_host( int nx, int ny, int* x_idx_list, @@ -322,7 +318,7 @@ std::pair calc_nonbonded_force_host( const int* y_charges_types, const int* x_atypes_types, const int* y_atypes_types, - const bool disable_water_h_lj, const real_t lambda) { + const bool disable_water_h_lj, const double lambda) { using namespace CudaNonbondedForce; Context& host = Context::instance(); const int thread_num = 256; @@ -338,8 +334,8 @@ std::pair calc_nonbonded_force_host( dim3 grid = dim3(grid_sz); - cudaMemset(d_ecoul_total, 0, sizeof(real_t)); - cudaMemset(d_evdw_total, 0, sizeof(real_t)); + cudaMemset(d_ecoul_total, 0, sizeof(double)); + cudaMemset(d_evdw_total, 0, sizeof(double)); auto launch_kernel = [&](auto work_tag) { using WorkT = decltype(work_tag); @@ -377,9 +373,9 @@ std::pair calc_nonbonded_force_host( cudaDeviceSynchronize(); - real_t evdw_tot = 0, ecoul_tot = 0; - cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(real_t), cudaMemcpyDeviceToHost); + double evdw_tot = 0, ecoul_tot = 0; + cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost); return {evdw_tot, ecoul_tot}; } @@ -387,8 +383,8 @@ std::pair calc_nonbonded_force_host( void init_nonbonded_force_kernel_data() { using namespace CudaNonbondedForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_evdw_total, sizeof(real_t)); - check_cudaMalloc((void**)&d_ecoul_total, sizeof(real_t)); + check_cudaMalloc((void**)&d_evdw_total, sizeof(double)); + check_cudaMalloc((void**)&d_ecoul_total, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index bdb35608..7be0656f 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -14,11 +14,11 @@ int* water_shell = nullptr; int* water_rank = nullptr; int* polx_list_sh = nullptr; // use 1d array to simulate 2d array -real_t* d_energy; +double* d_energy; int* d_list_sh = nullptr; -real_t* d_theta = nullptr; -real_t* d_theta0 = nullptr; -real_t* d_tdum = nullptr; +double* d_theta = nullptr; +double* d_theta0 = nullptr; +double* d_tdum = nullptr; int* d_water_shell = nullptr; int* d_water_rank = nullptr; @@ -27,15 +27,15 @@ int* d_water_rank = nullptr; __global__ void calc_polx_theta_and_shells( int n_waters, int n_shells, int n_atoms_solute, coord_t* coords, topo_t topo, shell_t* wshells, int* list_sh, - real_t* theta, real_t* theta0, real_t* tdum) { + double* theta, double* theta0, double* tdum) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_waters) return; int i = idx; int wi, iis; coord_t rmu, rcu; - real_t rm, rc; - real_t cos_th; + double rm, rc; + double cos_th; theta[i] = 0; theta0[i] = 0; @@ -81,7 +81,7 @@ __global__ void calc_polx_theta_and_shells( __global__ void calc_polx_water_forces_kernel( int n_waters, int n_atoms_solute, shell_t* wshells, coord_t* coords, dvel_t* dvelocities, topo_t topo, - real_t* theta, md_t md, real_t* energy, + double* theta, md_t md, double* energy, int* water_rank, int* water_shell) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_waters) return; @@ -92,21 +92,21 @@ __global__ void calc_polx_water_forces_kernel( int wi, ii; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; - real_t rm, rc; - real_t cos_th; - real_t avtdum, arg, f0, dv; - real_t ener; + double rm, rc; + double cos_th; + double avtdum, arg, f0, dv; + double ener; avtdum = 0; ii = idx; - arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell); - real_t theta_val = acos(arg); + arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell); + double theta_val = acos(arg); theta_val = theta_val - 3 * sin(theta_val) * wshells[is].cstb / 2; if (theta_val < 0) theta_val = 0; if (theta_val > M_PI) theta_val = M_PI; avtdum += theta[ii]; - const real_t dtheta = theta[ii] - theta_val + wshells[is].theta_corr; + const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr; ener = .5 * md.polarisation_force * dtheta * dtheta; // E_restraint.Upolx += ener; atomicAdd(energy, ener); @@ -164,7 +164,7 @@ __global__ void calc_polx_water_forces_kernel( atomicAdd(&dvelocities[wi + 2].y, f0 * (f1H2.y)); atomicAdd(&dvelocities[wi + 2].z, f0 * (f1H2.z)); - atomicAdd(&wshells[is].avtheta, avtdum / (real_t)wshells[is].n_inshell); + atomicAdd(&wshells[is].avtheta, avtdum / (double)wshells[is].n_inshell); atomicAdd(&wshells[is].avn_inshell, wshells[is].n_inshell); } @@ -174,7 +174,7 @@ void sort_waters() { auto *wshells = ctx.wshells->cpu_data_p; int imin, jmin, jw; - real_t tmin; + double tmin; // Sort the waters according to theta for (int is = 0; is < ctx.n_shells; is++) { imin = 0; @@ -224,7 +224,7 @@ void calc_polx_water_forces_host(int iteration) { // todo: sort in cpu now.. ctx.wshells->download(); - cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(polx_list_sh, d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int), cudaMemcpyDeviceToHost); // Reset per-water metadata; only waters placed in shells will be overwritten in sort_waters(). @@ -244,8 +244,8 @@ void calc_polx_water_forces_host(int iteration) { if (iteration != 0 && iteration % itdis_update == 0) { for (int is = 0; is < ctx.n_shells; is++) { printf("SHELL %d\n", is); - wshells[is].avtheta /= (real_t)itdis_update; - wshells[is].avn_inshell /= (real_t)itdis_update; + wshells[is].avtheta /= (double)itdis_update; + wshells[is].avn_inshell /= (double)itdis_update; wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb); printf("average theta = %f, average in shell = %f, theta_corr = %f\n", wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI); @@ -256,12 +256,12 @@ void calc_polx_water_forces_host(int iteration) { } // Calculate energy and force - cudaMemset(d_energy, 0, sizeof(real_t)); + cudaMemset(d_energy, 0, sizeof(double)); calc_polx_water_forces_kernel<<>>( ctx.n_waters, ctx.n_atoms_solute, d_wshells, d_coords, d_dvelocities, ctx.topo, d_theta, ctx.md, d_energy, d_water_rank, d_water_shell); - real_t energy; - cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost); + double energy; + cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost); ctx.E_restraint.Upolx += energy; ctx.wshells->download(); // Copy back forces for all atoms (solute + solvent); water forces were being dropped. @@ -275,11 +275,11 @@ void init_polx_water_force_kernel_data() { water_shell = new int[ctx.n_waters]; polx_list_sh = new int[ctx.n_max_inshell * ctx.n_shells]; - check_cudaMalloc((void**)&d_energy, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy, sizeof(double)); check_cudaMalloc((void**)&d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int)); - check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(real_t)); - check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(real_t)); - check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(real_t)); + check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(double)); check_cudaMalloc((void**)&d_water_rank, ctx.n_waters * sizeof(int)); check_cudaMalloc((void**)&d_water_shell, ctx.n_waters * sizeof(int)); diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu index b6ef257e..5221cb9e 100644 --- a/src/core/cuda/src/cuda_pshell_force.cu +++ b/src/core/cuda/src/cuda_pshell_force.cu @@ -5,8 +5,8 @@ #include namespace CudaPshellForce { bool is_initialized = false; -real_t* d_ufix_energy; -real_t* d_ushell_energy; +double* d_ufix_energy; +double* d_ushell_energy; } // namespace CudaPshellForce __global__ void calc_pshell_force_kernel( @@ -15,14 +15,14 @@ __global__ void calc_pshell_force_kernel( bool* excluded, coord_t* coords, coord_t* coords_init, - real_t* ufix_energy, - real_t* ushell_energy, + double* ufix_energy, + double* ushell_energy, dvel_t* dvelocities) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n_atoms_solute) return; coord_t dr; - real_t k, r2, ener; + double k, r2, ener; if (shell[i] || excluded[i]) { // printf("i = %d excluded = %s shell = %s\n", i, excluded[i] ? "True" : "False", shell[i] ? "True" : "False"); @@ -57,8 +57,8 @@ void calc_pshell_forces_host() { auto d_coords_init = host.coords_init->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - cudaMemset(d_ufix_energy, 0, sizeof(real_t)); - cudaMemset(d_ushell_energy, 0, sizeof(real_t)); + cudaMemset(d_ufix_energy, 0, sizeof(double)); + cudaMemset(d_ushell_energy, 0, sizeof(double)); int blockSize = 256; int numBlocks = (host.n_atoms_solute + blockSize - 1) / blockSize; @@ -72,10 +72,10 @@ void calc_pshell_forces_host() { d_ushell_energy, d_dvelocities); cudaDeviceSynchronize(); - real_t ufix_energy; - real_t ushell_energy; - cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(real_t), cudaMemcpyDeviceToHost); + double ufix_energy; + double ushell_energy; + cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost); host.E_restraint.Ufix += ufix_energy; host.E_restraint.Ushell += ushell_energy; @@ -85,8 +85,8 @@ void calc_pshell_forces_host() { void init_pshell_force_kernel_data() { using namespace CudaPshellForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_ufix_energy, sizeof(real_t)); - check_cudaMalloc((void**)&d_ushell_energy, sizeof(real_t)); + check_cudaMalloc((void**)&d_ufix_energy, sizeof(double)); + check_cudaMalloc((void**)&d_ushell_energy, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu index 26c8c94f..f037e9db 100644 --- a/src/core/cuda/src/cuda_radix_water_force.cu +++ b/src/core/cuda/src/cuda_radix_water_force.cu @@ -6,20 +6,20 @@ #include "cuda/include/cuda_utility.cuh" namespace CudaRadixWaterForce { bool is_initialized = false; -real_t* d_energy; +double* d_energy; } // namespace CudaRadixWaterForce __global__ void calc_radix_water_forces_kernel( coord_t* coords, - real_t shift, + double shift, int n_atoms_solute, int n_atoms, topo_t topo, md_t md, - real_t Dwmz, - real_t awmz, + double Dwmz, + double awmz, dvel_t* dvelocities, - real_t* energy) { + double* energy) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = n_atoms_solute + i * 3; // Process only oxygen atoms of water molecules if (i >= n_atoms) return; @@ -29,16 +29,16 @@ __global__ void calc_radix_water_forces_kernel( dr.x = coords[i].x - topo.solvent_center.x; dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - real_t b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); - real_t db = b - (topo.solvent_radius - shift); + double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); + double db = b - (topo.solvent_radius - shift); - real_t ener, dv; + double ener, dv; if (db > 0) { ener = 0.5 * md.radial_force * db * db - Dwmz; dv = md.radial_force * db / b; } else { if (b > 0.0) { - real_t fexp = exp(awmz * db); + double fexp = exp(awmz * db); ener = Dwmz * (fexp * fexp - 2 * fexp); dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b; } else { @@ -70,16 +70,16 @@ void calc_radix_water_forces_host() { auto d_coords = host.coords->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - check_cuda(cudaMemset(d_energy, 0, sizeof(real_t))); + check_cuda(cudaMemset(d_energy, 0, sizeof(double))); - real_t shift; + double shift; if (host.md.radial_force != 0) { shift = sqrt(Boltz * host.Tfree / host.md.radial_force); } else { shift = 0; } - real_t energy = 0.0; + double energy = 0.0; calc_radix_water_forces_kernel<<>>(d_coords, shift, host.n_atoms_solute, @@ -91,14 +91,14 @@ void calc_radix_water_forces_host() { d_dvelocities, d_energy); check_cuda(cudaDeviceSynchronize()); - check_cuda(cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost)); + check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost)); host.E_restraint.Uradx += energy; } void init_radix_water_force_kernel_data() { using namespace CudaRadixWaterForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index e32872b7..567a78df 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -3,26 +3,26 @@ #include "common/include/context.h" namespace CudaRestrangForce { bool is_initialized = false; -real_t* d_E_restraint; +double* d_E_restraint; } // namespace CudaRestrangForce __global__ void calc_restrang_force_kernel( restrang_t* restrangs, int n_restrangs, coord_t* coords, - real_t* lambdas, + double* lambdas, int n_lambdas, dvel_t* dvelocities, E_restraint_t* EQ_restraint, - real_t* E_restraint) { + double* E_restraint) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrangs) return; int ir = idx; int state, i, j, k; coord_t dr, dr2, di, dk; - real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th; - real_t dth, dv, ener, f1; + double lambda, r2ij, r2jk, rij, rjk, cos_th, th; + double dth, dv, ener, f1; state = restrangs[ir].ipsi - 1; i = restrangs[ir].ai - 1; @@ -110,8 +110,8 @@ void calc_restrang_force_host() { auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_EQ_restraint = host.EQ_restraint->gpu_data_p; - real_t val = 0; - cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice); + double val = 0; + cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); int blockSize = 256; int numBlocks = (host.n_restrangs + blockSize - 1) / blockSize; @@ -126,14 +126,14 @@ void calc_restrang_force_host() { d_E_restraint); cudaDeviceSynchronize(); host.EQ_restraint->download(); - cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); host.E_restraint.Upres += val; } void init_restrang_force_kernel_data() { using namespace CudaRestrangForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu index cdb035b4..14f9b466 100644 --- a/src/core/cuda/src/cuda_restrdis_force.cu +++ b/src/core/cuda/src/cuda_restrdis_force.cu @@ -5,24 +5,24 @@ #include "common/include/context.h" namespace CudaRestrdisForce { bool is_initialized = false; -real_t* d_E_restraint; +double* d_E_restraint; } // namespace CudaRestrdisForce __global__ void calc_restrdis_forces_kernel( restrdis_t* restrdists, int n_restrdists, coord_t* coords, - real_t* lambdas, + double* lambdas, int n_lambdas, dvel_t* dvelocities, E_restraint_t* EQ_restraint, - real_t* E_restraint) { + double* E_restraint) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrdists) return; int state, i, j; coord_t dr; - real_t lambda, b, db, dv, ener; + double lambda, b, db, dv, ener; int ir = idx; @@ -82,7 +82,7 @@ void calc_restrdis_forces_host() { auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_EQ_restraint = host.EQ_restraint->gpu_data_p; - cudaMemset(d_E_restraint, 0, sizeof(real_t)); + cudaMemset(d_E_restraint, 0, sizeof(double)); int blockSize = 256; int numBlocks = (host.n_restrdists + blockSize - 1) / blockSize; @@ -97,8 +97,8 @@ void calc_restrdis_forces_host() { d_E_restraint); cudaDeviceSynchronize(); host.EQ_restraint->download(); - real_t ener; - cudaMemcpy(&ener, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); + double ener; + cudaMemcpy(&ener, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); printf("Energy restraint: %f\n", ener); host.E_restraint.Upres += ener; } @@ -106,7 +106,7 @@ void calc_restrdis_forces_host() { void init_restrdis_force_kernel_data() { using namespace CudaRestrdisForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu index 5307bff5..695e2b33 100644 --- a/src/core/cuda/src/cuda_restrpos_force.cu +++ b/src/core/cuda/src/cuda_restrpos_force.cu @@ -6,17 +6,17 @@ namespace CudaRestrposForce { bool is_initialized = false; -real_t* d_E_restraint; +double* d_E_restraint; } // namespace CudaRestrposForce __global__ void calc_restrpos_forces_kernel( restrpos_t* restrspos, int n_restrspos, coord_t* coords, - real_t* lambdas, + double* lambdas, int n_lambdas, E_restraint_t* EQ_restraint, - real_t* E_restraint, + double* E_restraint, dvel_t* dvelocities) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrspos) return; @@ -24,7 +24,7 @@ __global__ void calc_restrpos_forces_kernel( int state, i; coord_t dr; - real_t lambda, ener, x2, y2, z2; + double lambda, ener, x2, y2, z2; state = restrspos[ir].ipsi - 1; i = restrspos[ir].a - 1; @@ -64,8 +64,8 @@ void calc_restrpos_forces_host() { auto& host = Context::instance(); if (host.n_restrspos == 0) return; using namespace CudaRestrposForce; - real_t val = 0.0; - cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice); + double val = 0.0; + cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); auto d_restrspos = host.restrspos->gpu_data_p; auto d_coords = host.coords->gpu_data_p; @@ -85,7 +85,7 @@ void calc_restrpos_forces_host() { d_E_restraint, d_dvelocities); cudaDeviceSynchronize(); - cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); host.E_restraint.Upres += val; host.EQ_restraint->download(); } @@ -93,7 +93,7 @@ void calc_restrpos_forces_host() { void init_restrpos_force_kernel_data() { using namespace CudaRestrposForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t)); + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu index e5951303..71835e4e 100644 --- a/src/core/cuda/src/cuda_restrseq_force.cu +++ b/src/core/cuda/src/cuda_restrseq_force.cu @@ -4,7 +4,7 @@ namespace CudaRestrseqForce { bool is_initialized = false; -real_t* d_upres_energy; +double* d_upres_energy; } // namespace CudaRestrseqForce __global__ void calc_restrseq_forces_kernel( int n_restrseqs, @@ -15,13 +15,13 @@ __global__ void calc_restrseq_forces_kernel( catype_t* catypes, bool* heavy, dvel_t* dvelocities, - real_t* upres_energy) { + double* upres_energy) { int s = blockIdx.x * blockDim.x + threadIdx.x; if (s >= n_restrseqs) return; - real_t k, mass, totmass; + double k, mass, totmass; coord_t dr; - real_t r2, ener; + double r2, ener; k = restrseqs[s].k; @@ -123,7 +123,7 @@ void calc_restrseq_forces_host() { auto d_catypes = host.catypes->gpu_data_p; auto d_heavy = host.heavy->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; - cudaMemset(d_upres_energy, 0, sizeof(real_t)); + cudaMemset(d_upres_energy, 0, sizeof(double)); // ctx.sync_all_to_device(); int blockSize = 256; @@ -139,8 +139,8 @@ void calc_restrseq_forces_host() { d_dvelocities, d_upres_energy); cudaDeviceSynchronize(); - real_t upres_energy; - cudaMemcpy(&upres_energy, d_upres_energy, sizeof(real_t), cudaMemcpyDeviceToHost); + double upres_energy; + cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost); host.E_restraint.Upres = upres_energy; printf("Restrseq U_upres: %f\n", upres_energy); } @@ -148,7 +148,7 @@ void calc_restrseq_forces_host() { void init_restrseq_force_kernel_data() { using namespace CudaRestrseqForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_upres_energy, sizeof(real_t)); + check_cudaMalloc((void**)&d_upres_energy, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu index 2ca01839..c928bb71 100644 --- a/src/core/cuda/src/cuda_restrwall_force.cu +++ b/src/core/cuda/src/cuda_restrwall_force.cu @@ -5,20 +5,20 @@ namespace CudaRestrwallForce { bool is_initialized = false; -real_t* d_energies; +double* d_energies; } // namespace CudaRestrwallForce __global__ void calc_restrwall_forces_kernel( restrwall_t* restrwalls, int n_restrwalls, coord_t* coords, - real_t* energies, + double* energies, dvel_t* dvelocities, bool* heavy, topo_t topo) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_restrwalls) return; - real_t k, b, db, ener, dv, fexp; + double k, b, db, ener, dv, fexp; coord_t dr; int ir = idx; @@ -58,7 +58,7 @@ void calc_restrwall_forces_host() { auto d_coords = host.coords->gpu_data_p; auto d_dvelocities = host.dvelocities->gpu_data_p; auto d_heavy = host.heavy->gpu_data_p; - cudaMemset(d_energies, 0, sizeof(real_t)); + cudaMemset(d_energies, 0, sizeof(double)); int blockSize = 256; int numBlocks = (host.n_restrwalls + blockSize - 1) / blockSize; @@ -69,8 +69,8 @@ void calc_restrwall_forces_host() { d_energies, d_dvelocities, d_heavy, host.topo); cudaDeviceSynchronize(); - real_t h_energy; - cudaMemcpy(&h_energy, d_energies, sizeof(real_t), cudaMemcpyDeviceToHost); + double h_energy; + cudaMemcpy(&h_energy, d_energies, sizeof(double), cudaMemcpyDeviceToHost); printf("Restrwall energy: %f\n", h_energy); host.E_restraint.Upres += h_energy; } @@ -78,7 +78,7 @@ void calc_restrwall_forces_host() { void init_restrwall_force_kernel_data() { using namespace CudaRestrwallForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energies, sizeof(real_t)); + check_cudaMalloc((void**)&d_energies, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu index 03fa76cd..bda47e50 100644 --- a/src/core/cuda/src/cuda_shake_constraints.cu +++ b/src/core/cuda/src/cuda_shake_constraints.cu @@ -17,7 +17,7 @@ __global__ void calc_shake_constraints_kernel( shake_bond_t* shake_bonds, coord_t* coords, coord_t* xcoords, - real_t* winv, + double* winv, int* total_iterations, int* mol_shake_offset) { int idx = blockIdx.x; @@ -26,7 +26,7 @@ __global__ void calc_shake_constraints_kernel( int mol = idx; int ai, aj, n_iterations, shake; - real_t xij2, diff, corr, scp, xxij2; + double xij2, diff, corr, scp, xxij2; coord_t xij, xxij; if (mol_n_shakes[mol] == 0) return; diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu index 46c4c373..baba687e 100644 --- a/src/core/cuda/src/cuda_temperature.cu +++ b/src/core/cuda/src/cuda_temperature.cu @@ -6,23 +6,23 @@ namespace CudaTemperature { bool is_initialized = false; -real_t* d_Temp_solute; -real_t* d_Tfree_solute; -real_t* d_Texcl_solute; -real_t* d_Temp_solvent; -real_t* d_Tfree_solvent; -real_t* d_Texcl_solvent; +double* d_Temp_solute; +double* d_Tfree_solute; +double* d_Texcl_solute; +double* d_Temp_solvent; +double* d_Tfree_solvent; +double* d_Texcl_solvent; } // namespace CudaTemperature -__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, real_t boltz, real_t ekinmax, - real_t* Temp_solute, real_t* Tfree_solute, real_t* Texcl_solute, real_t* Temp_solvent, real_t* Tfree_solvent, real_t* Texcl_solvent) { +__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, double boltz, double ekinmax, + double* Temp_solute, double* Tfree_solute, double* Texcl_solute, double* Temp_solvent, double* Tfree_solvent, double* Texcl_solvent) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; - real_t mass_i = catypes[atypes[idx].code - 1].m; - const real_t vx = velocities[idx].x; - const real_t vy = velocities[idx].y; - const real_t vz = velocities[idx].z; - real_t ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); + double mass_i = catypes[atypes[idx].code - 1].m; + const double vx = velocities[idx].x; + const double vy = velocities[idx].y; + const double vz = velocities[idx].z; + double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); bool is_solute = (idx < n_atoms_solute); bool is_excluded = excluded[idx]; @@ -49,14 +49,14 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t void calc_temperature_host() { auto& host = Context::instance(); using namespace CudaTemperature; - real_t h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0; + double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0; - cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(real_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(real_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(real_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(real_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(real_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(real_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice); atype_t* d_atypes = host.atypes->gpu_data_p; catype_t* d_catypes = host.catypes->gpu_data_p; @@ -66,17 +66,17 @@ void calc_temperature_host() { int blockSize = 256; int numBlocks = (host.n_atoms + blockSize - 1) / blockSize; - real_t Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms; + double Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms; calc_temperature_kernel<<>>(host.n_atoms, host.n_atoms_solute, d_atypes, d_catypes, d_velocities, d_excluded, Boltz, Ekinmax, d_Temp_solute, d_Tfree_solute, d_Texcl_solute, d_Temp_solvent, d_Tfree_solvent, d_Texcl_solvent); cudaDeviceSynchronize(); - cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(double), cudaMemcpyDeviceToHost); host.Tfree = h_Tfree_solute + h_Tfree_solvent; host.Temp = h_Temp_solute + h_Temp_solvent; @@ -98,12 +98,12 @@ void calc_temperature_host() { void init_temperature_kernel_data() { using namespace CudaTemperature; if (!is_initialized) { - check_cudaMalloc((void**)&d_Temp_solute, sizeof(real_t)); - check_cudaMalloc((void**)&d_Tfree_solute, sizeof(real_t)); - check_cudaMalloc((void**)&d_Texcl_solute, sizeof(real_t)); - check_cudaMalloc((void**)&d_Temp_solvent, sizeof(real_t)); - check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(real_t)); - check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(real_t)); + check_cudaMalloc((void**)&d_Temp_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double)); + check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double)); + check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double)); is_initialized = true; } } diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 1c0692ae..5baffbde 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -4,10 +4,10 @@ namespace CudaTorsionForce { bool is_initialized = false; -real_t* d_energy_sum = nullptr; +double* d_energy_sum = nullptr; } // namespace CudaTorsionForce -__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) { +__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x + start; if (i >= end) return; int aii, aji, aki, ali; @@ -16,10 +16,10 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - real_t bj2inv, bk2inv, bjinv, bkinv; - real_t cos_phi, phi; - real_t arg, dv, f1; - real_t ener; + double bj2inv, bk2inv, bjinv, bkinv; + double cos_phi, phi; + double arg, dv, f1; + double ener; torsion_t t; ctorsion_t ctors; @@ -63,8 +63,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio bkinv = sqrt(bk2inv); cos_phi = (rnj.x * rnk.x + rnj.y * rnk.y + rnj.z * rnk.z) * (bjinv * bkinv); - cos_phi = cos_phi > static_cast(1.0) ? static_cast(1.0) : cos_phi; - cos_phi = cos_phi < static_cast(-1.0) ? static_cast(-1.0) : cos_phi; + cos_phi = fmin(fmax(cos_phi, -1.0), 1.0); phi = acos(cos_phi); if (rjk.x * (rnj.y * rnk.z - rnj.z * rnk.y) + rjk.y * (rnj.z * rnk.x - rnj.x * rnk.z) + rjk.z * (rnj.x * rnk.y - rnj.y * rnk.x) < 0) { phi = -phi; @@ -124,15 +123,15 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio atomicAdd(&dvelocities[ali].z, dv * dpl.z); } -real_t calc_torsion_forces_host(int start, int end) { +double calc_torsion_forces_host(int start, int end) { using namespace CudaTorsionForce; int N = end - start; if (N <= 0) return 0.0; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - real_t zero = 0.0; - cudaMemcpy(d_energy_sum, &zero, sizeof(real_t), cudaMemcpyHostToDevice); + double zero = 0.0; + cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice); auto& host_ctx = Context::instance(); coord_t* d_coords = host_ctx.coords->gpu_data_p; @@ -142,7 +141,7 @@ real_t calc_torsion_forces_host(int start, int end) { calc_torsion_forces_kernel<<>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&zero, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); return zero; } @@ -151,7 +150,7 @@ real_t calc_torsion_forces_host(int start, int end) { void init_torsion_force_kernel_data() { using namespace CudaTorsionForce; if (!is_initialized) { - check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t)); + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); is_initialized = true; } } From b67d111424d837c7b084d2863158c824e5ccb852 Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Thu, 30 Apr 2026 16:10:03 +0200 Subject: [PATCH 19/20] Revert "Merge branch 'feature/qgpu_mixed_precision' into feature/qgpu_benchmark_script" This reverts commit e6eee26979200b22c1cbc7ee6851dd1653aebfb0, reversing changes made to edd65a3725ac4edd6a218716ee9fb6eb8dd3ce15. --- src/core/common/include/md_types.h | 6 +- src/core/common/include/precision.h | 9 +- src/core/cpu/src/cpu_angle_force.cpp | 4 +- src/core/cpu/src/cpu_improper2_force.cpp | 4 +- src/core/cpu/src/cpu_polx_water_force.cpp | 4 +- src/core/cpu/src/cpu_q_angle_force.cpp | 4 +- src/core/cpu/src/cpu_q_torsion_force.cpp | 4 +- src/core/cpu/src/cpu_restrang_force.cpp | 4 +- src/core/cpu/src/cpu_torsion_force.cpp | 4 +- src/core/cuda/src/cuda_angle_force.cu | 4 +- src/core/cuda/src/cuda_improper2_force.cu | 2 +- src/core/cuda/src/cuda_nonbonded_force.cu | 126 ++++++++++----------- src/core/cuda/src/cuda_polx_water_force.cu | 2 +- src/core/cuda/src/cuda_restrang_force.cu | 4 +- src/core/cuda/src/cuda_torsion_force.cu | 2 +- 15 files changed, 84 insertions(+), 99 deletions(-) diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index 27c20cef..6a4d2865 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -310,9 +310,9 @@ struct vel_t { }; struct dvel_t { - force_accum_t x; - force_accum_t y; - force_accum_t z; + double x; + double y; + double z; }; struct E_bonded_t { diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h index 80b790f7..fc633f45 100644 --- a/src/core/common/include/precision.h +++ b/src/core/common/include/precision.h @@ -3,18 +3,11 @@ #ifdef QDYN_SPFP using real_t = float; using nonbond_work_t = float; -using force_accum_t = float; #else using real_t = double; using nonbond_work_t = double; -using force_accum_t = double; #endif using energy_accum_t = double; +using force_accum_t = double; using constraint_work_t = double; - -#ifdef QDYN_SPFP -constexpr double k_singular_sin_epsilon = 1.0e-6; -#else -constexpr double k_singular_sin_epsilon = 1.0e-12; -#endif diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp index ae600561..a9c29c1e 100644 --- a/src/core/cpu/src/cpu_angle_force.cpp +++ b/src/core/cpu/src/cpu_angle_force.cpp @@ -64,9 +64,9 @@ double calc_angle_forces(int start, int end) { dv = cangle.kth * dth; f1 = sin(th); - if (std::fabs(f1) < k_singular_sin_epsilon) { + if (std::fabs(f1) < 1.0E-12) { // Avoid division by zero - f1 = -1.0 / k_singular_sin_epsilon; + f1 = -1.0E12; } else { f1 = -1.0 / f1; } diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp index 6e4faa60..af73a9cc 100644 --- a/src/core/cpu/src/cpu_improper2_force.cpp +++ b/src/core/cpu/src/cpu_improper2_force.cpp @@ -79,8 +79,8 @@ double calc_improper2_forces(int start, int end) { // Forces f1 = sin(phi); - if (std::fabs(f1) < k_singular_sin_epsilon) { - f1 = std::copysign(k_singular_sin_epsilon, f1); + if (std::fabs(f1) < 1E-12) { + f1 = 1E-12; } f1 = -1 / f1; diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp index 5116dbbb..9d0e4711 100644 --- a/src/core/cpu/src/cpu_polx_water_force.cpp +++ b/src/core/cpu/src/cpu_polx_water_force.cpp @@ -158,8 +158,8 @@ void calc_polx_w_forces(int iteration) { cos_th = -1; } f0 = sin(acos(cos_th)); - if (fabs(f0) < k_singular_sin_epsilon) { - f0 = k_singular_sin_epsilon; + if (fabs(f0) < 1.0E-12) { + f0 = 1.0E-12; } f0 = -1.0 / f0; f0 *= dv; diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp index 14aa802c..c9c2ea65 100644 --- a/src/core/cpu/src/cpu_q_angle_force.cpp +++ b/src/core/cpu/src/cpu_q_angle_force.cpp @@ -56,8 +56,8 @@ void calc_qangle_forces(int state) { dv = ctx.q_cangles[ic].kth * dth * lambdas[state]; f1 = sin(th); - if (fabs(f1) < k_singular_sin_epsilon) { - f1 = k_singular_sin_epsilon; + if (abs(f1) < 1E-12) { + f1 = 1E-12; } f1 = -1.0 / f1; diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp index 7b7fb271..be309347 100644 --- a/src/core/cpu/src/cpu_q_torsion_force.cpp +++ b/src/core/cpu/src/cpu_q_torsion_force.cpp @@ -76,8 +76,8 @@ void calc_qtorsion_forces(int state) { // Forces f1 = sin(phi); - if (fabs(f1) < k_singular_sin_epsilon) { - f1 = copysign(k_singular_sin_epsilon, f1); + if (abs(f1) < 1E-12) { + f1 = 1E-12; } f1 = -1 / f1; diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp index 84f593b0..d809a9c1 100644 --- a/src/core/cpu/src/cpu_restrang_force.cpp +++ b/src/core/cpu/src/cpu_restrang_force.cpp @@ -61,8 +61,8 @@ void calc_restrang_forces() { dv = lambda * restrangs[ir].k * dth; f1 = sin(th); - if (fabs(f1) < k_singular_sin_epsilon) { - f1 = -1.0 / k_singular_sin_epsilon; + if (fabs(f1) < 1E-12) { + f1 = -1E-12; } else { f1 = -1 / f1; } diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp index 4ebb44b2..e8aaa2a3 100644 --- a/src/core/cpu/src/cpu_torsion_force.cpp +++ b/src/core/cpu/src/cpu_torsion_force.cpp @@ -88,8 +88,8 @@ double calc_torsion_forces(int start, int end) { // Forces f1 = sin(phi); - if (std::fabs(f1) < k_singular_sin_epsilon) { - f1 = std::copysign(k_singular_sin_epsilon, f1); + if (std::fabs(f1) < 1E-12) { + f1 = 1E-12; } f1 = -1 / f1; diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index f20b039a..dcd044ce 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -39,8 +39,8 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co double dv = cang.kth * dtheta; double f1 = sin(theta); - if (fabs(f1) < k_singular_sin_epsilon) { - f1 = -1.0 / k_singular_sin_epsilon; + if (fabs(f1) < 1e-12) { + f1 = -1.0e12; } else { f1 = -1.0 / f1; } diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index dd7d91aa..78707b12 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -76,7 +76,7 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp // Forces f1 = sin(phi); - if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1); + if (fabs(f1) < 1E-12) f1 = 1E-12; f1 = -1 / f1; // printf("f1 = %f phi = %f cos_phi = %f\n", f1, phi, cos_phi); diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index 32b4077a..ce3f73ae 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -9,19 +9,18 @@ namespace CudaNonbondedForce { bool is_initialized = false; double *d_evdw_total, *d_ecoul_total; -template struct nonbond_vec_t { - WorkT x; - WorkT y; - WorkT z; + nonbond_work_t x; + nonbond_work_t y; + nonbond_work_t z; }; -__device__ __forceinline__ float nonbond_rsqrt(float value) { +__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) { +#ifdef QDYN_SPFP return rsqrtf(value); -} - -__device__ __forceinline__ double nonbond_rsqrt(double value) { +#else return rsqrt(value); +#endif } __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { @@ -54,7 +53,6 @@ __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned m return v; } -template __device__ void calculate_unforce_bound( const coord_t& x, const coord_t& y, @@ -62,20 +60,20 @@ __device__ void calculate_unforce_bound( const real_t charge_product, const vdw_pair_param_t& pair_param, - const WorkT coulomb_constant, + const nonbond_work_t coulomb_constant, - const WorkT scaling, - const WorkT lambda, + const nonbond_work_t scaling, + const nonbond_work_t lambda, - WorkT& evdw, - WorkT& ecoul, - WorkT& dv) { - const WorkT dx = static_cast(x.x - y.x); - const WorkT dy = static_cast(x.y - y.y); - const WorkT dz = static_cast(x.z - y.z); - const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); - const WorkT r2 = r * r; - const WorkT r6 = r2 * r2 * r2; + nonbond_work_t& evdw, + nonbond_work_t& ecoul, + nonbond_work_t& dv) { + const nonbond_work_t dx = static_cast(x.x - y.x); + const nonbond_work_t dy = static_cast(x.y - y.y); + const nonbond_work_t dz = static_cast(x.z - y.z); + const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); + const nonbond_work_t r2 = r * r; + const nonbond_work_t r6 = r2 * r2 * r2; // double v_a = r6 * r6; // double v_b = r6; // ecoul = r; @@ -84,13 +82,12 @@ __device__ void calculate_unforce_bound( ecoul = scaling * coulomb_constant * charge_product * r * lambda; - const WorkT v_a = static_cast(pair_param.a) * r6 * r6 * lambda; - const WorkT v_b = static_cast(pair_param.b) * r6 * lambda; + const nonbond_work_t v_a = static_cast(pair_param.a) * r6 * r6 * lambda; + const nonbond_work_t v_b = static_cast(pair_param.b) * r6 * lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); + dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); } -template __global__ void calc_nonbonded_force_kernel( const int nx, const int ny, @@ -177,8 +174,8 @@ __global__ void calc_nonbonded_force_kernel( int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1; int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1; - nonbond_vec_t x_force = {0.0, 0.0, 0.0}; - nonbond_vec_t y_force = {0.0, 0.0, 0.0}; + nonbond_vec_t x_force = {0.0, 0.0, 0.0}; + nonbond_vec_t y_force = {0.0, 0.0, 0.0}; double evdw_sum = 0.0; double ecoul_sum = 0.0; @@ -233,14 +230,14 @@ __global__ void calc_nonbonded_force_kernel( } } - const WorkT kernel_lambda = static_cast(lambda); - const WorkT coulomb_constant = static_cast(d_topo.coulomb_constant); + const nonbond_work_t kernel_lambda = static_cast(lambda); + const nonbond_work_t coulomb_constant = static_cast(d_topo.coulomb_constant); const int charge_pair_row = x_charge_type_idx * n_charge_types; const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0; for (int i = 0; i < 32; i++) { if (is_valid()) { - WorkT scaling = static_cast(1.0); + nonbond_work_t scaling = static_cast(1.0); real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx]; @@ -252,7 +249,7 @@ __global__ void calc_nonbonded_force_kernel( // } // } - WorkT evdw = 0, ecoul = 0, dv = 0; + nonbond_work_t evdw = 0, ecoul = 0, dv = 0; calculate_unforce_bound( x_coord, @@ -269,9 +266,9 @@ __global__ void calc_nonbonded_force_kernel( evdw_sum += evdw; ecoul_sum += ecoul; - const WorkT dx = static_cast(x_coord.x - y_coord.x); - const WorkT dy = static_cast(x_coord.y - y_coord.y); - const WorkT dz = static_cast(x_coord.z - y_coord.z); + const nonbond_work_t dx = static_cast(x_coord.x - y_coord.x); + const nonbond_work_t dy = static_cast(x_coord.y - y_coord.y); + const nonbond_work_t dz = static_cast(x_coord.z - y_coord.z); y_force.x -= dv * dx; y_force.y -= dv * dy; y_force.z -= dv * dz; @@ -337,39 +334,34 @@ std::pair calc_nonbonded_force_host( cudaMemset(d_ecoul_total, 0, sizeof(double)); cudaMemset(d_evdw_total, 0, sizeof(double)); - auto launch_kernel = [&](auto work_tag) { - using WorkT = decltype(work_tag); - calc_nonbonded_force_kernel<<>>( - nx, - ny, - x_charges_types, - y_charges_types, - host.charge_pair_products->gpu_data_p, - x_atypes_types, - y_atypes_types, - host.catype_pair_params->gpu_data_p, - host.topo, - host.excluded->gpu_data_p, - host.LJ_matrix->gpu_data_p, - x_idx_list, - y_idx_list, - host.coords->gpu_data_p, - host.dvelocities->gpu_data_p, - d_evdw_total, - d_ecoul_total, - symmetric, - disable_water_h_lj, - host.n_atoms_solute, - host.n_charge_types, - host.zero_charge_type, - host.n_catype_types, - host.zero_catype_type, - host.n_qelscales, - lambda, - host.q_elscales->gpu_data_p); - }; - - launch_kernel(nonbond_work_t{}); + calc_nonbonded_force_kernel<<>>( + nx, + ny, + x_charges_types, + y_charges_types, + host.charge_pair_products->gpu_data_p, + x_atypes_types, + y_atypes_types, + host.catype_pair_params->gpu_data_p, + host.topo, + host.excluded->gpu_data_p, + host.LJ_matrix->gpu_data_p, + x_idx_list, + y_idx_list, + host.coords->gpu_data_p, + host.dvelocities->gpu_data_p, + d_evdw_total, + d_ecoul_total, + symmetric, + disable_water_h_lj, + host.n_atoms_solute, + host.n_charge_types, + host.zero_charge_type, + host.n_catype_types, + host.zero_catype_type, + host.n_qelscales, + lambda, + host.q_elscales->gpu_data_p); cudaDeviceSynchronize(); diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index 7be0656f..13c37fbc 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -136,7 +136,7 @@ __global__ void calc_polx_water_forces_kernel( if (cos_th > 1) cos_th = 1; if (cos_th < -1) cos_th = -1; f0 = sin(acos(cos_th)); - if (abs(f0) < k_singular_sin_epsilon) f0 = k_singular_sin_epsilon; + if (abs(f0) < 1.0E-12) f0 = 1.0E-12; f0 = -1.0 / f0; f0 *= dv; diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index 567a78df..b214aee9 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -64,8 +64,8 @@ __global__ void calc_restrang_force_kernel( dv = lambda * restrangs[ir].k * dth; f1 = sin(th); - if (fabs(f1) < k_singular_sin_epsilon) { - f1 = -1.0 / k_singular_sin_epsilon; + if (fabs(f1) < 1E-12) { + f1 = -1E-12; } else { f1 = -1 / f1; } diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 5baffbde..97b687a6 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -76,7 +76,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio // Forces f1 = sin(phi); - if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1); + if (fabs(f1) < 1E-12) f1 = 1E-12; f1 = -1 / f1; di.x = f1 * (rnk.x * (bjinv * bkinv) - cos_phi * rnj.x * bj2inv); From 3c39af706d38ed50b490a7c269ee3379ce3e25fd Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Thu, 30 Apr 2026 16:12:34 +0200 Subject: [PATCH 20/20] revert all --- src/core/Makefile | 32 ++---- src/core/common/include/context.h | 2 +- src/core/common/include/md_types.h | 28 +++--- src/core/common/include/vdw_rules.h | 26 +++-- src/core/common/src/handler.cpp | 5 - src/core/common/src/init.cpp | 12 +-- src/core/cpu/src/cpu_nonbonded_pp_force.cpp | 21 ++-- src/core/cpu/src/cpu_nonbonded_pw_force.cpp | 28 +++--- src/core/cpu/src/cpu_nonbonded_qp_force.cpp | 24 ++--- src/core/cpu/src/cpu_nonbonded_qq_force.cpp | 23 ++--- src/core/cpu/src/cpu_nonbonded_qw_force.cpp | 47 +++++---- src/core/cpu/src/cpu_nonbonded_ww_force.cpp | 49 +++++---- src/core/cuda/src/cuda_angle_force.cu | 12 +-- src/core/cuda/src/cuda_improper2_force.cu | 4 +- src/core/cuda/src/cuda_leapfrog.cu | 35 +++---- src/core/cuda/src/cuda_nonbonded_14_force.cu | 94 +++++++---------- src/core/cuda/src/cuda_nonbonded_force.cu | 100 +++++++------------ src/core/cuda/src/cuda_polx_water_force.cu | 13 ++- src/core/cuda/src/cuda_pshell_force.cu | 2 +- src/core/cuda/src/cuda_radix_water_force.cu | 9 +- src/core/cuda/src/cuda_restrang_force.cu | 6 +- src/core/cuda/src/cuda_restrdis_force.cu | 4 +- src/core/cuda/src/cuda_restrpos_force.cu | 6 +- src/core/cuda/src/cuda_restrseq_force.cu | 6 +- src/core/cuda/src/cuda_restrwall_force.cu | 4 +- src/core/cuda/src/cuda_shake_constraints.cu | 5 +- src/core/cuda/src/cuda_temperature.cu | 5 +- src/core/cuda/src/cuda_torsion_force.cu | 4 +- 28 files changed, 261 insertions(+), 345 deletions(-) diff --git a/src/core/Makefile b/src/core/Makefile index 6acc5da1..367be528 100644 --- a/src/core/Makefile +++ b/src/core/Makefile @@ -1,44 +1,30 @@ CC = nvcc -SPFPFLAGS = -ifeq ($(QDYN_SPFP),1) -SPFPFLAGS += -DQDYN_SPFP -endif -CFLAGS = -O3 -std=c++17 -arch=sm_86 $(SPFPFLAGS) -I./cuda/include -I./common/include -I./cpu/include -I. +CFLAGS = -O3 -std=c++17 -arch=sm_89 -I./cuda/include -I./common/include -I./cpu/include -I. DEPFLAGS = -MMD -MF $(@:.o=.d) -BUILD_MODE = $(if $(filter 1,$(QDYN_SPFP)),spfp,dpfp) -OBJDIR = .build/$(BUILD_MODE) -TARGET = $(OBJDIR)/qdyn # collect all .cu files except main.cu SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu)) CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp) -MAIN_OBJ = $(OBJDIR)/main.o -OBJS = $(addprefix $(OBJDIR)/,$(SRCS:.cu=.o)) $(addprefix $(OBJDIR)/,$(CPPSRCS:.cpp=.o)) +MAIN_OBJ = main.o +OBJS = $(SRCS:.cu=.o) $(CPPSRCS:.cpp=.o) DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d) all: qdyn move -qdyn: $(TARGET) - cp $< $@ - -$(TARGET): $(MAIN_OBJ) $(OBJS) +qdyn: $(MAIN_OBJ) $(OBJS) $(CC) $(CFLAGS) -o $@ $^ -$(OBJDIR)/%.o: %.cu - mkdir -p $(@D) +%.o: %.cu $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ -$(OBJDIR)/%.o: %.cpp - mkdir -p $(@D) +%.o: %.cpp $(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@ clean: - rm -rf .build qdyn ../../bin/qdyn + rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn -move: $(TARGET) +move: mkdir -p ../../bin - cp $< ../../bin/qdyn - -.PHONY: all qdyn clean move + mv qdyn ../../bin/ -include $(DEPS) diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h index 83817bb8..c77a2c91 100644 --- a/src/core/common/include/context.h +++ b/src/core/common/include/context.h @@ -187,7 +187,7 @@ class Context { std::unique_ptr> p_atoms_list; std::unique_ptr> w_atoms_list; std::unique_ptr> q_atoms_list; - std::unique_ptr> charge_pair_products; + std::unique_ptr> charge_pair_products; std::unique_ptr> p_charge_types; std::unique_ptr> w_charge_types; std::unique_ptr> q_charge_types; diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h index 6a4d2865..60f1f56a 100644 --- a/src/core/common/include/md_types.h +++ b/src/core/common/include/md_types.h @@ -2,8 +2,6 @@ #include #include - -#include "common/include/precision.h" /* ============================================= * == FROM MD FILE * ============================================= @@ -49,9 +47,9 @@ struct md_t { }; struct coord_t { - real_t x; - real_t y; - real_t z; + double x; + double y; + double z; }; struct bond_t { @@ -116,7 +114,7 @@ struct charge_t { struct ccharge_t { int code; - real_t charge; + double charge; }; struct atype_t { @@ -127,17 +125,17 @@ struct atype_t { struct catype_t { int code; double m; - real_t aii_normal; - real_t bii_normal; + double aii_normal; + double bii_normal; // double aii_polar; // double bii_polar; - real_t aii_1_4; - real_t bii_1_4; + double aii_1_4; + double bii_1_4; }; struct vdw_pair_param_t { - real_t a; - real_t b; + double a; + double b; }; struct topo_t { @@ -304,9 +302,9 @@ struct shake_bond_t { */ struct vel_t { - real_t x; - real_t y; - real_t z; + double x; + double y; + double z; }; struct dvel_t { diff --git a/src/core/common/include/vdw_rules.h b/src/core/common/include/vdw_rules.h index 5b8e8604..ca7bd762 100644 --- a/src/core/common/include/vdw_rules.h +++ b/src/core/common/include/vdw_rules.h @@ -4,10 +4,15 @@ #include -template + +// Geometric rule: A_ij = sqrt(A_i) * sqrt(A_j), B_ij = sqrt(B_i) * sqrt(B_j) +// Energy: V = A_ij * r^-12 - B_ij * r^-6 +// Parameters: ai_aii, aj_aii are sqrt(A_i), sqrt(A_j) +// ai_bii, aj_bii are sqrt(B_i), sqrt(B_j) +// r6 is 1/r^6 __device__ __host__ inline void calc_vdw_geometric( - Real ai_aii, Real aj_aii, Real ai_bii, Real aj_bii, - Real r6, Real* V_a, Real* V_b) { + double ai_aii, double aj_aii, double ai_bii, double aj_bii, + double r6, double* V_a, double* V_b) { *V_a = r6 * r6 * ai_aii * aj_aii; *V_b = r6 * ai_bii * aj_bii; } @@ -19,17 +24,16 @@ __device__ __host__ inline void calc_vdw_geometric( // ai_aii, aj_aii store R*_i, R*_j (vdW radius) // ai_bii, aj_bii store sqrt(eps_i), sqrt(eps_j) (after preprocessing) // r6 is 1/r^6 -template __device__ __host__ inline void calc_vdw_arithmetic( - Real Rstar_i, Real Rstar_j, Real sqrt_eps_i, Real sqrt_eps_j, - Real r6, Real* V_a, Real* V_b) { - Real Rstar_ij = Rstar_i + Rstar_j; // Arithmetic combination - Real sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j; // Geometric combination (already sqrt) + double Rstar_i, double Rstar_j, double sqrt_eps_i, double sqrt_eps_j, + double r6, double* V_a, double* V_b) { + double Rstar_ij = Rstar_i + Rstar_j; // Arithmetic combination + double sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j; // Geometric combination (already sqrt) // Compute R6 = (R*_ij)^6 - Real R2 = Rstar_ij * Rstar_ij; - Real R6 = R2 * R2 * R2; + double R2 = Rstar_ij * Rstar_ij; + double R6 = R2 * R2 * R2; *V_a = sqrt_eps_ij * R6 * R6 * r6 * r6; // sqrt(eps_i * eps_j) * R^12 * r^-12 - *V_b = static_cast(2.0) * sqrt_eps_ij * R6 * r6; // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6 + *V_b = 2.0 * sqrt_eps_ij * R6 * r6; // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6 } diff --git a/src/core/common/src/handler.cpp b/src/core/common/src/handler.cpp index b462b2c7..3fdd1341 100644 --- a/src/core/common/src/handler.cpp +++ b/src/core/common/src/handler.cpp @@ -88,11 +88,6 @@ void Handler::update_energy_totals() { } void Handler::print_outputs(int iteration) { - auto& host = Context::instance(); - if (host.run_gpu && host.md.trajectory != 0 && iteration % host.md.trajectory == 0) { - host.coords->download(); - host.velocities->download(); - } print_energies(); write_coords(iteration); write_velocities(iteration); diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp index 499c01cb..dc519a9f 100644 --- a/src/core/common/src/init.cpp +++ b/src/core/common/src/init.cpp @@ -77,11 +77,9 @@ void initialize_catype_tables() { const catype_t& cj = h_catype_table_all[j]; vdw_pair_param_t pair_param = {}; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { - calc_vdw_geometric( - ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast(1.0), &pair_param.a, &pair_param.b); + calc_vdw_geometric(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b); } else { - calc_vdw_arithmetic( - ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast(1.0), &pair_param.a, &pair_param.b); + calc_vdw_arithmetic(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b); } h_catype_pair_params[i * ctx.n_catype_types + j] = pair_param; } @@ -170,11 +168,10 @@ void initialize_charge_tables() { ctx.zero_charge_type = add_charge(0.0); ctx.n_charge_types = static_cast(h_charge_table_all.size()); - std::vector h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types); + std::vector h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types); for (int i = 0; i < ctx.n_charge_types; i++) { for (int j = 0; j < ctx.n_charge_types; j++) { - h_charge_pair_products[i * ctx.n_charge_types + j] = - static_cast(h_charge_table_all[i].charge * h_charge_table_all[j].charge); + h_charge_pair_products[i * ctx.n_charge_types + j] = h_charge_table_all[i].charge * h_charge_table_all[j].charge; } } @@ -916,3 +913,4 @@ void write_headers() { write_header("velocities.csv"); write_energy_header(); } + diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp index 390c67eb..ce744ad0 100644 --- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp @@ -15,10 +15,11 @@ void calc_nonbonded_pp_forces() { bool bond14, bond23; double scaling; coord_t da; - real_t r2a, ra, r6a; - real_t V_a, V_b; - real_t crg_i, crg_j; - real_t ai_aii, aj_aii, ai_bii, aj_bii; + double r2a, ra, r6a; + double Vela, V_a, V_b; + double dva; + double crg_i, crg_j; + double ai_aii, aj_aii, ai_bii, aj_bii; int i, j; for (int pi = 0; pi < ctx.n_patoms; pi++) { for (int pj = pi + 1; pj < ctx.n_patoms; pj++) { @@ -41,11 +42,11 @@ void calc_nonbonded_pp_forces() { da.x = coords[j].x - coords[i].x; da.y = coords[j].y - coords[i].y; da.z = coords[j].z - coords[i].z; - r2a = static_cast(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z); - ra = static_cast(std::sqrt(r2a)); + r2a = 1 / (std::pow(da.x, 2) + std::pow(da.y, 2) + std::pow(da.z, 2)); + ra = sqrt(r2a); r6a = r2a * r2a * r2a; - const real_t Vela = static_cast(scaling * ctx.topo.coulomb_constant) * crg_i * crg_j * ra; + Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra; ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal; aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; @@ -57,7 +58,7 @@ void calc_nonbonded_pp_forces() { } else { calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b); } - const real_t dva = r2a * (-Vela - static_cast(12.0) * V_a + static_cast(6.0) * V_b); + dva = r2a * (-Vela - 12 * V_a + 6 * V_b); dvelocities[i].x -= dva * da.x; dvelocities[i].y -= dva * da.y; @@ -67,8 +68,8 @@ void calc_nonbonded_pp_forces() { dvelocities[j].y += dva * da.y; dvelocities[j].z += dva * da.z; - ctx.E_nonbond_pp.Ucoul += static_cast(Vela); - ctx.E_nonbond_pp.Uvdw += static_cast(V_a - V_b); + ctx.E_nonbond_pp.Ucoul += Vela; + ctx.E_nonbond_pp.Uvdw += (V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp index 030c1290..6bf2c27e 100644 --- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp @@ -22,21 +22,21 @@ void calc_nonbonded_pw_forces() { continue; } - const real_t qi = ctx.unified_ccharge(atom_i, 0).charge; - const real_t qj = ctx.unified_ccharge(atom_j, 0).charge; + const double qi = ctx.unified_ccharge(atom_i, 0).charge; + const double qj = ctx.unified_ccharge(atom_j, 0).charge; const catype_t& atom_i_type = ctx.unified_catype(atom_i, 0); const catype_t& atom_j_type = ctx.unified_catype(atom_j, 0); - real_t v_a = 0.0; - real_t v_b = 0.0; - const real_t dx = coords[atom_j].x - coords[atom_i].x; - const real_t dy = coords[atom_j].y - coords[atom_i].y; - const real_t dz = coords[atom_j].z - coords[atom_i].z; - const real_t r2inv = static_cast(1.0) / (dx * dx + dy * dy + dz * dz); - const real_t rinv = static_cast(std::sqrt(r2inv)); - const real_t r6inv = r2inv * r2inv * r2inv; - const real_t ecoul = static_cast(ctx.topo.coulomb_constant) * qi * qj * rinv; + double v_a = 0.0; + double v_b = 0.0; + const double dx = coords[atom_j].x - coords[atom_i].x; + const double dy = coords[atom_j].y - coords[atom_i].y; + const double dz = coords[atom_j].z - coords[atom_i].z; + const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz); + const double rinv = std::sqrt(r2inv); + const double r6inv = r2inv * r2inv * r2inv; + const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { calc_vdw_geometric(atom_i_type.aii_normal, @@ -56,7 +56,7 @@ void calc_nonbonded_pw_forces() { &v_b); } - const real_t scale = r2inv * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); + const double scale = r2inv * (-ecoul - 12.0 * v_a + 6.0 * v_b); dvelocities[atom_i].x -= scale * dx; dvelocities[atom_i].y -= scale * dy; @@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() { dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - ctx.E_nonbond_pw.Ucoul += static_cast(ecoul); - ctx.E_nonbond_pw.Uvdw += static_cast(v_a - v_b); + ctx.E_nonbond_pw.Ucoul += ecoul; + ctx.E_nonbond_pw.Uvdw += (v_a - v_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp index 7a81a516..65a74a6c 100644 --- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp @@ -15,11 +15,10 @@ void calc_nonbonded_qp_forces() { auto *excluded = ctx.excluded->cpu_data_p; int i, j; coord_t da; - real_t r2, r; - real_t ai_aii, aj_aii, ai_bii, aj_bii; + double r2, r6, r; + double ai_aii, aj_aii, ai_bii, aj_bii; bool bond23, bond14; - double scaling; - real_t Vel, V_a, V_b, dv; + double scaling, Vel, V_a, V_b, dv; for (int qi = 0; qi < ctx.n_qatoms; qi++) { for (int pj = 0; pj < ctx.n_patoms; pj++) { @@ -38,10 +37,12 @@ void calc_nonbonded_qp_forces() { da.y = coords[j].y - coords[i].y; da.z = coords[j].z - coords[i].z; - r2 = da.x * da.x + da.y * da.y + da.z * da.z; - r2 = static_cast(1.0) / r2; - r = static_cast(std::sqrt(r2)); - const real_t r6inv = r2 * r2 * r2; // 1/r^6 for vdW calculation + r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2); + + r6 = r2 * r2 * r2; + r2 = 1 / r2; + r = sqrt(r2); + double r6inv = r2 * r2 * r2; // 1/r^6 for vdW calculation for (int state = 0; state < ctx.n_lambdas; state++) { const catype_t& qi_type = ctx.unified_catype(i, state); @@ -52,8 +53,7 @@ void calc_nonbonded_qp_forces() { ai_bii = bond14 ? qi_type.bii_1_4 : qi_type.bii_normal; aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; - Vel = static_cast(ctx.topo.coulomb_constant * scaling) * - ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r; + Vel = ctx.topo.coulomb_constant * scaling * ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r; if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { calc_vdw_geometric(ai_aii, aj_aii, ai_bii, aj_bii, r6inv, &V_a, &V_b); } else { @@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() { dvelocities[j].z += dv * da.z; // Update Q totals - ctx.EQ_nonbond_qp[state].Ucoul += static_cast(Vel); - ctx.EQ_nonbond_qp[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qp[state].Ucoul += Vel; + ctx.EQ_nonbond_qp[state].Uvdw += (V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp index 006a3c0e..2b062d48 100644 --- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp @@ -15,14 +15,14 @@ void calc_nonbonded_qq_forces() { auto *excluded = ctx.excluded->cpu_data_p; auto *q_elscales = ctx.q_elscales->cpu_data_p; int ai, aj; - real_t crg_i, crg_j; + double crg_i, crg_j; double elscale, scaling; bool bond23, bond14; coord_t da; - real_t r2a, ra, r6a; - real_t Vela, V_a, V_b; - real_t dva; - real_t ai_aii, aj_aii, ai_bii, aj_bii; + double r2a, ra, r6a; + double Vela, V_a, V_b; + double dva; + double ai_aii, aj_aii, ai_bii, aj_bii; for (int state = 0; state < ctx.n_lambdas; state++) { for (int qi = 0; qi < ctx.n_qatoms; qi++) { @@ -54,11 +54,11 @@ void calc_nonbonded_qq_forces() { da.x = coords[aj].x - coords[ai].x; da.y = coords[aj].y - coords[ai].y; da.z = coords[aj].z - coords[ai].z; - r2a = static_cast(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z); - ra = static_cast(std::sqrt(r2a)); + r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); + ra = sqrt(r2a); r6a = r2a * r2a * r2a; - Vela = static_cast(scaling * ctx.topo.coulomb_constant * elscale) * crg_i * crg_j * ra; + Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra * elscale; ai_aii = bond14 ? qi_type.aii_1_4 : qi_type.aii_normal; aj_aii = bond14 ? qj_type.aii_1_4 : qj_type.aii_normal; @@ -70,8 +70,7 @@ void calc_nonbonded_qq_forces() { } else { calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b); } - dva = r2a * (-Vela - static_cast(12.0) * V_a + static_cast(6.0) * V_b) * - static_cast(lambdas[state]); + dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state]; dvelocities[ai].x -= dva * da.x; dvelocities[ai].y -= dva * da.y; @@ -81,8 +80,8 @@ void calc_nonbonded_qq_forces() { dvelocities[aj].y += dva * da.y; dvelocities[aj].z += dva * da.z; - ctx.EQ_nonbond_qq[state].Ucoul += static_cast(Vela); - ctx.EQ_nonbond_qq[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qq[state].Ucoul += Vela; + ctx.EQ_nonbond_qq[state].Uvdw += (V_a - V_b); } } } diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp index 8d18bc55..17530a16 100644 --- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp @@ -13,17 +13,17 @@ void calc_nonbonded_qw_forces() { auto *excluded = ctx.excluded->cpu_data_p; int i; coord_t dO, dH1, dH2; - real_t r2O, rH1, rH2, rO, r2H1, r2H2; - real_t dvO, dvH1, dvH2; - real_t V_a, V_b, VelO, VelH1, VelH2; - real_t ai_aii, ai_bii; + double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; + double dvO, dvH1, dvH2; + double V_a, V_b, VelO, VelH1, VelH2; + double ai_aii, ai_bii; // Loop over O-atoms, q-atoms for (int j = ctx.n_atoms_solute; j < ctx.n_atoms; j += 3) { const catype_t& ow_type = ctx.unified_catype(j, 0); - const real_t ow_charge = ctx.unified_ccharge(j, 0).charge; - const real_t hw1_charge = ctx.unified_ccharge(j + 1, 0).charge; - const real_t hw2_charge = ctx.unified_ccharge(j + 2, 0).charge; + const double ow_charge = ctx.unified_ccharge(j, 0).charge; + const double hw1_charge = ctx.unified_ccharge(j + 1, 0).charge; + const double hw2_charge = ctx.unified_ccharge(j + 2, 0).charge; for (int qi = 0; qi < ctx.n_qatoms; qi++) { i = ctx.q_atoms[qi]; if (excluded[i] || excluded[j]) continue; @@ -36,12 +36,13 @@ void calc_nonbonded_qw_forces() { dH2.x = coords[j + 2].x - coords[i].x; dH2.y = coords[j + 2].y - coords[i].y; dH2.z = coords[j + 2].z - coords[i].z; - r2O = dO.x * dO.x + dO.y * dO.y + dO.z * dO.z; - rH1 = static_cast(std::sqrt(static_cast(1.0) / (dH1.x * dH1.x + dH1.y * dH1.y + dH1.z * dH1.z))); - rH2 = static_cast(std::sqrt(static_cast(1.0) / (dH2.x * dH2.x + dH2.y * dH2.y + dH2.z * dH2.z))); - r2O = static_cast(1.0) / r2O; - rO = static_cast(std::sqrt(r2O)); - const real_t r6Oinv = r2O * r2O * r2O; // 1/r^6 for vdW calculation + r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2); + rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2))); + rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2))); + r6O = r2O * r2O * r2O; + r2O = 1.0 / r2O; + rO = sqrt(r2O); + double r6Oinv = r2O * r2O * r2O; // 1/r^6 for vdW calculation r2H1 = rH1 * rH1; r2H2 = rH2 * rH2; @@ -62,21 +63,19 @@ void calc_nonbonded_qw_forces() { calc_vdw_arithmetic(ai_aii, ow_type.aii_normal, ai_bii, ow_type.bii_normal, r6Oinv, &V_a, &V_b); } - const real_t q_charge = ctx.unified_ccharge(i, state).charge; - const real_t coulomb_constant = static_cast(ctx.topo.coulomb_constant); - VelO = coulomb_constant * ow_charge * q_charge * rO; - VelH1 = coulomb_constant * hw1_charge * q_charge * rH1; - VelH2 = coulomb_constant * hw2_charge * q_charge * rH2; + const double q_charge = ctx.unified_ccharge(i, state).charge; + VelO = ctx.topo.coulomb_constant * ow_charge * q_charge * rO; + VelH1 = ctx.topo.coulomb_constant * hw1_charge * q_charge * rH1; + VelH2 = ctx.topo.coulomb_constant * hw2_charge * q_charge * rH2; // if (state == 0 && qi == 1) printf("j = %d ai__aii = %f A_O = %f B_O = %f V_a = %f V_b = %f r6O = %f\n", j, ai_aii, A_O, B_O, V_a, V_b, r6O); - const real_t lambda = static_cast(lambdas[state]); - dvO += r2O * (-VelO - (static_cast(12.0) * V_a - static_cast(6.0) * V_b)) * lambda; - dvH1 -= r2H1 * VelH1 * lambda; - dvH2 -= r2H2 * VelH2 * lambda; + dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * lambdas[state]; + dvH1 -= r2H1 * VelH1 * lambdas[state]; + dvH2 -= r2H2 * VelH2 * lambdas[state]; - ctx.EQ_nonbond_qw[state].Ucoul += static_cast(VelO + VelH1 + VelH2); - ctx.EQ_nonbond_qw[state].Uvdw += static_cast(V_a - V_b); + ctx.EQ_nonbond_qw[state].Ucoul += (VelO + VelH1 + VelH2); + ctx.EQ_nonbond_qw[state].Uvdw += (V_a - V_b); } // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp index 3be5e6f0..505dd45a 100644 --- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp +++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp @@ -1,21 +1,18 @@ #include "cpu_nonbonded_ww_force.h" -#include - #include "constants.h" #include "context.h" #include "vdw_rules.h" namespace { -void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, real_t* vdw_a, real_t* vdw_b) { +void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, double* vdw_a, double* vdw_b) { const catype_t& oi_type = ctx.unified_catype(oxygen_i, 0); const catype_t& oj_type = ctx.unified_catype(oxygen_j, 0); if (ctx.topo.vdw_rule == VDW_GEOMETRIC) { *vdw_a = oi_type.aii_normal * oj_type.aii_normal; *vdw_b = oi_type.bii_normal * oj_type.bii_normal; } else { - calc_vdw_arithmetic( - oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, static_cast(1.0), vdw_a, vdw_b); + calc_vdw_arithmetic(oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, 1.0, vdw_a, vdw_b); } } } // namespace @@ -23,33 +20,33 @@ void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, void accumulate_pair_force(Context& ctx, int atom_i, int atom_j, - real_t qi, - real_t qj, + double qi, + double qj, bool include_vdw, - real_t vdw_a, - real_t vdw_b, + double vdw_a, + double vdw_b, E_nonbonded_t& energy) { auto &coords = ctx.coords->cpu_data_p; auto &dvelocities = ctx.dvelocities->cpu_data_p; - const real_t dx = coords[atom_j].x - coords[atom_i].x; - const real_t dy = coords[atom_j].y - coords[atom_i].y; - const real_t dz = coords[atom_j].z - coords[atom_i].z; + const double dx = coords[atom_j].x - coords[atom_i].x; + const double dy = coords[atom_j].y - coords[atom_i].y; + const double dz = coords[atom_j].z - coords[atom_i].z; - const real_t r2inv = static_cast(1.0) / (dx * dx + dy * dy + dz * dz); - const real_t rinv = static_cast(std::sqrt(r2inv)); - const real_t ecoul = static_cast(ctx.topo.coulomb_constant) * qi * qj * rinv; + const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz); + const double rinv = std::sqrt(r2inv); + const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv; - real_t evdw = 0.0; - real_t dva = -ecoul; + double evdw = 0.0; + double dva = -ecoul; if (include_vdw) { - const real_t r6inv = r2inv * r2inv * r2inv; - const real_t v_a = vdw_a * r6inv * r6inv; - const real_t v_b = vdw_b * r6inv; + const double r6inv = r2inv * r2inv * r2inv; + const double v_a = vdw_a * r6inv * r6inv; + const double v_b = vdw_b * r6inv; evdw = v_a - v_b; - dva -= static_cast(12.0) * v_a - static_cast(6.0) * v_b; + dva -= 12.0 * v_a - 6.0 * v_b; } - const real_t scale = r2inv * dva; + const double scale = r2inv * dva; dvelocities[atom_i].x -= scale * dx; dvelocities[atom_i].y -= scale * dy; @@ -59,8 +56,8 @@ void accumulate_pair_force(Context& ctx, dvelocities[atom_j].y += scale * dy; dvelocities[atom_j].z += scale * dz; - energy.Ucoul += static_cast(ecoul); - energy.Uvdw += static_cast(evdw); + energy.Ucoul += ecoul; + energy.Uvdw += evdw; } void calc_nonbonded_ww_forces() { @@ -73,8 +70,8 @@ void calc_nonbonded_ww_forces() { const int base_i = ctx.n_atoms_solute + 3 * water_i; for (int water_j = water_i + 1; water_j < ctx.n_waters; ++water_j) { const int base_j = ctx.n_atoms_solute + 3 * water_j; - real_t oxygen_vdw_a = 0.0; - real_t oxygen_vdw_b = 0.0; + double oxygen_vdw_a = 0.0; + double oxygen_vdw_b = 0.0; calc_oxygen_vdw_parameters(ctx, base_i, base_j, &oxygen_vdw_a, &oxygen_vdw_b); for (int atom_i = 0; atom_i < 3; ++atom_i) { for (int atom_j = 0; atom_j < 3; ++atom_j) { diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu index dcd044ce..7c49cffb 100644 --- a/src/core/cuda/src/cuda_angle_force.cu +++ b/src/core/cuda/src/cuda_angle_force.cu @@ -48,14 +48,14 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co atomicAdd(energy_sum, energy); coord_t di = { - static_cast(f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length))), - static_cast(f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length))), - static_cast(f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length)))}; + f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length)), + f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length)), + f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length))}; coord_t dk = { - static_cast(f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length))), - static_cast(f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length))), - static_cast(f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length)))}; + f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length)), + f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length)), + f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length))}; atomicAdd(&dvelocities[i].x, dv * di.x); atomicAdd(&dvelocities[i].y, dv * di.y); diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu index 78707b12..e44678e0 100644 --- a/src/core/cuda/src/cuda_improper2_force.cu +++ b/src/core/cuda/src/cuda_improper2_force.cu @@ -51,8 +51,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z; rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x; - bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z); - bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z); + bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2)); + bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2)); bjinv = sqrt(bj2inv); bkinv = sqrt(bk2inv); diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu index 1e010f7e..49312337 100644 --- a/src/core/cuda/src/cuda_leapfrog.cu +++ b/src/core/cuda/src/cuda_leapfrog.cu @@ -45,20 +45,6 @@ __global__ void calc_leapfrog_kernel( coords[i].z += velocities[i].z * dt; } -__global__ void update_velocities_from_positions_kernel( - vel_t* velocities, - const coord_t* coords, - const coord_t* xcoords, - int n_atoms, - double dt) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= n_atoms) return; - - velocities[idx].x = (coords[idx].x - xcoords[idx].x) / dt; - velocities[idx].y = (coords[idx].y - xcoords[idx].y) / dt; - velocities[idx].z = (coords[idx].z - xcoords[idx].z) / dt; -} - void calc_leapfrog_host() { auto& host = Context::instance(); auto d_atypes = host.atypes->gpu_data_p; @@ -84,17 +70,24 @@ void calc_leapfrog_host() { host.dt); check_cuda(cudaDeviceSynchronize()); + host.velocities->download(); + host.dvelocities->download(); + host.coords->download(); + host.xcoords->download(); + // shake + // todo: Here is some problem, it writes into cpu memory, but we use gpu.. printf("n_shake_constraints: %d\n", host.n_shake_constraints); if (host.n_shake_constraints > 0) { calc_shake_constraints_host(); - update_velocities_from_positions_kernel<<>>( - d_velocities, - d_coords, - d_xcoords, - host.n_atoms, - host.dt); - check_cuda(cudaDeviceSynchronize()); + auto &velocities = host.velocities->cpu_data_p; + auto &coords = host.coords->cpu_data_p; + auto *xcoords = host.xcoords->cpu_data_p; + for (int i = 0; i < host.n_atoms; i++) { + velocities[i].x = (coords[i].x - xcoords[i].x) / host.dt; + velocities[i].y = (coords[i].y - xcoords[i].y) / host.dt; + velocities[i].z = (coords[i].z - xcoords[i].z) / host.dt; + } } } diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu index 78c4bc91..fa404ee7 100644 --- a/src/core/cuda/src/cuda_nonbonded_14_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu @@ -12,14 +12,6 @@ int* d_atom_to_qi = nullptr; double* d_evdw_totals = nullptr; double* d_ecoul_totals = nullptr; -__device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) { -#ifdef QDYN_SPFP - return rsqrtf(value); -#else - return rsqrt(value); -#endif -} - __device__ __forceinline__ int unified_parameter_index( int atom_idx, int state, @@ -37,53 +29,37 @@ __device__ __forceinline__ int unified_parameter_index( __device__ void calculate_nonbonded_14_pair( const coord_t& x, const coord_t& y, - real_t x_charge, - real_t y_charge, - real_t x_aii, - real_t y_aii, - real_t x_bii, - real_t y_bii, - nonbond_work_t coulomb_constant, - nonbond_work_t scaling, + double x_charge, + double y_charge, + double x_aii, + double y_aii, + double x_bii, + double y_bii, + double coulomb_constant, + double scaling, int vdw_rule, - nonbond_work_t lambda, - nonbond_work_t& evdw, - nonbond_work_t& ecoul, - nonbond_work_t& dv) { - const nonbond_work_t dx = static_cast(x.x - y.x); - const nonbond_work_t dy = static_cast(x.y - y.y); - const nonbond_work_t dz = static_cast(x.z - y.z); - const nonbond_work_t r = nonbond14_rsqrt(dx * dx + dy * dy + dz * dz); - const nonbond_work_t r2 = r * r; - const nonbond_work_t r6 = r2 * r2 * r2; + double lambda, + double& evdw, + double& ecoul, + double& dv) { + const double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; + const double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z); + const double r2 = r * r; + const double r6 = r2 * r2 * r2; ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda; - nonbond_work_t v_a = 0.0; - nonbond_work_t v_b = 0.0; + double v_a = 0.0; + double v_b = 0.0; if (vdw_rule == VDW_GEOMETRIC) { - calc_vdw_geometric( - static_cast(x_aii), - static_cast(y_aii), - static_cast(x_bii), - static_cast(y_bii), - r6, - &v_a, - &v_b); + calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b); } else { - calc_vdw_arithmetic( - static_cast(x_aii), - static_cast(y_aii), - static_cast(x_bii), - static_cast(y_bii), - r6, - &v_a, - &v_b); + calc_vdw_arithmetic(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b); } v_a *= lambda; v_b *= lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); + dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); } __global__ void calc_nonbonded_14_force_kernel( @@ -126,10 +102,10 @@ __global__ void calc_nonbonded_14_force_kernel( const coord_t ri = d_coords[ai]; const coord_t rj = d_coords[aj]; - nonbond_work_t evdw = 0.0; - nonbond_work_t ecoul = 0.0; - nonbond_work_t dv = 0.0; - const nonbond_work_t pair_lambda = static_cast((mode == NONBONDED_14_PP) ? 1.0 : lambda); + double evdw = 0.0; + double ecoul = 0.0; + double dv = 0.0; + const double pair_lambda = (mode == NONBONDED_14_PP) ? 1.0 : lambda; calculate_nonbonded_14_pair( ri, @@ -140,23 +116,21 @@ __global__ void calc_nonbonded_14_force_kernel( aj_type.aii_1_4, ai_type.bii_1_4, aj_type.bii_1_4, - static_cast(d_topo.coulomb_constant), - static_cast(d_topo.el14_scale), + d_topo.coulomb_constant, + d_topo.el14_scale, d_topo.vdw_rule, pair_lambda, evdw, ecoul, dv); - const nonbond_work_t dx = static_cast(rj.x - ri.x); - const nonbond_work_t dy = static_cast(rj.y - ri.y); - const nonbond_work_t dz = static_cast(rj.z - ri.z); - atomicAdd(&d_dvelocities[ai].x, -dv * dx); - atomicAdd(&d_dvelocities[ai].y, -dv * dy); - atomicAdd(&d_dvelocities[ai].z, -dv * dz); - atomicAdd(&d_dvelocities[aj].x, dv * dx); - atomicAdd(&d_dvelocities[aj].y, dv * dy); - atomicAdd(&d_dvelocities[aj].z, dv * dz); + const double3 d = {rj.x - ri.x, rj.y - ri.y, rj.z - ri.z}; + atomicAdd(&d_dvelocities[ai].x, -dv * d.x); + atomicAdd(&d_dvelocities[ai].y, -dv * d.y); + atomicAdd(&d_dvelocities[ai].z, -dv * d.z); + atomicAdd(&d_dvelocities[aj].x, dv * d.x); + atomicAdd(&d_dvelocities[aj].y, dv * d.y); + atomicAdd(&d_dvelocities[aj].z, dv * d.z); atomicAdd(&evdw_totals[mode], evdw); atomicAdd(&ecoul_totals[mode], ecoul); diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu index ce3f73ae..432a7137 100644 --- a/src/core/cuda/src/cuda_nonbonded_force.cu +++ b/src/core/cuda/src/cuda_nonbonded_force.cu @@ -9,20 +9,6 @@ namespace CudaNonbondedForce { bool is_initialized = false; double *d_evdw_total, *d_ecoul_total; -struct nonbond_vec_t { - nonbond_work_t x; - nonbond_work_t y; - nonbond_work_t z; -}; - -__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) { -#ifdef QDYN_SPFP - return rsqrtf(value); -#else - return rsqrt(value); -#endif -} - __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f); y = t - (x * n - (x * (x - 1) >> 1)); @@ -33,13 +19,7 @@ __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) { y += x; } -template -__device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffffffffu) { - return __shfl_sync(mask, v, srcLane); -} - -template <> -__device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) { +__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) { int2 a = *reinterpret_cast(&v); a.x = __shfl_sync(mask, a.x, srcLane); a.y = __shfl_sync(mask, a.y, srcLane); @@ -47,9 +27,9 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas } __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) { - v.x = shfl_value(v.x, srcLane, mask); - v.y = shfl_value(v.y, srcLane, mask); - v.z = shfl_value(v.z, srcLane, mask); + v.x = shfl(v.x, srcLane, mask); + v.y = shfl(v.y, srcLane, mask); + v.z = shfl(v.z, srcLane, mask); return v; } @@ -57,23 +37,21 @@ __device__ void calculate_unforce_bound( const coord_t& x, const coord_t& y, - const real_t charge_product, + const double charge_product, const vdw_pair_param_t& pair_param, - const nonbond_work_t coulomb_constant, + const double coulomb_constant, - const nonbond_work_t scaling, - const nonbond_work_t lambda, + const double scaling, + const double lambda, - nonbond_work_t& evdw, - nonbond_work_t& ecoul, - nonbond_work_t& dv) { - const nonbond_work_t dx = static_cast(x.x - y.x); - const nonbond_work_t dy = static_cast(x.y - y.y); - const nonbond_work_t dz = static_cast(x.z - y.z); - const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz); - const nonbond_work_t r2 = r * r; - const nonbond_work_t r6 = r2 * r2 * r2; + double& evdw, + double& ecoul, + double& dv) { + double3 d = {x.x - y.x, x.y - y.y, x.z - y.z}; + double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z); + double r2 = r * r; + double r6 = r2 * r2 * r2; // double v_a = r6 * r6; // double v_b = r6; // ecoul = r; @@ -82,10 +60,10 @@ __device__ void calculate_unforce_bound( ecoul = scaling * coulomb_constant * charge_product * r * lambda; - const nonbond_work_t v_a = static_cast(pair_param.a) * r6 * r6 * lambda; - const nonbond_work_t v_b = static_cast(pair_param.b) * r6 * lambda; + double v_a = pair_param.a * r6 * r6 * lambda; + double v_b = pair_param.b * r6 * lambda; evdw = v_a - v_b; - dv = r2 * (-ecoul - static_cast(12.0) * v_a + static_cast(6.0) * v_b); + dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b); } __global__ void calc_nonbonded_force_kernel( @@ -94,7 +72,7 @@ __global__ void calc_nonbonded_force_kernel( const int* x_charges_types, const int* y_charges_types, - const real_t* charge_pair_products, + const double* charge_pair_products, const int* x_atypes_types, const int* y_atypes_types, @@ -161,7 +139,7 @@ __global__ void calc_nonbonded_force_kernel( int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1; int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1; - coord_t invalid = {static_cast(-1e9), static_cast(-1e9), static_cast(-1e9)}; + coord_t invalid = {-1e9, -1e9, -1e9}; coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid; coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid; @@ -174,8 +152,8 @@ __global__ void calc_nonbonded_force_kernel( int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1; int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1; - nonbond_vec_t x_force = {0.0, 0.0, 0.0}; - nonbond_vec_t y_force = {0.0, 0.0, 0.0}; + double3 x_force = {0.0, 0.0, 0.0}; + double3 y_force = {0.0, 0.0, 0.0}; double evdw_sum = 0.0; double ecoul_sum = 0.0; @@ -216,9 +194,9 @@ __global__ void calc_nonbonded_force_kernel( y_charge_type_idx = __shfl_sync(mask, y_charge_type_idx, src); y_catype_type_idx = __shfl_sync(mask, y_catype_type_idx, src); - y_force.x = shfl_value(y_force.x, src, mask); - y_force.y = shfl_value(y_force.y, src, mask); - y_force.z = shfl_value(y_force.z, src, mask); + y_force.x = shfl(y_force.x, src, mask); + y_force.y = shfl(y_force.y, src, mask); + y_force.z = shfl(y_force.z, src, mask); }; if (disable_water_h_lj) { @@ -230,15 +208,13 @@ __global__ void calc_nonbonded_force_kernel( } } - const nonbond_work_t kernel_lambda = static_cast(lambda); - const nonbond_work_t coulomb_constant = static_cast(d_topo.coulomb_constant); const int charge_pair_row = x_charge_type_idx * n_charge_types; const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0; for (int i = 0; i < 32; i++) { if (is_valid()) { - nonbond_work_t scaling = static_cast(1.0); - real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; + double scaling = 1.0; + double charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx]; vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx]; // todo: Now the idx is wrong, should optimize it later @@ -249,16 +225,16 @@ __global__ void calc_nonbonded_force_kernel( // } // } - nonbond_work_t evdw = 0, ecoul = 0, dv = 0; + double evdw = 0, ecoul = 0, dv = 0; calculate_unforce_bound( x_coord, y_coord, charge_product, pair_param, - coulomb_constant, + d_topo.coulomb_constant, scaling, - kernel_lambda, + lambda, evdw, ecoul, dv); @@ -266,16 +242,14 @@ __global__ void calc_nonbonded_force_kernel( evdw_sum += evdw; ecoul_sum += ecoul; - const nonbond_work_t dx = static_cast(x_coord.x - y_coord.x); - const nonbond_work_t dy = static_cast(x_coord.y - y_coord.y); - const nonbond_work_t dz = static_cast(x_coord.z - y_coord.z); - y_force.x -= dv * dx; - y_force.y -= dv * dy; - y_force.z -= dv * dz; + double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z}; + y_force.x -= dv * d.x; + y_force.y -= dv * d.y; + y_force.z -= dv * d.z; - x_force.x += dv * dx; - x_force.y += dv * dy; - x_force.z += dv * dz; + x_force.x += dv * d.x; + x_force.y += dv * d.y; + x_force.z += dv * d.z; } do_shuffle(); } diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu index 13c37fbc..9b0eb667 100644 --- a/src/core/cuda/src/cuda_polx_water_force.cu +++ b/src/core/cuda/src/cuda_polx_water_force.cu @@ -46,7 +46,7 @@ __global__ void calc_polx_theta_and_shells( rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y; rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z; - rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z); + rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2)); rmu.x /= rm; rmu.y /= rm; @@ -55,7 +55,7 @@ __global__ void calc_polx_theta_and_shells( rcu.x = coords[wi].x - topo.solvent_center.x; rcu.y = coords[wi].y - topo.solvent_center.y; rcu.z = coords[wi].z - topo.solvent_center.z; - rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z); + rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2)); rcu.x /= rc; rcu.y /= rc; rcu.z /= rc; @@ -106,19 +106,18 @@ __global__ void calc_polx_water_forces_kernel( if (theta_val > M_PI) theta_val = M_PI; avtdum += theta[ii]; - const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr; - ener = .5 * md.polarisation_force * dtheta * dtheta; + ener = .5 * md.polarisation_force * pow(theta[ii] - theta_val + wshells[is].theta_corr, 2); // E_restraint.Upolx += ener; atomicAdd(energy, ener); - dv = md.polarisation_force * dtheta; + dv = md.polarisation_force * (theta[ii] - theta_val + wshells[is].theta_corr); wi = n_atoms_solute + 3 * ii; rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x; rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y; rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z; - rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z); + rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2)); rmu.x /= rm; rmu.y /= rm; @@ -127,7 +126,7 @@ __global__ void calc_polx_water_forces_kernel( rcu.x = coords[wi].x - topo.solvent_center.x; rcu.y = coords[wi].y - topo.solvent_center.y; rcu.z = coords[wi].z - topo.solvent_center.z; - rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z); + rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2)); rcu.x /= rc; rcu.y /= rc; rcu.z /= rc; diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu index 5221cb9e..a01fb536 100644 --- a/src/core/cuda/src/cuda_pshell_force.cu +++ b/src/core/cuda/src/cuda_pshell_force.cu @@ -34,7 +34,7 @@ __global__ void calc_pshell_force_kernel( dr.x = coords[i].x - coords_init[i].x; dr.y = coords[i].y - coords_init[i].y; dr.z = coords[i].z - coords_init[i].z; - r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; + r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); ener = 0.5 * k * r2; // printf("dr = %f %f %f\n", dr.x, dr.y, dr.z); diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu index f037e9db..06f5f5a3 100644 --- a/src/core/cuda/src/cuda_radix_water_force.cu +++ b/src/core/cuda/src/cuda_radix_water_force.cu @@ -29,18 +29,18 @@ __global__ void calc_radix_water_forces_kernel( dr.x = coords[i].x - topo.solvent_center.x; dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); + double b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); double db = b - (topo.solvent_radius - shift); double ener, dv; if (db > 0) { - ener = 0.5 * md.radial_force * db * db - Dwmz; + ener = 0.5 * md.radial_force * pow(db, 2) - Dwmz; dv = md.radial_force * db / b; } else { if (b > 0.0) { double fexp = exp(awmz * db); - ener = Dwmz * (fexp * fexp - 2 * fexp); - dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b; + ener = Dwmz * (pow(fexp, 2) - 2 * fexp); + dv = -2 * Dwmz * awmz * (fexp - pow(fexp, 2)) / b; } else { dv = 0; ener = 0; @@ -91,6 +91,7 @@ void calc_radix_water_forces_host() { d_dvelocities, d_energy); check_cuda(cudaDeviceSynchronize()); + host.dvelocities->download(); check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost)); host.E_restraint.Uradx += energy; } diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu index b214aee9..eb0813f5 100644 --- a/src/core/cuda/src/cuda_restrang_force.cu +++ b/src/core/cuda/src/cuda_restrang_force.cu @@ -45,8 +45,8 @@ __global__ void calc_restrang_force_kernel( lambda = 1; } - r2ij = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; - r2jk = dr2.x * dr2.x + dr2.y * dr2.y + dr2.z * dr2.z; + r2ij = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); + r2jk = pow(dr2.x, 2) + pow(dr2.y, 2) + pow(dr2.z, 2); rij = sqrt(r2ij); rjk = sqrt(r2jk); @@ -60,7 +60,7 @@ __global__ void calc_restrang_force_kernel( th = acos(cos_th); dth = th - to_radians_device(restrangs[ir].ang); - ener = .5 * restrangs[ir].k * dth * dth; + ener = .5 * restrangs[ir].k * pow(dth, 2); dv = lambda * restrangs[ir].k * dth; f1 = sin(th); diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu index 14f9b466..9aacf977 100644 --- a/src/core/cuda/src/cuda_restrdis_force.cu +++ b/src/core/cuda/src/cuda_restrdis_force.cu @@ -40,7 +40,7 @@ __global__ void calc_restrdis_forces_kernel( lambda = 1; } - b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); + b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); if (b < restrdists[ir].d1) { db = b - restrdists[ir].d1; } else if (b > restrdists[ir].d2) { @@ -50,7 +50,7 @@ __global__ void calc_restrdis_forces_kernel( return; } - ener = .5 * restrdists[ir].k * db * db; + ener = .5 * restrdists[ir].k * pow(db, 2); dv = lambda * restrdists[ir].k * db / b; atomicAdd(&dvelocities[j].x, dr.x * dv); diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu index 695e2b33..5f479364 100644 --- a/src/core/cuda/src/cuda_restrpos_force.cu +++ b/src/core/cuda/src/cuda_restrpos_force.cu @@ -39,9 +39,9 @@ __global__ void calc_restrpos_forces_kernel( lambda = 1; } - x2 = dr.x * dr.x; - y2 = dr.y * dr.y; - z2 = dr.z * dr.z; + x2 = pow(dr.x, 2); + y2 = pow(dr.y, 2); + z2 = pow(dr.z, 2); ener = .5 * restrspos[ir].k.x * x2 + .5 * restrspos[ir].k.y * y2 + .5 * restrspos[ir].k.z * z2; diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu index 71835e4e..b5db3552 100644 --- a/src/core/cuda/src/cuda_restrseq_force.cu +++ b/src/core/cuda/src/cuda_restrseq_force.cu @@ -46,7 +46,7 @@ __global__ void calc_restrseq_forces_kernel( dr.x /= n_ctr; dr.y /= n_ctr; dr.z /= n_ctr; - r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; + r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); ener = .5 * k * r2; atomicAdd(upres_energy, ener); @@ -77,7 +77,7 @@ __global__ void calc_restrseq_forces_kernel( dr.x /= totmass; dr.y /= totmass; dr.z /= totmass; - r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; + r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); ener = .5 * k * r2; atomicAdd(upres_energy, ener); @@ -100,7 +100,7 @@ __global__ void calc_restrseq_forces_kernel( dr.y = coords[i].y - coords_init[i].y; dr.z = coords[i].z - coords_init[i].z; - r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; + r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); ener = .5 * k * r2; atomicAdd(upres_energy, ener); diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu index c928bb71..12d890ad 100644 --- a/src/core/cuda/src/cuda_restrwall_force.cu +++ b/src/core/cuda/src/cuda_restrwall_force.cu @@ -29,11 +29,11 @@ __global__ void calc_restrwall_forces_kernel( dr.y = coords[i].y - topo.solvent_center.y; dr.z = coords[i].z - topo.solvent_center.z; - b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z); + b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2)); db = b - restrwalls[ir].d; if (db > 0) { - ener = .5 * k * db * db - restrwalls[ir].dMorse; + ener = .5 * k * pow(db, 2) - restrwalls[ir].dMorse; dv = k * db / b; } else { fexp = exp(restrwalls[ir].aMorse * db); diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu index bda47e50..e9dfd051 100644 --- a/src/core/cuda/src/cuda_shake_constraints.cu +++ b/src/core/cuda/src/cuda_shake_constraints.cu @@ -48,7 +48,7 @@ __global__ void calc_shake_constraints_kernel( xij.x = coords[ai].x - coords[aj].x; xij.y = coords[ai].y - coords[aj].y; xij.z = coords[ai].z - coords[aj].z; - xij2 = xij.x * xij.x + xij.y * xij.y + xij.z * xij.z; + xij2 = pow(xij.x, 2) + pow(xij.y, 2) + pow(xij.z, 2); diff = shake_bonds[shake + i].dist2 - xij2; if (fabs(diff) < shake_tol * shake_bonds[shake + i].dist2) { shake_bonds[shake + i].ready = true; @@ -86,7 +86,7 @@ __global__ void calc_shake_constraints_kernel( xxij.x = xcoords[ai].x - xcoords[aj].x; xxij.y = xcoords[ai].y - xcoords[aj].y; xxij.z = xcoords[ai].z - xcoords[aj].z; - xxij2 = xxij.x * xxij.x + xxij.y * xxij.y + xxij.z * xxij.z; + xxij2 = pow(xxij.x, 2) + pow(xxij.y, 2) + pow(xxij.z, 2); printf(">>> Shake failed, i = %d,j = %d, d = %f, d0 = %f", ai, aj, sqrt(xxij2), shake_bonds[shake + i].dist2); } return; @@ -154,5 +154,6 @@ int calc_shake_constraints_host() { d_mol_shake_offset); cudaDeviceSynchronize(); cudaMemcpy(&total_iterations_host, d_total_iterations, sizeof(int), cudaMemcpyDeviceToHost); + host.coords->download(); return host.n_molecules == 0 ? 0 : total_iterations_host / host.n_molecules; } diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu index baba687e..a02c6cf7 100644 --- a/src/core/cuda/src/cuda_temperature.cu +++ b/src/core/cuda/src/cuda_temperature.cu @@ -19,10 +19,7 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_atoms) return; double mass_i = catypes[atypes[idx].code - 1].m; - const double vx = velocities[idx].x; - const double vy = velocities[idx].y; - const double vz = velocities[idx].z; - double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz); + double ener = .5 * mass_i * (pow(velocities[idx].x, 2) + pow(velocities[idx].y, 2) + pow(velocities[idx].z, 2)); bool is_solute = (idx < n_atoms_solute); bool is_excluded = excluded[idx]; diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu index 97b687a6..6ef7cd45 100644 --- a/src/core/cuda/src/cuda_torsion_force.cu +++ b/src/core/cuda/src/cuda_torsion_force.cu @@ -57,8 +57,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z; rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x; - bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z); - bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z); + bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2)); + bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2)); bjinv = sqrt(bj2inv); bkinv = sqrt(bk2inv);